@amaster.ai/pi-computer-use 0.1.2-beta.2 → 0.1.2-beta.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -0
- package/bin/darwin-arm64/.version +2 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/CodeResources +0 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Info.plist +32 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/MacOS/cua-driver +0 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Resources/Skills/cua-driver/README.md +140 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Resources/Skills/cua-driver/RECORDING.md +113 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Resources/Skills/cua-driver/SKILL.md +887 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Resources/Skills/cua-driver/TESTS.md +232 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/Resources/Skills/cua-driver/WEB_APPS.md +471 -0
- package/bin/darwin-arm64/CuaDriver.app/Contents/_CodeSignature/CodeResources +172 -0
- package/bin/darwin-x64/.version +2 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/CodeResources +0 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Info.plist +32 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/MacOS/cua-driver +0 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Resources/Skills/cua-driver/README.md +140 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Resources/Skills/cua-driver/RECORDING.md +113 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Resources/Skills/cua-driver/SKILL.md +887 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Resources/Skills/cua-driver/TESTS.md +232 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/Resources/Skills/cua-driver/WEB_APPS.md +471 -0
- package/bin/darwin-x64/CuaDriver.app/Contents/_CodeSignature/CodeResources +172 -0
- package/bin/linux-x64/.version +2 -0
- package/bin/linux-x64/cua-driver +0 -0
- package/bin/win32-arm64/.version +2 -0
- package/bin/win32-arm64/cua-driver-uia.exe +0 -0
- package/bin/win32-arm64/cua-driver.exe +0 -0
- package/bin/win32-x64/.version +2 -0
- package/bin/win32-x64/cua-driver-uia.exe +0 -0
- package/bin/win32-x64/cua-driver.exe +0 -0
- package/dist/config.d.ts +5 -19
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +1 -6
- package/dist/config.js.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +578 -71
- package/dist/index.js.map +1 -1
- package/dist/mcp-client.d.ts +22 -0
- package/dist/mcp-client.d.ts.map +1 -0
- package/dist/mcp-client.js +91 -0
- package/dist/mcp-client.js.map +1 -0
- package/dist/vision.d.ts.map +1 -1
- package/dist/vision.js +19 -0
- package/dist/vision.js.map +1 -1
- package/package.json +20 -6
- package/preview.png +0 -0
- package/scripts/postinstall.js +29 -0
- package/dist/__tests__/computer-client.test.d.ts +0 -2
- package/dist/__tests__/computer-client.test.d.ts.map +0 -1
- package/dist/__tests__/computer-client.test.js +0 -174
- package/dist/__tests__/computer-client.test.js.map +0 -1
- package/dist/__tests__/index.test.d.ts +0 -2
- package/dist/__tests__/index.test.d.ts.map +0 -1
- package/dist/__tests__/index.test.js +0 -385
- package/dist/__tests__/index.test.js.map +0 -1
- package/dist/__tests__/server-process.test.d.ts +0 -2
- package/dist/__tests__/server-process.test.d.ts.map +0 -1
- package/dist/__tests__/server-process.test.js +0 -127
- package/dist/__tests__/server-process.test.js.map +0 -1
- package/dist/__tests__/vision.test.d.ts +0 -2
- package/dist/__tests__/vision.test.d.ts.map +0 -1
- package/dist/__tests__/vision.test.js +0 -36
- package/dist/__tests__/vision.test.js.map +0 -1
- package/dist/actions.d.ts +0 -15
- package/dist/actions.d.ts.map +0 -1
- package/dist/actions.js +0 -45
- package/dist/actions.js.map +0 -1
- package/dist/computer-client.d.ts +0 -13
- package/dist/computer-client.d.ts.map +0 -1
- package/dist/computer-client.js +0 -109
- package/dist/computer-client.js.map +0 -1
- package/dist/server-process.d.ts +0 -9
- package/dist/server-process.d.ts.map +0 -1
- package/dist/server-process.js +0 -76
- package/dist/server-process.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,103 +1,610 @@
|
|
|
1
1
|
import { Type } from 'typebox';
|
|
2
|
-
import { dispatchAction } from './actions.js';
|
|
3
|
-
import { ComputerClient } from './computer-client.js';
|
|
4
2
|
import { loadConfigFromFile, resolveConfig } from './config.js';
|
|
5
|
-
import {
|
|
3
|
+
import { CuaDriverClient } from './mcp-client.js';
|
|
6
4
|
import { createPiVisionCaller } from './vision.js';
|
|
7
5
|
export { loadConfigFromFile, resolveConfig };
|
|
6
|
+
const TOOL_PREFIX = 'computer_use_';
|
|
7
|
+
const PLATFORM = process.platform;
|
|
8
|
+
function permissionHint() {
|
|
9
|
+
switch (PLATFORM) {
|
|
10
|
+
case 'darwin':
|
|
11
|
+
return 'Check that Accessibility and Screen Recording permissions are granted in System Settings → Privacy & Security.';
|
|
12
|
+
case 'win32':
|
|
13
|
+
return 'Try running the application as Administrator, or check that UI Automation access is not blocked by security software.';
|
|
14
|
+
default:
|
|
15
|
+
return 'Check that the process has access to the display server (X11/Wayland) and required input permissions are configured.';
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
function accessibilityHint() {
|
|
19
|
+
switch (PLATFORM) {
|
|
20
|
+
case 'darwin':
|
|
21
|
+
return 'Accessibility permission not granted. The user needs to enable it in System Settings → Privacy & Security → Accessibility, then restart the app.';
|
|
22
|
+
case 'win32':
|
|
23
|
+
return 'UI Automation access denied. Try running the application as Administrator.';
|
|
24
|
+
default:
|
|
25
|
+
return 'Input automation access denied. Check that AT-SPI or equivalent accessibility service is available.';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
function screenRecordingHint() {
|
|
29
|
+
switch (PLATFORM) {
|
|
30
|
+
case 'darwin':
|
|
31
|
+
return 'Screen Recording permission not granted. The user needs to enable it in System Settings → Privacy & Security → Screen & System Audio Recording, then restart the app.';
|
|
32
|
+
case 'win32':
|
|
33
|
+
return 'Screen capture failed. Try running the application as Administrator, or check that screen capture is not blocked by DRM or security policy.';
|
|
34
|
+
default:
|
|
35
|
+
return 'Screen capture failed. Check that the compositor allows screen capture (PipeWire portal or X11 access).';
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
const EXCLUDED_TOOLS = new Set([
|
|
39
|
+
'set_agent_cursor_enabled',
|
|
40
|
+
'set_agent_cursor_motion',
|
|
41
|
+
'set_agent_cursor_style',
|
|
42
|
+
'get_agent_cursor_state',
|
|
43
|
+
'set_recording',
|
|
44
|
+
'get_recording_state',
|
|
45
|
+
'replay_trajectory',
|
|
46
|
+
'check_permissions',
|
|
47
|
+
'get_config',
|
|
48
|
+
'set_config',
|
|
49
|
+
'move_cursor',
|
|
50
|
+
'zoom',
|
|
51
|
+
'type_text_chars',
|
|
52
|
+
'page',
|
|
53
|
+
'browser_eval',
|
|
54
|
+
'screenshot',
|
|
55
|
+
]);
|
|
56
|
+
const FALLBACK_TOOLS = [
|
|
57
|
+
{
|
|
58
|
+
name: 'click',
|
|
59
|
+
description: 'Left-click against a target pid via element_index or x/y coordinates',
|
|
60
|
+
inputSchema: {
|
|
61
|
+
type: 'object',
|
|
62
|
+
additionalProperties: false,
|
|
63
|
+
properties: {
|
|
64
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
65
|
+
x: { type: 'number', description: 'Window-local screenshot X coordinate' },
|
|
66
|
+
y: { type: 'number', description: 'Window-local screenshot Y coordinate' },
|
|
67
|
+
element_index: { type: 'integer', description: 'Element index from last get_window_state' },
|
|
68
|
+
window_id: { type: 'integer', description: 'Target window ID. Required for element_index' },
|
|
69
|
+
action: {
|
|
70
|
+
type: 'string',
|
|
71
|
+
description: 'AX action: press, show_menu, pick, confirm, cancel, open',
|
|
72
|
+
},
|
|
73
|
+
modifier: {
|
|
74
|
+
type: 'array',
|
|
75
|
+
items: { type: 'string' },
|
|
76
|
+
description: 'Modifier keys: cmd, shift, option/alt, ctrl',
|
|
77
|
+
},
|
|
78
|
+
from_zoom: {
|
|
79
|
+
type: 'boolean',
|
|
80
|
+
description: 'When true, x/y are in last zoom image coordinates',
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
required: ['pid'],
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
name: 'double_click',
|
|
88
|
+
description: 'Double-click at x/y or on an AX element via element_index',
|
|
89
|
+
inputSchema: {
|
|
90
|
+
type: 'object',
|
|
91
|
+
additionalProperties: false,
|
|
92
|
+
properties: {
|
|
93
|
+
pid: { type: 'integer' },
|
|
94
|
+
x: { type: 'number', description: 'Screen X coordinate (pixel path)' },
|
|
95
|
+
y: { type: 'number', description: 'Screen Y coordinate (pixel path)' },
|
|
96
|
+
element_index: { type: 'integer', description: 'Element index from last get_window_state' },
|
|
97
|
+
window_id: {
|
|
98
|
+
type: 'integer',
|
|
99
|
+
description: 'CGWindowID. Required when element_index is used',
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
required: ['pid'],
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
name: 'right_click',
|
|
107
|
+
description: 'Right-click against a target pid via element_index or x/y coordinates',
|
|
108
|
+
inputSchema: {
|
|
109
|
+
type: 'object',
|
|
110
|
+
additionalProperties: false,
|
|
111
|
+
properties: {
|
|
112
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
113
|
+
x: { type: 'number', description: 'X in window-local screenshot pixels' },
|
|
114
|
+
y: { type: 'number', description: 'Y in window-local screenshot pixels' },
|
|
115
|
+
element_index: {
|
|
116
|
+
type: 'integer',
|
|
117
|
+
description: 'Element index from last get_window_state. Routes through AXShowMenu',
|
|
118
|
+
},
|
|
119
|
+
window_id: {
|
|
120
|
+
type: 'integer',
|
|
121
|
+
description: 'CGWindowID. Required when element_index is used',
|
|
122
|
+
},
|
|
123
|
+
modifier: {
|
|
124
|
+
type: 'array',
|
|
125
|
+
items: { type: 'string' },
|
|
126
|
+
description: 'Modifier keys held during the right-click (pixel path only)',
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
required: ['pid'],
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
name: 'type_text',
|
|
134
|
+
description: 'Insert text into the target pid via AX or CGEvent fallback',
|
|
135
|
+
inputSchema: {
|
|
136
|
+
type: 'object',
|
|
137
|
+
additionalProperties: false,
|
|
138
|
+
properties: {
|
|
139
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
140
|
+
text: { type: 'string', description: 'Text to insert at the target cursor' },
|
|
141
|
+
element_index: { type: 'integer', description: 'Element index from last get_window_state' },
|
|
142
|
+
window_id: {
|
|
143
|
+
type: 'integer',
|
|
144
|
+
description: 'CGWindowID. Required when element_index is used',
|
|
145
|
+
},
|
|
146
|
+
delay_ms: {
|
|
147
|
+
type: 'integer',
|
|
148
|
+
minimum: 0,
|
|
149
|
+
maximum: 200,
|
|
150
|
+
description: 'Milliseconds between characters in CGEvent fallback. Default 30',
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
required: ['pid', 'text'],
|
|
154
|
+
},
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
name: 'press_key',
|
|
158
|
+
description: 'Press and release a single key, delivered to the target pid',
|
|
159
|
+
inputSchema: {
|
|
160
|
+
type: 'object',
|
|
161
|
+
additionalProperties: false,
|
|
162
|
+
properties: {
|
|
163
|
+
pid: { type: 'integer' },
|
|
164
|
+
key: {
|
|
165
|
+
type: 'string',
|
|
166
|
+
description: 'Key name: return, tab, escape, up, down, left, right, space, delete, etc.',
|
|
167
|
+
},
|
|
168
|
+
modifiers: {
|
|
169
|
+
type: 'array',
|
|
170
|
+
items: { type: 'string' },
|
|
171
|
+
description: 'Modifier keys: cmd, shift, option/alt, ctrl, fn',
|
|
172
|
+
},
|
|
173
|
+
element_index: { type: 'integer' },
|
|
174
|
+
window_id: { type: 'integer' },
|
|
175
|
+
},
|
|
176
|
+
required: ['pid', 'key'],
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
name: 'hotkey',
|
|
181
|
+
description: 'Press a combination of keys simultaneously, e.g. ["cmd", "c"] for Copy',
|
|
182
|
+
inputSchema: {
|
|
183
|
+
type: 'object',
|
|
184
|
+
additionalProperties: false,
|
|
185
|
+
properties: {
|
|
186
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
187
|
+
keys: {
|
|
188
|
+
type: 'array',
|
|
189
|
+
items: { type: 'string' },
|
|
190
|
+
minItems: 2,
|
|
191
|
+
description: 'Modifier(s) and one non-modifier key, e.g. ["cmd", "c"]',
|
|
192
|
+
},
|
|
193
|
+
window_id: {
|
|
194
|
+
type: 'integer',
|
|
195
|
+
description: 'When set, uses NSMenu path for native menu key dispatch',
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
required: ['pid', 'keys'],
|
|
199
|
+
},
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
name: 'scroll',
|
|
203
|
+
description: 'Scroll the target pid focused region by synthesized keystrokes',
|
|
204
|
+
inputSchema: {
|
|
205
|
+
type: 'object',
|
|
206
|
+
additionalProperties: false,
|
|
207
|
+
properties: {
|
|
208
|
+
pid: { type: 'integer' },
|
|
209
|
+
direction: {
|
|
210
|
+
type: 'string',
|
|
211
|
+
enum: ['up', 'down', 'left', 'right'],
|
|
212
|
+
description: 'Scroll direction',
|
|
213
|
+
},
|
|
214
|
+
amount: {
|
|
215
|
+
type: 'integer',
|
|
216
|
+
minimum: 1,
|
|
217
|
+
maximum: 50,
|
|
218
|
+
description: 'Number of keystroke repetitions. Default: 3',
|
|
219
|
+
},
|
|
220
|
+
by: {
|
|
221
|
+
type: 'string',
|
|
222
|
+
enum: ['line', 'page'],
|
|
223
|
+
description: 'Scroll granularity. Default: line',
|
|
224
|
+
},
|
|
225
|
+
element_index: { type: 'integer' },
|
|
226
|
+
window_id: { type: 'integer' },
|
|
227
|
+
},
|
|
228
|
+
required: ['pid', 'direction'],
|
|
229
|
+
},
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
name: 'drag',
|
|
233
|
+
description: 'Press-drag-release gesture from one point to another in window-local pixels',
|
|
234
|
+
inputSchema: {
|
|
235
|
+
type: 'object',
|
|
236
|
+
additionalProperties: false,
|
|
237
|
+
properties: {
|
|
238
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
239
|
+
from_x: { type: 'number', description: 'Drag-start X in window-local screenshot pixels' },
|
|
240
|
+
from_y: { type: 'number', description: 'Drag-start Y in window-local screenshot pixels' },
|
|
241
|
+
to_x: { type: 'number', description: 'Drag-end X in window-local screenshot pixels' },
|
|
242
|
+
to_y: { type: 'number', description: 'Drag-end Y in window-local screenshot pixels' },
|
|
243
|
+
button: {
|
|
244
|
+
type: 'string',
|
|
245
|
+
enum: ['left', 'right', 'middle'],
|
|
246
|
+
description: 'Mouse button. Default: left',
|
|
247
|
+
},
|
|
248
|
+
duration_ms: {
|
|
249
|
+
type: 'integer',
|
|
250
|
+
minimum: 0,
|
|
251
|
+
maximum: 10000,
|
|
252
|
+
description: 'Duration of drag path. Default: 500',
|
|
253
|
+
},
|
|
254
|
+
steps: {
|
|
255
|
+
type: 'integer',
|
|
256
|
+
minimum: 1,
|
|
257
|
+
maximum: 200,
|
|
258
|
+
description: 'Intermediate drag events. Default: 20',
|
|
259
|
+
},
|
|
260
|
+
modifier: {
|
|
261
|
+
type: 'array',
|
|
262
|
+
items: { type: 'string' },
|
|
263
|
+
description: 'Modifier keys held across the gesture',
|
|
264
|
+
},
|
|
265
|
+
window_id: { type: 'integer' },
|
|
266
|
+
from_zoom: { type: 'boolean' },
|
|
267
|
+
},
|
|
268
|
+
required: ['pid', 'from_x', 'from_y', 'to_x', 'to_y'],
|
|
269
|
+
},
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
name: 'set_value',
|
|
273
|
+
description: 'Set a value on a UI element (popups, sliders, steppers, date pickers)',
|
|
274
|
+
inputSchema: {
|
|
275
|
+
type: 'object',
|
|
276
|
+
additionalProperties: false,
|
|
277
|
+
properties: {
|
|
278
|
+
pid: { type: 'integer' },
|
|
279
|
+
window_id: {
|
|
280
|
+
type: 'integer',
|
|
281
|
+
description: 'CGWindowID for the window whose get_window_state produced the element_index',
|
|
282
|
+
},
|
|
283
|
+
element_index: { type: 'integer' },
|
|
284
|
+
value: {
|
|
285
|
+
type: 'string',
|
|
286
|
+
description: 'New value. AX will coerce to the element native type',
|
|
287
|
+
},
|
|
288
|
+
},
|
|
289
|
+
required: ['pid', 'window_id', 'element_index', 'value'],
|
|
290
|
+
},
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
name: 'get_screen_size',
|
|
294
|
+
description: 'Return the logical size of the main display in points plus backing scale factor',
|
|
295
|
+
inputSchema: { type: 'object', additionalProperties: false, properties: {} },
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
name: 'get_cursor_position',
|
|
299
|
+
description: 'Return the current mouse cursor position in screen points',
|
|
300
|
+
inputSchema: { type: 'object', additionalProperties: false, properties: {} },
|
|
301
|
+
},
|
|
302
|
+
{
|
|
303
|
+
name: 'get_accessibility_tree',
|
|
304
|
+
description: 'Return a lightweight desktop snapshot: running apps and visible windows with bounds and z-order',
|
|
305
|
+
inputSchema: { type: 'object', additionalProperties: false, properties: {} },
|
|
306
|
+
},
|
|
307
|
+
{
|
|
308
|
+
name: 'get_window_state',
|
|
309
|
+
description: 'Walk an app AX tree and return a Markdown rendering of its UI with actionable element indices',
|
|
310
|
+
inputSchema: {
|
|
311
|
+
type: 'object',
|
|
312
|
+
additionalProperties: false,
|
|
313
|
+
properties: {
|
|
314
|
+
pid: { type: 'integer', description: 'Target process ID' },
|
|
315
|
+
window_id: { type: 'integer', description: 'Target window ID from list_windows' },
|
|
316
|
+
query: { type: 'string', description: 'Case-insensitive filter for tree_markdown' },
|
|
317
|
+
capture_mode: {
|
|
318
|
+
type: 'string',
|
|
319
|
+
enum: ['som', 'vision', 'ax'],
|
|
320
|
+
description: 'som=AX+screenshot (default), vision=screenshot only, ax=AX only',
|
|
321
|
+
},
|
|
322
|
+
},
|
|
323
|
+
required: ['pid', 'window_id'],
|
|
324
|
+
},
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
name: 'list_windows',
|
|
328
|
+
description: 'List all layer-0 top-level windows known to WindowServer',
|
|
329
|
+
inputSchema: {
|
|
330
|
+
type: 'object',
|
|
331
|
+
additionalProperties: false,
|
|
332
|
+
properties: {
|
|
333
|
+
pid: { type: 'integer', description: 'Optional pid filter' },
|
|
334
|
+
on_screen_only: {
|
|
335
|
+
type: 'boolean',
|
|
336
|
+
description: 'When true, drop windows not on current Space. Default false',
|
|
337
|
+
},
|
|
338
|
+
},
|
|
339
|
+
},
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
name: 'list_apps',
|
|
343
|
+
description: 'List macOS apps (running and installed) with state flags, pid, bundle_id',
|
|
344
|
+
inputSchema: { type: 'object', additionalProperties: false, properties: {} },
|
|
345
|
+
},
|
|
346
|
+
{
|
|
347
|
+
name: 'launch_app',
|
|
348
|
+
description: 'Launch a macOS app in the background without stealing focus',
|
|
349
|
+
inputSchema: {
|
|
350
|
+
type: 'object',
|
|
351
|
+
additionalProperties: false,
|
|
352
|
+
properties: {
|
|
353
|
+
bundle_id: {
|
|
354
|
+
type: 'string',
|
|
355
|
+
description: 'App bundle identifier, e.g. com.apple.calculator. Preferred over name',
|
|
356
|
+
},
|
|
357
|
+
name: {
|
|
358
|
+
type: 'string',
|
|
359
|
+
description: 'App display name. Used only when bundle_id is absent',
|
|
360
|
+
},
|
|
361
|
+
urls: {
|
|
362
|
+
type: 'array',
|
|
363
|
+
items: { type: 'string' },
|
|
364
|
+
description: 'File paths or URLs to open with the app',
|
|
365
|
+
},
|
|
366
|
+
creates_new_application_instance: {
|
|
367
|
+
type: 'boolean',
|
|
368
|
+
description: 'Force a new app instance even if already running',
|
|
369
|
+
},
|
|
370
|
+
},
|
|
371
|
+
},
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
name: 'kill_app',
|
|
375
|
+
description: 'Force-terminate a process by pid (kill -9 equivalent)',
|
|
376
|
+
inputSchema: {
|
|
377
|
+
type: 'object',
|
|
378
|
+
additionalProperties: false,
|
|
379
|
+
properties: { pid: { type: 'integer', description: 'PID of the process to terminate' } },
|
|
380
|
+
required: ['pid'],
|
|
381
|
+
},
|
|
382
|
+
},
|
|
383
|
+
];
|
|
8
384
|
export default function computerUseExtension(pi) {
|
|
9
|
-
let config
|
|
10
|
-
|
|
11
|
-
let
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
385
|
+
let config;
|
|
386
|
+
let client;
|
|
387
|
+
let connected = false;
|
|
388
|
+
async function ensureConnected() {
|
|
389
|
+
if (!client)
|
|
390
|
+
throw new Error('pi-computer-use: session not started');
|
|
391
|
+
if (!connected) {
|
|
392
|
+
await client.connect();
|
|
393
|
+
connected = true;
|
|
18
394
|
}
|
|
19
|
-
await client.connect();
|
|
20
|
-
started = true;
|
|
21
395
|
}
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
396
|
+
function registerTools(tools) {
|
|
397
|
+
for (const tool of tools) {
|
|
398
|
+
if (EXCLUDED_TOOLS.has(tool.name))
|
|
399
|
+
continue;
|
|
400
|
+
const prefixedName = `${TOOL_PREFIX}${tool.name}`;
|
|
401
|
+
const originalName = tool.name;
|
|
402
|
+
pi.registerTool({
|
|
403
|
+
name: prefixedName,
|
|
404
|
+
label: prefixedName,
|
|
405
|
+
description: tool.description ?? '',
|
|
406
|
+
parameters: Type.Unsafe(tool.inputSchema),
|
|
407
|
+
async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
|
|
408
|
+
try {
|
|
409
|
+
await ensureConnected();
|
|
410
|
+
}
|
|
411
|
+
catch (connErr) {
|
|
412
|
+
const msg = connErr instanceof Error ? connErr.message : String(connErr);
|
|
413
|
+
ctx.ui.notify(`pi-computer-use: cannot connect to cua-driver — ${msg}`, 'warning');
|
|
414
|
+
return {
|
|
415
|
+
content: [
|
|
416
|
+
{
|
|
417
|
+
type: 'text',
|
|
418
|
+
text: `Failed to connect to cua-driver: ${msg}. ${permissionHint()}`,
|
|
419
|
+
},
|
|
420
|
+
],
|
|
421
|
+
details: undefined,
|
|
422
|
+
isError: true,
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
const result = await client.callTool(originalName, params);
|
|
426
|
+
if (result.isError) {
|
|
427
|
+
const errorText = result.content?.map((c) => c.text ?? '').join('') ?? '';
|
|
428
|
+
const friendlyError = formatToolError(originalName, errorText, params);
|
|
429
|
+
if (friendlyError) {
|
|
430
|
+
return {
|
|
431
|
+
content: [{ type: 'text', text: friendlyError }],
|
|
432
|
+
details: undefined,
|
|
433
|
+
isError: true,
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
const content = [];
|
|
438
|
+
if (result.content) {
|
|
439
|
+
for (const item of result.content) {
|
|
440
|
+
if (item.type === 'text' && item.text) {
|
|
441
|
+
content.push({ type: 'text', text: item.text });
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
if (content.length === 0) {
|
|
446
|
+
content.push({ type: 'text', text: 'Action executed.' });
|
|
447
|
+
}
|
|
448
|
+
return result.isError
|
|
449
|
+
? { content, details: undefined, isError: true }
|
|
450
|
+
: { content, details: undefined };
|
|
451
|
+
},
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
function registerVisionTool() {
|
|
456
|
+
if (!config?.visionModel)
|
|
457
|
+
return;
|
|
458
|
+
const visionConfig = config.visionModel;
|
|
459
|
+
pi.registerTool({
|
|
460
|
+
name: `${TOOL_PREFIX}analyze_screenshot`,
|
|
461
|
+
label: `${TOOL_PREFIX}analyze_screenshot`,
|
|
462
|
+
description: 'Capture a screenshot using ScreenCaptureKit and analyze it visually using a vision model. Returns analysis for a single window in the requested format (default png).\n\n`window_id` is required. Get window ids from `list_windows`.\n\nRequires the Screen Recording TCC grant — call `check_permissions` first if unsure.',
|
|
463
|
+
parameters: Type.Object({
|
|
464
|
+
window_id: Type.Number({
|
|
465
|
+
description: 'Required CGWindowID / kCGWindowNumber to capture.',
|
|
31
466
|
}),
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
467
|
+
instruction: Type.Optional(Type.String({
|
|
468
|
+
description: 'What to identify or analyze visually (e.g., "Find the coordinates of the blue submit button").',
|
|
469
|
+
})),
|
|
470
|
+
format: Type.Optional(Type.Union([Type.Literal('png'), Type.Literal('jpeg')], {
|
|
471
|
+
description: 'Image format. Default: png.',
|
|
472
|
+
})),
|
|
473
|
+
quality: Type.Optional(Type.Number({
|
|
474
|
+
description: 'JPEG quality 1-95; ignored for png.',
|
|
475
|
+
minimum: 1,
|
|
476
|
+
maximum: 95,
|
|
41
477
|
})),
|
|
42
|
-
command: Type.Optional(Type.String({ description: 'Shell command (run_command action)' })),
|
|
43
478
|
}),
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if (params.action.type === 'screenshot') {
|
|
48
|
-
const screenshotBase64 = await client.screenshot();
|
|
49
|
-
if (config.visionModel) {
|
|
50
|
-
const callVision = createPiVisionCaller(config.visionModel, ctx);
|
|
51
|
-
const analysis = await callVision('Describe the full screen: identify all visible windows, UI elements, buttons, text fields, and their positions.', screenshotBase64, 'image/png');
|
|
52
|
-
return { content: [{ type: 'text', text: analysis }], details: undefined };
|
|
479
|
+
async execute(_toolCallId, params, _signal, _onUpdate, ctx) {
|
|
480
|
+
try {
|
|
481
|
+
await ensureConnected();
|
|
53
482
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
type: 'text',
|
|
58
|
-
text: 'Screenshot captured (no vision model configured to analyze it).',
|
|
59
|
-
},
|
|
60
|
-
],
|
|
61
|
-
details: undefined,
|
|
62
|
-
};
|
|
63
|
-
}
|
|
64
|
-
const actionResult = await dispatchAction(client, params.action);
|
|
65
|
-
if (config.autoScreenshot !== false) {
|
|
66
|
-
const screenshotBase64 = await client.screenshot();
|
|
67
|
-
if (config.visionModel) {
|
|
68
|
-
const callVision = createPiVisionCaller(config.visionModel, ctx);
|
|
69
|
-
const analysis = await callVision('Describe the current screen state after the action. Focus on what changed and what is now visible.', screenshotBase64, 'image/png');
|
|
483
|
+
catch (connErr) {
|
|
484
|
+
const msg = connErr instanceof Error ? connErr.message : String(connErr);
|
|
485
|
+
ctx.ui.notify(`pi-computer-use: cannot connect to cua-driver — ${msg}`, 'warning');
|
|
70
486
|
return {
|
|
71
487
|
content: [
|
|
72
488
|
{
|
|
73
489
|
type: 'text',
|
|
74
|
-
text:
|
|
490
|
+
text: `Failed to connect to cua-driver: ${msg}. ${permissionHint()}`,
|
|
75
491
|
},
|
|
76
492
|
],
|
|
77
493
|
details: undefined,
|
|
494
|
+
isError: true,
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
const screenshotArgs = { window_id: params.window_id };
|
|
498
|
+
if (params.format)
|
|
499
|
+
screenshotArgs.format = params.format;
|
|
500
|
+
if (params.quality)
|
|
501
|
+
screenshotArgs.quality = params.quality;
|
|
502
|
+
const screenshotResult = await client.callTool('screenshot', screenshotArgs);
|
|
503
|
+
const imageContent = screenshotResult.content?.find((c) => c.type === 'image' && c.data);
|
|
504
|
+
console.error('[pi-computer-use analyze_screenshot] screenshot result', JSON.stringify({
|
|
505
|
+
window_id: params.window_id,
|
|
506
|
+
isError: screenshotResult.isError,
|
|
507
|
+
contentTypes: screenshotResult.content?.map((c) => c.type),
|
|
508
|
+
imageDataLength: imageContent?.data?.length,
|
|
509
|
+
imageMimeType: imageContent?.mimeType,
|
|
510
|
+
}, null, 2));
|
|
511
|
+
if (!imageContent?.data) {
|
|
512
|
+
const errorText = screenshotResult.content
|
|
513
|
+
?.filter((c) => c.type === 'text' && c.text)
|
|
514
|
+
.map((c) => c.text)
|
|
515
|
+
.join('\n') || 'Failed to capture screenshot.';
|
|
516
|
+
const formatted = formatToolError('screenshot', errorText, params);
|
|
517
|
+
return {
|
|
518
|
+
content: [{ type: 'text', text: formatted ?? errorText }],
|
|
519
|
+
details: undefined,
|
|
520
|
+
isError: true,
|
|
78
521
|
};
|
|
79
522
|
}
|
|
523
|
+
const callVision = createPiVisionCaller(visionConfig, ctx);
|
|
524
|
+
const instruction = params.instruction ??
|
|
525
|
+
'Describe the full screen: identify all visible windows, UI elements, buttons, text fields, and their positions.';
|
|
526
|
+
const analysis = await callVision(instruction, imageContent.data, imageContent.mimeType ?? 'image/png');
|
|
527
|
+
console.error('[pi-computer-use analyze_screenshot] vision analysis', JSON.stringify({
|
|
528
|
+
analysisLength: analysis.length,
|
|
529
|
+
analysisPreview: analysis.slice(0, 200),
|
|
530
|
+
}, null, 2));
|
|
80
531
|
return {
|
|
81
|
-
content: [{ type: 'text', text:
|
|
532
|
+
content: [{ type: 'text', text: analysis }],
|
|
82
533
|
details: undefined,
|
|
83
534
|
};
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
details: undefined,
|
|
88
|
-
};
|
|
89
|
-
},
|
|
90
|
-
});
|
|
535
|
+
},
|
|
536
|
+
});
|
|
537
|
+
}
|
|
91
538
|
pi.on('session_start', async (_event, ctx) => {
|
|
92
539
|
config = resolveConfig(loadConfigFromFile({ cwd: ctx.cwd }));
|
|
93
|
-
client = new
|
|
540
|
+
client = new CuaDriverClient(config);
|
|
541
|
+
connected = false;
|
|
542
|
+
let upstreamTools;
|
|
543
|
+
try {
|
|
544
|
+
await client.connect();
|
|
545
|
+
connected = true;
|
|
546
|
+
upstreamTools = await client.listAllTools();
|
|
547
|
+
}
|
|
548
|
+
catch (err) {
|
|
549
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
550
|
+
ctx.ui.notify(`pi-computer-use: cua-driver failed to start — ${msg}. Tools registered but may fail until permissions are granted.`, 'warning');
|
|
551
|
+
}
|
|
552
|
+
registerTools(upstreamTools ?? FALLBACK_TOOLS);
|
|
553
|
+
registerVisionTool();
|
|
554
|
+
if (connected) {
|
|
555
|
+
try {
|
|
556
|
+
const permResult = await client.callTool('check_permissions', {});
|
|
557
|
+
const structured = permResult.structuredContent;
|
|
558
|
+
if (structured) {
|
|
559
|
+
if (!structured.accessibility) {
|
|
560
|
+
ctx.ui.notify(`pi-computer-use: ${accessibilityHint()}`, 'warning');
|
|
561
|
+
}
|
|
562
|
+
if (!structured.screen_recording) {
|
|
563
|
+
ctx.ui.notify(`pi-computer-use: ${screenRecordingHint()}`, 'warning');
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
catch {
|
|
568
|
+
// permission check is best-effort
|
|
569
|
+
}
|
|
570
|
+
}
|
|
94
571
|
});
|
|
95
572
|
pi.on('session_shutdown', async () => {
|
|
96
|
-
if (
|
|
573
|
+
if (connected && client) {
|
|
97
574
|
await client.close();
|
|
98
|
-
|
|
99
|
-
started = false;
|
|
575
|
+
connected = false;
|
|
100
576
|
}
|
|
101
577
|
});
|
|
102
578
|
}
|
|
579
|
+
function formatToolError(toolName, errorText, params) {
|
|
580
|
+
if (errorText.includes('ax_not_granted')) {
|
|
581
|
+
return accessibilityHint();
|
|
582
|
+
}
|
|
583
|
+
if (errorText.includes('sc_not_granted')) {
|
|
584
|
+
return screenRecordingHint();
|
|
585
|
+
}
|
|
586
|
+
if (toolName === 'screenshot' || errorText.includes('screencapture failed')) {
|
|
587
|
+
const windowId = params.window_id;
|
|
588
|
+
if (windowId !== undefined) {
|
|
589
|
+
if (errorText.includes('screencapture failed')) {
|
|
590
|
+
return [
|
|
591
|
+
`Screenshot failed for window ${windowId}. Possible causes:`,
|
|
592
|
+
`1. ${screenRecordingHint()}`,
|
|
593
|
+
'2. The window_id is stale — the window may have been closed or recreated (e.g. after navigation in Electron apps). Re-fetch window list to get current IDs.',
|
|
594
|
+
'3. The window is minimized or not yet rendered.',
|
|
595
|
+
`Try capturing without window_id (full screen) as a fallback, or verify the window still exists.`,
|
|
596
|
+
].join('\n');
|
|
597
|
+
}
|
|
598
|
+
if (errorText.includes('empty output')) {
|
|
599
|
+
return `Screenshot captured an empty image for window ${windowId}. The window may be minimized, fully transparent, or off-screen. Try restoring the window first.`;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
else {
|
|
603
|
+
if (errorText.includes('screencapture failed')) {
|
|
604
|
+
return `Screenshot failed for the main display. ${screenRecordingHint()}`;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
return undefined;
|
|
609
|
+
}
|
|
103
610
|
//# sourceMappingURL=index.js.map
|