@midscene/computer 1.2.1-beta-20260112081017.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +243 -0
- package/dist/es/index.mjs +438 -0
- package/dist/es/mcp-server.mjs +508 -0
- package/dist/lib/index.js +498 -0
- package/dist/lib/mcp-server.js +559 -0
- package/dist/types/index.d.ts +70 -0
- package/dist/types/mcp-server.d.ts +88 -0
- package/package.json +48 -0
- package/rslib.config.ts +26 -0
- package/src/agent.ts +17 -0
- package/src/device.ts +554 -0
- package/src/index.ts +8 -0
- package/src/mcp-server.ts +65 -0
- package/src/mcp-tools.ts +96 -0
- package/src/types/libnut.d.ts +36 -0
- package/src/utils.ts +51 -0
- package/tests/ai/ai-auto-todo.test.ts +85 -0
- package/tests/ai/ai-shop.test.ts +56 -0
- package/tests/ai/basic.test.ts +46 -0
- package/tests/ai/keyboard.test.ts +66 -0
- package/tests/ai/multi-display.test.ts +76 -0
- package/tests/ai/test-utils.ts +31 -0
- package/tests/ai/web-browser.test.ts +63 -0
- package/tests/unit-test/agent.test.ts +34 -0
- package/tests/unit-test/device.test.ts +53 -0
- package/tsconfig.json +18 -0
- package/tsconfig.tsbuildinfo +1 -0
- package/vitest.config.ts +47 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-present Bytedance, Inc. and its affiliates.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# @midscene/computer
|
|
2
|
+
|
|
3
|
+
Midscene.js Computer Desktop Automation - AI-powered desktop automation for Windows, macOS, and Linux.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- 🖥️ **Desktop Automation**: Control mouse, keyboard, and screen
|
|
8
|
+
- 📸 **Screenshot Capture**: Take screenshots of any display
|
|
9
|
+
- 🖱️ **Mouse Operations**: Click, double-click, right-click, hover, drag & drop
|
|
10
|
+
- ⌨️ **Keyboard Input**: Type text, press keys, shortcuts
|
|
11
|
+
- 📜 **Scroll Operations**: Scroll in any direction
|
|
12
|
+
- 🖼️ **Multi-Display Support**: Work with multiple monitors
|
|
13
|
+
- 🤖 **AI-Powered**: Use natural language to control your desktop
|
|
14
|
+
- 🔌 **MCP Server**: Expose capabilities via Model Context Protocol
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
npm install @midscene/computer
|
|
20
|
+
# or
|
|
21
|
+
pnpm add @midscene/computer
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Platform Requirements
|
|
25
|
+
|
|
26
|
+
This package uses native modules for desktop control:
|
|
27
|
+
- `screenshot-desktop`: For capturing screenshots
|
|
28
|
+
- `@computer-use/libnut`: For mouse and keyboard control
|
|
29
|
+
|
|
30
|
+
These modules require compilation on installation. Make sure you have the necessary build tools:
|
|
31
|
+
|
|
32
|
+
**macOS**: Install Xcode Command Line Tools
|
|
33
|
+
```bash
|
|
34
|
+
xcode-select --install
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
**Linux**: Install build essentials and ImageMagick
|
|
38
|
+
```bash
|
|
39
|
+
# Ubuntu/Debian
|
|
40
|
+
sudo apt-get install build-essential libx11-dev libxtst-dev libpng-dev imagemagick
|
|
41
|
+
|
|
42
|
+
# Fedora/RHEL
|
|
43
|
+
sudo dnf install gcc-c++ libX11-devel libXtst-devel libpng-devel ImageMagick
|
|
44
|
+
|
|
45
|
+
# Arch
|
|
46
|
+
sudo pacman -S base-devel libx11 libxtst libpng imagemagick
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Note**: ImageMagick is required for screenshot capture on Linux.
|
|
50
|
+
|
|
51
|
+
**Windows**: Install Windows Build Tools
|
|
52
|
+
```bash
|
|
53
|
+
npm install --global windows-build-tools
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Basic Usage
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
import { agentFromComputer } from '@midscene/computer';
|
|
62
|
+
|
|
63
|
+
// Create an agent
|
|
64
|
+
const agent = await agentFromComputer({
|
|
65
|
+
aiActionContext: 'You are controlling a desktop computer.',
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// Use AI to perform actions
|
|
69
|
+
await agent.aiAct('move mouse to center of screen');
|
|
70
|
+
await agent.aiAct('click on the desktop');
|
|
71
|
+
await agent.aiAct('type "Hello World"');
|
|
72
|
+
|
|
73
|
+
// Query information
|
|
74
|
+
const screenInfo = await agent.aiQuery(
|
|
75
|
+
'{width: number, height: number}, get screen resolution',
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
// Assert conditions
|
|
79
|
+
await agent.aiAssert('There is a desktop visible');
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Multi-Display Support
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
import { ComputerDevice, agentFromComputer } from '@midscene/computer';
|
|
86
|
+
|
|
87
|
+
// List all displays
|
|
88
|
+
const displays = await ComputerDevice.listDisplays();
|
|
89
|
+
console.log('Available displays:', displays);
|
|
90
|
+
|
|
91
|
+
// Connect to a specific display
|
|
92
|
+
const agent = await agentFromComputer({
|
|
93
|
+
displayId: displays[0].id,
|
|
94
|
+
});
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Environment Check
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
import { checkComputerEnvironment } from '@midscene/computer';
|
|
101
|
+
|
|
102
|
+
const env = await checkComputerEnvironment();
|
|
103
|
+
console.log('Platform:', env.platform);
|
|
104
|
+
console.log('Available:', env.available);
|
|
105
|
+
console.log('Displays:', env.displays);
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Available Actions
|
|
109
|
+
|
|
110
|
+
The ComputerDevice supports the following actions:
|
|
111
|
+
|
|
112
|
+
- **Tap**: Single click at element center
|
|
113
|
+
- **DoubleClick**: Double click at element center
|
|
114
|
+
- **RightClick**: Right click at element center
|
|
115
|
+
- **Hover**: Move mouse to element center
|
|
116
|
+
- **Input**: Type text with different modes (replace/clear/append)
|
|
117
|
+
- **Scroll**: Scroll in any direction (up/down/left/right)
|
|
118
|
+
- **KeyboardPress**: Press keyboard keys with modifiers
|
|
119
|
+
- **DragAndDrop**: Drag from one element to another
|
|
120
|
+
- **ClearInput**: Clear input field content
|
|
121
|
+
- **ListDisplays**: Get all available displays
|
|
122
|
+
|
|
123
|
+
## Platform-Specific Shortcuts
|
|
124
|
+
|
|
125
|
+
### macOS
|
|
126
|
+
- Modifier key: `Cmd` (Command)
|
|
127
|
+
- Open search: `Cmd+Space`
|
|
128
|
+
- Select all: `Cmd+A`
|
|
129
|
+
- Copy: `Cmd+C`
|
|
130
|
+
- Paste: `Cmd+V`
|
|
131
|
+
|
|
132
|
+
### Windows/Linux
|
|
133
|
+
- Modifier key: `Ctrl` (Control)
|
|
134
|
+
- Open search: `Windows key` or `Super key`
|
|
135
|
+
- Select all: `Ctrl+A`
|
|
136
|
+
- Copy: `Ctrl+C`
|
|
137
|
+
- Paste: `Ctrl+V`
|
|
138
|
+
|
|
139
|
+
## Testing
|
|
140
|
+
|
|
141
|
+
### Run Unit Tests
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
pnpm test
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Run AI Tests
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Set AI_TEST_TYPE environment variable
|
|
151
|
+
AI_TEST_TYPE=computer pnpm test:ai
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Available AI tests:
|
|
155
|
+
- `basic.test.ts`: Basic desktop interactions
|
|
156
|
+
- `multi-display.test.ts`: Multi-display support
|
|
157
|
+
- `web-browser.test.ts`: Browser automation
|
|
158
|
+
- `text-editor.test.ts`: Text editor operations
|
|
159
|
+
|
|
160
|
+
## MCP Server
|
|
161
|
+
|
|
162
|
+
Start the MCP server for AI assistant integration:
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
import { mcpServerForAgent } from '@midscene/computer/mcp-server';
|
|
166
|
+
import { agentFromComputer } from '@midscene/computer';
|
|
167
|
+
|
|
168
|
+
const agent = await agentFromComputer();
|
|
169
|
+
const { server } = mcpServerForAgent(agent);
|
|
170
|
+
await server.launch();
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Available MCP tools:
|
|
174
|
+
- `computer_connect`: Connect to desktop display
|
|
175
|
+
- `computer_list_displays`: List all available displays
|
|
176
|
+
- Plus all standard Midscene tools (aiAct, aiQuery, aiAssert, etc.)
|
|
177
|
+
|
|
178
|
+
## Architecture
|
|
179
|
+
|
|
180
|
+
This package follows the same architecture pattern as `@midscene/android` and `@midscene/ios`:
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
packages/computer/
|
|
184
|
+
├── src/
|
|
185
|
+
│ ├── device.ts # ComputerDevice - core device implementation
|
|
186
|
+
│ ├── agent.ts # ComputerAgent - agent wrapper
|
|
187
|
+
│ ├── utils.ts # Utility functions
|
|
188
|
+
│ ├── mcp-server.ts # MCP server
|
|
189
|
+
│ └── mcp-tools.ts # MCP tools definitions
|
|
190
|
+
├── tests/
|
|
191
|
+
│ ├── unit-test/ # Unit tests (no native dependencies)
|
|
192
|
+
│ └── ai/ # AI-powered integration tests
|
|
193
|
+
└── README.md
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## API Reference
|
|
197
|
+
|
|
198
|
+
### ComputerDevice
|
|
199
|
+
|
|
200
|
+
```typescript
|
|
201
|
+
class ComputerDevice implements AbstractInterface {
|
|
202
|
+
constructor(options?: ComputerDeviceOpt);
|
|
203
|
+
|
|
204
|
+
static listDisplays(): Promise<DisplayInfo[]>;
|
|
205
|
+
|
|
206
|
+
async connect(): Promise<void>;
|
|
207
|
+
async screenshotBase64(): Promise<string>;
|
|
208
|
+
async size(): Promise<Size>;
|
|
209
|
+
actionSpace(): DeviceAction<any>[];
|
|
210
|
+
async destroy(): Promise<void>;
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### ComputerAgent
|
|
215
|
+
|
|
216
|
+
```typescript
|
|
217
|
+
class ComputerAgent extends PageAgent<ComputerDevice> {
|
|
218
|
+
// Inherits all PageAgent methods
|
|
219
|
+
async aiAct(action: string): Promise<void>;
|
|
220
|
+
async aiQuery(query: string): Promise<any>;
|
|
221
|
+
async aiAssert(assertion: string): Promise<void>;
|
|
222
|
+
async aiWaitFor(condition: string): Promise<void>;
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Factory Functions
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
async function agentFromComputer(
|
|
230
|
+
opts?: ComputerAgentOpt
|
|
231
|
+
): Promise<ComputerAgent>;
|
|
232
|
+
|
|
233
|
+
async function checkComputerEnvironment(): Promise<EnvironmentCheck>;
|
|
234
|
+
async function getConnectedDisplays(): Promise<DisplayInfo[]>;
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## License
|
|
238
|
+
|
|
239
|
+
MIT
|
|
240
|
+
|
|
241
|
+
## Contributing
|
|
242
|
+
|
|
243
|
+
See the main [Midscene.js repository](https://github.com/web-infra-dev/midscene) for contributing guidelines.
|
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
import node_assert from "node:assert";
|
|
2
|
+
import { getMidsceneLocationSchema, z } from "@midscene/core";
|
|
3
|
+
import { defineAction, defineActionClearInput, defineActionDoubleClick, defineActionDragAndDrop, defineActionHover, defineActionKeyboardPress, defineActionRightClick, defineActionScroll, defineActionTap } from "@midscene/core/device";
|
|
4
|
+
import { sleep } from "@midscene/core/utils";
|
|
5
|
+
import { createImgBase64ByFormat } from "@midscene/shared/img";
|
|
6
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
7
|
+
import screenshot_desktop from "screenshot-desktop";
|
|
8
|
+
import { Agent } from "@midscene/core/agent";
|
|
9
|
+
import { overrideAIConfig } from "@midscene/shared/env";
|
|
10
|
+
function _define_property(obj, key, value) {
|
|
11
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
12
|
+
value: value,
|
|
13
|
+
enumerable: true,
|
|
14
|
+
configurable: true,
|
|
15
|
+
writable: true
|
|
16
|
+
});
|
|
17
|
+
else obj[key] = value;
|
|
18
|
+
return obj;
|
|
19
|
+
}
|
|
20
|
+
const SMOOTH_MOVE_STEPS_TAP = 8;
|
|
21
|
+
const SMOOTH_MOVE_STEPS_HOVER = 10;
|
|
22
|
+
const SMOOTH_MOVE_DELAY_TAP = 8;
|
|
23
|
+
const SMOOTH_MOVE_DELAY_HOVER = 10;
|
|
24
|
+
const HOVER_EFFECT_WAIT = 300;
|
|
25
|
+
const CLICK_HOLD_DURATION = 50;
|
|
26
|
+
const INPUT_FOCUS_DELAY = 300;
|
|
27
|
+
const INPUT_CLEAR_DELAY = 150;
|
|
28
|
+
const SCROLL_REPEAT_COUNT = 10;
|
|
29
|
+
const SCROLL_STEP_DELAY = 100;
|
|
30
|
+
const SCROLL_COMPLETE_DELAY = 500;
|
|
31
|
+
let device_libnut = null;
|
|
32
|
+
let libnutLoadError = null;
|
|
33
|
+
async function getLibnut() {
|
|
34
|
+
if (device_libnut) return device_libnut;
|
|
35
|
+
if (libnutLoadError) throw libnutLoadError;
|
|
36
|
+
try {
|
|
37
|
+
const libnutModule = await import("@computer-use/libnut/dist/import_libnut");
|
|
38
|
+
device_libnut = libnutModule.libnut;
|
|
39
|
+
if (!device_libnut) throw new Error('libnut module loaded but libnut object is undefined');
|
|
40
|
+
return device_libnut;
|
|
41
|
+
} catch (error) {
|
|
42
|
+
libnutLoadError = error;
|
|
43
|
+
throw new Error(`Failed to load @computer-use/libnut. Make sure it is properly installed and compiled for your platform. Error: ${error}`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
const debugDevice = getDebug('computer:device');
|
|
47
|
+
async function smoothMoveMouse(targetX, targetY, steps, stepDelay) {
|
|
48
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
49
|
+
const currentPos = device_libnut.getMousePos();
|
|
50
|
+
for(let i = 1; i <= steps; i++){
|
|
51
|
+
const stepX = Math.round(currentPos.x + (targetX - currentPos.x) * i / steps);
|
|
52
|
+
const stepY = Math.round(currentPos.y + (targetY - currentPos.y) * i / steps);
|
|
53
|
+
device_libnut.moveMouse(stepX, stepY);
|
|
54
|
+
await sleep(stepDelay);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const KEY_NAME_MAP = {
|
|
58
|
+
windows: 'win',
|
|
59
|
+
win: 'win',
|
|
60
|
+
ctrl: 'control',
|
|
61
|
+
esc: 'escape',
|
|
62
|
+
del: 'delete',
|
|
63
|
+
ins: 'insert',
|
|
64
|
+
pgup: 'pageup',
|
|
65
|
+
pgdn: 'pagedown',
|
|
66
|
+
arrowup: 'up',
|
|
67
|
+
arrowdown: 'down',
|
|
68
|
+
arrowleft: 'left',
|
|
69
|
+
arrowright: 'right',
|
|
70
|
+
volumedown: 'audio_vol_down',
|
|
71
|
+
volumeup: 'audio_vol_up',
|
|
72
|
+
mediavolumedown: 'audio_vol_down',
|
|
73
|
+
mediavolumeup: 'audio_vol_up',
|
|
74
|
+
mute: 'audio_mute',
|
|
75
|
+
mediamute: 'audio_mute',
|
|
76
|
+
mediaplay: 'audio_play',
|
|
77
|
+
mediapause: 'audio_pause',
|
|
78
|
+
mediaplaypause: 'audio_play',
|
|
79
|
+
mediastop: 'audio_stop',
|
|
80
|
+
medianexttrack: 'audio_next',
|
|
81
|
+
mediaprevioustrack: 'audio_prev',
|
|
82
|
+
medianext: 'audio_next',
|
|
83
|
+
mediaprev: 'audio_prev'
|
|
84
|
+
};
|
|
85
|
+
const PRIMARY_KEY_MAP = {
|
|
86
|
+
command: 'cmd',
|
|
87
|
+
cmd: 'cmd',
|
|
88
|
+
meta: 'meta',
|
|
89
|
+
control: 'control',
|
|
90
|
+
ctrl: 'control',
|
|
91
|
+
shift: 'shift',
|
|
92
|
+
alt: 'alt',
|
|
93
|
+
option: 'alt'
|
|
94
|
+
};
|
|
95
|
+
function normalizeKeyName(key) {
|
|
96
|
+
const lowerKey = key.toLowerCase();
|
|
97
|
+
return KEY_NAME_MAP[lowerKey] || lowerKey;
|
|
98
|
+
}
|
|
99
|
+
function normalizePrimaryKey(key) {
|
|
100
|
+
const lowerKey = key.toLowerCase();
|
|
101
|
+
if (PRIMARY_KEY_MAP[lowerKey]) return PRIMARY_KEY_MAP[lowerKey];
|
|
102
|
+
return KEY_NAME_MAP[lowerKey] || lowerKey;
|
|
103
|
+
}
|
|
104
|
+
class ComputerDevice {
|
|
105
|
+
describe() {
|
|
106
|
+
return this.description || 'Computer Device';
|
|
107
|
+
}
|
|
108
|
+
static async listDisplays() {
|
|
109
|
+
try {
|
|
110
|
+
const displays = await screenshot_desktop.listDisplays();
|
|
111
|
+
return displays.map((d)=>({
|
|
112
|
+
id: String(d.id),
|
|
113
|
+
name: d.name || `Display ${d.id}`,
|
|
114
|
+
primary: d.primary || false
|
|
115
|
+
}));
|
|
116
|
+
} catch (error) {
|
|
117
|
+
debugDevice(`Failed to list displays: ${error}`);
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
async connect() {
|
|
122
|
+
debugDevice('Connecting to computer device');
|
|
123
|
+
try {
|
|
124
|
+
device_libnut = await getLibnut();
|
|
125
|
+
const size = await this.size();
|
|
126
|
+
const displays = await ComputerDevice.listDisplays();
|
|
127
|
+
this.description = `
|
|
128
|
+
Type: Computer
|
|
129
|
+
Platform: ${process.platform}
|
|
130
|
+
Display: ${this.displayId || 'Primary'}
|
|
131
|
+
Screen Size: ${size.width}x${size.height}
|
|
132
|
+
Available Displays: ${displays.length > 0 ? displays.map((d)=>d.name).join(', ') : 'Unknown'}
|
|
133
|
+
`;
|
|
134
|
+
debugDevice('Computer device connected', this.description);
|
|
135
|
+
} catch (error) {
|
|
136
|
+
debugDevice(`Failed to connect: ${error}`);
|
|
137
|
+
throw new Error(`Unable to connect to computer device: ${error}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
async screenshotBase64() {
|
|
141
|
+
debugDevice('Taking screenshot', {
|
|
142
|
+
displayId: this.displayId
|
|
143
|
+
});
|
|
144
|
+
try {
|
|
145
|
+
const options = {
|
|
146
|
+
format: 'png'
|
|
147
|
+
};
|
|
148
|
+
if (void 0 !== this.displayId) if ('darwin' === process.platform) {
|
|
149
|
+
const screenIndex = Number(this.displayId);
|
|
150
|
+
if (!Number.isNaN(screenIndex)) options.screen = screenIndex;
|
|
151
|
+
} else options.screen = this.displayId;
|
|
152
|
+
debugDevice('Screenshot options', options);
|
|
153
|
+
const buffer = await screenshot_desktop(options);
|
|
154
|
+
return createImgBase64ByFormat('png', buffer.toString('base64'));
|
|
155
|
+
} catch (error) {
|
|
156
|
+
debugDevice(`Screenshot failed: ${error}`);
|
|
157
|
+
throw new Error(`Failed to take screenshot: ${error}`);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
async size() {
|
|
161
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
162
|
+
try {
|
|
163
|
+
const screenSize = device_libnut.getScreenSize();
|
|
164
|
+
return {
|
|
165
|
+
width: screenSize.width,
|
|
166
|
+
height: screenSize.height,
|
|
167
|
+
dpr: 1
|
|
168
|
+
};
|
|
169
|
+
} catch (error) {
|
|
170
|
+
debugDevice(`Failed to get screen size: ${error}`);
|
|
171
|
+
throw new Error(`Failed to get screen size: ${error}`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
actionSpace() {
|
|
175
|
+
const defaultActions = [
|
|
176
|
+
defineActionTap(async (param)=>{
|
|
177
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
178
|
+
const element = param.locate;
|
|
179
|
+
node_assert(element, 'Element not found, cannot tap');
|
|
180
|
+
const [x, y] = element.center;
|
|
181
|
+
const targetX = Math.round(x);
|
|
182
|
+
const targetY = Math.round(y);
|
|
183
|
+
await smoothMoveMouse(targetX, targetY, SMOOTH_MOVE_STEPS_TAP, SMOOTH_MOVE_DELAY_TAP);
|
|
184
|
+
device_libnut.mouseToggle('down', 'left');
|
|
185
|
+
await sleep(CLICK_HOLD_DURATION);
|
|
186
|
+
device_libnut.mouseToggle('up', 'left');
|
|
187
|
+
}),
|
|
188
|
+
defineActionDoubleClick(async (param)=>{
|
|
189
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
190
|
+
const element = param.locate;
|
|
191
|
+
node_assert(element, 'Element not found, cannot double click');
|
|
192
|
+
const [x, y] = element.center;
|
|
193
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
194
|
+
device_libnut.mouseClick('left', true);
|
|
195
|
+
}),
|
|
196
|
+
defineActionRightClick(async (param)=>{
|
|
197
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
198
|
+
const element = param.locate;
|
|
199
|
+
node_assert(element, 'Element not found, cannot right click');
|
|
200
|
+
const [x, y] = element.center;
|
|
201
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
202
|
+
device_libnut.mouseClick('right');
|
|
203
|
+
}),
|
|
204
|
+
defineActionHover(async (param)=>{
|
|
205
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
206
|
+
const element = param.locate;
|
|
207
|
+
node_assert(element, 'Element not found, cannot hover');
|
|
208
|
+
const [x, y] = element.center;
|
|
209
|
+
const targetX = Math.round(x);
|
|
210
|
+
const targetY = Math.round(y);
|
|
211
|
+
await smoothMoveMouse(targetX, targetY, SMOOTH_MOVE_STEPS_HOVER, SMOOTH_MOVE_DELAY_HOVER);
|
|
212
|
+
await sleep(HOVER_EFFECT_WAIT);
|
|
213
|
+
}),
|
|
214
|
+
defineAction({
|
|
215
|
+
name: 'Input',
|
|
216
|
+
description: 'Input text into the input field',
|
|
217
|
+
interfaceAlias: 'aiInput',
|
|
218
|
+
paramSchema: z.object({
|
|
219
|
+
value: z.string().describe('The text to input'),
|
|
220
|
+
mode: z["enum"]([
|
|
221
|
+
'replace',
|
|
222
|
+
'clear',
|
|
223
|
+
'append'
|
|
224
|
+
]).default('replace').optional().describe('Input mode: replace, clear, or append'),
|
|
225
|
+
locate: getMidsceneLocationSchema().describe('The input field to be filled').optional()
|
|
226
|
+
}),
|
|
227
|
+
call: async (param)=>{
|
|
228
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
229
|
+
const element = param.locate;
|
|
230
|
+
if (element && 'append' !== param.mode) {
|
|
231
|
+
const [x, y] = element.center;
|
|
232
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
233
|
+
device_libnut.mouseClick('left');
|
|
234
|
+
await sleep(INPUT_FOCUS_DELAY);
|
|
235
|
+
const modifier = 'darwin' === process.platform ? 'command' : 'control';
|
|
236
|
+
device_libnut.keyTap('a', [
|
|
237
|
+
modifier
|
|
238
|
+
]);
|
|
239
|
+
await sleep(50);
|
|
240
|
+
device_libnut.keyTap('backspace');
|
|
241
|
+
await sleep(INPUT_CLEAR_DELAY);
|
|
242
|
+
}
|
|
243
|
+
if ('clear' === param.mode) return;
|
|
244
|
+
if (!param.value) return;
|
|
245
|
+
device_libnut.typeString(param.value);
|
|
246
|
+
}
|
|
247
|
+
}),
|
|
248
|
+
defineActionScroll(async (param)=>{
|
|
249
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
250
|
+
if (param.locate) {
|
|
251
|
+
const element = param.locate;
|
|
252
|
+
const [x, y] = element.center;
|
|
253
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
254
|
+
}
|
|
255
|
+
const scrollType = param?.scrollType;
|
|
256
|
+
const scrollToEdgeActions = {
|
|
257
|
+
scrollToTop: [
|
|
258
|
+
0,
|
|
259
|
+
10
|
|
260
|
+
],
|
|
261
|
+
scrollToBottom: [
|
|
262
|
+
0,
|
|
263
|
+
-10
|
|
264
|
+
],
|
|
265
|
+
scrollToLeft: [
|
|
266
|
+
-10,
|
|
267
|
+
0
|
|
268
|
+
],
|
|
269
|
+
scrollToRight: [
|
|
270
|
+
10,
|
|
271
|
+
0
|
|
272
|
+
]
|
|
273
|
+
};
|
|
274
|
+
const edgeAction = scrollToEdgeActions[scrollType || ''];
|
|
275
|
+
if (edgeAction) {
|
|
276
|
+
const [dx, dy] = edgeAction;
|
|
277
|
+
for(let i = 0; i < SCROLL_REPEAT_COUNT; i++){
|
|
278
|
+
device_libnut.scrollMouse(dx, dy);
|
|
279
|
+
await sleep(SCROLL_STEP_DELAY);
|
|
280
|
+
}
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
if ('singleAction' === scrollType || !scrollType) {
|
|
284
|
+
const distance = param?.distance || 500;
|
|
285
|
+
const ticks = Math.ceil(distance / 100);
|
|
286
|
+
const direction = param?.direction || 'down';
|
|
287
|
+
const directionMap = {
|
|
288
|
+
up: [
|
|
289
|
+
0,
|
|
290
|
+
ticks
|
|
291
|
+
],
|
|
292
|
+
down: [
|
|
293
|
+
0,
|
|
294
|
+
-ticks
|
|
295
|
+
],
|
|
296
|
+
left: [
|
|
297
|
+
-ticks,
|
|
298
|
+
0
|
|
299
|
+
],
|
|
300
|
+
right: [
|
|
301
|
+
ticks,
|
|
302
|
+
0
|
|
303
|
+
]
|
|
304
|
+
};
|
|
305
|
+
const [dx, dy] = directionMap[direction] || [
|
|
306
|
+
0,
|
|
307
|
+
-ticks
|
|
308
|
+
];
|
|
309
|
+
device_libnut.scrollMouse(dx, dy);
|
|
310
|
+
await sleep(SCROLL_COMPLETE_DELAY);
|
|
311
|
+
return;
|
|
312
|
+
}
|
|
313
|
+
throw new Error(`Unknown scroll type: ${scrollType}, param: ${JSON.stringify(param)}`);
|
|
314
|
+
}),
|
|
315
|
+
defineActionKeyboardPress(async (param)=>{
|
|
316
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
317
|
+
if (param.locate) {
|
|
318
|
+
const [x, y] = param.locate.center;
|
|
319
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
320
|
+
device_libnut.mouseClick('left');
|
|
321
|
+
await sleep(50);
|
|
322
|
+
}
|
|
323
|
+
const keys = param.keyName.split('+');
|
|
324
|
+
const modifiers = keys.slice(0, -1).map(normalizeKeyName);
|
|
325
|
+
const key = normalizePrimaryKey(keys[keys.length - 1]);
|
|
326
|
+
debugDevice('KeyboardPress', {
|
|
327
|
+
original: param.keyName,
|
|
328
|
+
key,
|
|
329
|
+
modifiers
|
|
330
|
+
});
|
|
331
|
+
if (modifiers.length > 0) device_libnut.keyTap(key, modifiers);
|
|
332
|
+
else device_libnut.keyTap(key);
|
|
333
|
+
}),
|
|
334
|
+
defineActionDragAndDrop(async (param)=>{
|
|
335
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
336
|
+
const from = param.from;
|
|
337
|
+
const to = param.to;
|
|
338
|
+
node_assert(from, 'missing "from" param for drag and drop');
|
|
339
|
+
node_assert(to, 'missing "to" param for drag and drop');
|
|
340
|
+
const [fromX, fromY] = from.center;
|
|
341
|
+
const [toX, toY] = to.center;
|
|
342
|
+
device_libnut.moveMouse(Math.round(fromX), Math.round(fromY));
|
|
343
|
+
device_libnut.mouseToggle('down', 'left');
|
|
344
|
+
await sleep(100);
|
|
345
|
+
device_libnut.moveMouse(Math.round(toX), Math.round(toY));
|
|
346
|
+
await sleep(100);
|
|
347
|
+
device_libnut.mouseToggle('up', 'left');
|
|
348
|
+
}),
|
|
349
|
+
defineActionClearInput(async (param)=>{
|
|
350
|
+
node_assert(device_libnut, 'libnut not initialized');
|
|
351
|
+
const element = param.locate;
|
|
352
|
+
node_assert(element, 'Element not found, cannot clear input');
|
|
353
|
+
const [x, y] = element.center;
|
|
354
|
+
device_libnut.moveMouse(Math.round(x), Math.round(y));
|
|
355
|
+
device_libnut.mouseClick('left');
|
|
356
|
+
await sleep(100);
|
|
357
|
+
const modifier = 'darwin' === process.platform ? 'command' : 'control';
|
|
358
|
+
device_libnut.keyTap('a', [
|
|
359
|
+
modifier
|
|
360
|
+
]);
|
|
361
|
+
device_libnut.keyTap('backspace');
|
|
362
|
+
await sleep(50);
|
|
363
|
+
})
|
|
364
|
+
];
|
|
365
|
+
const platformActions = Object.values(createPlatformActions());
|
|
366
|
+
const customActions = this.options?.customActions || [];
|
|
367
|
+
return [
|
|
368
|
+
...defaultActions,
|
|
369
|
+
...platformActions,
|
|
370
|
+
...customActions
|
|
371
|
+
];
|
|
372
|
+
}
|
|
373
|
+
async destroy() {
|
|
374
|
+
if (this.destroyed) return;
|
|
375
|
+
this.destroyed = true;
|
|
376
|
+
debugDevice('Computer device destroyed');
|
|
377
|
+
}
|
|
378
|
+
async url() {
|
|
379
|
+
return '';
|
|
380
|
+
}
|
|
381
|
+
constructor(options){
|
|
382
|
+
_define_property(this, "interfaceType", 'computer');
|
|
383
|
+
_define_property(this, "options", void 0);
|
|
384
|
+
_define_property(this, "displayId", void 0);
|
|
385
|
+
_define_property(this, "description", void 0);
|
|
386
|
+
_define_property(this, "destroyed", false);
|
|
387
|
+
_define_property(this, "uri", void 0);
|
|
388
|
+
this.options = options;
|
|
389
|
+
this.displayId = options?.displayId;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
function createPlatformActions() {
|
|
393
|
+
return {
|
|
394
|
+
ListDisplays: defineAction({
|
|
395
|
+
name: 'ListDisplays',
|
|
396
|
+
description: 'List all available displays/monitors',
|
|
397
|
+
call: async ()=>await ComputerDevice.listDisplays()
|
|
398
|
+
})
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
class ComputerAgent extends Agent {
|
|
402
|
+
}
|
|
403
|
+
async function agentFromComputer(opts) {
|
|
404
|
+
const device = new ComputerDevice(opts || {});
|
|
405
|
+
await device.connect();
|
|
406
|
+
return new ComputerAgent(device, opts);
|
|
407
|
+
}
|
|
408
|
+
async function checkComputerEnvironment() {
|
|
409
|
+
try {
|
|
410
|
+
const libnutModule = await import("@computer-use/libnut/dist/import_libnut");
|
|
411
|
+
const libnut = libnutModule.libnut;
|
|
412
|
+
const screenSize = libnut.getScreenSize();
|
|
413
|
+
if (!screenSize || screenSize.width <= 0) return {
|
|
414
|
+
available: false,
|
|
415
|
+
error: 'libnut cannot get screen size',
|
|
416
|
+
platform: process.platform,
|
|
417
|
+
displays: 0
|
|
418
|
+
};
|
|
419
|
+
const displays = await ComputerDevice.listDisplays();
|
|
420
|
+
return {
|
|
421
|
+
available: true,
|
|
422
|
+
platform: process.platform,
|
|
423
|
+
displays: displays.length
|
|
424
|
+
};
|
|
425
|
+
} catch (error) {
|
|
426
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
427
|
+
return {
|
|
428
|
+
available: false,
|
|
429
|
+
error: errorMessage,
|
|
430
|
+
platform: process.platform,
|
|
431
|
+
displays: 0
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
async function getConnectedDisplays() {
|
|
436
|
+
return ComputerDevice.listDisplays();
|
|
437
|
+
}
|
|
438
|
+
export { ComputerAgent, ComputerDevice, agentFromComputer, checkComputerEnvironment, getConnectedDisplays, overrideAIConfig };
|