@midscene/computer 1.2.1-beta-20260112081017.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/device.ts ADDED
@@ -0,0 +1,554 @@
1
+ import assert from 'node:assert';
2
+ import {
3
+ type DeviceAction,
4
+ type InterfaceType,
5
+ type LocateResultElement,
6
+ type Size,
7
+ getMidsceneLocationSchema,
8
+ z,
9
+ } from '@midscene/core';
10
+ import {
11
+ type AbstractInterface,
12
+ type ActionTapParam,
13
+ defineAction,
14
+ defineActionClearInput,
15
+ defineActionDoubleClick,
16
+ defineActionDragAndDrop,
17
+ defineActionHover,
18
+ defineActionKeyboardPress,
19
+ defineActionRightClick,
20
+ defineActionScroll,
21
+ defineActionTap,
22
+ } from '@midscene/core/device';
23
+ import { sleep } from '@midscene/core/utils';
24
+ import { createImgBase64ByFormat } from '@midscene/shared/img';
25
+ import { getDebug } from '@midscene/shared/logger';
26
+ import screenshot from 'screenshot-desktop';
27
+
28
+ // Type definitions
29
+ interface LibNut {
30
+ getScreenSize(): { width: number; height: number };
31
+ getMousePos(): { x: number; y: number };
32
+ moveMouse(x: number, y: number): void;
33
+ mouseClick(button?: 'left' | 'right' | 'middle', double?: boolean): void;
34
+ mouseToggle(state: 'up' | 'down', button?: 'left' | 'right' | 'middle'): void;
35
+ scrollMouse(x: number, y: number): void;
36
+ keyTap(key: string, modifiers?: string[]): void;
37
+ typeString(text: string): void;
38
+ }
39
+
40
+ interface ScreenshotOptions {
41
+ format: 'png' | 'jpg';
42
+ screen?: string | number;
43
+ }
44
+
45
+ interface ScreenshotDisplay {
46
+ id: string | number;
47
+ name?: string;
48
+ primary?: boolean;
49
+ }
50
+
51
+ // Constants
52
+ const SMOOTH_MOVE_STEPS_TAP = 8;
53
+ const SMOOTH_MOVE_STEPS_HOVER = 10;
54
+ const SMOOTH_MOVE_DELAY_TAP = 8;
55
+ const SMOOTH_MOVE_DELAY_HOVER = 10;
56
+ const HOVER_EFFECT_WAIT = 300;
57
+ const CLICK_HOLD_DURATION = 50;
58
+ const INPUT_FOCUS_DELAY = 300;
59
+ const INPUT_CLEAR_DELAY = 150;
60
+ const SCROLL_REPEAT_COUNT = 10;
61
+ const SCROLL_STEP_DELAY = 100;
62
+ const SCROLL_COMPLETE_DELAY = 500;
63
+
64
+ // Lazy load libnut with fallback
65
+ let libnut: LibNut | null = null;
66
+ let libnutLoadError: Error | null = null;
67
+
68
+ async function getLibnut(): Promise<LibNut> {
69
+ if (libnut) return libnut;
70
+ if (libnutLoadError) throw libnutLoadError;
71
+
72
+ try {
73
+ const libnutModule = await import(
74
+ '@computer-use/libnut/dist/import_libnut'
75
+ );
76
+ libnut = libnutModule.libnut as LibNut;
77
+ if (!libnut) {
78
+ throw new Error('libnut module loaded but libnut object is undefined');
79
+ }
80
+ return libnut;
81
+ } catch (error) {
82
+ libnutLoadError = error as Error;
83
+ throw new Error(
84
+ `Failed to load @computer-use/libnut. Make sure it is properly installed and compiled for your platform. Error: ${error}`,
85
+ );
86
+ }
87
+ }
88
+
89
+ const debugDevice = getDebug('computer:device');
90
+
91
+ /**
92
+ * Smooth mouse movement to trigger mousemove events
93
+ */
94
+ async function smoothMoveMouse(
95
+ targetX: number,
96
+ targetY: number,
97
+ steps: number,
98
+ stepDelay: number,
99
+ ): Promise<void> {
100
+ assert(libnut, 'libnut not initialized');
101
+ const currentPos = libnut.getMousePos();
102
+ for (let i = 1; i <= steps; i++) {
103
+ const stepX = Math.round(
104
+ currentPos.x + ((targetX - currentPos.x) * i) / steps,
105
+ );
106
+ const stepY = Math.round(
107
+ currentPos.y + ((targetY - currentPos.y) * i) / steps,
108
+ );
109
+ libnut.moveMouse(stepX, stepY);
110
+ await sleep(stepDelay);
111
+ }
112
+ }
113
+
114
+ // Key name mapping for cross-platform compatibility
115
+ // Note: Modifier keys have different names when used as primary key vs modifier
116
+ const KEY_NAME_MAP: Record<string, string> = {
117
+ // Modifier keys (for use in modifiers array)
118
+ windows: 'win',
119
+ win: 'win',
120
+ ctrl: 'control',
121
+ esc: 'escape',
122
+ del: 'delete',
123
+ ins: 'insert',
124
+ // Navigation keys
125
+ pgup: 'pageup',
126
+ pgdn: 'pagedown',
127
+ arrowup: 'up',
128
+ arrowdown: 'down',
129
+ arrowleft: 'left',
130
+ arrowright: 'right',
131
+ // Media keys
132
+ volumedown: 'audio_vol_down',
133
+ volumeup: 'audio_vol_up',
134
+ mediavolumedown: 'audio_vol_down',
135
+ mediavolumeup: 'audio_vol_up',
136
+ mute: 'audio_mute',
137
+ mediamute: 'audio_mute',
138
+ mediaplay: 'audio_play',
139
+ mediapause: 'audio_pause',
140
+ mediaplaypause: 'audio_play',
141
+ mediastop: 'audio_stop',
142
+ medianexttrack: 'audio_next',
143
+ mediaprevioustrack: 'audio_prev',
144
+ medianext: 'audio_next',
145
+ mediaprev: 'audio_prev',
146
+ };
147
+
148
+ // When pressing modifier keys alone (as primary key), use these names
149
+ // This is needed because libnut requires different key names for modifiers
150
+ // when they are the main key vs when they are in the modifiers array
151
+ const PRIMARY_KEY_MAP: Record<string, string> = {
152
+ command: 'cmd',
153
+ cmd: 'cmd',
154
+ meta: 'meta',
155
+ control: 'control',
156
+ ctrl: 'control',
157
+ shift: 'shift',
158
+ alt: 'alt',
159
+ option: 'alt',
160
+ };
161
+
162
+ function normalizeKeyName(key: string): string {
163
+ const lowerKey = key.toLowerCase();
164
+ return KEY_NAME_MAP[lowerKey] || lowerKey;
165
+ }
166
+
167
+ function normalizePrimaryKey(key: string): string {
168
+ const lowerKey = key.toLowerCase();
169
+ // First check PRIMARY_KEY_MAP for modifier keys pressed alone
170
+ if (PRIMARY_KEY_MAP[lowerKey]) {
171
+ return PRIMARY_KEY_MAP[lowerKey];
172
+ }
173
+ // Then use regular KEY_NAME_MAP
174
+ return KEY_NAME_MAP[lowerKey] || lowerKey;
175
+ }
176
+
177
+ export interface DisplayInfo {
178
+ id: string;
179
+ name: string;
180
+ primary?: boolean;
181
+ }
182
+
183
+ export interface ComputerDeviceOpt {
184
+ displayId?: string;
185
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
186
+ customActions?: DeviceAction<any>[];
187
+ }
188
+
189
+ export class ComputerDevice implements AbstractInterface {
190
+ interfaceType: InterfaceType = 'computer';
191
+ private options?: ComputerDeviceOpt;
192
+ private displayId?: string;
193
+ private description?: string;
194
+ private destroyed = false;
195
+ uri?: string;
196
+
197
+ constructor(options?: ComputerDeviceOpt) {
198
+ this.options = options;
199
+ this.displayId = options?.displayId;
200
+ }
201
+
202
+ describe(): string {
203
+ return this.description || 'Computer Device';
204
+ }
205
+
206
+ /**
207
+ * Get all available displays
208
+ */
209
+ static async listDisplays(): Promise<DisplayInfo[]> {
210
+ try {
211
+ const displays: ScreenshotDisplay[] = await screenshot.listDisplays();
212
+ return displays.map((d) => ({
213
+ id: String(d.id),
214
+ name: d.name || `Display ${d.id}`,
215
+ primary: d.primary || false,
216
+ }));
217
+ } catch (error) {
218
+ debugDevice(`Failed to list displays: ${error}`);
219
+ return [];
220
+ }
221
+ }
222
+
223
+ async connect(): Promise<void> {
224
+ debugDevice('Connecting to computer device');
225
+
226
+ try {
227
+ // Load libnut on first connect
228
+ libnut = await getLibnut();
229
+
230
+ const size = await this.size();
231
+ const displays = await ComputerDevice.listDisplays();
232
+
233
+ this.description = `
234
+ Type: Computer
235
+ Platform: ${process.platform}
236
+ Display: ${this.displayId || 'Primary'}
237
+ Screen Size: ${size.width}x${size.height}
238
+ Available Displays: ${displays.length > 0 ? displays.map((d) => d.name).join(', ') : 'Unknown'}
239
+ `;
240
+ debugDevice('Computer device connected', this.description);
241
+ } catch (error) {
242
+ debugDevice(`Failed to connect: ${error}`);
243
+ throw new Error(`Unable to connect to computer device: ${error}`);
244
+ }
245
+ }
246
+
247
+ async screenshotBase64(): Promise<string> {
248
+ debugDevice('Taking screenshot', { displayId: this.displayId });
249
+
250
+ try {
251
+ const options: ScreenshotOptions = { format: 'png' };
252
+ if (this.displayId !== undefined) {
253
+ // On macOS: displayId is numeric (CGDirectDisplayID)
254
+ // On Windows: displayId is string like "\\.\DISPLAY1"
255
+ // On Linux: displayId is string like ":0.0"
256
+ if (process.platform === 'darwin') {
257
+ const screenIndex = Number(this.displayId);
258
+ if (!Number.isNaN(screenIndex)) {
259
+ options.screen = screenIndex;
260
+ }
261
+ } else {
262
+ // Windows and Linux use string IDs directly
263
+ options.screen = this.displayId;
264
+ }
265
+ }
266
+
267
+ debugDevice('Screenshot options', options);
268
+ const buffer: Buffer = await screenshot(options);
269
+ return createImgBase64ByFormat('png', buffer.toString('base64'));
270
+ } catch (error) {
271
+ debugDevice(`Screenshot failed: ${error}`);
272
+ throw new Error(`Failed to take screenshot: ${error}`);
273
+ }
274
+ }
275
+
276
+ async size(): Promise<Size> {
277
+ assert(libnut, 'libnut not initialized');
278
+ try {
279
+ const screenSize = libnut.getScreenSize();
280
+ return {
281
+ width: screenSize.width,
282
+ height: screenSize.height,
283
+ dpr: 1, // Desktop typically uses logical pixels
284
+ };
285
+ } catch (error) {
286
+ debugDevice(`Failed to get screen size: ${error}`);
287
+ throw new Error(`Failed to get screen size: ${error}`);
288
+ }
289
+ }
290
+
291
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
292
+ actionSpace(): DeviceAction<any>[] {
293
+ const defaultActions: DeviceAction<any>[] = [
294
+ // Tap (single click)
295
+ defineActionTap(async (param: ActionTapParam) => {
296
+ assert(libnut, 'libnut not initialized');
297
+ const element = param.locate as LocateResultElement;
298
+ assert(element, 'Element not found, cannot tap');
299
+ const [x, y] = element.center;
300
+ const targetX = Math.round(x);
301
+ const targetY = Math.round(y);
302
+
303
+ await smoothMoveMouse(
304
+ targetX,
305
+ targetY,
306
+ SMOOTH_MOVE_STEPS_TAP,
307
+ SMOOTH_MOVE_DELAY_TAP,
308
+ );
309
+ // Use mouseToggle for more realistic click behavior
310
+ libnut.mouseToggle('down', 'left');
311
+ await sleep(CLICK_HOLD_DURATION);
312
+ libnut.mouseToggle('up', 'left');
313
+ }),
314
+
315
+ // DoubleClick
316
+ defineActionDoubleClick(async (param) => {
317
+ assert(libnut, 'libnut not initialized');
318
+ const element = param.locate as LocateResultElement;
319
+ assert(element, 'Element not found, cannot double click');
320
+ const [x, y] = element.center;
321
+ libnut.moveMouse(Math.round(x), Math.round(y));
322
+ libnut.mouseClick('left', true);
323
+ }),
324
+
325
+ // RightClick
326
+ defineActionRightClick(async (param) => {
327
+ assert(libnut, 'libnut not initialized');
328
+ const element = param.locate as LocateResultElement;
329
+ assert(element, 'Element not found, cannot right click');
330
+ const [x, y] = element.center;
331
+ libnut.moveMouse(Math.round(x), Math.round(y));
332
+ libnut.mouseClick('right');
333
+ }),
334
+
335
+ // Hover
336
+ defineActionHover(async (param) => {
337
+ assert(libnut, 'libnut not initialized');
338
+ const element = param.locate as LocateResultElement;
339
+ assert(element, 'Element not found, cannot hover');
340
+ const [x, y] = element.center;
341
+ const targetX = Math.round(x);
342
+ const targetY = Math.round(y);
343
+
344
+ await smoothMoveMouse(
345
+ targetX,
346
+ targetY,
347
+ SMOOTH_MOVE_STEPS_HOVER,
348
+ SMOOTH_MOVE_DELAY_HOVER,
349
+ );
350
+ await sleep(HOVER_EFFECT_WAIT);
351
+ }),
352
+
353
+ // Input
354
+ defineAction({
355
+ name: 'Input',
356
+ description: 'Input text into the input field',
357
+ interfaceAlias: 'aiInput',
358
+ paramSchema: z.object({
359
+ value: z.string().describe('The text to input'),
360
+ mode: z
361
+ .enum(['replace', 'clear', 'append'])
362
+ .default('replace')
363
+ .optional()
364
+ .describe('Input mode: replace, clear, or append'),
365
+ locate: getMidsceneLocationSchema()
366
+ .describe('The input field to be filled')
367
+ .optional(),
368
+ }),
369
+ call: async (param) => {
370
+ assert(libnut, 'libnut not initialized');
371
+ const element = param.locate as LocateResultElement | undefined;
372
+
373
+ if (element && param.mode !== 'append') {
374
+ // Click and clear
375
+ const [x, y] = element.center;
376
+ libnut.moveMouse(Math.round(x), Math.round(y));
377
+ libnut.mouseClick('left');
378
+ await sleep(INPUT_FOCUS_DELAY);
379
+
380
+ // Select all and delete
381
+ const modifier =
382
+ process.platform === 'darwin' ? 'command' : 'control';
383
+ libnut.keyTap('a', [modifier]);
384
+ await sleep(50);
385
+ libnut.keyTap('backspace');
386
+ await sleep(INPUT_CLEAR_DELAY);
387
+ }
388
+
389
+ if (param.mode === 'clear') {
390
+ return;
391
+ }
392
+
393
+ if (!param.value) {
394
+ return;
395
+ }
396
+
397
+ libnut.typeString(param.value);
398
+ },
399
+ }),
400
+
401
+ // Scroll
402
+ defineActionScroll(async (param) => {
403
+ assert(libnut, 'libnut not initialized');
404
+
405
+ if (param.locate) {
406
+ const element = param.locate as LocateResultElement;
407
+ const [x, y] = element.center;
408
+ libnut.moveMouse(Math.round(x), Math.round(y));
409
+ }
410
+
411
+ const scrollType = param?.scrollType;
412
+
413
+ // Scroll to edge actions
414
+ const scrollToEdgeActions: Record<string, [number, number]> = {
415
+ scrollToTop: [0, 10],
416
+ scrollToBottom: [0, -10],
417
+ scrollToLeft: [-10, 0],
418
+ scrollToRight: [10, 0],
419
+ };
420
+
421
+ const edgeAction = scrollToEdgeActions[scrollType || ''];
422
+ if (edgeAction) {
423
+ const [dx, dy] = edgeAction;
424
+ for (let i = 0; i < SCROLL_REPEAT_COUNT; i++) {
425
+ libnut.scrollMouse(dx, dy);
426
+ await sleep(SCROLL_STEP_DELAY);
427
+ }
428
+ return;
429
+ }
430
+
431
+ // Single scroll action
432
+ if (scrollType === 'singleAction' || !scrollType) {
433
+ const distance = param?.distance || 500;
434
+ const ticks = Math.ceil(distance / 100);
435
+ const direction = param?.direction || 'down';
436
+
437
+ const directionMap: Record<string, [number, number]> = {
438
+ up: [0, ticks],
439
+ down: [0, -ticks],
440
+ left: [-ticks, 0],
441
+ right: [ticks, 0],
442
+ };
443
+
444
+ const [dx, dy] = directionMap[direction] || [0, -ticks];
445
+ libnut.scrollMouse(dx, dy);
446
+ await sleep(SCROLL_COMPLETE_DELAY);
447
+ return;
448
+ }
449
+
450
+ throw new Error(
451
+ `Unknown scroll type: ${scrollType}, param: ${JSON.stringify(param)}`,
452
+ );
453
+ }),
454
+
455
+ // KeyboardPress
456
+ defineActionKeyboardPress(async (param) => {
457
+ assert(libnut, 'libnut not initialized');
458
+
459
+ if (param.locate) {
460
+ const [x, y] = param.locate.center;
461
+ libnut.moveMouse(Math.round(x), Math.round(y));
462
+ libnut.mouseClick('left');
463
+ await sleep(50);
464
+ }
465
+
466
+ const keys = param.keyName.split('+');
467
+ const modifiers = keys.slice(0, -1).map(normalizeKeyName);
468
+ // Use normalizePrimaryKey for the main key to handle modifier keys pressed alone
469
+ const key = normalizePrimaryKey(keys[keys.length - 1]);
470
+
471
+ debugDevice('KeyboardPress', {
472
+ original: param.keyName,
473
+ key,
474
+ modifiers,
475
+ });
476
+
477
+ if (modifiers.length > 0) {
478
+ libnut.keyTap(key, modifiers);
479
+ } else {
480
+ libnut.keyTap(key);
481
+ }
482
+ }),
483
+
484
+ // DragAndDrop
485
+ defineActionDragAndDrop(async (param) => {
486
+ assert(libnut, 'libnut not initialized');
487
+ const from = param.from as LocateResultElement;
488
+ const to = param.to as LocateResultElement;
489
+ assert(from, 'missing "from" param for drag and drop');
490
+ assert(to, 'missing "to" param for drag and drop');
491
+
492
+ const [fromX, fromY] = from.center;
493
+ const [toX, toY] = to.center;
494
+
495
+ libnut.moveMouse(Math.round(fromX), Math.round(fromY));
496
+ libnut.mouseToggle('down', 'left');
497
+ await sleep(100);
498
+ libnut.moveMouse(Math.round(toX), Math.round(toY));
499
+ await sleep(100);
500
+ libnut.mouseToggle('up', 'left');
501
+ }),
502
+
503
+ // ClearInput
504
+ defineActionClearInput(async (param) => {
505
+ assert(libnut, 'libnut not initialized');
506
+ const element = param.locate as LocateResultElement;
507
+ assert(element, 'Element not found, cannot clear input');
508
+
509
+ const [x, y] = element.center;
510
+ libnut.moveMouse(Math.round(x), Math.round(y));
511
+ libnut.mouseClick('left');
512
+ await sleep(100);
513
+
514
+ const modifier = process.platform === 'darwin' ? 'command' : 'control';
515
+ libnut.keyTap('a', [modifier]);
516
+ libnut.keyTap('backspace');
517
+ await sleep(50);
518
+ }),
519
+ ];
520
+
521
+ const platformActions = Object.values(createPlatformActions());
522
+ const customActions = this.options?.customActions || [];
523
+
524
+ return [...defaultActions, ...platformActions, ...customActions];
525
+ }
526
+
527
+ async destroy(): Promise<void> {
528
+ if (this.destroyed) {
529
+ return;
530
+ }
531
+
532
+ this.destroyed = true;
533
+ debugDevice('Computer device destroyed');
534
+ }
535
+
536
+ async url(): Promise<string> {
537
+ return '';
538
+ }
539
+ }
540
+
541
+ /**
542
+ * Platform-specific actions
543
+ */
544
+ function createPlatformActions() {
545
+ return {
546
+ ListDisplays: defineAction({
547
+ name: 'ListDisplays',
548
+ description: 'List all available displays/monitors',
549
+ call: async () => {
550
+ return await ComputerDevice.listDisplays();
551
+ },
552
+ }),
553
+ } as const;
554
+ }
package/src/index.ts ADDED
@@ -0,0 +1,8 @@
1
+ export { ComputerDevice } from './device';
2
+ export type { ComputerDeviceOpt, DisplayInfo } from './device';
3
+
4
+ export { ComputerAgent, agentFromComputer } from './agent';
5
+ export type { ComputerAgentOpt } from './agent';
6
+
7
+ export { overrideAIConfig } from '@midscene/shared/env';
8
+ export { checkComputerEnvironment, getConnectedDisplays } from './utils';
@@ -0,0 +1,65 @@
1
+ import type { Agent } from '@midscene/core/agent';
2
+ import {
3
+ BaseMCPServer,
4
+ type Tool,
5
+ createMCPServerLauncher,
6
+ } from '@midscene/shared/mcp';
7
+ import { ComputerAgent } from './agent';
8
+ import { ComputerMidsceneTools } from './mcp-tools.js';
9
+
10
+ declare const __VERSION__: string;
11
+
12
+ /**
13
+ * Computer MCP Server
14
+ * Provides MCP tools for computer desktop automation
15
+ */
16
+ export class ComputerMCPServer extends BaseMCPServer {
17
+ constructor(toolsManager?: ComputerMidsceneTools) {
18
+ super(
19
+ {
20
+ name: '@midscene/computer-mcp',
21
+ version: __VERSION__,
22
+ description:
23
+ 'Control the computer desktop using natural language commands',
24
+ },
25
+ toolsManager,
26
+ );
27
+ }
28
+
29
+ protected createToolsManager(): ComputerMidsceneTools {
30
+ return new ComputerMidsceneTools();
31
+ }
32
+ }
33
+
34
+ /**
35
+ * Create an MCP server launcher for a specific Computer Agent
36
+ */
37
+ export function mcpServerForAgent(agent: Agent | ComputerAgent) {
38
+ return createMCPServerLauncher({
39
+ agent,
40
+ platformName: 'Computer',
41
+ ToolsManagerClass: ComputerMidsceneTools,
42
+ MCPServerClass: ComputerMCPServer,
43
+ });
44
+ }
45
+
46
+ /**
47
+ * Create MCP kit for a specific Computer Agent
48
+ */
49
+ export async function mcpKitForAgent(agent: Agent | ComputerAgent): Promise<{
50
+ description: string;
51
+ tools: Tool[];
52
+ }> {
53
+ const toolsManager = new ComputerMidsceneTools();
54
+
55
+ // Convert Agent to ComputerAgent if needed
56
+ const computerAgent =
57
+ agent instanceof ComputerAgent ? agent : (agent as ComputerAgent);
58
+ toolsManager.setAgent(computerAgent);
59
+ await toolsManager.initTools();
60
+
61
+ return {
62
+ description: 'Midscene MCP Kit for computer desktop automation',
63
+ tools: toolsManager.getToolDefinitions(),
64
+ };
65
+ }