@midscene/computer 1.2.1-beta-20260112081017.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ import { z } from '@midscene/core';
2
+ import { getDebug } from '@midscene/shared/logger';
3
+ import { BaseMidsceneTools, type ToolDefinition } from '@midscene/shared/mcp';
4
+ import { type ComputerAgent, agentFromComputer } from './agent';
5
+ import { ComputerDevice } from './device';
6
+
7
+ const debug = getDebug('mcp:computer-tools');
8
+
9
+ /**
10
+ * Computer-specific tools manager
11
+ * Extends BaseMidsceneTools to provide desktop automation tools
12
+ */
13
+ export class ComputerMidsceneTools extends BaseMidsceneTools<ComputerAgent> {
14
+ protected createTemporaryDevice() {
15
+ // Create minimal temporary instance
16
+ return new ComputerDevice({});
17
+ }
18
+
19
+ protected async ensureAgent(displayId?: string): Promise<ComputerAgent> {
20
+ if (this.agent && displayId) {
21
+ // If a specific displayId is requested and we have an agent,
22
+ // destroy it to create a new one with the new display
23
+ try {
24
+ await this.agent.destroy?.();
25
+ } catch (error) {
26
+ debug('Failed to destroy agent during cleanup:', error);
27
+ }
28
+ this.agent = undefined;
29
+ }
30
+
31
+ if (this.agent) {
32
+ return this.agent;
33
+ }
34
+
35
+ debug('Creating Computer agent with displayId:', displayId || 'primary');
36
+ const opts = displayId ? { displayId } : undefined;
37
+ const agent = await agentFromComputer(opts);
38
+ this.agent = agent;
39
+ return agent;
40
+ }
41
+
42
+ /**
43
+ * Provide Computer-specific platform tools
44
+ */
45
+ protected preparePlatformTools(): ToolDefinition[] {
46
+ return [
47
+ {
48
+ name: 'computer_connect',
49
+ description:
50
+ 'Connect to computer desktop. If displayId not provided, uses the primary display.',
51
+ schema: {
52
+ displayId: z
53
+ .string()
54
+ .optional()
55
+ .describe('Display ID (from list_displays)'),
56
+ },
57
+ handler: async ({ displayId }: { displayId?: string }) => {
58
+ const agent = await this.ensureAgent(displayId);
59
+ const screenshot = await agent.interface.screenshotBase64();
60
+
61
+ return {
62
+ content: [
63
+ {
64
+ type: 'text',
65
+ text: `Connected to computer${displayId ? ` (Display: ${displayId})` : ' (Primary display)'}`,
66
+ },
67
+ ...this.buildScreenshotContent(screenshot),
68
+ ],
69
+ };
70
+ },
71
+ },
72
+ {
73
+ name: 'computer_disconnect',
74
+ description: 'Disconnect from computer and release resources',
75
+ schema: {},
76
+ handler: this.createDisconnectHandler('computer'),
77
+ },
78
+ {
79
+ name: 'computer_list_displays',
80
+ description: 'List all available displays/monitors',
81
+ schema: {},
82
+ handler: async () => {
83
+ const displays = await ComputerDevice.listDisplays();
84
+ return {
85
+ content: [
86
+ {
87
+ type: 'text',
88
+ text: `Available displays:\n${displays.map((d) => `- ${d.name} (ID: ${d.id})${d.primary ? ' [PRIMARY]' : ''}`).join('\n')}`,
89
+ },
90
+ ],
91
+ };
92
+ },
93
+ },
94
+ ];
95
+ }
96
+ }
@@ -0,0 +1,36 @@
1
+ declare module '@computer-use/libnut/dist/import_libnut' {
2
+ interface ScreenSize {
3
+ width: number;
4
+ height: number;
5
+ }
6
+
7
+ interface Point {
8
+ x: number;
9
+ y: number;
10
+ }
11
+
12
+ type MouseButton = 'left' | 'right' | 'middle';
13
+ type ToggleState = 'up' | 'down';
14
+
15
+ interface LibNut {
16
+ getScreenSize(): ScreenSize;
17
+ getMousePos(): Point;
18
+ moveMouse(x: number, y: number): void;
19
+ mouseClick(button?: MouseButton, double?: boolean): void;
20
+ mouseToggle(state: ToggleState, button?: MouseButton): void;
21
+ scrollMouse(x: number, y: number): void;
22
+ keyTap(key: string, modifiers?: string[]): void;
23
+ typeString(text: string): void;
24
+ }
25
+
26
+ export const libnut: LibNut;
27
+ }
28
+
29
+ declare module '@computer-use/libnut' {
30
+ interface ScreenSize {
31
+ width: number;
32
+ height: number;
33
+ }
34
+
35
+ export function getScreenSize(): ScreenSize;
36
+ }
package/src/utils.ts ADDED
@@ -0,0 +1,51 @@
1
+ import { ComputerDevice, type DisplayInfo } from './device';
2
+
3
+ export interface EnvironmentCheck {
4
+ available: boolean;
5
+ error?: string;
6
+ platform: string;
7
+ displays: number;
8
+ }
9
+
10
+ /**
11
+ * Check if the computer environment is available
12
+ */
13
+ export async function checkComputerEnvironment(): Promise<EnvironmentCheck> {
14
+ try {
15
+ const libnutModule = await import(
16
+ '@computer-use/libnut/dist/import_libnut'
17
+ );
18
+ const libnut = libnutModule.libnut;
19
+ const screenSize = libnut.getScreenSize();
20
+ if (!screenSize || screenSize.width <= 0) {
21
+ return {
22
+ available: false,
23
+ error: 'libnut cannot get screen size',
24
+ platform: process.platform,
25
+ displays: 0,
26
+ };
27
+ }
28
+
29
+ const displays = await ComputerDevice.listDisplays();
30
+ return {
31
+ available: true,
32
+ platform: process.platform,
33
+ displays: displays.length,
34
+ };
35
+ } catch (error) {
36
+ const errorMessage = error instanceof Error ? error.message : String(error);
37
+ return {
38
+ available: false,
39
+ error: errorMessage,
40
+ platform: process.platform,
41
+ displays: 0,
42
+ };
43
+ }
44
+ }
45
+
46
+ /**
47
+ * Get all connected displays
48
+ */
49
+ export async function getConnectedDisplays(): Promise<DisplayInfo[]> {
50
+ return ComputerDevice.listDisplays();
51
+ }
@@ -0,0 +1,85 @@
1
+ import { beforeAll, describe, expect, it, vi } from 'vitest';
2
+ import { type ComputerAgent, agentFromComputer } from '../../src';
3
+ import { openBrowserAndNavigate } from './test-utils';
4
+
5
+ vi.setConfig({
6
+ testTimeout: 120 * 1000,
7
+ });
8
+
9
+ const isCacheEnabled = process.env.MIDSCENE_CACHE;
10
+
11
+ describe('computer todo app automation', () => {
12
+ let agent: ComputerAgent;
13
+
14
+ beforeAll(async () => {
15
+ agent = await agentFromComputer({
16
+ aiActionContext:
17
+ 'You are testing a web application on a desktop browser.',
18
+ });
19
+ });
20
+
21
+ it(
22
+ 'should automate todo list operations',
23
+ async () => {
24
+ if (isCacheEnabled) {
25
+ vi.setConfig({ testTimeout: 1000 * 1000 });
26
+ }
27
+
28
+ await openBrowserAndNavigate(
29
+ agent,
30
+ 'https://todomvc.com/examples/react/dist/',
31
+ );
32
+
33
+ // Wait for page to load
34
+ await agent.aiAssert('The todo input box is visible');
35
+
36
+ // Add tasks
37
+ await agent.aiAct('Enter "Happy Birthday" in the task box');
38
+ await agent.aiAct(
39
+ 'Enter "Learn JS today" in the task box, then press Enter to create',
40
+ );
41
+
42
+ await agent.aiAct(
43
+ 'Enter "Learn Rust tomorrow" in the task box, then press Enter to create',
44
+ );
45
+ await agent.aiAct(
46
+ 'Enter "Learning AI the day after tomorrow" in the task box, then press Enter to create',
47
+ );
48
+
49
+ // Verify tasks were created
50
+ const allTaskList = await agent.aiQuery<string[]>(
51
+ 'string[], tasks in the list',
52
+ );
53
+ console.log('allTaskList', allTaskList);
54
+ expect(allTaskList).toContain('Learn JS today');
55
+ expect(allTaskList).toContain('Learn Rust tomorrow');
56
+ expect(allTaskList).toContain('Learning AI the day after tomorrow');
57
+
58
+ // Interact with tasks - hover to show delete button, then click it
59
+ await agent.aiAct(
60
+ 'Move your mouse over the second item in the task list',
61
+ );
62
+ await agent.aiAct(
63
+ 'Click the delete button to the right of the second task',
64
+ );
65
+ await agent.aiAct('Click the checkbox next to the second task');
66
+ await agent.aiAct(
67
+ 'Click the "completed" Status button below the task list',
68
+ );
69
+
70
+ // Verify remaining tasks
71
+ const taskList = await agent.aiQuery<string[]>(
72
+ 'string[], Extract all task names from the list',
73
+ );
74
+ expect(taskList.length).toBe(1);
75
+ expect(taskList[0]).toBe('Learning AI the day after tomorrow');
76
+
77
+ // Verify placeholder text
78
+ const placeholder = await agent.aiQuery(
79
+ 'string, return the placeholder text in the input box',
80
+ );
81
+ expect(placeholder).toBe('What needs to be done?');
82
+ },
83
+ 360 * 1000,
84
+ );
85
+ });
@@ -0,0 +1,56 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import { beforeAll, describe, it, vi } from 'vitest';
3
+ import { type ComputerAgent, agentFromComputer } from '../../src';
4
+ import { openBrowserAndNavigate } from './test-utils';
5
+
6
+ vi.setConfig({
7
+ testTimeout: 120 * 1000,
8
+ });
9
+
10
+ const isCacheEnabled = process.env.MIDSCENE_CACHE;
11
+
12
+ describe('computer shop app automation', () => {
13
+ let agent: ComputerAgent;
14
+
15
+ beforeAll(async () => {
16
+ agent = await agentFromComputer({
17
+ aiActionContext:
18
+ 'You are testing a web application on a desktop browser.',
19
+ });
20
+ });
21
+
22
+ it(
23
+ 'should automate shop login and cart operations',
24
+ async () => {
25
+ if (isCacheEnabled) {
26
+ vi.setConfig({ testTimeout: 1000 * 1000 });
27
+ }
28
+
29
+ await openBrowserAndNavigate(agent, 'https://www.saucedemo.com/');
30
+
31
+ // Wait for page to load
32
+ await agent.aiAssert('The login form is visible');
33
+
34
+ // Login
35
+ await agent.aiAct('type "standard_user" in user name input');
36
+ await agent.aiAct('type "secret_sauce" in password input');
37
+ await agent.aiAct('click Login Button');
38
+ await sleep(2000);
39
+
40
+ // Check the login success
41
+ await agent.aiAssert('the page title is "Swag Labs"');
42
+
43
+ // Add to cart
44
+ await agent.aiAct('click "add to cart" for black t-shirt products');
45
+ await sleep(500);
46
+
47
+ // Click cart icon
48
+ await agent.aiAct('click right top cart icon');
49
+ await sleep(1000);
50
+
51
+ // Verify cart page loaded
52
+ await agent.aiAssert('The cart page is displayed');
53
+ },
54
+ 360 * 1000,
55
+ );
56
+ });
@@ -0,0 +1,46 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import { beforeAll, describe, it, vi } from 'vitest';
3
+ import { ComputerAgent, ComputerDevice } from '../../src';
4
+
5
+ vi.setConfig({
6
+ testTimeout: 120 * 1000,
7
+ });
8
+
9
+ describe('computer basic operations', () => {
10
+ let agent: ComputerAgent;
11
+
12
+ beforeAll(async () => {
13
+ const device = new ComputerDevice({});
14
+ agent = new ComputerAgent(device, {
15
+ aiActionContext:
16
+ 'You are controlling a desktop computer. This is a test environment.',
17
+ });
18
+ await device.connect();
19
+ });
20
+
21
+ it(
22
+ 'basic desktop interactions',
23
+ async () => {
24
+ // Take screenshot and query screen info
25
+ const screenInfo = await agent.aiQuery(
26
+ '{width: number, height: number, hasContent: boolean}, get current screen resolution and check if screen has visible content',
27
+ );
28
+ console.log('Screen info:', screenInfo);
29
+
30
+ // Move mouse
31
+ await agent.aiAct('move mouse to the center of the screen');
32
+ await sleep(500);
33
+
34
+ // Verify screen has content
35
+ await agent.aiAssert('The screen has visible content');
36
+
37
+ // Test moving mouse to corners
38
+ await agent.aiAct('move mouse to the top-left corner of the screen');
39
+ await sleep(500);
40
+
41
+ await agent.aiAct('move mouse to the bottom-right corner of the screen');
42
+ await sleep(500);
43
+ },
44
+ 360 * 1000,
45
+ );
46
+ });
@@ -0,0 +1,66 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import { beforeAll, describe, it, vi } from 'vitest';
3
+ import { ComputerAgent, ComputerDevice } from '../../src';
4
+
5
+ vi.setConfig({
6
+ testTimeout: 120 * 1000,
7
+ });
8
+
9
+ describe('computer keyboard operations', () => {
10
+ let agent: ComputerAgent;
11
+
12
+ beforeAll(async () => {
13
+ const device = new ComputerDevice({});
14
+ agent = new ComputerAgent(device, {
15
+ aiActionContext:
16
+ 'You are testing keyboard operations on a desktop computer.',
17
+ });
18
+ await device.connect();
19
+ });
20
+
21
+ it(
22
+ 'keyboard shortcuts test',
23
+ async () => {
24
+ const isMac = process.platform === 'darwin';
25
+
26
+ // Take screenshot to verify current screen state
27
+ const initialState = await agent.aiQuery(
28
+ '{hasVisibleContent: boolean}, check if there is visible content on screen',
29
+ );
30
+ console.log('Initial screen state:', initialState);
31
+
32
+ // Verify screen has content
33
+ await agent.aiAssert('The screen has visible content');
34
+
35
+ // Test mouse movement
36
+ await agent.aiAct('move mouse to center of screen');
37
+ await sleep(300);
38
+
39
+ // Test modifier key combinations
40
+ if (isMac) {
41
+ // Cmd+Tab to show app switcher
42
+ await agent.aiAct('press Command+Tab');
43
+ await sleep(500);
44
+ // Press Command alone to release and dismiss app switcher
45
+ await agent.aiAct('press Command');
46
+ await sleep(300);
47
+ } else {
48
+ // Alt+Tab to show app switcher
49
+ await agent.aiAct('press Alt+Tab');
50
+ await sleep(500);
51
+ // Click to dismiss (Alt alone may not work on Windows)
52
+ await agent.aiAct('click mouse');
53
+ await sleep(300);
54
+ }
55
+
56
+ // Take screenshot to verify final state
57
+ const finalState = await agent.aiQuery(
58
+ '{hasVisibleContent: boolean}, check current screen state',
59
+ );
60
+ console.log('Final screen state:', finalState);
61
+
62
+ await agent.aiAssert('Screen content is still visible');
63
+ },
64
+ 360 * 1000,
65
+ );
66
+ });
@@ -0,0 +1,76 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import { describe, it, vi } from 'vitest';
3
+ import { ComputerDevice, agentFromComputer } from '../../src';
4
+
5
+ vi.setConfig({
6
+ testTimeout: 120 * 1000,
7
+ });
8
+
9
+ describe('computer multi display', () => {
10
+ it(
11
+ 'connect to multiple displays',
12
+ async () => {
13
+ // List all displays
14
+ const displays = await ComputerDevice.listDisplays();
15
+ console.log('Available displays:', displays);
16
+
17
+ if (displays.length < 2) {
18
+ console.warn(
19
+ `Only ${displays.length} display(s) found, need at least 2 for multi-display test`,
20
+ );
21
+ // Still test single display
22
+ if (displays.length === 1) {
23
+ const agent = await agentFromComputer({
24
+ displayId: displays[0].id,
25
+ });
26
+ await agent.aiAct('move mouse to center of screen');
27
+ await agent.aiAssert('Screen has visible content');
28
+ }
29
+ return;
30
+ }
31
+
32
+ // Connect to first display
33
+ const display1 = displays[0];
34
+ console.log(
35
+ `Connecting to display 1: ${display1.name} (ID: ${display1.id})`,
36
+ );
37
+ const agent1 = await agentFromComputer({
38
+ displayId: display1.id,
39
+ aiActionContext: `You are controlling display 1: ${display1.name}`,
40
+ });
41
+
42
+ // Operate on first display
43
+ await agent1.aiAct('move mouse to center of screen');
44
+ await sleep(500);
45
+ const screen1Info = await agent1.aiQuery(
46
+ '{hasContent: boolean}, check if display has visible content',
47
+ );
48
+ console.log('Display 1 info:', screen1Info);
49
+
50
+ // Connect to second display
51
+ const display2 = displays[1];
52
+ console.log(
53
+ `Connecting to display 2: ${display2.name} (ID: ${display2.id})`,
54
+ );
55
+ const agent2 = await agentFromComputer({
56
+ displayId: display2.id,
57
+ aiActionContext: `You are controlling display 2: ${display2.name}`,
58
+ });
59
+
60
+ // Operate on second display
61
+ await agent2.aiAct('move mouse to center of screen');
62
+ await sleep(500);
63
+ const screen2Info = await agent2.aiQuery(
64
+ '{hasContent: boolean}, check if display has visible content',
65
+ );
66
+ console.log('Display 2 info:', screen2Info);
67
+
68
+ // Verify both displays have content
69
+ await agent1.aiAssert('This display has visible content');
70
+ await agent2.aiAssert('This display has visible content');
71
+
72
+ console.log('Multi-display test completed successfully');
73
+ },
74
+ 360 * 1000,
75
+ );
76
+ });
@@ -0,0 +1,31 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import type { ComputerAgent } from '../../src';
3
+
4
+ const IS_MAC = process.platform === 'darwin';
5
+
6
+ /**
7
+ * Opens a browser and navigates to the specified URL
8
+ */
9
+ export async function openBrowserAndNavigate(
10
+ agent: ComputerAgent,
11
+ url: string,
12
+ ): Promise<void> {
13
+ if (IS_MAC) {
14
+ await agent.aiAct('press Cmd+Space');
15
+ await sleep(500);
16
+ await agent.aiAct('type "Safari" and press Enter');
17
+ await sleep(2000);
18
+ await agent.aiAct('press Cmd+L to focus address bar');
19
+ } else {
20
+ await agent.aiAct('press Windows key');
21
+ await sleep(500);
22
+ await agent.aiAct('type "Chrome" and press Enter');
23
+ await sleep(2000);
24
+ await agent.aiAct('press Ctrl+L to focus address bar');
25
+ }
26
+ await sleep(300);
27
+
28
+ await agent.aiAct(`type "${url}"`);
29
+ await agent.aiAct('press Enter');
30
+ await sleep(3000);
31
+ }
@@ -0,0 +1,63 @@
1
+ import { sleep } from '@midscene/core/utils';
2
+ import { beforeAll, describe, it, vi } from 'vitest';
3
+ import { ComputerAgent, ComputerDevice } from '../../src';
4
+
5
+ vi.setConfig({
6
+ testTimeout: 240 * 1000,
7
+ });
8
+
9
+ describe('computer web browser automation', () => {
10
+ let agent: ComputerAgent;
11
+
12
+ beforeAll(async () => {
13
+ const device = new ComputerDevice({});
14
+ agent = new ComputerAgent(device, {
15
+ aiActionContext:
16
+ 'You are automating a web browser on a desktop computer. If any popup appears, close it.',
17
+ });
18
+ await device.connect();
19
+ });
20
+
21
+ it(
22
+ 'open browser and navigate',
23
+ async () => {
24
+ const isMac = process.platform === 'darwin';
25
+
26
+ // Open browser (using platform-specific shortcuts)
27
+ if (isMac) {
28
+ await agent.aiAct('press Cmd+Space to open Spotlight');
29
+ await sleep(1000);
30
+ await agent.aiAct('type "Safari" and press Enter');
31
+ } else {
32
+ await agent.aiAct('press Windows key');
33
+ await sleep(1000);
34
+ await agent.aiAct('type "Chrome" and press Enter');
35
+ }
36
+
37
+ await sleep(3000);
38
+
39
+ // Wait for browser to open
40
+ await agent.aiWaitFor('Browser window is open');
41
+
42
+ // Navigate to website
43
+ await agent.aiAct('click on address bar and type "example.com"');
44
+ await sleep(1000);
45
+ await agent.aiAct('press Enter');
46
+
47
+ await sleep(3000);
48
+
49
+ // Verify page loaded
50
+ await agent.aiWaitFor('Page has loaded');
51
+
52
+ // Extract page info
53
+ const pageInfo = await agent.aiQuery(
54
+ '{title: string, hasContent: boolean}, extract page title and check if content exists',
55
+ );
56
+ console.log('Page info:', pageInfo);
57
+
58
+ // Assert page content
59
+ await agent.aiAssert('The page has text content');
60
+ },
61
+ 720 * 1000,
62
+ );
63
+ });
@@ -0,0 +1,34 @@
1
+ import { describe, expect, it } from 'vitest';
2
+ import { ComputerAgent, ComputerDevice } from '../../src';
3
+
4
+ describe('ComputerAgent', () => {
5
+ it('should create agent instance', () => {
6
+ const device = new ComputerDevice({});
7
+ const agent = new ComputerAgent(device);
8
+
9
+ expect(agent).toBeDefined();
10
+ expect(agent.interface).toBe(device);
11
+ });
12
+
13
+ it('should create agent with options', () => {
14
+ const device = new ComputerDevice({ displayId: 'test' });
15
+ const agent = new ComputerAgent(device, {
16
+ aiActionContext: 'Test context',
17
+ });
18
+
19
+ expect(agent).toBeDefined();
20
+ });
21
+
22
+ it('should create agent with custom actions', () => {
23
+ const device = new ComputerDevice({
24
+ customActions: [],
25
+ });
26
+ const agent = new ComputerAgent(device);
27
+
28
+ expect(agent).toBeDefined();
29
+ expect(agent.interface).toBeDefined();
30
+ });
31
+
32
+ // Note: Tests that require actual libnut functionality (like agentFromComputer with connect)
33
+ // should be run as AI tests or integration tests where native modules are available
34
+ });