@midscene/computer 1.8.1-beta-20260513084557.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/es/index.mjs CHANGED
@@ -4,8 +4,7 @@ import { existsSync } from "node:fs";
4
4
  import { createRequire } from "node:module";
5
5
  import { dirname, resolve as external_node_path_resolve } from "node:path";
6
6
  import { fileURLToPath } from "node:url";
7
- import { getMidsceneLocationSchema, z } from "@midscene/core";
8
- import { actionHoverParamSchema, actionTapParamSchema, defineAction, defineActionClearInput, defineActionDoubleClick, defineActionDragAndDrop, defineActionHover, defineActionInput, defineActionKeyboardPress, defineActionRightClick, defineActionScroll, defineActionTap } from "@midscene/core/device";
7
+ import { defineAction, defineActionsFromInputPrimitives } from "@midscene/core/device";
9
8
  import { sleep } from "@midscene/core/utils";
10
9
  import { createImgBase64ByFormat } from "@midscene/shared/img";
11
10
  import { getDebug } from "@midscene/shared/logger";
@@ -13,6 +12,7 @@ import screenshot_desktop from "screenshot-desktop";
13
12
  import { Agent } from "@midscene/core/agent";
14
13
  import { once } from "node:events";
15
14
  import { createInterface } from "node:readline";
15
+ import { z } from "@midscene/core";
16
16
  import { BaseMidsceneTools } from "@midscene/shared/mcp/base-tools";
17
17
  import { overrideAIConfig } from "@midscene/shared/env";
18
18
  const debugXvfb = getDebug('computer:xvfb');
@@ -92,15 +92,6 @@ function _define_property(obj, key, value) {
92
92
  else obj[key] = value;
93
93
  return obj;
94
94
  }
95
- const computerInputParamSchema = z.object({
96
- value: z.string().describe('The text to input'),
97
- mode: z["enum"]([
98
- 'replace',
99
- 'clear',
100
- 'append'
101
- ]).default('replace').optional().describe('Input mode: replace, clear, or append'),
102
- locate: getMidsceneLocationSchema().describe('The input field to be filled').optional()
103
- });
104
95
  const SMOOTH_MOVE_STEPS_TAP = 8;
105
96
  const SMOOTH_MOVE_STEPS_MOUSE_MOVE = 10;
106
97
  const SMOOTH_MOVE_DELAY_TAP = 8;
@@ -411,7 +402,7 @@ Available Displays: ${displays.length > 0 ? displays.map((d)=>d.name).join(', ')
411
402
  }
412
403
  async healthCheck() {
413
404
  console.log('[HealthCheck] Starting health check...');
414
- console.log("[HealthCheck] @midscene/computer v1.8.1-beta-20260513084557.0");
405
+ console.log("[HealthCheck] @midscene/computer v1.8.1");
415
406
  console.log('[HealthCheck] Taking screenshot...');
416
407
  const screenshotTimeout = 15000;
417
408
  let timeoutId;
@@ -477,21 +468,38 @@ Available Displays: ${displays.length > 0 ? displays.map((d)=>d.name).join(', ')
477
468
  debugDevice('Taking screenshot', {
478
469
  displayId: this.displayId
479
470
  });
480
- try {
481
- const options = {
482
- format: 'png'
483
- };
484
- if (void 0 !== this.displayId) if ('darwin' === process.platform) {
485
- const screenIndex = Number(this.displayId);
486
- if (!Number.isNaN(screenIndex)) options.screen = screenIndex;
487
- } else options.screen = this.displayId;
488
- debugDevice('Screenshot options', options);
471
+ const options = {
472
+ format: 'png'
473
+ };
474
+ if (void 0 !== this.displayId) if ('darwin' === process.platform) {
475
+ const screenIndex = Number(this.displayId);
476
+ if (!Number.isNaN(screenIndex)) options.screen = screenIndex;
477
+ } else options.screen = this.displayId;
478
+ debugDevice('Screenshot options', options);
479
+ const MAX_ATTEMPTS = 3;
480
+ const RETRY_DELAY_MS = 300;
481
+ let lastRawMessage = '';
482
+ for(let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++)try {
489
483
  const buffer = await screenshot_desktop(options);
484
+ if (attempt > 1) debugDevice(`Screenshot succeeded on attempt ${attempt}`);
490
485
  return createImgBase64ByFormat('png', buffer.toString('base64'));
491
486
  } catch (error) {
492
- debugDevice(`Screenshot failed: ${error}`);
493
- throw new Error(`Failed to take screenshot: ${error}`);
487
+ lastRawMessage = error instanceof Error ? error.message : String(error);
488
+ const isMacTransient = 'darwin' === process.platform && /could not create image from display/i.test(lastRawMessage);
489
+ const willRetry = isMacTransient && attempt < MAX_ATTEMPTS;
490
+ debugDevice(`Screenshot attempt ${attempt} failed: ${lastRawMessage}${willRetry ? ' — retrying' : ''}`);
491
+ if (!willRetry) break;
492
+ await sleep(RETRY_DELAY_MS);
494
493
  }
494
+ if ('darwin' === process.platform && /could not create image from display/i.test(lastRawMessage)) throw new Error(`Failed to take screenshot on macOS: the host process is missing Screen Recording permission, or the target display is locked/sleeping.
495
+
496
+ Please follow these steps:
497
+ 1. Open System Settings > Privacy & Security > Screen Recording
498
+ 2. Enable the application running this script (e.g., Terminal, iTerm2, VS Code, WebStorm, or Midscene Studio)
499
+ 3. Fully quit and relaunch that application after granting permission — macOS only re-reads this permission on process launch.
500
+
501
+ Original error: ${lastRawMessage}`);
502
+ throw new Error(`Failed to take screenshot: ${lastRawMessage}`);
495
503
  }
496
504
  async size() {
497
505
  node_assert(device_libnut, 'libnut not initialized');
@@ -537,228 +545,111 @@ Available Displays: ${displays.length > 0 ? displays.map((d)=>d.name).join(', ')
537
545
  node_assert(device_libnut, 'libnut not initialized');
538
546
  await this.typeViaClipboard(text);
539
547
  }
548
+ async selectAllAndDelete() {
549
+ node_assert(device_libnut, 'libnut not initialized');
550
+ if (this.useAppleScript) {
551
+ sendKeyViaAppleScript('a', [
552
+ 'command'
553
+ ]);
554
+ await sleep(50);
555
+ sendKeyViaAppleScript('backspace', []);
556
+ return;
557
+ }
558
+ const modifier = 'darwin' === process.platform ? 'command' : 'control';
559
+ device_libnut.keyTap('a', [
560
+ modifier
561
+ ]);
562
+ await sleep(50);
563
+ device_libnut.keyTap('backspace');
564
+ }
565
+ async pressKeyboardShortcut(keyName) {
566
+ node_assert(device_libnut, 'libnut not initialized');
567
+ const keys = keyName.split('+');
568
+ const modifiers = keys.slice(0, -1).map(normalizeKeyName);
569
+ const key = normalizePrimaryKey(keys[keys.length - 1]);
570
+ debugDevice('KeyboardPress', {
571
+ original: keyName,
572
+ key,
573
+ modifiers,
574
+ driver: this.useAppleScript ? "applescript" : 'libnut'
575
+ });
576
+ if (this.useAppleScript) sendKeyViaAppleScript(key, modifiers);
577
+ else if (modifiers.length > 0) device_libnut.keyTap(key, modifiers);
578
+ else device_libnut.keyTap(key);
579
+ }
580
+ async performScroll(param) {
581
+ node_assert(device_libnut, 'libnut not initialized');
582
+ if (param.locate) {
583
+ const element = param.locate;
584
+ const [x, y] = element.center;
585
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
586
+ }
587
+ const scrollType = param?.scrollType;
588
+ const edgeSpec = scrollType && scrollType in EDGE_SCROLL_SPEC ? EDGE_SCROLL_SPEC[scrollType] : null;
589
+ if (edgeSpec) {
590
+ if (runPhasedScroll(edgeSpec.direction, EDGE_SCROLL_TOTAL_PX, EDGE_SCROLL_STEPS)) return void await sleep(SCROLL_COMPLETE_DELAY);
591
+ if (this.useAppleScript) {
592
+ sendKeyViaAppleScript(edgeSpec.key);
593
+ await sleep(SCROLL_COMPLETE_DELAY);
594
+ return;
595
+ }
596
+ const [dx, dy] = edgeSpec.libnut;
597
+ for(let i = 0; i < SCROLL_REPEAT_COUNT; i++){
598
+ device_libnut.scrollMouse(dx, dy);
599
+ await sleep(SCROLL_STEP_DELAY);
600
+ }
601
+ return;
602
+ }
603
+ if ('singleAction' === scrollType || !scrollType) {
604
+ const distance = param?.distance || 500;
605
+ const direction = param?.direction || 'down';
606
+ const isKnownDirection = 'up' === direction || 'down' === direction || 'left' === direction || 'right' === direction;
607
+ if (isKnownDirection) {
608
+ const steps = Math.max(PHASED_MIN_STEPS, Math.round(distance / PHASED_PIXELS_PER_STEP));
609
+ if (runPhasedScroll(direction, distance, steps)) return void await sleep(SCROLL_COMPLETE_DELAY);
610
+ }
611
+ if (this.useAppleScript && ('up' === direction || 'down' === direction)) {
612
+ const pages = Math.max(1, Math.round(distance / APPROX_VIEWPORT_HEIGHT_PX));
613
+ const key = 'up' === direction ? 'pageup' : 'pagedown';
614
+ for(let i = 0; i < pages; i++){
615
+ sendKeyViaAppleScript(key);
616
+ await sleep(SCROLL_STEP_DELAY);
617
+ }
618
+ await sleep(SCROLL_COMPLETE_DELAY);
619
+ return;
620
+ }
621
+ const ticks = Math.ceil(distance / 100);
622
+ const directionMap = {
623
+ up: [
624
+ 0,
625
+ ticks
626
+ ],
627
+ down: [
628
+ 0,
629
+ -ticks
630
+ ],
631
+ left: [
632
+ -ticks,
633
+ 0
634
+ ],
635
+ right: [
636
+ ticks,
637
+ 0
638
+ ]
639
+ };
640
+ const [dx, dy] = directionMap[direction] || [
641
+ 0,
642
+ -ticks
643
+ ];
644
+ device_libnut.scrollMouse(dx, dy);
645
+ await sleep(SCROLL_COMPLETE_DELAY);
646
+ return;
647
+ }
648
+ throw new Error(`Unknown scroll type: ${scrollType}, param: ${JSON.stringify(param)}`);
649
+ }
540
650
  actionSpace() {
541
651
  const defaultActions = [
542
- defineActionTap(async (param)=>{
543
- node_assert(device_libnut, 'libnut not initialized');
544
- const element = param.locate;
545
- node_assert(element, 'Element not found, cannot tap');
546
- const [x, y] = element.center;
547
- const targetX = Math.round(x);
548
- const targetY = Math.round(y);
549
- await smoothMoveMouse(targetX, targetY, SMOOTH_MOVE_STEPS_TAP, SMOOTH_MOVE_DELAY_TAP);
550
- device_libnut.mouseToggle('down', 'left');
551
- await sleep(CLICK_HOLD_DURATION);
552
- device_libnut.mouseToggle('up', 'left');
553
- }),
554
- defineActionDoubleClick(async (param)=>{
555
- node_assert(device_libnut, 'libnut not initialized');
556
- const element = param.locate;
557
- node_assert(element, 'Element not found, cannot double click');
558
- const [x, y] = element.center;
559
- device_libnut.moveMouse(Math.round(x), Math.round(y));
560
- device_libnut.mouseClick('left', true);
561
- }),
562
- defineActionRightClick(async (param)=>{
563
- node_assert(device_libnut, 'libnut not initialized');
564
- const element = param.locate;
565
- node_assert(element, 'Element not found, cannot right click');
566
- const [x, y] = element.center;
567
- device_libnut.moveMouse(Math.round(x), Math.round(y));
568
- device_libnut.mouseClick('right');
569
- }),
570
- defineAction({
571
- name: 'MouseMove',
572
- description: 'Move the mouse to the element',
573
- interfaceAlias: 'aiHover',
574
- paramSchema: actionHoverParamSchema,
575
- sample: {
576
- locate: {
577
- prompt: 'the navigation menu item "Products"'
578
- }
579
- },
580
- call: async (param)=>{
581
- node_assert(device_libnut, 'libnut not initialized');
582
- const element = param.locate;
583
- node_assert(element, 'Element not found, cannot move mouse');
584
- const [x, y] = element.center;
585
- const targetX = Math.round(x);
586
- const targetY = Math.round(y);
587
- await smoothMoveMouse(targetX, targetY, SMOOTH_MOVE_STEPS_MOUSE_MOVE, SMOOTH_MOVE_DELAY_MOUSE_MOVE);
588
- await sleep(MOUSE_MOVE_EFFECT_WAIT);
589
- }
590
- }),
591
- defineAction({
592
- name: 'Input',
593
- description: 'Input text into the input field',
594
- interfaceAlias: 'aiInput',
595
- paramSchema: computerInputParamSchema,
596
- sample: {
597
- value: 'test@example.com',
598
- locate: {
599
- prompt: 'the email input field'
600
- }
601
- },
602
- call: async (param)=>{
603
- node_assert(device_libnut, 'libnut not initialized');
604
- const element = param.locate;
605
- if (element) {
606
- const [x, y] = element.center;
607
- device_libnut.moveMouse(Math.round(x), Math.round(y));
608
- device_libnut.mouseClick('left');
609
- await sleep(INPUT_FOCUS_DELAY);
610
- if ('append' !== param.mode) {
611
- if (this.useAppleScript) {
612
- sendKeyViaAppleScript('a', [
613
- 'command'
614
- ]);
615
- await sleep(50);
616
- sendKeyViaAppleScript('backspace', []);
617
- } else {
618
- const modifier = 'darwin' === process.platform ? 'command' : 'control';
619
- device_libnut.keyTap('a', [
620
- modifier
621
- ]);
622
- await sleep(50);
623
- device_libnut.keyTap('backspace');
624
- }
625
- await sleep(INPUT_CLEAR_DELAY);
626
- }
627
- }
628
- if ('clear' === param.mode) return;
629
- if (!param.value) return;
630
- await this.smartTypeString(param.value);
631
- }
632
- }),
633
- defineActionScroll(async (param)=>{
634
- node_assert(device_libnut, 'libnut not initialized');
635
- if (param.locate) {
636
- const element = param.locate;
637
- const [x, y] = element.center;
638
- device_libnut.moveMouse(Math.round(x), Math.round(y));
639
- }
640
- const scrollType = param?.scrollType;
641
- const edgeSpec = scrollType && scrollType in EDGE_SCROLL_SPEC ? EDGE_SCROLL_SPEC[scrollType] : null;
642
- if (edgeSpec) {
643
- if (runPhasedScroll(edgeSpec.direction, EDGE_SCROLL_TOTAL_PX, EDGE_SCROLL_STEPS)) return void await sleep(SCROLL_COMPLETE_DELAY);
644
- if (this.useAppleScript) {
645
- sendKeyViaAppleScript(edgeSpec.key);
646
- await sleep(SCROLL_COMPLETE_DELAY);
647
- return;
648
- }
649
- const [dx, dy] = edgeSpec.libnut;
650
- for(let i = 0; i < SCROLL_REPEAT_COUNT; i++){
651
- device_libnut.scrollMouse(dx, dy);
652
- await sleep(SCROLL_STEP_DELAY);
653
- }
654
- return;
655
- }
656
- if ('singleAction' === scrollType || !scrollType) {
657
- const distance = param?.distance || 500;
658
- const direction = param?.direction || 'down';
659
- const isKnownDirection = 'up' === direction || 'down' === direction || 'left' === direction || 'right' === direction;
660
- if (isKnownDirection) {
661
- const steps = Math.max(PHASED_MIN_STEPS, Math.round(distance / PHASED_PIXELS_PER_STEP));
662
- if (runPhasedScroll(direction, distance, steps)) return void await sleep(SCROLL_COMPLETE_DELAY);
663
- }
664
- if (this.useAppleScript && ('up' === direction || 'down' === direction)) {
665
- const pages = Math.max(1, Math.round(distance / APPROX_VIEWPORT_HEIGHT_PX));
666
- const key = 'up' === direction ? 'pageup' : 'pagedown';
667
- for(let i = 0; i < pages; i++){
668
- sendKeyViaAppleScript(key);
669
- await sleep(SCROLL_STEP_DELAY);
670
- }
671
- await sleep(SCROLL_COMPLETE_DELAY);
672
- return;
673
- }
674
- const ticks = Math.ceil(distance / 100);
675
- const directionMap = {
676
- up: [
677
- 0,
678
- ticks
679
- ],
680
- down: [
681
- 0,
682
- -ticks
683
- ],
684
- left: [
685
- -ticks,
686
- 0
687
- ],
688
- right: [
689
- ticks,
690
- 0
691
- ]
692
- };
693
- const [dx, dy] = directionMap[direction] || [
694
- 0,
695
- -ticks
696
- ];
697
- device_libnut.scrollMouse(dx, dy);
698
- await sleep(SCROLL_COMPLETE_DELAY);
699
- return;
700
- }
701
- throw new Error(`Unknown scroll type: ${scrollType}, param: ${JSON.stringify(param)}`);
702
- }),
703
- defineActionKeyboardPress(async (param)=>{
704
- node_assert(device_libnut, 'libnut not initialized');
705
- if (param.locate) {
706
- const [x, y] = param.locate.center;
707
- device_libnut.moveMouse(Math.round(x), Math.round(y));
708
- device_libnut.mouseClick('left');
709
- await sleep(50);
710
- }
711
- const keys = param.keyName.split('+');
712
- const modifiers = keys.slice(0, -1).map(normalizeKeyName);
713
- const key = normalizePrimaryKey(keys[keys.length - 1]);
714
- debugDevice('KeyboardPress', {
715
- original: param.keyName,
716
- key,
717
- modifiers,
718
- driver: this.useAppleScript ? "applescript" : 'libnut'
719
- });
720
- if (this.useAppleScript) sendKeyViaAppleScript(key, modifiers);
721
- else if (modifiers.length > 0) device_libnut.keyTap(key, modifiers);
722
- else device_libnut.keyTap(key);
723
- }),
724
- defineActionDragAndDrop(async (param)=>{
725
- node_assert(device_libnut, 'libnut not initialized');
726
- const from = param.from;
727
- const to = param.to;
728
- node_assert(from, 'missing "from" param for drag and drop');
729
- node_assert(to, 'missing "to" param for drag and drop');
730
- const [fromX, fromY] = from.center;
731
- const [toX, toY] = to.center;
732
- device_libnut.moveMouse(Math.round(fromX), Math.round(fromY));
733
- device_libnut.mouseToggle('down', 'left');
734
- await sleep(100);
735
- device_libnut.moveMouse(Math.round(toX), Math.round(toY));
736
- await sleep(100);
737
- device_libnut.mouseToggle('up', 'left');
738
- }),
739
- defineActionClearInput(async (param)=>{
740
- node_assert(device_libnut, 'libnut not initialized');
741
- const element = param.locate;
742
- node_assert(element, 'Element not found, cannot clear input');
743
- const [x, y] = element.center;
744
- device_libnut.moveMouse(Math.round(x), Math.round(y));
745
- device_libnut.mouseClick('left');
746
- await sleep(100);
747
- if (this.useAppleScript) {
748
- sendKeyViaAppleScript('a', [
749
- 'command'
750
- ]);
751
- await sleep(50);
752
- sendKeyViaAppleScript('backspace', []);
753
- } else {
754
- const modifier = 'darwin' === process.platform ? 'command' : 'control';
755
- device_libnut.keyTap('a', [
756
- modifier
757
- ]);
758
- device_libnut.keyTap('backspace');
759
- }
760
- await sleep(50);
761
- })
652
+ ...defineActionsFromInputPrimitives(this.inputPrimitives)
762
653
  ];
763
654
  const platformActions = Object.values(createPlatformActions());
764
655
  const customActions = this.options?.customActions || [];
@@ -796,6 +687,88 @@ Available Displays: ${displays.length > 0 ? displays.map((d)=>d.name).join(', ')
796
687
  _define_property(this, "xvfbCleanup", void 0);
797
688
  _define_property(this, "useAppleScript", void 0);
798
689
  _define_property(this, "uri", void 0);
690
+ _define_property(this, "inputPrimitives", {
691
+ pointer: {
692
+ tap: async ({ x, y })=>{
693
+ node_assert(device_libnut, 'libnut not initialized');
694
+ const targetX = Math.round(x);
695
+ const targetY = Math.round(y);
696
+ await smoothMoveMouse(targetX, targetY, SMOOTH_MOVE_STEPS_TAP, SMOOTH_MOVE_DELAY_TAP);
697
+ device_libnut.mouseToggle('down', 'left');
698
+ await sleep(CLICK_HOLD_DURATION);
699
+ device_libnut.mouseToggle('up', 'left');
700
+ },
701
+ doubleClick: async ({ x, y })=>{
702
+ node_assert(device_libnut, 'libnut not initialized');
703
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
704
+ device_libnut.mouseClick('left', true);
705
+ },
706
+ rightClick: async ({ x, y })=>{
707
+ node_assert(device_libnut, 'libnut not initialized');
708
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
709
+ device_libnut.mouseClick('right');
710
+ },
711
+ hover: async ({ x, y })=>{
712
+ node_assert(device_libnut, 'libnut not initialized');
713
+ await smoothMoveMouse(Math.round(x), Math.round(y), SMOOTH_MOVE_STEPS_MOUSE_MOVE, SMOOTH_MOVE_DELAY_MOUSE_MOVE);
714
+ await sleep(MOUSE_MOVE_EFFECT_WAIT);
715
+ },
716
+ dragAndDrop: async (from, to)=>{
717
+ node_assert(device_libnut, 'libnut not initialized');
718
+ device_libnut.moveMouse(Math.round(from.x), Math.round(from.y));
719
+ device_libnut.mouseToggle('down', 'left');
720
+ await sleep(100);
721
+ device_libnut.moveMouse(Math.round(to.x), Math.round(to.y));
722
+ await sleep(100);
723
+ device_libnut.mouseToggle('up', 'left');
724
+ }
725
+ },
726
+ keyboard: {
727
+ typeText: async (value, opts)=>{
728
+ node_assert(device_libnut, 'libnut not initialized');
729
+ const element = opts?.target;
730
+ if (element) {
731
+ const [x, y] = element.center;
732
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
733
+ device_libnut.mouseClick('left');
734
+ await sleep(INPUT_FOCUS_DELAY);
735
+ if (opts?.replace !== false) {
736
+ await this.selectAllAndDelete();
737
+ await sleep(INPUT_CLEAR_DELAY);
738
+ }
739
+ }
740
+ await this.smartTypeString(value);
741
+ },
742
+ keyboardPress: async (keyName, opts)=>{
743
+ node_assert(device_libnut, 'libnut not initialized');
744
+ const target = opts?.target;
745
+ if (target) {
746
+ const [x, y] = target.center;
747
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
748
+ device_libnut.mouseClick('left');
749
+ await sleep(50);
750
+ }
751
+ await this.pressKeyboardShortcut(keyName);
752
+ },
753
+ clearInput: async (target)=>{
754
+ node_assert(device_libnut, 'libnut not initialized');
755
+ if (target) {
756
+ const element = target;
757
+ const [x, y] = element.center;
758
+ device_libnut.moveMouse(Math.round(x), Math.round(y));
759
+ device_libnut.mouseClick('left');
760
+ await sleep(100);
761
+ }
762
+ await this.selectAllAndDelete();
763
+ await sleep(50);
764
+ }
765
+ },
766
+ scroll: {
767
+ scroll: async (param)=>{
768
+ await this.performScroll(param);
769
+ }
770
+ }
771
+ });
799
772
  this.options = options;
800
773
  this.displayId = options?.displayId;
801
774
  this.useAppleScript = 'darwin' === process.platform && options?.keyboardDriver !== 'libnut';
@@ -1214,132 +1187,7 @@ class RDPDevice {
1214
1187
  }
1215
1188
  actionSpace() {
1216
1189
  const defaultActions = [
1217
- defineActionTap(async ({ locate })=>{
1218
- const element = this.requireLocate(locate, 'tap');
1219
- await this.moveToElement(element, {
1220
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1221
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1222
- });
1223
- await this.backend.mouseButton('left', 'down');
1224
- await sleep(device_CLICK_HOLD_DURATION);
1225
- await this.backend.mouseButton('left', 'up');
1226
- }),
1227
- defineActionDoubleClick(async ({ locate })=>{
1228
- const element = this.requireLocate(locate, 'double click');
1229
- await this.moveToElement(element, {
1230
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1231
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1232
- });
1233
- await this.backend.mouseButton('left', 'doubleClick');
1234
- }),
1235
- defineActionRightClick(async ({ locate })=>{
1236
- const element = this.requireLocate(locate, 'right click');
1237
- await this.moveToElement(element, {
1238
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1239
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1240
- });
1241
- await this.backend.mouseButton('right', 'click');
1242
- }),
1243
- defineActionHover(async ({ locate })=>{
1244
- const element = this.requireLocate(locate, 'hover');
1245
- await this.moveToElement(element, {
1246
- steps: device_SMOOTH_MOVE_STEPS_MOUSE_MOVE,
1247
- stepDelayMs: device_SMOOTH_MOVE_DELAY_MOUSE_MOVE,
1248
- settleDelayMs: device_MOUSE_MOVE_EFFECT_WAIT
1249
- });
1250
- }),
1251
- defineActionInput(async (param)=>{
1252
- this.assertConnected();
1253
- if (param.locate) {
1254
- await this.moveToElement(param.locate, {
1255
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1256
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1257
- });
1258
- await this.backend.mouseButton('left', 'click');
1259
- await sleep(device_INPUT_FOCUS_DELAY);
1260
- }
1261
- if ('typeOnly' !== param.mode) {
1262
- await this.clearInput();
1263
- await sleep(device_INPUT_CLEAR_DELAY);
1264
- }
1265
- if ('clear' === param.mode) return;
1266
- if (param.value) await this.backend.typeText(param.value);
1267
- }),
1268
- defineActionClearInput(async ({ locate })=>{
1269
- this.assertConnected();
1270
- if (locate) {
1271
- await this.moveToElement(locate, {
1272
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1273
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1274
- });
1275
- await this.backend.mouseButton('left', 'click');
1276
- await sleep(device_INPUT_FOCUS_DELAY);
1277
- }
1278
- await this.clearInput();
1279
- await sleep(device_INPUT_CLEAR_DELAY);
1280
- }),
1281
- defineActionKeyboardPress(async ({ locate, keyName })=>{
1282
- this.assertConnected();
1283
- if (locate) {
1284
- await this.moveToElement(locate, {
1285
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1286
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1287
- });
1288
- await this.backend.mouseButton('left', 'click');
1289
- }
1290
- await this.backend.keyPress(keyName);
1291
- }),
1292
- defineActionScroll(async (param)=>{
1293
- this.assertConnected();
1294
- const target = param.locate;
1295
- if (target) await this.moveToElement(target, {
1296
- steps: device_SMOOTH_MOVE_STEPS_MOUSE_MOVE,
1297
- stepDelayMs: device_SMOOTH_MOVE_DELAY_MOUSE_MOVE
1298
- });
1299
- if (param.scrollType && 'singleAction' !== param.scrollType) {
1300
- const direction = this.edgeScrollDirection(param.scrollType);
1301
- for(let i = 0; i < device_EDGE_SCROLL_STEPS; i++)await this.performWheel(direction, DEFAULT_SCROLL_DISTANCE, target?.center[0], target?.center[1]);
1302
- await sleep(device_SCROLL_COMPLETE_DELAY);
1303
- return;
1304
- }
1305
- await this.performWheel(param.direction || 'down', param.distance || DEFAULT_SCROLL_DISTANCE, target?.center[0], target?.center[1]);
1306
- await sleep(device_SCROLL_COMPLETE_DELAY);
1307
- }),
1308
- defineActionDragAndDrop(async ({ from, to })=>{
1309
- this.assertConnected();
1310
- const source = this.requireLocate(from, 'drag source');
1311
- const target = this.requireLocate(to, 'drag target');
1312
- await this.moveToElement(source, {
1313
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1314
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1315
- });
1316
- await this.backend.mouseButton('left', 'down');
1317
- await sleep(DRAG_HOLD_DURATION);
1318
- await this.moveToElement(target, {
1319
- steps: SMOOTH_MOVE_STEPS_DRAG,
1320
- stepDelayMs: SMOOTH_MOVE_DELAY_DRAG
1321
- });
1322
- await sleep(DRAG_HOLD_DURATION);
1323
- await this.backend.mouseButton('left', 'up');
1324
- }),
1325
- defineAction({
1326
- name: 'MiddleClick',
1327
- description: 'Middle click the element',
1328
- sample: {
1329
- locate: {
1330
- prompt: 'the browser tab close target'
1331
- }
1332
- },
1333
- paramSchema: actionTapParamSchema,
1334
- call: async ({ locate })=>{
1335
- const element = this.requireLocate(locate, 'middle click');
1336
- await this.moveToElement(element, {
1337
- steps: device_SMOOTH_MOVE_STEPS_TAP,
1338
- stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1339
- });
1340
- await this.backend.mouseButton('middle', 'click');
1341
- }
1342
- }),
1190
+ ...defineActionsFromInputPrimitives(this.inputPrimitives),
1343
1191
  defineAction({
1344
1192
  name: 'ListDisplays',
1345
1193
  description: 'List all available displays/monitors',
@@ -1368,10 +1216,6 @@ class RDPDevice {
1368
1216
  throwIfDestroyed() {
1369
1217
  if (this.destroyed) throw new Error('RDPDevice has been destroyed');
1370
1218
  }
1371
- requireLocate(locate, actionName) {
1372
- if (!locate) throw new Error(`Missing target element for ${actionName}`);
1373
- return locate;
1374
- }
1375
1219
  async moveToElement(element, options) {
1376
1220
  this.assertConnected();
1377
1221
  const targetX = Math.round(element.center[0]);
@@ -1437,6 +1281,113 @@ class RDPDevice {
1437
1281
  device_define_property(this, "destroyed", false);
1438
1282
  device_define_property(this, "cursorPosition", void 0);
1439
1283
  device_define_property(this, "uri", void 0);
1284
+ device_define_property(this, "inputPrimitives", {
1285
+ pointer: {
1286
+ tap: async ({ x, y })=>{
1287
+ await this.movePointer(Math.round(x), Math.round(y), {
1288
+ steps: device_SMOOTH_MOVE_STEPS_TAP,
1289
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1290
+ });
1291
+ await this.backend.mouseButton('left', 'down');
1292
+ await sleep(device_CLICK_HOLD_DURATION);
1293
+ await this.backend.mouseButton('left', 'up');
1294
+ },
1295
+ doubleClick: async ({ x, y })=>{
1296
+ await this.movePointer(Math.round(x), Math.round(y), {
1297
+ steps: device_SMOOTH_MOVE_STEPS_TAP,
1298
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1299
+ });
1300
+ await this.backend.mouseButton('left', 'doubleClick');
1301
+ },
1302
+ rightClick: async ({ x, y })=>{
1303
+ await this.movePointer(Math.round(x), Math.round(y), {
1304
+ steps: device_SMOOTH_MOVE_STEPS_TAP,
1305
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1306
+ });
1307
+ await this.backend.mouseButton('right', 'click');
1308
+ },
1309
+ hover: async ({ x, y })=>{
1310
+ await this.movePointer(Math.round(x), Math.round(y), {
1311
+ steps: device_SMOOTH_MOVE_STEPS_MOUSE_MOVE,
1312
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_MOUSE_MOVE,
1313
+ settleDelayMs: device_MOUSE_MOVE_EFFECT_WAIT
1314
+ });
1315
+ },
1316
+ dragAndDrop: async (from, to)=>{
1317
+ await this.movePointer(Math.round(from.x), Math.round(from.y), {
1318
+ steps: device_SMOOTH_MOVE_STEPS_TAP,
1319
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_TAP
1320
+ });
1321
+ await this.backend.mouseButton('left', 'down');
1322
+ await sleep(DRAG_HOLD_DURATION);
1323
+ await this.movePointer(Math.round(to.x), Math.round(to.y), {
1324
+ steps: SMOOTH_MOVE_STEPS_DRAG,
1325
+ stepDelayMs: SMOOTH_MOVE_DELAY_DRAG
1326
+ });
1327
+ await sleep(DRAG_HOLD_DURATION);
1328
+ await this.backend.mouseButton('left', 'up');
1329
+ }
1330
+ },
1331
+ keyboard: {
1332
+ typeText: async (value, opts)=>{
1333
+ this.assertConnected();
1334
+ const target = opts?.target;
1335
+ if (target) {
1336
+ await this.inputPrimitives.pointer.tap({
1337
+ x: target.center[0],
1338
+ y: target.center[1]
1339
+ });
1340
+ await sleep(device_INPUT_FOCUS_DELAY);
1341
+ }
1342
+ if (opts?.replace !== false) {
1343
+ await this.clearInput();
1344
+ await sleep(device_INPUT_CLEAR_DELAY);
1345
+ }
1346
+ if (opts?.focusOnly || !value) return;
1347
+ await this.backend.typeText(value);
1348
+ },
1349
+ clearInput: async (target)=>{
1350
+ this.assertConnected();
1351
+ const element = target;
1352
+ if (element) {
1353
+ await this.inputPrimitives.pointer.tap({
1354
+ x: element.center[0],
1355
+ y: element.center[1]
1356
+ });
1357
+ await sleep(device_INPUT_FOCUS_DELAY);
1358
+ }
1359
+ await this.clearInput();
1360
+ await sleep(device_INPUT_CLEAR_DELAY);
1361
+ },
1362
+ keyboardPress: async (keyName, opts)=>{
1363
+ this.assertConnected();
1364
+ const target = opts?.target;
1365
+ if (target) await this.inputPrimitives.pointer.tap({
1366
+ x: target.center[0],
1367
+ y: target.center[1]
1368
+ });
1369
+ await this.backend.keyPress(keyName);
1370
+ }
1371
+ },
1372
+ scroll: {
1373
+ scroll: async (param)=>{
1374
+ this.assertConnected();
1375
+ const target = param.locate;
1376
+ if (target) await this.moveToElement(target, {
1377
+ steps: device_SMOOTH_MOVE_STEPS_MOUSE_MOVE,
1378
+ stepDelayMs: device_SMOOTH_MOVE_DELAY_MOUSE_MOVE
1379
+ });
1380
+ if (param.scrollType && 'singleAction' !== param.scrollType) {
1381
+ const direction = this.edgeScrollDirection(param.scrollType);
1382
+ for(let i = 0; i < device_EDGE_SCROLL_STEPS; i++)await this.performWheel(direction, DEFAULT_SCROLL_DISTANCE, target?.center[0], target?.center[1]);
1383
+ await sleep(device_SCROLL_COMPLETE_DELAY);
1384
+ return;
1385
+ }
1386
+ await this.performWheel(param.direction || 'down', param.distance || DEFAULT_SCROLL_DISTANCE, target?.center[0], target?.center[1]);
1387
+ await sleep(device_SCROLL_COMPLETE_DELAY);
1388
+ }
1389
+ }
1390
+ });
1440
1391
  this.options = {
1441
1392
  port: 3389,
1442
1393
  securityProtocol: 'auto',
@@ -1662,26 +1613,38 @@ class ComputerMidsceneTools extends BaseMidsceneTools {
1662
1613
  }
1663
1614
  }
1664
1615
  function version() {
1665
- const currentVersion = "1.8.1-beta-20260513084557.0";
1616
+ const currentVersion = "1.8.1";
1666
1617
  console.log(`@midscene/computer v${currentVersion}`);
1667
1618
  return currentVersion;
1668
1619
  }
1620
+ function loadMacPermissions() {
1621
+ if ('darwin' !== process.platform) return {
1622
+ permissions: null
1623
+ };
1624
+ try {
1625
+ const dynamicRequire = createRequire(import.meta.url);
1626
+ return {
1627
+ permissions: dynamicRequire('node-mac-permissions')
1628
+ };
1629
+ } catch (error) {
1630
+ return {
1631
+ permissions: null,
1632
+ loadError: error instanceof Error ? error.message : String(error)
1633
+ };
1634
+ }
1635
+ }
1669
1636
  function checkAccessibilityPermission(promptIfNeeded = false) {
1670
1637
  if ('darwin' !== process.platform) return {
1671
1638
  hasPermission: true,
1672
1639
  platform: process.platform
1673
1640
  };
1674
1641
  try {
1675
- let permissions;
1676
- try {
1677
- const dynamicRequire = createRequire(import.meta.url);
1678
- permissions = dynamicRequire('node-mac-permissions');
1679
- } catch {
1680
- return {
1681
- hasPermission: true,
1682
- platform: process.platform
1683
- };
1684
- }
1642
+ const { permissions, loadError } = loadMacPermissions();
1643
+ if (!permissions) return {
1644
+ hasPermission: false,
1645
+ platform: process.platform,
1646
+ error: `Cannot verify macOS Accessibility permission: node-mac-permissions is unavailable${loadError ? ` (${loadError})` : ''}. The native module may need to be rebuilt for the current Node/Electron ABI.`
1647
+ };
1685
1648
  const status = permissions.getAuthStatus('accessibility');
1686
1649
  if ('authorized' === status) return {
1687
1650
  hasPermission: true,
@@ -1701,6 +1664,37 @@ function checkAccessibilityPermission(promptIfNeeded = false) {
1701
1664
  };
1702
1665
  }
1703
1666
  }
1667
+ function checkScreenRecordingPermission(promptIfNeeded = false) {
1668
+ if ('darwin' !== process.platform) return {
1669
+ hasPermission: true,
1670
+ platform: process.platform
1671
+ };
1672
+ try {
1673
+ const { permissions, loadError } = loadMacPermissions();
1674
+ if (!permissions) return {
1675
+ hasPermission: false,
1676
+ platform: process.platform,
1677
+ error: `Cannot verify macOS Screen Recording permission: node-mac-permissions is unavailable${loadError ? ` (${loadError})` : ''}. The native module may need to be rebuilt for the current Node/Electron ABI.`
1678
+ };
1679
+ const status = permissions.getAuthStatus('screen');
1680
+ if ('authorized' === status) return {
1681
+ hasPermission: true,
1682
+ platform: process.platform
1683
+ };
1684
+ if (promptIfNeeded) permissions.askForScreenCaptureAccess(true);
1685
+ return {
1686
+ hasPermission: false,
1687
+ platform: process.platform,
1688
+ error: `macOS Screen Recording permission is required (current status: ${status}).\n\nPlease follow these steps:\n1. Open System Settings > Privacy & Security > Screen Recording\n2. Enable the application running this script (e.g., Terminal, iTerm2, VS Code, WebStorm, or Midscene Studio)\n3. Fully quit and relaunch that application after granting permission — macOS only re-reads this permission on process launch.`
1689
+ };
1690
+ } catch (error) {
1691
+ return {
1692
+ hasPermission: false,
1693
+ platform: process.platform,
1694
+ error: `Failed to check screen recording permission: ${error instanceof Error ? error.message : String(error)}`
1695
+ };
1696
+ }
1697
+ }
1704
1698
  async function checkComputerEnvironment() {
1705
1699
  try {
1706
1700
  const libnutModule = await import("@computer-use/libnut/dist/import_libnut.js");
@@ -1731,4 +1725,4 @@ async function checkComputerEnvironment() {
1731
1725
  async function getConnectedDisplays() {
1732
1726
  return ComputerDevice.listDisplays();
1733
1727
  }
1734
- export { ComputerAgent, ComputerDevice, ComputerMidsceneTools, HelperProcessRDPBackendClient, RDPDevice, UnsupportedRDPBackendClient, agentForComputer, agentForRDPComputer, agentFromComputer, checkAccessibilityPermission, checkComputerEnvironment, checkXvfbInstalled, createDefaultRDPBackendClient, getConnectedDisplays, needsXvfb, overrideAIConfig, version };
1728
+ export { ComputerAgent, ComputerDevice, ComputerMidsceneTools, HelperProcessRDPBackendClient, RDPDevice, UnsupportedRDPBackendClient, agentForComputer, agentForRDPComputer, agentFromComputer, checkAccessibilityPermission, checkComputerEnvironment, checkScreenRecordingPermission, checkXvfbInstalled, createDefaultRDPBackendClient, getConnectedDisplays, needsXvfb, overrideAIConfig, version };