@midscene/ios 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,8 +4,8 @@ import { MIDSCENE_IOS_DEVICE_CLASS_OVERRIDE } from "@midscene/shared/env";
4
4
  import { getDebug } from "@midscene/shared/logger";
5
5
  import { mergeAndNormalizeAppNameMapping, normalizeForComparison } from "@midscene/shared/utils";
6
6
  import node_assert from "node:assert";
7
- import { getMidsceneLocationSchema, z } from "@midscene/core";
8
- import { defineAction, defineActionClearInput, defineActionCursorMove, defineActionDoubleClick, defineActionDragAndDrop, defineActionKeyboardPress, defineActionLongPress, defineActionPinch, defineActionScroll, defineActionSwipe, defineActionTap, normalizeMobileSwipeParam, normalizePinchParam } from "@midscene/core/device";
7
+ import { z } from "@midscene/core";
8
+ import { createDefaultMobileActions, defineAction } from "@midscene/core/device";
9
9
  import { sleep } from "@midscene/core/utils";
10
10
  import { DEFAULT_WDA_PORT } from "@midscene/shared/constants";
11
11
  import { createImgBase64ByFormat } from "@midscene/shared/img";
@@ -649,16 +649,6 @@ function _define_property(obj, key, value) {
649
649
  return obj;
650
650
  }
651
651
  const debugDevice = getDebug('ios:device');
652
- const iosInputParamSchema = z.object({
653
- value: z.string().describe('The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.'),
654
- autoDismissKeyboard: z.boolean().optional().describe('Whether to dismiss the keyboard after input. Defaults to true if not specified. Set to false to keep the keyboard visible after input.'),
655
- mode: z.preprocess((val)=>'append' === val ? 'typeOnly' : val, z["enum"]([
656
- 'replace',
657
- 'clear',
658
- 'typeOnly'
659
- ]).default('replace').optional().describe('Input mode: "replace" (default) - clear the field and input the value; "typeOnly" - type the value directly without clearing the field first; "clear" - clear the field without inputting new text.')),
660
- locate: getMidsceneLocationSchema().describe('The input field to be filled').optional()
661
- });
662
652
  const WDA_HTTP_METHODS = [
663
653
  'GET',
664
654
  'POST',
@@ -667,96 +657,39 @@ const WDA_HTTP_METHODS = [
667
657
  ];
668
658
  const DEFAULT_WDA_MJPEG_PORT = 9100;
669
659
  class IOSDevice {
660
+ async tapPoint(point) {
661
+ debugDevice(`tap at coordinates (${point.x}, ${point.y})`);
662
+ await this.wdaBackend.tap(Math.round(point.x), Math.round(point.y));
663
+ }
664
+ async doubleTapPoint(point) {
665
+ await this.wdaBackend.doubleTap(Math.round(point.x), Math.round(point.y));
666
+ }
667
+ async longPressPoint(point, duration = 1000) {
668
+ await this.wdaBackend.longPress(Math.round(point.x), Math.round(point.y), duration);
669
+ }
670
+ async swipePoint(start, end, duration = 500) {
671
+ await this.wdaBackend.swipe(Math.round(start.x), Math.round(start.y), Math.round(end.x), Math.round(end.y), duration);
672
+ }
673
+ async clearInputAt(point) {
674
+ if (point) {
675
+ await this.tapPoint(point);
676
+ await sleep(100);
677
+ }
678
+ debugDevice('Attempting to clear input with WebDriver Clear API');
679
+ const cleared = await this.wdaBackend.clearActiveElement();
680
+ cleared ? debugDevice('Successfully cleared input with WebDriver Clear API') : debugDevice('WebDriver Clear API returned false (no active element or clear failed)');
681
+ }
670
682
  actionSpace() {
683
+ const mobileActionContext = {
684
+ input: this.inputPrimitives,
685
+ size: ()=>this.size(),
686
+ sleep: async (timeMs)=>{
687
+ await sleep(timeMs);
688
+ },
689
+ getDefaultAutoDismissKeyboard: ()=>this.options?.autoDismissKeyboard
690
+ };
671
691
  const defaultActions = [
672
- defineActionTap(async (param)=>{
673
- const element = param.locate;
674
- node_assert(element, 'Element not found, cannot tap');
675
- await this.mouseClick(element.center[0], element.center[1]);
676
- }),
677
- defineActionDoubleClick(async (param)=>{
678
- const element = param.locate;
679
- node_assert(element, 'Element not found, cannot double click');
680
- await this.doubleTap(element.center[0], element.center[1]);
681
- }),
682
- defineAction({
683
- name: 'Input',
684
- description: 'Input text into the input field',
685
- interfaceAlias: 'aiInput',
686
- paramSchema: iosInputParamSchema,
687
- sample: {
688
- value: 'test@example.com',
689
- locate: {
690
- prompt: 'the email input field'
691
- }
692
- },
693
- call: async (param)=>{
694
- const element = param.locate;
695
- if ('typeOnly' !== param.mode) await this.clearInput(element);
696
- if ('clear' === param.mode) return;
697
- if (!param || !param.value) return;
698
- const autoDismissKeyboard = param.autoDismissKeyboard ?? this.options?.autoDismissKeyboard;
699
- await this.typeText(param.value, {
700
- autoDismissKeyboard
701
- });
702
- }
703
- }),
704
- defineActionScroll(async (param)=>{
705
- const element = param.locate;
706
- const startingPoint = element ? {
707
- left: element.center[0],
708
- top: element.center[1]
709
- } : void 0;
710
- const scrollToEventName = param?.scrollType;
711
- if ('scrollToTop' === scrollToEventName) await this.scrollUntilTop(startingPoint);
712
- else if ('scrollToBottom' === scrollToEventName) await this.scrollUntilBottom(startingPoint);
713
- else if ('scrollToRight' === scrollToEventName) await this.scrollUntilRight(startingPoint);
714
- else if ('scrollToLeft' === scrollToEventName) await this.scrollUntilLeft(startingPoint);
715
- else if ('singleAction' !== scrollToEventName && scrollToEventName) throw new Error(`Unknown scroll event type: ${scrollToEventName}, param: ${JSON.stringify(param)}`);
716
- else {
717
- if (param?.direction !== 'down' && param && param.direction) if ('up' === param.direction) await this.scrollUp(param.distance || void 0, startingPoint);
718
- else if ('left' === param.direction) await this.scrollLeft(param.distance || void 0, startingPoint);
719
- else if ('right' === param.direction) await this.scrollRight(param.distance || void 0, startingPoint);
720
- else throw new Error(`Unknown scroll direction: ${param.direction}`);
721
- else await this.scrollDown(param?.distance || void 0, startingPoint);
722
- await sleep(500);
723
- }
724
- }),
725
- defineActionDragAndDrop(async (param)=>{
726
- const from = param.from;
727
- const to = param.to;
728
- node_assert(from, 'missing "from" param for drag and drop');
729
- node_assert(to, 'missing "to" param for drag and drop');
730
- await this.swipe(from.center[0], from.center[1], to.center[0], to.center[1], 1000);
731
- }),
732
- defineActionSwipe(async (param)=>{
733
- const { startPoint, endPoint, duration, repeatCount } = normalizeMobileSwipeParam(param, await this.size());
734
- for(let i = 0; i < repeatCount; i++)await this.swipe(startPoint.x, startPoint.y, endPoint.x, endPoint.y, duration);
735
- }),
736
- defineActionKeyboardPress(async (param)=>{
737
- await this.pressKey(param.keyName);
738
- }),
739
- defineActionCursorMove(async (param)=>{
740
- const arrowKey = 'left' === param.direction ? 'ArrowLeft' : 'ArrowRight';
741
- const times = param.times ?? 1;
742
- for(let i = 0; i < times; i++){
743
- await this.pressKey(arrowKey);
744
- await sleep(100);
745
- }
746
- }),
747
- defineActionLongPress(async (param)=>{
748
- const element = param.locate;
749
- node_assert(element, 'LongPress requires an element to be located');
750
- const [x, y] = element.center;
751
- await this.longPress(x, y, param?.duration);
752
- }),
753
- defineActionPinch(async (param)=>{
754
- const { centerX, centerY, startDistance, endDistance, duration } = normalizePinchParam(param, await this.size());
755
- await this.wdaBackend.pinch(centerX, centerY, startDistance, endDistance, duration);
756
- }),
757
- defineActionClearInput(async (param)=>{
758
- await this.clearInput(param.locate);
759
- })
692
+ ...createDefaultMobileActions(mobileActionContext)
760
693
  ];
761
694
  const platformSpecificActions = Object.values(createPlatformActions(this));
762
695
  const customActions = this.customActions || [];
@@ -766,6 +699,27 @@ class IOSDevice {
766
699
  ...customActions
767
700
  ];
768
701
  }
702
+ async performActionScroll(param) {
703
+ const element = param.locate;
704
+ const startingPoint = element ? {
705
+ left: element.center[0],
706
+ top: element.center[1]
707
+ } : void 0;
708
+ const scrollToEventName = param?.scrollType;
709
+ if ('scrollToTop' === scrollToEventName) await this.scrollUntilTop(startingPoint);
710
+ else if ('scrollToBottom' === scrollToEventName) await this.scrollUntilBottom(startingPoint);
711
+ else if ('scrollToRight' === scrollToEventName) await this.scrollUntilRight(startingPoint);
712
+ else if ('scrollToLeft' === scrollToEventName) await this.scrollUntilLeft(startingPoint);
713
+ else if ('singleAction' !== scrollToEventName && scrollToEventName) throw new Error(`Unknown scroll event type: ${scrollToEventName}, param: ${JSON.stringify(param)}`);
714
+ else {
715
+ if (param?.direction !== 'down' && param && param.direction) if ('up' === param.direction) await this.scrollUp(param.distance || void 0, startingPoint);
716
+ else if ('left' === param.direction) await this.scrollLeft(param.distance || void 0, startingPoint);
717
+ else if ('right' === param.direction) await this.scrollRight(param.distance || void 0, startingPoint);
718
+ else throw new Error(`Unknown scroll direction: ${param.direction}`);
719
+ else await this.scrollDown(param?.distance || void 0, startingPoint);
720
+ await sleep(500);
721
+ }
722
+ }
769
723
  describe() {
770
724
  return this.description || `Device ID: ${this.deviceId}`;
771
725
  }
@@ -877,35 +831,31 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
877
831
  }
878
832
  }
879
833
  async clearInput(element) {
880
- if (element) {
881
- await this.tap(element.center[0], element.center[1]);
882
- await sleep(100);
883
- }
884
- debugDevice('Attempting to clear input with WebDriver Clear API');
885
- const cleared = await this.wdaBackend.clearActiveElement();
886
- cleared ? debugDevice('Successfully cleared input with WebDriver Clear API') : debugDevice('WebDriver Clear API returned false (no active element or clear failed)');
834
+ await this.clearInputAt(element ? {
835
+ x: element.center[0],
836
+ y: element.center[1]
837
+ } : void 0);
887
838
  }
888
839
  async url() {
889
840
  return '';
890
841
  }
891
842
  async tap(x, y) {
892
- await this.wdaBackend.tap(Math.round(x), Math.round(y));
893
- }
894
- async mouseClick(x, y) {
895
- debugDevice(`mouseClick at coordinates (${x}, ${y})`);
896
- await this.tap(x, y);
897
- }
898
- async doubleTap(x, y) {
899
- await this.wdaBackend.doubleTap(Math.round(x), Math.round(y));
900
- }
901
- async tripleTap(x, y) {
902
- await this.wdaBackend.tripleTap(Math.round(x), Math.round(y));
903
- }
904
- async longPress(x, y, duration = 1000) {
905
- await this.wdaBackend.longPress(Math.round(x), Math.round(y), duration);
843
+ await this.tapPoint({
844
+ x,
845
+ y
846
+ });
906
847
  }
907
848
  async swipe(fromX, fromY, toX, toY, duration = 500) {
908
- await this.wdaBackend.swipe(Math.round(fromX), Math.round(fromY), Math.round(toX), Math.round(toY), duration);
849
+ await this.swipeCoordinates(fromX, fromY, toX, toY, duration);
850
+ }
851
+ async swipeCoordinates(fromX, fromY, toX, toY, duration = 500) {
852
+ await this.swipePoint({
853
+ x: fromX,
854
+ y: fromY
855
+ }, {
856
+ x: toX,
857
+ y: toY
858
+ }, duration);
909
859
  }
910
860
  async typeText(text, options) {
911
861
  if (!text) return;
@@ -934,7 +884,7 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
934
884
  y: Math.round(height / 2)
935
885
  };
936
886
  const scrollDistance = Math.round(distance || height / 3);
937
- await this.swipe(start.x, start.y, start.x, start.y + scrollDistance);
887
+ await this.swipeCoordinates(start.x, start.y, start.x, start.y + scrollDistance);
938
888
  }
939
889
  async scrollDown(distance, startPoint) {
940
890
  const { width, height } = await this.size();
@@ -946,7 +896,7 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
946
896
  y: Math.round(height / 2)
947
897
  };
948
898
  const scrollDistance = Math.round(distance || height / 3);
949
- await this.swipe(start.x, start.y, start.x, start.y - scrollDistance);
899
+ await this.swipeCoordinates(start.x, start.y, start.x, start.y - scrollDistance);
950
900
  }
951
901
  async scrollLeft(distance, startPoint) {
952
902
  const { width, height } = await this.size();
@@ -958,7 +908,7 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
958
908
  y: Math.round(height / 2)
959
909
  };
960
910
  const scrollDistance = Math.round(distance || 0.7 * width);
961
- await this.swipe(start.x, start.y, start.x + scrollDistance, start.y);
911
+ await this.swipeCoordinates(start.x, start.y, start.x + scrollDistance, start.y);
962
912
  }
963
913
  async scrollRight(distance, startPoint) {
964
914
  const { width, height } = await this.size();
@@ -970,7 +920,7 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
970
920
  y: Math.round(height / 2)
971
921
  };
972
922
  const scrollDistance = Math.round(distance || 0.7 * width);
973
- await this.swipe(start.x, start.y, start.x - scrollDistance, start.y);
923
+ await this.swipeCoordinates(start.x, start.y, start.x - scrollDistance, start.y);
974
924
  }
975
925
  async scrollUntilTop(startPoint) {
976
926
  debugDevice('Using screenshot-based scroll detection for better reliability');
@@ -1066,16 +1016,16 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
1066
1016
  debugDevice(`Performing scroll: ${direction}, distance: ${scrollDistance}`);
1067
1017
  switch(direction){
1068
1018
  case 'up':
1069
- await this.swipe(start.x, start.y, start.x, start.y + scrollDistance, 300);
1019
+ await this.swipeCoordinates(start.x, start.y, start.x, start.y + scrollDistance, 300);
1070
1020
  break;
1071
1021
  case 'down':
1072
- await this.swipe(start.x, start.y, start.x, start.y - scrollDistance, 300);
1022
+ await this.swipeCoordinates(start.x, start.y, start.x, start.y - scrollDistance, 300);
1073
1023
  break;
1074
1024
  case 'left':
1075
- await this.swipe(start.x, start.y, start.x + scrollDistance, start.y, 300);
1025
+ await this.swipeCoordinates(start.x, start.y, start.x + scrollDistance, start.y, 300);
1076
1026
  break;
1077
1027
  case 'right':
1078
- await this.swipe(start.x, start.y, start.x - scrollDistance, start.y, 300);
1028
+ await this.swipeCoordinates(start.x, start.y, start.x - scrollDistance, start.y, 300);
1079
1029
  break;
1080
1030
  }
1081
1031
  debugDevice('Waiting for scroll and inertia to complete...');
@@ -1132,7 +1082,7 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
1132
1082
  const centerX = Math.round(windowSize.width / 2);
1133
1083
  const startY = Math.round(0.9 * windowSize.height);
1134
1084
  const endY = Math.round(0.5 * windowSize.height);
1135
- await this.swipe(centerX, startY, centerX, endY, 300);
1085
+ await this.swipeCoordinates(centerX, startY, centerX, endY, 300);
1136
1086
  debugDevice('Dismissed keyboard with swipe up gesture from bottom of screen');
1137
1087
  await sleep(500);
1138
1088
  return true;
@@ -1209,6 +1159,45 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
1209
1159
  _define_property(this, "interfaceType", 'ios');
1210
1160
  _define_property(this, "uri", void 0);
1211
1161
  _define_property(this, "options", void 0);
1162
+ _define_property(this, "inputPrimitives", {
1163
+ pointer: {
1164
+ tap: (point)=>this.tapPoint(point),
1165
+ doubleClick: (point)=>this.doubleTapPoint(point),
1166
+ longPress: (point, opts)=>this.longPressPoint(point, opts?.duration),
1167
+ dragAndDrop: (from, to)=>this.swipePoint(from, to, 1000)
1168
+ },
1169
+ keyboard: {
1170
+ keyboardPress: (keyName)=>this.pressKey(keyName),
1171
+ typeText: async (value, opts)=>{
1172
+ const target = opts?.target;
1173
+ if (target && opts?.replace !== false) await this.clearInput(target);
1174
+ else if (target) await this.tapPoint({
1175
+ x: target.center[0],
1176
+ y: target.center[1]
1177
+ });
1178
+ if (opts?.focusOnly) return;
1179
+ await this.typeText(value, opts);
1180
+ },
1181
+ clearInput: (target)=>this.clearInput(target),
1182
+ cursorMove: async (direction, times = 1)=>{
1183
+ const arrowKey = 'left' === direction ? 'ArrowLeft' : 'ArrowRight';
1184
+ for(let i = 0; i < times; i++)await this.pressKey(arrowKey);
1185
+ }
1186
+ },
1187
+ touch: {
1188
+ swipe: async (start, end, opts)=>{
1189
+ const duration = opts?.duration ?? 300;
1190
+ const repeat = opts?.repeat ?? 1;
1191
+ for(let i = 0; i < repeat; i++)await this.swipePoint(start, end, duration);
1192
+ },
1193
+ pinch: async (center, opts)=>{
1194
+ await this.wdaBackend.pinch(Math.round(center.x), Math.round(center.y), opts.startDistance, opts.endDistance, opts.duration);
1195
+ }
1196
+ },
1197
+ scroll: {
1198
+ scroll: (param)=>this.performActionScroll(param)
1199
+ }
1200
+ });
1212
1201
  this.deviceId = 'pending-connection';
1213
1202
  this.options = options;
1214
1203
  this.customActions = options?.customActions;
@@ -1451,7 +1440,7 @@ class IOSMCPServer extends BaseMCPServer {
1451
1440
  constructor(toolsManager){
1452
1441
  super({
1453
1442
  name: '@midscene/ios-mcp',
1454
- version: "1.8.0",
1443
+ version: "1.8.1",
1455
1444
  description: 'Control the iOS device using natural language commands'
1456
1445
  }, toolsManager);
1457
1446
  }