appium-mcp 1.74.2 → 1.74.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Helpers for the special "ai-element:" UUIDs produced by the appium_ai
3
+ * find_element handler.
4
+ *
5
+ * The format is:
6
+ *
7
+ * ai-element:<cx>,<cy>:<x0>,<y0>,<x1>,<y1>
8
+ *
9
+ * where (cx, cy) is the visual centre of the target and (x0,y0,x1,y1) is
10
+ * the bounding box returned by the vision model. Both are pixel
11
+ * coordinates in the screenshot's coordinate space, NOT real WebDriver
12
+ * element ids — so they MUST NOT be passed to driver.getElementRect or
13
+ * to platform-native commands like `mobile: doubleTap`, `mobile: pinch`,
14
+ * etc., which require a real element id.
15
+ *
16
+ * Centralising the prefix, the parser, and the rect-resolution helper
17
+ * here keeps every gesture handler honest: if it ever needs a rect for
18
+ * a UUID, it goes through `resolveTargetRect` and gets the right thing
19
+ * for both AI and traditional UUIDs.
20
+ */
21
+ import type { ContentResult } from 'fastmcp';
22
+ import type { Rect } from '@appium/types';
23
+ import { getElementRect } from '../../../command.js';
24
+ import type { DriverInstance } from '../../../session-store.js';
25
+ import { errorResult } from '../../tool-response.js';
26
+
27
+ export const AI_ELEMENT_PREFIX = 'ai-element:';
28
+
29
+ export const AI_DISABLED_REJECTION =
30
+ `Received an ai-element: UUID, but the appium_ai tool is not registered ` +
31
+ `(AI_VISION_ENABLED is not set to true). Use appium_find_element to get a real ` +
32
+ `element UUID, or enable AI_VISION_ENABLED=true with the required AI_VISION_* keys.`;
33
+
34
+ export type ParsedAiElement = {
35
+ center: { x: number; y: number };
36
+ rect: Rect;
37
+ };
38
+
39
+ /**
40
+ * When centre-only AI UUIDs have no bbox, direction-based swipe/scroll uses
41
+ * `rect.width` / `rect.height` with 0.2 / 0.8 edges; an effective 1x1 rect
42
+ * makes start/end identical after rounding. Keep a minimum inset area so those
43
+ * gestures still move visibly while staying centred on the parsed point.
44
+ *
45
+ * Values are screenshot pixels; callers can clamp against the window if needed.
46
+ */
47
+ const AI_FALLBACK_RECT_WIDTH = 100;
48
+ const AI_FALLBACK_RECT_HEIGHT = 100;
49
+
50
+ export function isAiElementUUID(uuid: string | undefined): uuid is string {
51
+ return typeof uuid === 'string' && uuid.startsWith(AI_ELEMENT_PREFIX);
52
+ }
53
+
54
+ /**
55
+ * Parse an `ai-element:` UUID into a centre point and a synthetic rect.
56
+ *
57
+ * If the bbox segment is present and well-formed, the rect describes the
58
+ * full bounding box. Otherwise we fall back to a small rect centred on
59
+ * `(cx, cy)`, so directional swipe/scroll still has usable start/end deltas.
60
+ */
61
+ export function parseAiElement(
62
+ uuid: string
63
+ ): ParsedAiElement | { error: string } {
64
+ const parts = uuid.split(':');
65
+ if (parts.length < 2) {
66
+ return { error: 'Invalid ai-element UUID: missing coordinate segment.' };
67
+ }
68
+
69
+ const [cxStr, cyStr] = (parts[1] ?? '').split(',');
70
+ const cx = Number.parseInt(cxStr ?? '', 10);
71
+ const cy = Number.parseInt(cyStr ?? '', 10);
72
+ if (!Number.isFinite(cx) || !Number.isFinite(cy)) {
73
+ return {
74
+ error: `Invalid ai-element UUID: centre coordinates are not numbers ('${parts[1]}').`,
75
+ };
76
+ }
77
+
78
+ let rect = rectAroundCenter(cx, cy);
79
+ if (parts[2]) {
80
+ const bbox = parts[2].split(',').map((v) => Number.parseInt(v, 10));
81
+ if (
82
+ bbox.length === 4 &&
83
+ bbox.every((n) => Number.isFinite(n)) &&
84
+ bbox[2] > bbox[0] &&
85
+ bbox[3] > bbox[1]
86
+ ) {
87
+ rect = {
88
+ x: bbox[0],
89
+ y: bbox[1],
90
+ width: bbox[2] - bbox[0],
91
+ height: bbox[3] - bbox[1],
92
+ };
93
+ }
94
+ }
95
+
96
+ return { center: { x: cx, y: cy }, rect };
97
+ }
98
+
99
+ /**
100
+ * Single dispatcher for "give me a rect for this UUID":
101
+ * - ai-element UUID → rect synthesised from the bbox / centre fallback, no driver call
102
+ * - traditional UUID → `getElementRect` on the driver
103
+ *
104
+ * Returns `{ error }` for malformed AI UUIDs or for driver lookup failures (`getElementRect`
105
+ * rejects). Call sites avoid try/catch and handle both cases the same way.
106
+ */
107
+ export async function resolveTargetRect(
108
+ driver: DriverInstance,
109
+ elementUUID: string
110
+ ): Promise<Rect | { error: string }> {
111
+ if (isAiElementUUID(elementUUID)) {
112
+ const parsed = parseAiElement(elementUUID);
113
+ if ('error' in parsed) {
114
+ return parsed;
115
+ }
116
+ return parsed.rect;
117
+ }
118
+ try {
119
+ return await getElementRect(driver, elementUUID);
120
+ } catch (err: unknown) {
121
+ return {
122
+ error: err instanceof Error ? err.message : String(err),
123
+ };
124
+ }
125
+ }
126
+
127
+ export function aiDisabledResult(): ContentResult {
128
+ return errorResult(AI_DISABLED_REJECTION);
129
+ }
130
+
131
+ function rectAroundCenter(cx: number, cy: number): Rect {
132
+ const w = AI_FALLBACK_RECT_WIDTH;
133
+ const h = AI_FALLBACK_RECT_HEIGHT;
134
+ return {
135
+ x: cx - Math.floor(w / 2),
136
+ y: cy - Math.floor(h / 2),
137
+ width: w,
138
+ height: h,
139
+ };
140
+ }
@@ -1,17 +1,18 @@
1
1
  import type { ContentResult } from 'fastmcp';
2
2
  import type { DriverInstance } from '../../../session-store.js';
3
3
  import { getPlatformName, PLATFORM } from '../../../session-store.js';
4
- import {
5
- execute,
6
- getElementRect,
7
- getWindowRect,
8
- performActions,
9
- } from '../../../command.js';
4
+ import { execute, getWindowRect, performActions } from '../../../command.js';
10
5
  import {
11
6
  errorResult,
12
7
  textResult,
13
8
  toolErrorMessage,
14
9
  } from '../../tool-response.js';
10
+ import { isAIEnabled } from '../../ai/config.js';
11
+ import {
12
+ aiDisabledResult,
13
+ isAiElementUUID,
14
+ resolveTargetRect,
15
+ } from './ai-element.js';
15
16
  import type { GestureArgs } from '../schema.js';
16
17
 
17
18
  const DEFAULT_VELOCITY = 2.2;
@@ -96,6 +97,14 @@ export async function handlePinchZoom(
96
97
  }
97
98
  const useCustomCoords =
98
99
  !args.elementUUID && args.x !== undefined && args.y !== undefined;
100
+ // ai-element UUIDs are coordinate UUIDs, not real element ids: drive the
101
+ // gesture from the bbox and skip the iOS/Android native paths that need
102
+ // a real `elementId`.
103
+ const isAiUUID = isAiElementUUID(args.elementUUID);
104
+ if (isAiUUID && !isAIEnabled()) {
105
+ return aiDisabledResult();
106
+ }
107
+ const useCoordPath = useCustomCoords || isAiUUID;
99
108
 
100
109
  try {
101
110
  const platform = getPlatformName(driver);
@@ -107,7 +116,10 @@ export async function handlePinchZoom(
107
116
 
108
117
  if (args.elementUUID) {
109
118
  windowRect = await getWindowRect(driver);
110
- const rect = await getElementRect(driver, args.elementUUID);
119
+ const rect = await resolveTargetRect(driver, args.elementUUID);
120
+ if ('error' in rect) {
121
+ return errorResult(rect.error);
122
+ }
111
123
  ({ cx, cy, spread } = resolveElementPinchTarget(rect, windowRect));
112
124
  } else if (useCustomCoords) {
113
125
  windowRect = await getWindowRect(driver);
@@ -151,10 +163,11 @@ export async function handlePinchZoom(
151
163
  ],
152
164
  },
153
165
  ]);
154
- } else if (useCustomCoords && platform === PLATFORM.android) {
155
- // Zoom in at a custom center on Android: use the native pinchOpenGesture
156
- // with a region centered at (cx, cy). spread is pre-clamped so the region
157
- // fits within the window.
166
+ } else if (useCoordPath && platform === PLATFORM.android) {
167
+ // Zoom in at a coordinate-driven center on Android: use the native
168
+ // pinchOpenGesture with a region centered at (cx, cy). spread is
169
+ // pre-clamped so the region fits within the window. Also covers
170
+ // ai-element UUIDs, whose synthetic rect is used to pick the region.
158
171
  const percent = Math.min(0.99, 1 - 1 / scale);
159
172
  await execute(driver, 'mobile: pinchOpenGesture', {
160
173
  left: cx - spread,
@@ -163,8 +176,10 @@ export async function handlePinchZoom(
163
176
  height: 2 * spread,
164
177
  percent,
165
178
  });
166
- } else if (useCustomCoords) {
167
- // Zoom in at a custom center on iOS using W3C Actions
179
+ } else if (useCoordPath) {
180
+ // Zoom in at a coordinate-driven center on iOS using W3C Actions.
181
+ // Same path is used for ai-element UUIDs because `mobile: pinch`
182
+ // requires a real elementId we don't have.
168
183
  const startSpread = Math.max(1, Math.floor(spread / scale));
169
184
  const endSpread = spread;
170
185
  const duration = Math.floor((1 / Math.abs(velocity)) * 1000);
@@ -224,11 +239,13 @@ export async function handlePinchZoom(
224
239
  }
225
240
 
226
241
  const direction = scale < 1 ? 'out' : 'in';
227
- const target = args.elementUUID
228
- ? `element ${args.elementUUID}`
229
- : useCustomCoords
230
- ? `coordinates (${cx}, ${cy})`
231
- : 'screen';
242
+ const target = isAiUUID
243
+ ? `AI element coordinates (${cx}, ${cy})`
244
+ : args.elementUUID
245
+ ? `element ${args.elementUUID}`
246
+ : useCustomCoords
247
+ ? `coordinates (${cx}, ${cy})`
248
+ : 'screen';
232
249
  return textResult(
233
250
  `Successfully pinched ${direction} (scale=${scale}) on ${target}.`
234
251
  );
@@ -1,17 +1,18 @@
1
1
  import type { ContentResult } from 'fastmcp';
2
2
  import type { DriverInstance } from '../../../session-store.js';
3
3
  import { getPlatformName, PLATFORM } from '../../../session-store.js';
4
- import {
5
- execute,
6
- getElementRect,
7
- getWindowRect,
8
- performActions,
9
- } from '../../../command.js';
4
+ import { execute, getWindowRect, performActions } from '../../../command.js';
10
5
  import {
11
6
  errorResult,
12
7
  textResult,
13
8
  toolErrorMessage,
14
9
  } from '../../tool-response.js';
10
+ import { isAIEnabled } from '../../ai/config.js';
11
+ import {
12
+ aiDisabledResult,
13
+ isAiElementUUID,
14
+ resolveTargetRect,
15
+ } from './ai-element.js';
15
16
  import type { GestureArgs } from '../schema.js';
16
17
 
17
18
  type Direction = 'up' | 'down' | 'left' | 'right';
@@ -123,6 +124,9 @@ export async function handleScroll(
123
124
  args: GestureArgs
124
125
  ): Promise<ContentResult> {
125
126
  try {
127
+ if (isAiElementUUID(args.elementUUID) && !isAIEnabled()) {
128
+ return aiDisabledResult();
129
+ }
126
130
  const platform = getPlatformName(driver);
127
131
  // Android scroll follows the scrollbar convention (down = reveal content below),
128
132
  // so flip the user's direction before computing the W3C drag coords.
@@ -164,6 +168,9 @@ export async function handleSwipe(
164
168
  args: GestureArgs
165
169
  ): Promise<ContentResult> {
166
170
  try {
171
+ if (isAiElementUUID(args.elementUUID) && !isAIEnabled()) {
172
+ return aiDisabledResult();
173
+ }
167
174
  const coordsResult = await resolveCoords(driver, args);
168
175
  if ('error' in coordsResult) {
169
176
  return errorResult(`swipe: ${coordsResult.error}`);
@@ -252,7 +259,13 @@ async function resolveCoords(
252
259
  ): Promise<Coords | { error: string }> {
253
260
  if (args.direction) {
254
261
  if (args.elementUUID) {
255
- const rect = await getElementRect(driver, args.elementUUID);
262
+ // ai-element UUIDs are coordinate UUIDs, not real element ids; their
263
+ // rect is synthesised from the bbox in the UUID itself rather than
264
+ // fetched from the driver.
265
+ const rect = await resolveTargetRect(driver, args.elementUUID);
266
+ if ('error' in rect) {
267
+ return rect;
268
+ }
256
269
  return coordsForDirection(args.direction, {
257
270
  x: rect.x,
258
271
  y: rect.y,
@@ -14,33 +14,32 @@ import {
14
14
  toolErrorMessage,
15
15
  } from '../../tool-response.js';
16
16
  import { isAIEnabled } from '../../ai/config.js';
17
+ import {
18
+ aiDisabledResult,
19
+ isAiElementUUID,
20
+ parseAiElement,
21
+ } from './ai-element.js';
17
22
  import type { GestureArgs } from '../schema.js';
18
23
 
19
- const AI_ELEMENT_PREFIX = 'ai-element:';
20
-
21
- const AI_DISABLED_REJECTION =
22
- `Received an ai-element: UUID, but the appium_ai tool is not registered ` +
23
- `(AI_VISION_ENABLED is not set to true). Use appium_find_element to get a real ` +
24
- `element UUID, or enable AI_VISION_ENABLED=true with the required AI_VISION_* keys.`;
25
-
26
24
  export async function handleTap(
27
25
  driver: DriverInstance,
28
26
  args: GestureArgs
29
27
  ): Promise<ContentResult> {
30
28
  try {
31
29
  if (args.elementUUID) {
32
- if (args.elementUUID.startsWith(AI_ELEMENT_PREFIX)) {
30
+ if (isAiElementUUID(args.elementUUID)) {
33
31
  if (!isAIEnabled()) {
34
- return errorResult(AI_DISABLED_REJECTION);
32
+ return aiDisabledResult();
35
33
  }
36
- const parsed = parseAiElementCoords(args.elementUUID);
34
+ const parsed = parseAiElement(args.elementUUID);
37
35
  if ('error' in parsed) {
38
36
  return errorResult(parsed.error);
39
37
  }
40
- await performActions(driver, w3cTapAt(parsed.x, parsed.y));
38
+ const { x, y } = parsed.center;
39
+ await performActions(driver, w3cTapAt(x, y));
41
40
  return textResultWithPrimaryElementId(
42
41
  args.elementUUID,
43
- `Successfully tapped at AI element coordinates (${parsed.x}, ${parsed.y}).`
42
+ `Successfully tapped at AI element coordinates (${x}, ${y}).`
44
43
  );
45
44
  }
46
45
  await elementClick(driver, args.elementUUID);
@@ -77,9 +76,22 @@ export async function handleDoubleTap(
77
76
  // other JS-bridged) gesture handlers. Resolve elements to coordinates
78
77
  // and use the same W3C sequence everywhere, verified against vodqa.
79
78
  if (args.elementUUID) {
80
- const coords = await resolveCoordsFromElement(driver, args.elementUUID);
81
- x = coords.x;
82
- y = coords.y;
79
+ if (isAiElementUUID(args.elementUUID)) {
80
+ if (!isAIEnabled()) {
81
+ return aiDisabledResult();
82
+ }
83
+ const parsed = parseAiElement(args.elementUUID);
84
+ if ('error' in parsed) {
85
+ return errorResult(parsed.error);
86
+ }
87
+ // ai-element UUIDs are coordinates, not real element ids — use W3C
88
+ // double-tap at the parsed centre (same as main for real elements).
89
+ ({ x, y } = parsed.center);
90
+ } else {
91
+ const coords = await resolveCoordsFromElement(driver, args.elementUUID);
92
+ x = coords.x;
93
+ y = coords.y;
94
+ }
83
95
  } else if (args.x !== undefined && args.y !== undefined) {
84
96
  x = args.x;
85
97
  y = args.y;
@@ -128,24 +140,37 @@ export async function handleLongPress(
128
140
  let y: number;
129
141
 
130
142
  if (args.elementUUID) {
131
- const platform = getPlatformName(driver);
132
- if (platform === PLATFORM.ios) {
133
- try {
134
- await execute(driver, 'mobile: touchAndHold', {
135
- elementId: args.elementUUID,
136
- duration: duration / 1000,
137
- });
138
- return textResultWithPrimaryElementId(
139
- args.elementUUID,
140
- `Successfully long pressed element ${args.elementUUID} for ${duration}ms.`
141
- );
142
- } catch {
143
- // fall through to W3C Actions fallback
143
+ if (isAiElementUUID(args.elementUUID)) {
144
+ if (!isAIEnabled()) {
145
+ return aiDisabledResult();
144
146
  }
147
+ const parsed = parseAiElement(args.elementUUID);
148
+ if ('error' in parsed) {
149
+ return errorResult(parsed.error);
150
+ }
151
+ // ai-element UUIDs are coordinates, not real element ids — skip the
152
+ // iOS `mobile: touchAndHold` fast-path and long-press at the parsed centre.
153
+ ({ x, y } = parsed.center);
154
+ } else {
155
+ const platform = getPlatformName(driver);
156
+ if (platform === PLATFORM.ios) {
157
+ try {
158
+ await execute(driver, 'mobile: touchAndHold', {
159
+ elementId: args.elementUUID,
160
+ duration: duration / 1000,
161
+ });
162
+ return textResultWithPrimaryElementId(
163
+ args.elementUUID,
164
+ `Successfully long pressed element ${args.elementUUID} for ${duration}ms.`
165
+ );
166
+ } catch {
167
+ // fall through to W3C Actions fallback
168
+ }
169
+ }
170
+ const coords = await resolveCoordsFromElement(driver, args.elementUUID);
171
+ x = coords.x;
172
+ y = coords.y;
145
173
  }
146
- const coords = await resolveCoordsFromElement(driver, args.elementUUID);
147
- x = coords.x;
148
- y = coords.y;
149
174
  } else if (args.x !== undefined && args.y !== undefined) {
150
175
  x = args.x;
151
176
  y = args.y;
@@ -178,25 +203,6 @@ export async function handleLongPress(
178
203
  }
179
204
  }
180
205
 
181
- function parseAiElementCoords(
182
- uuid: string
183
- ): { x: number; y: number } | { error: string } {
184
- const parts = uuid.split(':');
185
- if (parts.length < 2) {
186
- return { error: 'Invalid AI element UUID format.' };
187
- }
188
- const coords = parts[1].split(',');
189
- if (coords.length < 2) {
190
- return { error: 'Invalid AI element coordinates format.' };
191
- }
192
- const x = parseInt(coords[0], 10);
193
- const y = parseInt(coords[1], 10);
194
- if (isNaN(x) || isNaN(y)) {
195
- return { error: 'Invalid AI element coordinates: not numbers.' };
196
- }
197
- return { x, y };
198
- }
199
-
200
206
  function w3cTapAt(x: number, y: number) {
201
207
  return [
202
208
  {