replicant-mcp 1.0.4 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/tools/ui.js CHANGED
@@ -1,12 +1,14 @@
1
1
  import { z } from "zod";
2
2
  import { CACHE_TTLS } from "../types/index.js";
3
+ import { flattenTree } from "../parsers/ui-dump.js";
3
4
  export const uiInputSchema = z.object({
4
- operation: z.enum(["dump", "find", "tap", "input", "screenshot", "accessibility-check"]),
5
+ operation: z.enum(["dump", "find", "tap", "input", "screenshot", "accessibility-check", "visual-snapshot"]),
5
6
  selector: z.object({
6
7
  resourceId: z.string().optional(),
7
8
  text: z.string().optional(),
8
9
  textContains: z.string().optional(),
9
10
  className: z.string().optional(),
11
+ nearestTo: z.string().optional(),
10
12
  }).optional(),
11
13
  x: z.number().optional(),
12
14
  y: z.number().optional(),
@@ -15,24 +17,83 @@ export const uiInputSchema = z.object({
15
17
  localPath: z.string().optional(),
16
18
  inline: z.boolean().optional(),
17
19
  debug: z.boolean().optional(),
20
+ gridCell: z.number().min(1).max(24).optional(),
21
+ gridPosition: z.number().min(1).max(5).optional(),
18
22
  });
19
23
  // Store last find results for elementIndex reference
20
- // Updated to support both accessibility and OCR elements
24
+ // Updated to support accessibility, OCR, and grid elements
21
25
  let lastFindResults = [];
22
- // Helper to get center coordinates from either element type
26
+ // Type guards for different element types
27
+ function isAccessibilityNode(el) {
28
+ return "centerX" in el && "className" in el;
29
+ }
30
+ function isOcrElement(el) {
31
+ return "confidence" in el && "center" in el;
32
+ }
33
+ function isGridElement(el) {
34
+ return "center" in el && "bounds" in el && !("confidence" in el) && !("centerX" in el);
35
+ }
36
+ // Helper to get center coordinates from any element type
23
37
  function getElementCenter(element) {
24
- if ("centerX" in element) {
25
- // AccessibilityNode
38
+ if (isAccessibilityNode(element)) {
26
39
  return { x: element.centerX, y: element.centerY };
27
40
  }
28
41
  else {
29
- // OcrElement
42
+ // OcrElement or GridElement - both have center property
30
43
  return element.center;
31
44
  }
32
45
  }
33
- export async function handleUiTool(input, context) {
46
+ // Calculate Euclidean distance between two points
47
+ function calculateDistance(p1, p2) {
48
+ return Math.sqrt(Math.pow(p2.x - p1.x, 2) + Math.pow(p2.y - p1.y, 2));
49
+ }
50
+ // Check if a point is inside element bounds
51
+ function isPointInBounds(point, bounds) {
52
+ return (point.x >= bounds.left &&
53
+ point.x <= bounds.right &&
54
+ point.y >= bounds.top &&
55
+ point.y <= bounds.bottom);
56
+ }
57
+ // Calculate area of bounds
58
+ function boundsArea(bounds) {
59
+ return (bounds.right - bounds.left) * (bounds.bottom - bounds.top);
60
+ }
61
+ // Find targets whose smallest containing ViewGroup also contains the anchor point
62
+ function findContainingSiblingTargets(tree, anchorPoint, targetElements) {
63
+ const flat = flattenTree(tree);
64
+ const containingTargets = [];
65
+ for (const target of targetElements) {
66
+ const targetCenter = { x: target.centerX, y: target.centerY };
67
+ // Find the smallest ViewGroup that contains the target
68
+ let smallestContainerForTarget = null;
69
+ let smallestArea = Infinity;
70
+ for (const node of flat) {
71
+ if (!node.className?.includes("ViewGroup"))
72
+ continue;
73
+ if (!isPointInBounds(targetCenter, node.bounds))
74
+ continue;
75
+ const area = boundsArea(node.bounds);
76
+ if (area < smallestArea) {
77
+ smallestArea = area;
78
+ smallestContainerForTarget = node;
79
+ }
80
+ }
81
+ // Check if that smallest container also contains the anchor point
82
+ if (smallestContainerForTarget && isPointInBounds(anchorPoint, smallestContainerForTarget.bounds)) {
83
+ containingTargets.push(target);
84
+ }
85
+ }
86
+ return containingTargets;
87
+ }
88
+ export async function handleUiTool(input, context, uiConfig) {
34
89
  const device = await context.deviceState.ensureDevice(context.adb);
35
90
  const deviceId = device.id;
91
+ // Get config - use provided or defaults
92
+ const config = uiConfig ?? {
93
+ visualModePackages: [],
94
+ autoFallbackScreenshot: true,
95
+ includeBase64: false,
96
+ };
36
97
  switch (input.operation) {
37
98
  case "dump": {
38
99
  const tree = await context.ui.dump(deviceId);
@@ -59,14 +120,71 @@ export async function handleUiTool(input, context) {
59
120
  throw new Error("selector is required for find operation");
60
121
  }
61
122
  const debug = input.debug ?? false;
62
- // Use findWithOcrFallback for text-based selectors
123
+ const nearestTo = input.selector.nearestTo;
124
+ // Use findWithFallbacks for text-based selectors
63
125
  if (input.selector.text || input.selector.textContains) {
64
- const result = await context.ui.findWithOcrFallback(deviceId, input.selector, { debug });
126
+ // If nearestTo is specified, first find the anchor element
127
+ let anchorCenter = null;
128
+ if (nearestTo) {
129
+ const anchorResult = await context.ui.findWithFallbacks(deviceId, { text: nearestTo }, {
130
+ debug: false,
131
+ includeVisualFallback: false,
132
+ });
133
+ if (anchorResult.elements.length > 0) {
134
+ anchorCenter = getElementCenter(anchorResult.elements[0]);
135
+ }
136
+ }
137
+ const result = await context.ui.findWithFallbacks(deviceId, input.selector, {
138
+ debug,
139
+ includeVisualFallback: config.autoFallbackScreenshot,
140
+ includeBase64: config.includeBase64,
141
+ gridCell: input.gridCell,
142
+ gridPosition: input.gridPosition,
143
+ });
144
+ // If we have an anchor, use containment-based matching
145
+ let usedContainment = false;
146
+ if (anchorCenter && result.elements.length > 0) {
147
+ // Filter to AccessibilityNode elements for containment check
148
+ const accessibilityElements = result.elements.filter(isAccessibilityNode);
149
+ if (accessibilityElements.length > 0) {
150
+ // Get the full tree for containment analysis
151
+ const tree = await context.ui.dump(deviceId);
152
+ // Find elements whose parent container contains the anchor point
153
+ const containingMatches = findContainingSiblingTargets(tree, anchorCenter, accessibilityElements);
154
+ if (containingMatches.length > 0) {
155
+ // Prioritize containment matches, then sort remaining by distance
156
+ usedContainment = true;
157
+ const containingCenters = new Set(containingMatches.map((el) => `${el.centerX},${el.centerY}`));
158
+ result.elements.sort((a, b) => {
159
+ const aCenter = getElementCenter(a);
160
+ const bCenter = getElementCenter(b);
161
+ const aContains = containingCenters.has(`${aCenter.x},${aCenter.y}`);
162
+ const bContains = containingCenters.has(`${bCenter.x},${bCenter.y}`);
163
+ // Containment matches come first
164
+ if (aContains && !bContains)
165
+ return -1;
166
+ if (!aContains && bContains)
167
+ return 1;
168
+ // Within same group, sort by distance
169
+ const distA = calculateDistance(aCenter, anchorCenter);
170
+ const distB = calculateDistance(bCenter, anchorCenter);
171
+ return distA - distB;
172
+ });
173
+ }
174
+ else {
175
+ // Fallback to pure distance sorting if no containment matches
176
+ result.elements.sort((a, b) => {
177
+ const distA = calculateDistance(getElementCenter(a), anchorCenter);
178
+ const distB = calculateDistance(getElementCenter(b), anchorCenter);
179
+ return distA - distB;
180
+ });
181
+ }
182
+ }
183
+ }
65
184
  lastFindResults = result.elements;
66
185
  const response = {
67
186
  elements: result.elements.map((el, index) => {
68
- if ("centerX" in el) {
69
- // AccessibilityNode
187
+ if (isAccessibilityNode(el)) {
70
188
  return {
71
189
  index,
72
190
  text: el.text,
@@ -78,8 +196,7 @@ export async function handleUiTool(input, context) {
78
196
  clickable: el.clickable,
79
197
  };
80
198
  }
81
- else {
82
- // OcrElement
199
+ else if (isOcrElement(el)) {
83
200
  return {
84
201
  index,
85
202
  text: el.text,
@@ -88,22 +205,63 @@ export async function handleUiTool(input, context) {
88
205
  confidence: debug ? el.confidence : undefined,
89
206
  };
90
207
  }
208
+ else {
209
+ // GridElement
210
+ return {
211
+ index,
212
+ center: el.center,
213
+ bounds: el.bounds,
214
+ };
215
+ }
91
216
  }),
92
217
  count: result.elements.length,
93
218
  deviceId,
94
219
  };
220
+ // Always include tier and confidence when available
221
+ if (result.tier !== undefined)
222
+ response.tier = result.tier;
223
+ if (result.confidence)
224
+ response.confidence = result.confidence;
95
225
  if (debug) {
96
226
  response.source = result.source;
97
227
  if (result.fallbackReason) {
98
228
  response.fallbackReason = result.fallbackReason;
99
229
  }
100
230
  }
231
+ // Include nearestTo info when used
232
+ if (nearestTo && anchorCenter) {
233
+ response.sortedByProximityTo = {
234
+ query: nearestTo,
235
+ anchor: anchorCenter,
236
+ method: usedContainment ? "containment" : "distance",
237
+ };
238
+ }
239
+ else if (nearestTo && !anchorCenter) {
240
+ response.nearestToWarning = `Could not find anchor element: "${nearestTo}"`;
241
+ }
242
+ // Include Tier 4 visual candidates if present
243
+ if (result.candidates) {
244
+ response.candidates = result.candidates;
245
+ if (result.truncated)
246
+ response.truncated = result.truncated;
247
+ if (result.totalCandidates)
248
+ response.totalCandidates = result.totalCandidates;
249
+ }
250
+ // Include Tier 5 grid fields if present
251
+ if (result.gridImage)
252
+ response.gridImage = result.gridImage;
253
+ if (result.gridPositions)
254
+ response.gridPositions = result.gridPositions;
255
+ // Include visual fallback if present (when count is 0 and autoFallbackScreenshot is enabled)
256
+ if (result.visualFallback) {
257
+ response.visualFallback = result.visualFallback;
258
+ }
101
259
  return response;
102
260
  }
103
261
  // Non-text selectors use regular find (no OCR fallback)
104
262
  const elements = await context.ui.find(deviceId, input.selector);
105
263
  lastFindResults = elements;
106
- return {
264
+ const response = {
107
265
  elements: elements.map((el, index) => ({
108
266
  index,
109
267
  text: el.text,
@@ -117,6 +275,17 @@ export async function handleUiTool(input, context) {
117
275
  count: elements.length,
118
276
  deviceId,
119
277
  };
278
+ // Include visual fallback for non-text selectors when no results and config allows
279
+ if (elements.length === 0 && config.autoFallbackScreenshot) {
280
+ const snapshot = await context.ui.visualSnapshot(deviceId, {
281
+ includeBase64: config.includeBase64,
282
+ });
283
+ response.visualFallback = {
284
+ ...snapshot,
285
+ hint: "No elements matched selector. Use screenshot to identify tap coordinates.",
286
+ };
287
+ }
288
+ return response;
120
289
  }
121
290
  case "tap": {
122
291
  let x, y;
@@ -157,19 +326,25 @@ export async function handleUiTool(input, context) {
157
326
  const result = await context.ui.accessibilityCheck(deviceId);
158
327
  return { ...result, deviceId };
159
328
  }
329
+ case "visual-snapshot": {
330
+ const snapshot = await context.ui.visualSnapshot(deviceId, {
331
+ includeBase64: input.inline ?? config.includeBase64,
332
+ });
333
+ return { ...snapshot, deviceId };
334
+ }
160
335
  default:
161
336
  throw new Error(`Unknown operation: ${input.operation}`);
162
337
  }
163
338
  }
164
339
  export const uiToolDefinition = {
165
340
  name: "ui",
166
- description: "Interact with app UI via accessibility tree. Auto-selects device if only one connected. Operations: dump, find, tap, input, screenshot, accessibility-check.",
341
+ description: "Interact with app UI via accessibility tree. Auto-selects device if only one connected. Operations: dump, find, tap, input, screenshot, accessibility-check, visual-snapshot.",
167
342
  inputSchema: {
168
343
  type: "object",
169
344
  properties: {
170
345
  operation: {
171
346
  type: "string",
172
- enum: ["dump", "find", "tap", "input", "screenshot", "accessibility-check"],
347
+ enum: ["dump", "find", "tap", "input", "screenshot", "accessibility-check", "visual-snapshot"],
173
348
  },
174
349
  selector: {
175
350
  type: "object",
@@ -178,6 +353,7 @@ export const uiToolDefinition = {
178
353
  text: { type: "string" },
179
354
  textContains: { type: "string" },
180
355
  className: { type: "string" },
356
+ nearestTo: { type: "string", description: "Find elements nearest to this text (spatial proximity)" },
181
357
  },
182
358
  description: "Element selector (for find)",
183
359
  },
@@ -185,9 +361,11 @@ export const uiToolDefinition = {
185
361
  y: { type: "number", description: "Y coordinate (for tap)" },
186
362
  elementIndex: { type: "number", description: "Element index from last find (for tap)" },
187
363
  text: { type: "string", description: "Text to input" },
188
- localPath: { type: "string", description: "Local path for screenshot (default: /tmp/replicant-screenshot-{timestamp}.png)" },
364
+ localPath: { type: "string", description: "Local path for screenshot (default: .replicant/screenshots/screenshot-{timestamp}.png)" },
189
365
  inline: { type: "boolean", description: "Return base64 instead of file path (token-heavy, use sparingly)" },
190
366
  debug: { type: "boolean", description: "Include source (accessibility/ocr) and confidence in response" },
367
+ gridCell: { type: "number", minimum: 1, maximum: 24, description: "Grid cell number (1-24) for Tier 5 refinement" },
368
+ gridPosition: { type: "number", minimum: 1, maximum: 5, description: "Position within cell (1=TL, 2=TR, 3=Center, 4=BL, 5=BR)" },
191
369
  },
192
370
  required: ["operation"],
193
371
  },
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Configuration types for replicant-mcp
3
+ * Loaded from REPLICANT_CONFIG environment variable path
4
+ */
5
+ export interface UiConfig {
6
+ /** Always skip accessibility and use visual mode for these packages */
7
+ visualModePackages: string[];
8
+ /** Auto-include screenshot when find returns no results (default: true) */
9
+ autoFallbackScreenshot: boolean;
10
+ /** Include base64-encoded screenshot in response (default: false) */
11
+ includeBase64: boolean;
12
+ }
13
+ export interface ReplicantConfig {
14
+ ui: UiConfig;
15
+ }
16
+ export declare const DEFAULT_CONFIG: ReplicantConfig;
17
+ /**
18
+ * Visual snapshot response returned when accessibility fails
19
+ * or when visual-snapshot operation is explicitly requested
20
+ */
21
+ export interface VisualSnapshot {
22
+ screenshotPath: string;
23
+ screenshotBase64?: string;
24
+ screen: {
25
+ width: number;
26
+ height: number;
27
+ density: number;
28
+ };
29
+ app: {
30
+ packageName: string;
31
+ activityName: string;
32
+ };
33
+ hint?: string;
34
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Configuration types for replicant-mcp
3
+ * Loaded from REPLICANT_CONFIG environment variable path
4
+ */
5
+ export const DEFAULT_CONFIG = {
6
+ ui: {
7
+ visualModePackages: [],
8
+ autoFallbackScreenshot: true,
9
+ includeBase64: false,
10
+ },
11
+ };
@@ -0,0 +1,50 @@
1
+ export type ConfidenceLevel = "high" | "medium" | "low";
2
+ export type FindSource = "accessibility" | "ocr" | "visual" | "grid";
3
+ export type FindTier = 1 | 2 | 3 | 4 | 5;
4
+ export interface VisualCandidate {
5
+ index: number;
6
+ bounds: string;
7
+ center: {
8
+ x: number;
9
+ y: number;
10
+ };
11
+ image: string;
12
+ }
13
+ export interface GridPosition {
14
+ cell: number;
15
+ position: 1 | 2 | 3 | 4 | 5;
16
+ x: number;
17
+ y: number;
18
+ }
19
+ import { AccessibilityNode } from "../parsers/ui-dump.js";
20
+ import { OcrElement } from "./ocr.js";
21
+ export interface GridElement {
22
+ index: number;
23
+ bounds: string;
24
+ center: {
25
+ x: number;
26
+ y: number;
27
+ };
28
+ }
29
+ export type FindElement = AccessibilityNode | OcrElement | GridElement;
30
+ export interface FindWithFallbacksResult {
31
+ elements: FindElement[];
32
+ source: FindSource;
33
+ tier?: FindTier;
34
+ confidence?: ConfidenceLevel;
35
+ fallbackReason?: string;
36
+ candidates?: VisualCandidate[];
37
+ truncated?: boolean;
38
+ totalCandidates?: number;
39
+ gridImage?: string;
40
+ gridCell?: number;
41
+ gridPositions?: string[];
42
+ visualFallback?: import("./config.js").VisualSnapshot;
43
+ }
44
+ export interface FindOptions {
45
+ debug?: boolean;
46
+ includeVisualFallback?: boolean;
47
+ includeBase64?: boolean;
48
+ gridCell?: number;
49
+ gridPosition?: 1 | 2 | 3 | 4 | 5;
50
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -2,3 +2,5 @@ export * from "./errors.js";
2
2
  export * from "./cache.js";
3
3
  export * from "./device.js";
4
4
  export * from "./ocr.js";
5
+ export * from "./config.js";
6
+ export * from "./icon-recognition.js";
@@ -2,3 +2,5 @@ export * from "./errors.js";
2
2
  export * from "./cache.js";
3
3
  export * from "./device.js";
4
4
  export * from "./ocr.js";
5
+ export * from "./config.js";
6
+ export * from "./icon-recognition.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "replicant-mcp",
3
- "version": "1.0.4",
3
+ "version": "1.1.0",
4
4
  "description": "Android MCP server for AI-assisted Android development",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -51,11 +51,14 @@
51
51
  "@modelcontextprotocol/sdk": "^1.25.3",
52
52
  "commander": "^14.0.2",
53
53
  "execa": "^9.6.1",
54
+ "sharp": "^0.34.5",
54
55
  "tesseract.js": "^7.0.0",
56
+ "yaml": "^2.8.2",
55
57
  "zod": "^4.3.5"
56
58
  },
57
59
  "devDependencies": {
58
60
  "@types/node": "^25.0.9",
61
+ "@types/sharp": "^0.31.1",
59
62
  "@vitest/coverage-v8": "^4.0.17",
60
63
  "tsx": "^4.21.0",
61
64
  "typescript": "^5.9.3",