mobai-mcp 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +44 -3
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -198,11 +198,15 @@ const TOOLS = [
198
198
  },
199
199
  verbose: {
200
200
  type: "boolean",
201
- description: "Include full element coordinates (default: false)",
201
+ description: "Include detailed elements array with bounds (default: false)",
202
202
  },
203
203
  only_visible: {
204
204
  type: "boolean",
205
- description: "Filter to visible elements only (default: true)",
205
+ description: "Filter to only visible elements (default: true)",
206
+ },
207
+ include_keyboard: {
208
+ type: "boolean",
209
+ description: "Include keyboard elements in the tree (default: false). Useful for interacting with on-screen keyboards.",
206
210
  },
207
211
  },
208
212
  required: ["device_id"],
@@ -332,6 +336,20 @@ const TOOLS = [
332
336
  required: ["device_id"],
333
337
  },
334
338
  },
339
+ {
340
+ name: "get_ocr",
341
+ description: "Perform OCR text recognition on the current screen (iOS only). Returns detected text with screen coordinates for tapping (already adjusted for tapping).",
342
+ inputSchema: {
343
+ type: "object",
344
+ properties: {
345
+ device_id: {
346
+ type: "string",
347
+ description: "Device ID",
348
+ },
349
+ },
350
+ required: ["device_id"],
351
+ },
352
+ },
335
353
  {
336
354
  name: "execute_dsl",
337
355
  description: `Execute a batch of automation steps using the DSL (Domain Specific Language).
@@ -368,6 +386,7 @@ Example DSL script:
368
386
  steps: {
369
387
  type: "array",
370
388
  description: "Array of action steps",
389
+ items: { type: "object" },
371
390
  },
372
391
  on_fail: {
373
392
  type: "object",
@@ -579,6 +598,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
579
598
  params.set("verbose", "true");
580
599
  if (args?.only_visible === false)
581
600
  params.set("onlyVisible", "false");
601
+ if (args?.include_keyboard)
602
+ params.set("includeKeyboard", "true");
582
603
  const queryString = params.toString();
583
604
  const endpoint = `/devices/${args?.device_id}/ui-tree${queryString ? `?${queryString}` : ""}`;
584
605
  result = await makeRequest("GET", endpoint);
@@ -618,6 +639,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
618
639
  case "list_apps":
619
640
  result = await makeRequest("GET", `/devices/${args?.device_id}/apps`);
620
641
  break;
642
+ case "get_ocr":
643
+ result = await makeRequest("GET", `/devices/${args?.device_id}/ocr`);
644
+ break;
621
645
  case "execute_dsl":
622
646
  result = await makeRequest("POST", `/devices/${args?.device_id}/dsl/execute`, args?.script, 300000 // 5 minutes
623
647
  );
@@ -784,6 +808,7 @@ const API_REFERENCE = `# MobAI API Reference
784
808
  | /devices/{id}/screenshot | GET | Capture screenshot (saved to /tmp/mobai/screenshots/) |
785
809
  | /devices/{id}/ui-tree | GET | Get UI accessibility tree |
786
810
  | /devices/{id}/apps | GET | List installed apps |
811
+ | /devices/{id}/ocr | GET | OCR text recognition (iOS only) |
787
812
 
788
813
  ## Bridge Control
789
814
 
@@ -888,7 +913,7 @@ The DSL (Domain Specific Language) enables batch execution of multiple automatio
888
913
 
889
914
  | Action | Description | Key Fields |
890
915
  |--------|-------------|------------|
891
- | observe | Get UI tree/screenshot | context, include (ui_tree, screenshot, installed_apps) |
916
+ | observe | Get UI tree/screenshot/OCR | context, include (ui_tree, screenshot, installed_apps, ocr) |
892
917
  | tap | Tap element | predicate or coords |
893
918
  | type | Type text | text, predicate (if keyboard not open), dismiss_keyboard (default: false) |
894
919
  | press_key | Press keyboard key | key (return, tab, delete, etc.), context (optional: "web") |
@@ -955,6 +980,16 @@ Note: \`predicate\` is required if keyboard is not already open. Use \`dismiss_k
955
980
  - \`abort\`: Stop on failure (default)
956
981
  - \`skip\`: Skip failed step, continue
957
982
  - \`retry\`: Retry with delay
983
+
984
+ ## OCR (iOS only)
985
+
986
+ Use \`include: ["ocr"]\` in observe to get text recognition when UI tree is empty:
987
+
988
+ \`\`\`json
989
+ {"action": "observe", "context": "native", "include": ["ocr"]}
990
+ \`\`\`
991
+
992
+ Returns text with coordinates for tapping (already adjusted for tapping).
958
993
  `;
959
994
  const NATIVE_RUNNER_GUIDE = `# Native App Automation Guide
960
995
 
@@ -1025,6 +1060,11 @@ The \`type\` action requires either:
1025
1060
  {"action": "type", "text": "username", "predicate": {"type": "input", "label": "Username"}}
1026
1061
  \`\`\`
1027
1062
 
1063
+ ### Dismissing Keyboard
1064
+ - Use \`press_key: return\` to submit and close the keyboard
1065
+ - If submit is not desired, look for a "Close", "Cancel", "Done" or "Back" button in the UI tree and tap it
1066
+ - On Android, \`press_key: back\` also dismisses the keyboard
1067
+
1028
1068
  ## Common Patterns
1029
1069
 
1030
1070
  ### Open App and Navigate
@@ -1096,6 +1136,7 @@ The \`type\` action requires either:
1096
1136
  - **Add delays after navigation** - Apps need time to render
1097
1137
  - **Use retry strategy** - Transient failures are common
1098
1138
  - **Use press_key for form navigation** - Tab between fields, Return to submit
1139
+ - **Use OCR for system dialogs (iOS)** - When UI tree is empty, use \`include: ["ocr"]\`
1099
1140
  `;
1100
1141
  const WEB_RUNNER_GUIDE = `# Web Automation Guide
1101
1142
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mobai-mcp",
3
- "version": "1.0.2",
3
+ "version": "1.1.0",
4
4
  "description": "MCP server for MobAI - AI-powered mobile device automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",