mobai-mcp 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +44 -3
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -198,11 +198,15 @@ const TOOLS = [
|
|
|
198
198
|
},
|
|
199
199
|
verbose: {
|
|
200
200
|
type: "boolean",
|
|
201
|
-
description: "Include
|
|
201
|
+
description: "Include detailed elements array with bounds (default: false)",
|
|
202
202
|
},
|
|
203
203
|
only_visible: {
|
|
204
204
|
type: "boolean",
|
|
205
|
-
description: "Filter to visible elements
|
|
205
|
+
description: "Filter to only visible elements (default: true)",
|
|
206
|
+
},
|
|
207
|
+
include_keyboard: {
|
|
208
|
+
type: "boolean",
|
|
209
|
+
description: "Include keyboard elements in the tree (default: false). Useful for interacting with on-screen keyboards.",
|
|
206
210
|
},
|
|
207
211
|
},
|
|
208
212
|
required: ["device_id"],
|
|
@@ -332,6 +336,20 @@ const TOOLS = [
|
|
|
332
336
|
required: ["device_id"],
|
|
333
337
|
},
|
|
334
338
|
},
|
|
339
|
+
{
|
|
340
|
+
name: "get_ocr",
|
|
341
|
+
description: "Perform OCR text recognition on the current screen (iOS only). Returns detected text with screen coordinates for tapping (already adjusted for tapping).",
|
|
342
|
+
inputSchema: {
|
|
343
|
+
type: "object",
|
|
344
|
+
properties: {
|
|
345
|
+
device_id: {
|
|
346
|
+
type: "string",
|
|
347
|
+
description: "Device ID",
|
|
348
|
+
},
|
|
349
|
+
},
|
|
350
|
+
required: ["device_id"],
|
|
351
|
+
},
|
|
352
|
+
},
|
|
335
353
|
{
|
|
336
354
|
name: "execute_dsl",
|
|
337
355
|
description: `Execute a batch of automation steps using the DSL (Domain Specific Language).
|
|
@@ -368,6 +386,7 @@ Example DSL script:
|
|
|
368
386
|
steps: {
|
|
369
387
|
type: "array",
|
|
370
388
|
description: "Array of action steps",
|
|
389
|
+
items: { type: "object" },
|
|
371
390
|
},
|
|
372
391
|
on_fail: {
|
|
373
392
|
type: "object",
|
|
@@ -579,6 +598,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
579
598
|
params.set("verbose", "true");
|
|
580
599
|
if (args?.only_visible === false)
|
|
581
600
|
params.set("onlyVisible", "false");
|
|
601
|
+
if (args?.include_keyboard)
|
|
602
|
+
params.set("includeKeyboard", "true");
|
|
582
603
|
const queryString = params.toString();
|
|
583
604
|
const endpoint = `/devices/${args?.device_id}/ui-tree${queryString ? `?${queryString}` : ""}`;
|
|
584
605
|
result = await makeRequest("GET", endpoint);
|
|
@@ -618,6 +639,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
618
639
|
case "list_apps":
|
|
619
640
|
result = await makeRequest("GET", `/devices/${args?.device_id}/apps`);
|
|
620
641
|
break;
|
|
642
|
+
case "get_ocr":
|
|
643
|
+
result = await makeRequest("GET", `/devices/${args?.device_id}/ocr`);
|
|
644
|
+
break;
|
|
621
645
|
case "execute_dsl":
|
|
622
646
|
result = await makeRequest("POST", `/devices/${args?.device_id}/dsl/execute`, args?.script, 300000 // 5 minutes
|
|
623
647
|
);
|
|
@@ -784,6 +808,7 @@ const API_REFERENCE = `# MobAI API Reference
|
|
|
784
808
|
| /devices/{id}/screenshot | GET | Capture screenshot (saved to /tmp/mobai/screenshots/) |
|
|
785
809
|
| /devices/{id}/ui-tree | GET | Get UI accessibility tree |
|
|
786
810
|
| /devices/{id}/apps | GET | List installed apps |
|
|
811
|
+
| /devices/{id}/ocr | GET | OCR text recognition (iOS only) |
|
|
787
812
|
|
|
788
813
|
## Bridge Control
|
|
789
814
|
|
|
@@ -888,7 +913,7 @@ The DSL (Domain Specific Language) enables batch execution of multiple automatio
|
|
|
888
913
|
|
|
889
914
|
| Action | Description | Key Fields |
|
|
890
915
|
|--------|-------------|------------|
|
|
891
|
-
| observe | Get UI tree/screenshot | context, include (ui_tree, screenshot, installed_apps) |
|
|
916
|
+
| observe | Get UI tree/screenshot/OCR | context, include (ui_tree, screenshot, installed_apps, ocr) |
|
|
892
917
|
| tap | Tap element | predicate or coords |
|
|
893
918
|
| type | Type text | text, predicate (if keyboard not open), dismiss_keyboard (default: false) |
|
|
894
919
|
| press_key | Press keyboard key | key (return, tab, delete, etc.), context (optional: "web") |
|
|
@@ -955,6 +980,16 @@ Note: \`predicate\` is required if keyboard is not already open. Use \`dismiss_k
|
|
|
955
980
|
- \`abort\`: Stop on failure (default)
|
|
956
981
|
- \`skip\`: Skip failed step, continue
|
|
957
982
|
- \`retry\`: Retry with delay
|
|
983
|
+
|
|
984
|
+
## OCR (iOS only)
|
|
985
|
+
|
|
986
|
+
Use \`include: ["ocr"]\` in observe to get text recognition when UI tree is empty:
|
|
987
|
+
|
|
988
|
+
\`\`\`json
|
|
989
|
+
{"action": "observe", "context": "native", "include": ["ocr"]}
|
|
990
|
+
\`\`\`
|
|
991
|
+
|
|
992
|
+
Returns text with coordinates for tapping (already adjusted for tapping).
|
|
958
993
|
`;
|
|
959
994
|
const NATIVE_RUNNER_GUIDE = `# Native App Automation Guide
|
|
960
995
|
|
|
@@ -1025,6 +1060,11 @@ The \`type\` action requires either:
|
|
|
1025
1060
|
{"action": "type", "text": "username", "predicate": {"type": "input", "label": "Username"}}
|
|
1026
1061
|
\`\`\`
|
|
1027
1062
|
|
|
1063
|
+
### Dismissing Keyboard
|
|
1064
|
+
- Use \`press_key: return\` to submit and close the keyboard
|
|
1065
|
+
- If submit is not desired, look for a "Close", "Cancel", "Done" or "Back" button in the UI tree and tap it
|
|
1066
|
+
- On Android, \`press_key: back\` also dismisses the keyboard
|
|
1067
|
+
|
|
1028
1068
|
## Common Patterns
|
|
1029
1069
|
|
|
1030
1070
|
### Open App and Navigate
|
|
@@ -1096,6 +1136,7 @@ The \`type\` action requires either:
|
|
|
1096
1136
|
- **Add delays after navigation** - Apps need time to render
|
|
1097
1137
|
- **Use retry strategy** - Transient failures are common
|
|
1098
1138
|
- **Use press_key for form navigation** - Tab between fields, Return to submit
|
|
1139
|
+
- **Use OCR for system dialogs (iOS)** - When UI tree is empty, use \`include: ["ocr"]\`
|
|
1099
1140
|
`;
|
|
1100
1141
|
const WEB_RUNNER_GUIDE = `# Web Automation Guide
|
|
1101
1142
|
|