mobile-debug-mcp 0.24.2 → 0.24.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -185,9 +185,10 @@ export class ToolsInteract {
185
185
  return await interact.tap(x, y, resolved.id);
186
186
  }
187
187
  static async tapElementHandler({ elementId }) {
188
- const timestamp = Date.now();
188
+ const timestampMs = Date.now();
189
+ const timestamp = new Date(timestampMs).toISOString();
189
190
  const actionType = 'tap_element';
190
- const actionId = nextActionId(actionType, timestamp);
191
+ const actionId = nextActionId(actionType, timestampMs);
191
192
  const selector = { elementId };
192
193
  const resolved = ToolsInteract._resolvedUiElements.get(elementId);
193
194
  if (!resolved) {
@@ -225,6 +226,7 @@ export class ToolsInteract {
225
226
  action_id: actionId,
226
227
  timestamp,
227
228
  action_type: actionType,
229
+ ...(tree?.device ? { device: tree.device } : {}),
228
230
  target: {
229
231
  selector,
230
232
  resolved: resolvedTarget
@@ -49,9 +49,10 @@ export function inferScrollFailure(message) {
49
49
  return { failureCode: 'UNKNOWN', retryable: false };
50
50
  }
51
51
  export function buildActionExecutionResult({ actionType, device, selector, resolved, success, uiFingerprintBefore, uiFingerprintAfter, failure, details }) {
52
- const timestamp = Date.now();
52
+ const timestampMs = Date.now();
53
+ const timestamp = new Date(timestampMs).toISOString();
53
54
  return {
54
- action_id: nextActionId(actionType, timestamp),
55
+ action_id: nextActionId(actionType, timestampMs),
55
56
  timestamp,
56
57
  action_type: actionType,
57
58
  ...(device ? { device } : {}),
@@ -66,3 +67,28 @@ export function buildActionExecutionResult({ actionType, device, selector, resol
66
67
  ...(details ? { details } : {})
67
68
  };
68
69
  }
70
+ export function wrapToolError(name, error) {
71
+ const message = error instanceof Error
72
+ ? error.message
73
+ : typeof error === 'object' && error !== null
74
+ ? (() => {
75
+ try {
76
+ return JSON.stringify(error, null, 2);
77
+ }
78
+ catch {
79
+ return '[unserializable error object]';
80
+ }
81
+ })()
82
+ : String(error);
83
+ return {
84
+ content: [{
85
+ type: 'text',
86
+ text: JSON.stringify({
87
+ error: {
88
+ tool: name,
89
+ message
90
+ }
91
+ }, null, 2)
92
+ }]
93
+ };
94
+ }
@@ -10,7 +10,7 @@ Inputs:
10
10
  - deviceId (optional)
11
11
 
12
12
  Output Structure:
13
- - action_id, timestamp, action_type
13
+ - action_id, timestamp (ISO 8601), action_type
14
14
  - target.selector = { appId }
15
15
  - success = true when launch was dispatched successfully
16
16
  - failure_code/retryable when launch dispatch fails
@@ -83,7 +83,7 @@ Inputs:
83
83
  - deviceId (optional)
84
84
 
85
85
  Output Structure:
86
- - action_id, timestamp, action_type
86
+ - action_id, timestamp (ISO 8601), action_type
87
87
  - target.selector = { appId }
88
88
  - success = true when the restart command completed
89
89
  - failure_code/retryable when restart dispatch fails
@@ -344,6 +344,7 @@ Capabilities:
344
344
  Constraints:
345
345
  - Does not verify correctness of the resulting state
346
346
  - Must not be used alone to confirm action success when an applicable expect_* tool exists
347
+ - Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
347
348
 
348
349
  Recommended Usage:
349
350
  1. Capture or define the expected outcome
@@ -532,7 +533,7 @@ Inputs:
532
533
  - deviceId (optional)
533
534
 
534
535
  Output Structure:
535
- - action_id, timestamp, action_type
536
+ - action_id, timestamp (ISO 8601), action_type
536
537
  - target.selector = { x, y }
537
538
  - success = true when the tap was dispatched
538
539
  - failure_code/retryable when dispatch fails
@@ -587,7 +588,7 @@ Inputs:
587
588
 
588
589
  Output Structure:
589
590
  - action_id: unique timestamp-based action identifier
590
- - timestamp: epoch milliseconds for the action attempt
591
+ - timestamp: ISO 8601 timestamp for the action attempt
591
592
  - action_type: "tap_element"
592
593
  - target.selector: original target handle ({ elementId })
593
594
  - target.resolved: minimal resolved element info used for the tap
@@ -640,7 +641,7 @@ Inputs:
640
641
  - platform/deviceId (optional)
641
642
 
642
643
  Output Structure:
643
- - action_id, timestamp, action_type
644
+ - action_id, timestamp (ISO 8601), action_type
644
645
  - target.selector = { x1, y1, x2, y2, duration }
645
646
  - success = true when the swipe was dispatched
646
647
  - failure_code/retryable when dispatch fails
@@ -692,7 +693,7 @@ Inputs:
692
693
  - direction, maxScrolls, scrollAmount, deviceId (optional)
693
694
 
694
695
  Output Structure:
695
- - action_id, timestamp, action_type
696
+ - action_id, timestamp (ISO 8601), action_type
696
697
  - target.selector = original selector
697
698
  - target.resolved = minimal resolved element info when found
698
699
  - success = true when scrolling produced a visible target element
@@ -746,7 +747,7 @@ Inputs:
746
747
  - platform/deviceId (optional)
747
748
 
748
749
  Output Structure:
749
- - action_id, timestamp, action_type
750
+ - action_id, timestamp (ISO 8601), action_type
750
751
  - target.selector = { text }
751
752
  - success = true when text input was dispatched
752
753
  - failure_code/retryable when dispatch fails
@@ -795,7 +796,7 @@ Inputs:
795
796
  - platform/deviceId (optional)
796
797
 
797
798
  Output Structure:
798
- - action_id, timestamp, action_type
799
+ - action_id, timestamp (ISO 8601), action_type
799
800
  - target.selector = { key: "back" }
800
801
  - success = true when the back action was dispatched
801
802
  - failure_code/retryable when dispatch fails
@@ -835,6 +836,8 @@ Failure Handling:
835
836
  description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
836
837
 
837
838
  MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
839
+ Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
840
+ For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
838
841
 
839
842
  HOW TO GATHER INPUTS before calling:
840
843
  1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
868
871
  },
869
872
  networkRequests: {
870
873
  type: 'array',
871
- description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
874
+ description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
872
875
  items: {
873
876
  type: 'object',
874
877
  properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
890
893
  name: 'get_network_activity',
891
894
  description: `Returns structured network events captured from platform logs since the last action.
892
895
 
893
- Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
896
+ Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
894
897
  Do not call more than once per action.
895
898
 
896
899
  Events are filtered to significant (non-background) requests only.
@@ -4,7 +4,7 @@ import { ToolsObserve } from '../observe/index.js';
4
4
  import { classifyActionOutcome } from '../interact/classify.js';
5
5
  import { ToolsNetwork } from '../network/index.js';
6
6
  import { getSystemStatus } from '../system/index.js';
7
- import { buildActionExecutionResult, captureActionFingerprint, inferGenericFailure, inferScrollFailure, wrapResponse } from './common.js';
7
+ import { buildActionExecutionResult, captureActionFingerprint, inferGenericFailure, inferScrollFailure, wrapResponse, wrapToolError } from './common.js';
8
8
  async function handleStartApp(args) {
9
9
  const { platform, appId, deviceId } = args;
10
10
  const uiFingerprintBefore = await captureActionFingerprint(platform, deviceId);
@@ -330,8 +330,7 @@ export async function handleToolCall(name, args = {}) {
330
330
  return await handler(args);
331
331
  }
332
332
  catch (error) {
333
- return {
334
- content: [{ type: 'text', text: `Error executing tool ${name}: ${error instanceof Error ? error.message : String(error)}` }]
335
- };
333
+ console.error(`Error executing tool ${name}:`, error);
334
+ return wrapToolError(name, error);
336
335
  }
337
336
  }
package/docs/CHANGELOG.md CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  All notable changes to the **Mobile Debug MCP** project will be documented in this file.
4
4
 
5
+ ## [0.24.4]
6
+ - Moving agents away from `wait_for_screen_change`
7
+
8
+ ## [0.24.3]
9
+ - Improved output consistency
10
+
5
11
  ## [0.24.2]
6
12
  - Fixed Android install issue
7
13
  - Updated tools to have more detailed responses
@@ -0,0 +1,312 @@
1
+ # Baseline Spec v0
2
+
3
+ ## 1. System Overview
4
+
5
+ The MCP surface is defined in `src/server/tool-definitions.ts` and dispatched in `src/server/tool-handlers.ts`. Tools are grouped in code by module, not by an explicit runtime taxonomy: **manage**, **observe**, **interact**, **network/classification**, and **system**.
6
+
7
+ Agents interact with tools by name through `handleToolCall(name, args)`. Most handlers return a **single text content block containing JSON** via `wrapResponse(...)`. Exceptions are observable in code:
8
+
9
+ | Tool | MCP content shape |
10
+ | --- | --- |
11
+ | most tools | one text block with JSON |
12
+ | `get_logs` | two text blocks: metadata JSON, then logs JSON |
13
+ | `capture_screenshot` | one text block with JSON metadata, then one or more image blocks |
14
+ | `build_and_install` | one NDJSON text block, then one JSON text block |
15
+ | uncaught handler error | one plain text error string, not wrapped JSON |
16
+
17
+ Observable execution flow for state-mutating action tools at the MCP boundary:
18
+
19
+ 1. resolve device/platform
20
+ 2. call `ToolsNetwork.notifyActionStart()`
21
+ 3. capture UI fingerprint before the action
22
+ 4. execute the platform action
23
+ 5. capture UI fingerprint after the action
24
+ 6. wrap the result into an action envelope
25
+
26
+ That flow is applied to `start_app`, `restart_app`, `tap`, `swipe`, `scroll_to_element`, `type_text`, and `press_back`. `tap_element` builds a similar envelope inside `src/interact/index.ts` rather than through the shared wrapper.
27
+
28
+ ## 2. Tool Inventory
29
+
30
+ ### Manage / lifecycle
31
+
32
+ | Tool | Purpose | Inputs | Outputs | Side effects |
33
+ | --- | --- | --- | --- | --- |
34
+ | `start_app` | Launch app on Android or iOS. | `{ platform: 'android'\|'ios', appId: string, deviceId?: string }` | `ActionExecutionResult` JSON with `device` and `details` (`launch_time_ms`, `device_id`, `output?`, `observed_app?`, `error?`). | Launches app, captures fingerprints, resets network window. |
35
+ | `terminate_app` | Stop app process. | `{ platform: 'android'\|'ios', appId: string, deviceId?: string }` | `{ terminated: boolean, device: DeviceInfo }` | Terminates app. |
36
+ | `restart_app` | Terminate then relaunch app. | `{ platform: 'android'\|'ios', appId: string, deviceId?: string }` | `ActionExecutionResult` JSON with `device` and restart `details` (`terminated_before_restart`, `terminate_error?`, `output?`, `observed_app?`, `error?`). | Stops and launches app, captures fingerprints, resets network window. |
37
+ | `reset_app_data` | Clear app storage / simulator container data. | `{ platform: 'android'\|'ios', appId: string, deviceId?: string }` | `{ reset: boolean, device: DeviceInfo }` | Clears app state. |
38
+ | `install_app` | Install built artifact or project output. | `{ platform: 'android'\|'ios', projectType: 'native'\|'kmp'\|'react-native'\|'flutter', appPath: string, deviceId?: string }` | `{ device: DeviceInfo, installed: boolean, output?: string, error?: string }` | Installs app; Android may push APK/AAB and run `pm install`; iOS may use `simctl` or `idb`. |
39
+ | `build_app` | Build project and return artifact path. | `{ platform: 'android'\|'ios', projectType: ..., projectPath: string, variant?: string }` | Build result JSON from platform builder, including artifact path on success or `error`. | Runs Gradle or Xcode build. |
40
+ | `build_and_install` | Build then install, streaming progress. | `{ platform: 'android'\|'ios', projectType: ..., projectPath: string, deviceId?: string, variant?: string }` | MCP response has NDJSON event block plus result JSON `{ success: boolean, artifactPath?: string, device?: DeviceInfo, output?: string, error?: string }`. | Builds, installs, emits progress events. |
41
+ | `list_devices` | Enumerate available devices. | `{ platform?: 'android'\|'ios', appId?: string }` | `{ devices: DeviceInfo[] }` (runtime objects may also include `appInstalled`/`booted`). | Reads device lists. |
42
+
43
+ ### Observe / inspect
44
+
45
+ | Tool | Purpose | Inputs | Outputs | Side effects |
46
+ | --- | --- | --- | --- | --- |
47
+ | `get_logs` | Fetch recent device logs. | `{ platform: 'android'\|'ios', appId?: string, deviceId?: string, pid?: number, tag?: string, level?: string, contains?: string, since_seconds?: number, limit?: number, lines?: number }` | Two text blocks: metadata `{ device, result: { count, filtered, crashLines, source, meta } }`, then `{ logs: [...] }`. | Reads platform logs. |
48
+ | `capture_screenshot` | Capture current screenshot. | `{ platform: 'android'\|'ios', deviceId?: string }` | Text metadata block plus image block(s). | Captures screenshot; uses temp files. |
49
+ | `capture_debug_snapshot` | Bundle screenshot, UI tree, screen, fingerprint, and logs. | `{ reason?: string, includeLogs?: boolean, logLines?: number, platform?: 'android'\|'ios', appId?: string, deviceId?: string, sessionId?: string }` | Wrapped JSON snapshot object with device metadata, screenshot metadata, UI tree, fingerprint, current screen, and logs/errors. | Captures multiple observations. |
50
+ | `start_log_stream` | Start background structured log stream. | `{ platform?: 'android'\|'ios', packageName: string, level?: 'error'\|'warn'\|'info'\|'debug', deviceId?: string, sessionId?: string }` | `{ success: boolean, stream_started?: boolean, device_id?: string, pid?: number, error?: string }` | Starts long-lived log process, writes NDJSON file. |
51
+ | `read_log_stream` | Read accumulated streamed logs. | `{ sessionId?: string }` | `{ entries: any[], crash_summary?: { crash_detected: boolean, exception?: string, sample?: string } }` | Reads stream file; no new device action. |
52
+ | `stop_log_stream` | Stop background log stream. | `{ sessionId?: string }` | `{ success: boolean }` | Stops stream process and clears session entry. |
53
+ | `get_ui_tree` | Return current UI hierarchy. | `{ platform: 'android'\|'ios', deviceId?: string }` | `GetUITreeResponse` with `device`, `elements`, `resolution`, optional `error`. | Dumps UI hierarchy; Android writes/pulls XML; iOS queries via `idb`. |
54
+ | `get_current_screen` | Return visible Android activity. | `{ deviceId?: string }` | `GetCurrentScreenResponse` with `device`, `activity`, `package`, `shortActivity?`, `error?`. | Reads `dumpsys`; Android only. |
55
+ | `get_screen_fingerprint` | Compute stable screen fingerprint from UI tree and current screen. | `{ platform?: 'android'\|'ios', deviceId?: string }` | `{ fingerprint: string\|null, activity?: string, error?: string }` | Reads UI tree and, on Android, current screen. |
56
+
57
+ ### Interact / wait / verify
58
+
59
+ | Tool | Purpose | Inputs | Outputs | Side effects |
60
+ | --- | --- | --- | --- | --- |
61
+ | `wait_for_screen_change` | Wait until fingerprint differs from provided previous fingerprint. | `{ platform?: 'android'\|'ios', previousFingerprint: string, timeoutMs?: number, pollIntervalMs?: number, deviceId?: string }` | `{ success: boolean, previousFingerprint, newFingerprint?\|lastFingerprint?, elapsedMs, observed_screen: { fingerprint, activity }, reason?: 'timeout' }` | Polls fingerprints. |
62
+ | `expect_screen` | Exact check against expected fingerprint or screen name. | `{ platform?: 'android'\|'ios', fingerprint?: string, screen?: string, deviceId?: string }` | `{ success, observed_screen, expected_screen, confidence, comparison: { basis, matched, reason } }` | Reads fingerprint/current screen. |
63
+ | `expect_element_visible` | Binary visible check for selector. | `{ selector: { text?: string, resource_id?: string, accessibility_id?: string, contains?: boolean }, element_id?: string, timeout_ms?: number, poll_interval_ms?: number, platform?: 'android'\|'ios', deviceId?: string }` | `{ success, selector, element_id, expected_condition: 'visible', element?, observed, reason, failure_code?, retryable? }` | Polls UI tree through `wait_for_ui`. |
64
+ | `wait_for_ui` | Deterministic UI wait and element resolution. | `{ selector?: { text?: string, resource_id?: string, accessibility_id?: string, contains?: boolean }, condition?: 'exists'\|'not_exists'\|'visible'\|'clickable', timeout_ms?: number, poll_interval_ms?: number, match?: { index?: number }, retry?: { max_attempts?: number, backoff_ms?: number }, platform?: 'android'\|'ios', deviceId?: string }` | Success: `{ status:'success', matched, element, metrics, requested, observed }`; failure: `{ status:'timeout', error:{code,message}, metrics, requested, observed }`. | Polls UI tree; resolves actionable ancestor for `clickable`. |
65
+ | `find_element` | Heuristic semantic element search. | `{ query: string, exact?: boolean, timeoutMs?: number, platform?: 'android'\|'ios', deviceId?: string }` | `{ found: true, element, score, confidence }` or `{ found: false, error }` | Polls UI tree; no mutation. |
66
+
67
+ ### Action / mutation
68
+
69
+ | Tool | Purpose | Inputs | Outputs | Side effects |
70
+ | --- | --- | --- | --- | --- |
71
+ | `tap` | Tap coordinates. | `{ x: number, y: number, platform?: 'android'\|'ios', deviceId?: string }` | `ActionExecutionResult` | Taps screen; captures fingerprints; resets network window. |
72
+ | `tap_element` | Tap resolved UI element by `elementId`. | `{ elementId: string }` | Action-style JSON with `action_type: 'tap_element'`, target selector/resolved element, `success`, fingerprints, `failure_code?`, `retryable?`. | Reads cached element/UI context, validates element, taps it, resets network window. |
73
+ | `swipe` | Swipe coordinates. | `{ platform?: 'android'\|'ios', x1, y1, x2, y2, duration, deviceId?: string }` | `ActionExecutionResult` | Swipes screen; captures fingerprints; resets network window. |
74
+ | `scroll_to_element` | Repeatedly scroll until matching visible element is found. | `{ platform: 'android'\|'ios', selector: { text?: string, resourceId?: string, contentDesc?: string, className?: string }, direction?: 'down'\|'up', maxScrolls?: number, scrollAmount?: number, deviceId?: string }` | `ActionExecutionResult` | Repeated swipes plus UI tree checks; resets network window. |
75
+ | `type_text` | Type text into focused field. | `{ platform?: 'android', text: string, deviceId?: string }` | `ActionExecutionResult` | Android text input; captures fingerprints; resets network window. |
76
+ | `press_back` | Send Android Back key. | `{ platform?: 'android', deviceId?: string }` | `ActionExecutionResult` | Android back action; captures fingerprints; resets network window. |
77
+
78
+ ### Classification / network / system
79
+
80
+ | Tool | Purpose | Inputs | Outputs | Side effects |
81
+ | --- | --- | --- | --- | --- |
82
+ | `classify_action_outcome` | Deterministic rule-based classifier over supplied signals. | `{ uiChanged: boolean, expectedElementVisible?: boolean, networkRequests?: { url?: string, status: 'success'\|'failure'\|'retryable' }[], hasLogErrors?: boolean }` | `{ outcome: 'success'\|'no_op'\|'backend_failure'\|'ui_failure'\|'unknown', reasoning: string, nextAction?: 'call_get_network_activity' }` | Pure computation. |
83
+ | `get_network_activity` | Return normalized request events since last action window. | `{}` | `{ requests: NetworkRequestSummary[], count: number }` | Reads logs, advances internal `lastConsumedTimestamp`. |
84
+ | `get_system_status` | Aggregate Android/iOS/Gradle readiness. | `{}` | `{ success, status: 'ready'\|'degraded'\|'blocked', adbAvailable, adbVersion, devices, deviceStates, logsAvailable, envValid, issues, appInstalled, iosAvailable, iosDevices, gradleJavaHome, gradleValid, gradleFilesChecked, gradleSuggestedFixes, summary }` | Reads toolchain/device state. |
85
+
86
+ ## 3. Action Tools (Mutation Tools)
87
+
88
+ | Tool | Actual output shape | Success reporting | Failure structure | Retry logic |
89
+ | --- | --- | --- | --- | --- |
90
+ | `start_app` | `ActionExecutionResult` + `device` + `details` | `success` mirrors underlying launch success | `failure_code` inferred generically; raw launch `error` only appears in `details` | none |
91
+ | `terminate_app` | `{ terminated: boolean, device }` | `terminated === true` | no standardized failure code; boolean only at MCP layer | none |
92
+ | `restart_app` | `ActionExecutionResult` + `device` + restart `details` | `success` mirrors underlying restart success | `failure_code` inferred generically; terminate/start details kept in `details` | no retry; always does terminate then start |
93
+ | `reset_app_data` | `{ reset: boolean, device }` | `reset === true` | no standardized failure code | none |
94
+ | `install_app` | `{ device, installed, output?, error? }` | `installed === true` | unstructured `error` string; no action envelope | Android has internal fallback paths; iOS may fall back from `simctl` to `idb` |
95
+ | `build_and_install` | NDJSON event stream + `{ success, artifactPath?, device?, output?, error? }` | final `success === true` | unstructured `error`; build/install phases encoded in NDJSON | build and install internals may retry depending on platform helpers |
96
+ | `tap` | `ActionExecutionResult` | `success` means command executed | `failure_code`/`retryable` inferred from generic error text; raw error omitted | none |
97
+ | `tap_element` | action-style JSON built in `src/interact/index.ts` | `success` means element was resolved and tap dispatched | structured `failure_code` from `ActionFailureCode`; includes `retryable` | none |
98
+ | `swipe` | `ActionExecutionResult` | command executed | generic inferred `failure_code` | none |
99
+ | `scroll_to_element` | `ActionExecutionResult` | **different semantics**: success means target element became visible during scroll loop | `failure_code` inferred by scroll-specific string matching | internal loop up to `maxScrolls` |
100
+ | `type_text` | `ActionExecutionResult` | command executed | generic inferred `failure_code` | none |
101
+ | `press_back` | `ActionExecutionResult` | command executed | generic inferred `failure_code` | none |
102
+
103
+ **Observed inconsistency:** `start_app`/`restart_app` expose `device` and rich `details`; `tap`/`swipe`/`type_text`/`press_back` do not. `scroll_to_element` reports an outcome-oriented success, while the others mostly report execution success.
104
+
105
+ ## 4. Observation and Wait Tools
106
+
107
+ ### `wait_for_ui`
108
+
109
+ - **Role:** both waits and resolves.
110
+ - **Signals used:** only the current UI tree from `get_ui_tree`.
111
+ - **Behavior:** filters elements by selector, supports `match.index`, evaluates `exists` / `not_exists` / `visible` / `clickable`, and resolves an actionable ancestor for `clickable`.
112
+ - **Output:** descriptive, not binary. Returns `requested`, `observed`, `metrics`, and optionally `element`.
113
+ - **Success model:** `status: 'success'`; otherwise `status: 'timeout'` with structured `error`.
114
+
115
+ ### `wait_for_screen_change`
116
+
117
+ - **Role:** wait only.
118
+ - **Signals used:** screen fingerprints from `get_screen_fingerprint`.
119
+ - **Behavior:** polls until fingerprint differs from `previousFingerprint`, then performs a confirmation read for stability.
120
+ - **Output:** binary `success` plus descriptive `observed_screen`, elapsed time, and either `newFingerprint` or `lastFingerprint`.
121
+
122
+ ### `find_element`
123
+
124
+ - **Role:** resolve only.
125
+ - **Signals used:** UI tree.
126
+ - **Behavior:** heuristic scoring over text/content/resource/class; if best element is not interactable it tries to resolve a clickable ancestor.
127
+ - **Output:** descriptive, scored result (`score`, `confidence`) or `{ found:false, error }`.
128
+
129
+ ### `get_ui_tree`
130
+
131
+ - **Role:** inspect only.
132
+ - **Signals used:** platform accessibility/UI dump.
133
+ - **Output:** raw tree data with `elements`, `resolution`, and `device`.
134
+ - **Notes:** Android and iOS each retry internally up to three attempts.
135
+
136
+ ### `get_current_screen`
137
+
138
+ - **Role:** inspect only.
139
+ - **Signals used:** Android activity manager / window dumps.
140
+ - **Output:** current package/activity object.
141
+ - **Notes:** Android-only.
142
+
143
+ ### `get_screen_fingerprint`
144
+
145
+ - **Role:** inspect only.
146
+ - **Signals used:** UI tree plus current screen on Android.
147
+ - **Behavior:** normalizes a subset of visible, structurally significant elements and hashes them.
148
+ - **Output:** `{ fingerprint, activity?, error? }`.
149
+ - **Notes:** iOS fingerprint omits activity in the hash payload.
150
+
151
+ ### Log/snapshot observation
152
+
153
+ - `get_logs` returns structured metadata plus raw/structured log entries.
154
+ - `start_log_stream` / `read_log_stream` / `stop_log_stream` manage background NDJSON log capture.
155
+ - `capture_screenshot` and `capture_debug_snapshot` provide point-in-time observation artifacts.
156
+
157
+ ## 5. Existing Verification Mechanisms
158
+
159
+ | Mechanism | Success rule | Determinism | Ambiguity |
160
+ | --- | --- | --- | --- |
161
+ | `expect_screen` | exact fingerprint equality, else exact screen-name equality | binary and deterministic | if only `screen` is provided, Android may use either fingerprint-derived `activity` or `get_current_screen` label |
162
+ | `expect_element_visible` | delegated `wait_for_ui(condition:'visible')` reaches success | binary wrapper over deterministic wait | failure collapses to `TIMEOUT` or `UNKNOWN` |
163
+ | `wait_for_ui` used as verification | requested condition becomes true | deterministic per poll inputs | descriptive output, not a dedicated verification result |
164
+ | `wait_for_screen_change` | fingerprint changes and stays stable for one confirmation pass | deterministic | verifies change, not correctness of destination |
165
+ | `classify_action_outcome` | ordered rule evaluation over provided UI/network/log inputs | deterministic pure function | if `networkRequests` omitted, it returns `unknown` with `nextAction: 'call_get_network_activity'`; `hasLogErrors` does not change the enum outcome |
166
+
167
+ ## 6. Action Result Semantics
168
+
169
+ Across action tools, **success is not uniform**:
170
+
171
+ 1. **Execution success:** `tap`, `swipe`, `type_text`, `press_back`, `start_app`, `restart_app`, and `tap_element` mainly report that the command ran or the tap was dispatched.
172
+ 2. **Outcome success:** `scroll_to_element` reports success only if the target element was actually found during scrolling.
173
+ 3. **Boolean operation success:** `install_app`, `terminate_app`, and `reset_app_data` use tool-specific booleans (`installed`, `terminated`, `reset`) instead of the action envelope.
174
+
175
+ Failure handling is **partly standardized**:
176
+
177
+ - action-envelope tools use `failure_code` and `retryable`
178
+ - manage tools often use plain booleans plus `error` strings
179
+ - some handlers drop underlying diagnostics before the MCP response is built
180
+
181
+ ## 7. Failure Handling
182
+
183
+ ### Structured failure signals
184
+
185
+ | Source | Structured signals |
186
+ | --- | --- |
187
+ | action envelope | `ELEMENT_NOT_FOUND`, `ELEMENT_NOT_INTERACTABLE`, `TIMEOUT`, `NAVIGATION_NO_CHANGE`, `AMBIGUOUS_TARGET`, `STALE_REFERENCE`, `UNKNOWN` |
188
+ | `wait_for_ui` | `INVALID_SELECTOR`, `INVALID_CONDITION`, `PLATFORM_NOT_SUPPORTED`, `ELEMENT_NOT_FOUND`, `INTERNAL_ERROR` |
189
+ | `expect_element_visible` | `failure_code: 'TIMEOUT'\|'UNKNOWN'`, `retryable` |
190
+ | `classify_action_outcome` | `outcome: success\|no_op\|backend_failure\|ui_failure\|unknown` |
191
+ | `get_network_activity` | per-request `status: success\|failure\|retryable` |
192
+
193
+ ### Unstructured failure signals
194
+
195
+ - plain `error` strings from `install_app`, `build_app`, `build_and_install`, `find_element`, `start_log_stream`, many platform helpers
196
+ - boolean-only failures from `terminate_app` and `reset_app_data`
197
+ - top-level handler fallback: `Error executing tool <name>: ...` as plain text, not JSON
198
+
199
+ ### Retry / recovery logic present in implementation
200
+
201
+ | Area | Observed logic |
202
+ | --- | --- |
203
+ | `wait_for_ui` | `retry.max_attempts` and `retry.backoff_ms` |
204
+ | `scroll_to_element` | repeated swipes up to `maxScrolls` |
205
+ | Android `install_app` | retries `pm install` with `-t` on test-only failure; has push + shell fallback |
206
+ | iOS `install_app` | tries `simctl install`, may fall back to `idb` |
207
+ | `get_ui_tree` | platform handlers retry up to three times |
208
+ | `wait_for_screen_change` | one stability confirmation pass after a detected change |
209
+
210
+ ## 8. Execution Patterns (Observed)
211
+
212
+ 1. **Generic action wrapper**
213
+ `notifyActionStart()` → fingerprint before → platform action → fingerprint after → action envelope.
214
+
215
+ 2. **Resolved tap flow**
216
+ `wait_for_ui` returns `element.elementId` → `tap_element` uses cached element and current UI tree to validate it → tap → fingerprints before/after.
217
+
218
+ 3. **Visibility verification flow**
219
+ `expect_element_visible` is implemented as `wait_for_ui(... condition:'visible' ...)` plus a narrower binary result.
220
+
221
+ 4. **Screen verification flow**
222
+ `wait_for_screen_change` and `expect_screen` both depend on `get_screen_fingerprint`; `expect_screen` may additionally call `get_current_screen` on Android when matching by screen name.
223
+
224
+ 5. **Network correlation flow**
225
+ action tools that call `notifyActionStart()` create the time window used by `get_network_activity`; `classify_action_outcome` can then classify using supplied request summaries.
226
+
227
+ 6. **Snapshot/debug flow**
228
+ `capture_debug_snapshot` aggregates screenshot, current screen, fingerprint, UI tree, and logs in one call.
229
+
230
+ ## 9. Inconsistencies and Gaps
231
+
232
+ 1. **Response envelope mismatch:** most tools return wrapped JSON, but `get_logs`, `capture_screenshot`, and `build_and_install` use multi-block responses.
233
+ 2. **Unexpected-error shape mismatch:** uncaught handler failures become plain text strings, not structured JSON.
234
+ 3. **Action result mismatch:** some mutation tools use `ActionExecutionResult`; `install_app`, `terminate_app`, `reset_app_data`, and `build_and_install` do not.
235
+ 4. **Success semantics mismatch:** `scroll_to_element` success is outcome-based; most other action tools are execution-based.
236
+ 5. **Detail richness mismatch:** `start_app` and `restart_app` include `device` and rich `details`; other action-envelope tools usually omit raw error/details.
237
+ 6. **Failure-code derivation mismatch:** generic action wrappers infer `failure_code` by matching substrings in error text; `tap_element` assigns codes directly.
238
+ 7. **Dropped diagnostics:** handler-level MCP responses omit some underlying `diagnostics`/`error` detail, especially for `terminate_app`, `reset_app_data`, and `get_logs`.
239
+ 8. **`expect_element_visible` type/implementation mismatch:** the type allows `ELEMENT_NOT_FOUND`, but the implementation only emits `TIMEOUT` or `UNKNOWN`.
240
+ 9. **Platform mismatch:** `get_current_screen` is Android-only; `type_text` and `press_back` are Android-only; other tools are dual-platform.
241
+ 10. **Observation helper gap:** `waitForUICore` supports `ui`/`log`/`screen`/`idle` modes internally, but only the newer selector-based `wait_for_ui` is exposed as a tool.
242
+ 11. **Network-window coverage gap:** only tools that call `notifyActionStart()` reset the network activity window; `install_app`, `terminate_app`, and `reset_app_data` do not.
243
+ 12. **`classify_action_outcome` log input is secondary in name only:** `hasLogErrors` affects reasoning text for `no_op` but never changes the enum outcome.
244
+ 13. **`build_and_install` has dead autodetect code:** handler requires `platform` and `projectType`, but later still contains unreachable fallback autodetection branches.
245
+ 14. **Runtime object shape drift:** `list_devices` may return extra runtime fields like `appInstalled` and `booted` beyond the base `DeviceInfo` shape.
246
+
247
+ ## 10. Minimal Canonical Model (Derived, Not Invented)
248
+
249
+ ### Common action shape already present
250
+
251
+ ```ts
252
+ {
253
+ action_id: string,
254
+ timestamp: string,
255
+ action_type: string,
256
+ target: {
257
+ selector: Record<string, unknown>,
258
+ resolved: Record<string, unknown> | null
259
+ },
260
+ success: boolean,
261
+ failure_code?: string,
262
+ retryable?: boolean,
263
+ ui_fingerprint_before: string | null,
264
+ ui_fingerprint_after: string | null,
265
+ device?: DeviceInfo,
266
+ details?: Record<string, unknown>
267
+ }
268
+ ```
269
+
270
+ This shape is already used directly or closely approximated by:
271
+
272
+ - `start_app`
273
+ - `restart_app`
274
+ - `tap`
275
+ - `tap_element`
276
+ - `swipe`
277
+ - `scroll_to_element`
278
+ - `type_text`
279
+ - `press_back`
280
+
281
+ ### Common observation/verification pattern already present
282
+
283
+ ```ts
284
+ {
285
+ requested|expected: ...,
286
+ observed: ...,
287
+ success|status: boolean | 'success' | 'timeout',
288
+ metrics?|confidence?|comparison?|reason?
289
+ }
290
+ ```
291
+
292
+ Examples:
293
+
294
+ - `wait_for_ui` → `requested`, `observed`, `metrics`
295
+ - `expect_screen` → `expected_screen`, `observed_screen`, `comparison`
296
+ - `expect_element_visible` → `selector`, `observed`, `reason`
297
+ - `wait_for_screen_change` → previous vs observed/new fingerprint
298
+
299
+ ### Common failure signals already present
300
+
301
+ - action failure codes from `ActionFailureCode`
302
+ - wait/expect codes (`INVALID_*`, `ELEMENT_NOT_FOUND`, `TIMEOUT`, `UNKNOWN`)
303
+ - network request statuses (`success`, `failure`, `retryable`)
304
+ - fallback unstructured `error` strings
305
+
306
+ ### Common flow already present
307
+
308
+ - resolve device
309
+ - perform platform operation
310
+ - optionally capture fingerprints before/after
311
+ - return structured JSON, usually in one text block
312
+ - perform verification in separate tools rather than as part of most actions
@@ -0,0 +1,281 @@
1
+ # MCP Tooling Specification — Spec v1 (Refined)
2
+
3
+ ## 1. Scope
4
+
5
+ This specification defines the runtime contract for MCP tools used to interact with mobile applications.
6
+
7
+ It standardizes:
8
+
9
+ - action execution semantics
10
+ - verification model
11
+ - failure handling
12
+ - response shape constraints
13
+
14
+ This spec is incremental and aligned with the current implementation. It does not introduce new tools or require architectural redesign.
15
+
16
+ ## 2. Core Model
17
+
18
+ The system is based on a strict separation:
19
+
20
+ - Action tools perform execution
21
+ - Verification tools determine outcome
22
+ - `wait_for_*` tools resolve and synchronize
23
+ - Observation tools inspect state
24
+
25
+ ## 3. Execution Model
26
+
27
+ Canonical flow for verifiable interactions:
28
+
29
+ `RESOLVE -> ACT -> WAIT (optional) -> EXPECT`
30
+
31
+ This flow applies when outcome verification is required.
32
+
33
+ It does not apply to:
34
+
35
+ - pure inspection tools
36
+ - observation-only flows
37
+ - non-verifiable or exploratory actions
38
+
39
+ Outcome-specific guidance:
40
+
41
+ - visible navigation expected -> `wait_for_screen_change` (optional) -> `expect_screen`
42
+ - local UI change expected -> `wait_for_ui` (optional) -> `expect_element_visible`
43
+ - backend/API activity expected without a visible UI change -> compare `get_screen_fingerprint` before/after, then call `get_network_activity` immediately after the action and `classify_action_outcome` with the observed requests
44
+
45
+ For backend/API activity, `wait_for_screen_change` is not the right verification tool unless a visible transition is also expected.
46
+
47
+ ## 4. Action Tools
48
+
49
+ ### 4.1 Definition
50
+
51
+ Action tools mutate application state.
52
+
53
+ Includes:
54
+ `start_app`, `restart_app`, `tap`, `tap_element`, `swipe`, `scroll_to_element`, `type_text`, `press_back`
55
+
56
+ ### 4.2 Required Semantics
57
+
58
+ - `success` MUST represent execution success only
59
+ - execution success means the platform command was dispatched without error
60
+ - `success` MUST NOT imply outcome success
61
+
62
+ ### 4.3 Action Envelope
63
+
64
+ MUST be returned in this structure:
65
+
66
+ ```ts
67
+ {
68
+ action_id: string,
69
+ timestamp: string,
70
+ action_type: string,
71
+ target: {
72
+ selector: object,
73
+ resolved: object | null
74
+ },
75
+ success: boolean,
76
+ ui_fingerprint_before: string | null,
77
+ ui_fingerprint_after: string | null,
78
+ failure_code?: string,
79
+ retryable?: boolean,
80
+ device?: DeviceInfo,
81
+ details?: object
82
+ }
83
+ ```
84
+
85
+ Rules:
86
+
87
+ - `success` is at the top level, not nested
88
+ - `target` contains only selection and resolution context
89
+ - fingerprints represent observed pre/post UI state on a best-effort basis
90
+ - `failure_code` is optional but MUST be used when a structured mapping exists
91
+
92
+ ### 4.4 Allowed Deviations
93
+
94
+ Explicit temporary exceptions:
95
+
96
+ - `install_app`, `terminate_app`, `reset_app_data` do not use this envelope
97
+ - `scroll_to_element` may temporarily retain outcome-based success semantics
98
+ - partial `failure_code` coverage is allowed
99
+ - detail richness may vary across tools
100
+
101
+ ## 5. Verification Tools
102
+
103
+ ### 5.1 Definition
104
+
105
+ Verification tools determine whether the intended outcome occurred.
106
+
107
+ Primary:
108
+
109
+ - `expect_screen`
110
+ - `expect_element_visible`
111
+
112
+ ### 5.2 Required Semantics
113
+
114
+ - MUST return `success` as a boolean
115
+ - `success` MUST represent outcome truth
116
+ - MUST be binary and deterministic
117
+
118
+ Optional fields do not affect `success`:
119
+ `observed`, `expected`, `comparison`, `reason`, `confidence`
120
+
121
+ ### 5.3 Authoritative Role
122
+
123
+ Verification tools are the only authoritative source of outcome truth.
124
+
125
+ Action tools MUST NOT be used to infer outcome success.
126
+
127
+ ### 5.4 Applicability Rules
128
+
129
+ An `expect_*` tool is applicable when:
130
+
131
+ - expected destination screen is known -> `expect_screen`
132
+ - expected UI element state is known -> `expect_element_visible`
133
+ - outcome is explicitly defined or testable
134
+
135
+ Rules:
136
+
137
+ - `wait_for_*` MAY be used before `expect_*` for synchronization
138
+ - `wait_for_*` MUST NOT replace `expect_*` when an applicable `expect_*` tool exists
139
+ - when no applicable `expect_*` tool exists, `expect_*` MAY be skipped
140
+
141
+ ## 6. wait_for_* Tools
142
+
143
+ ### 6.1 Definition
144
+
145
+ `wait_for_*` tools provide deterministic resolution and synchronization.
146
+
147
+ Examples:
148
+
149
+ - `wait_for_ui`
150
+ - `wait_for_screen_change`
151
+
152
+ ### 6.2 Rules
153
+
154
+ - MAY resolve UI elements
155
+ - MAY synchronize UI/system state
156
+ - MUST NOT be treated as final verification when `expect_*` is applicable
157
+
158
+ ### 6.3 Semantics
159
+
160
+ - `success` indicates condition met or resolution succeeded
161
+ - `success` does NOT indicate outcome correctness
162
+
163
+ ## 7. Failure Semantics
164
+
165
+ ### 7.1 Canonical Codes
166
+
167
+ - `ELEMENT_NOT_FOUND`
168
+ - `ELEMENT_NOT_INTERACTABLE`
169
+ - `TIMEOUT`
170
+ - `NAVIGATION_NO_CHANGE`
171
+ - `AMBIGUOUS_TARGET`
172
+ - `STALE_REFERENCE`
173
+ - `UNKNOWN`
174
+
175
+ ### 7.2 Rules
176
+
177
+ - `failure_code` MUST be used when a structured mapping exists
178
+ - `failure_code` MUST NOT be replaced by string errors
179
+ - string errors MAY exist for diagnostics only
180
+ - not all tools must emit all codes
181
+
182
+ ### 7.3 Scope
183
+
184
+ Applies to:
185
+
186
+ - action tools
187
+ - verification tools
188
+ - `wait_for_ui`-style tools
189
+
190
+ ## 8. Response Shape
191
+
192
+ ### 8.1 Default
193
+
194
+ All responses MUST be a single JSON text block.
195
+
196
+ ### 8.2 Allowed Exceptions
197
+
198
+ Multi-block responses are allowed only for:
199
+
200
+ - `get_logs`
201
+ - `capture_screenshot`
202
+ - `build_and_install`
203
+
204
+ ### 8.3 Errors
205
+
206
+ All handler/runtime errors MUST be JSON-wrapped.
207
+
208
+ String-only errors are not allowed, including fallback handler errors.
209
+
210
+ Note: string diagnostics may still appear inside structured JSON payloads where explicitly defined by a tool.
211
+
212
+ ## 9. Classification
213
+
214
+ Tool: `classify_action_outcome`
215
+
216
+ Rules:
217
+
218
+ - MAY use UI, network, and log signals
219
+ - MUST be deterministic
220
+ - MUST NOT replace `expect_*` tools
221
+ - MUST be treated as a supplementary signal only
222
+ - SHOULD be used with `get_network_activity` when the expected outcome is backend/API activity without a visible UI change
223
+
224
+ It is not a verification mechanism.
225
+
226
+ ## 10. Execution Patterns
227
+
228
+ Canonical pattern:
229
+
230
+ `wait_for_ui -> tap_element -> wait_for_screen_change (optional) -> expect_screen`
231
+
232
+ Interpretation:
233
+
234
+ - `tap_element.success` = executed
235
+ - `wait_for_screen_change.success` = UI changed
236
+ - `expect_screen.success` = correct outcome verified
237
+
238
+ ## 11. Known Deviations
239
+
240
+ Explicitly allowed:
241
+
242
+ - `install_app`, `terminate_app`, `reset_app_data` not using envelope
243
+ - `build_and_install` streaming NDJSON
244
+ - platform-specific tools
245
+ - partial failure coverage
246
+ - `scroll_to_element` outcome-based success (temporary exception)
247
+ - extended runtime fields in `list_devices`
248
+
249
+ ## 12. Migration Rules
250
+
251
+ Must change now:
252
+
253
+ - uncaught errors must be JSON-wrapped
254
+
255
+ Should align when touched:
256
+
257
+ - `tap`, `swipe`, `type_text`, `press_back`
258
+ - `start_app`, `restart_app`
259
+ - `scroll_to_element`
260
+ - `wait_for_ui`
261
+
262
+ No change required:
263
+
264
+ - `tap_element`
265
+ - `expect_screen`
266
+ - `expect_element_visible`
267
+ - `wait_for_screen_change`
268
+
269
+ ## 13. Guiding Principles
270
+
271
+ - Actions execute
272
+ - Verification proves
273
+ - Waiting synchronizes
274
+ - Classification assists
275
+
276
+ ## Final Definition
277
+
278
+ Action success equals execution success.
279
+ Outcome success equals verification success.
280
+
281
+ Verification tools are authoritative when the expected outcome is defined.
@@ -33,7 +33,7 @@ Example response:
33
33
  ```json
34
34
  {
35
35
  "action_id": "tap_1710000000000_1",
36
- "timestamp": 1710000000000,
36
+ "timestamp": "2026-04-23T08:00:00.000Z",
37
37
  "action_type": "tap",
38
38
  "target": { "selector": { "x": 100, "y": 200 }, "resolved": null },
39
39
  "success": true,
@@ -53,6 +53,10 @@ Preferred verification:
53
53
 
54
54
  - navigation outcome known -> `expect_screen`
55
55
  - local UI change known -> `expect_element_visible`
56
+ - backend/API activity expected -> `classify_action_outcome` + `get_network_activity`
57
+
58
+ Use `wait_for_screen_change` only when a visible transition is the expected outcome. If a button should trigger an API request but the screen should stay the same, rely on network activity and classification instead.
59
+ For backend-only actions, prefer comparing `get_screen_fingerprint` before/after and call `get_network_activity` immediately after the action; do not wait on `wait_for_screen_change` if no visible transition is expected.
56
60
 
57
61
  ---
58
62
 
@@ -139,6 +143,7 @@ Notes:
139
143
  - Treats `null` fingerprints as transient and keeps polling.
140
144
  - Adds a stability confirmation before returning success to avoid transient animation frames.
141
145
  - Follow with `expect_screen` when the expected destination is known.
146
+ - Do not use this as the main success check for backend/API activity that does not change the visible UI.
142
147
 
143
148
  ---
144
149
 
@@ -303,7 +308,7 @@ Success response:
303
308
  ```json
304
309
  {
305
310
  "action_id": "tap_element_1710000000000_1",
306
- "timestamp": 1710000000000,
311
+ "timestamp": "2026-04-23T08:00:00.000Z",
307
312
  "action_type": "tap_element",
308
313
  "target": {
309
314
  "selector": { "elementId": "el_123" },
@@ -328,7 +333,7 @@ Failure response:
328
333
  ```json
329
334
  {
330
335
  "action_id": "tap_element_1710000000001_2",
331
- "timestamp": 1710000000001,
336
+ "timestamp": "2026-04-23T08:00:00.001Z",
332
337
  "action_type": "tap_element",
333
338
  "target": { "selector": { "elementId": "el_123" }, "resolved": null },
334
339
  "success": false,
@@ -451,3 +456,22 @@ Notes:
451
456
  - The tool resolves the selector internally when needed.
452
457
  - On failure, `reason` and `observed` tell you whether the selector was missing entirely or present but not yet visible.
453
458
  - Use when the screen should remain on the same destination but a specific element should appear or become visible.
459
+
460
+ ---
461
+
462
+ ## classify_action_outcome + get_network_activity
463
+
464
+ Use this pair when the action is expected to trigger network/backend work and the screen may not visibly change.
465
+
466
+ Pattern:
467
+
468
+ 1. perform the action
469
+ 2. call `classify_action_outcome` with `uiChanged` from `wait_for_screen_change` or a screen fingerprint comparison
470
+ 3. if the classifier asks for it, call `get_network_activity`
471
+ 4. call `classify_action_outcome` again with `networkRequests`
472
+
473
+ Guidance:
474
+
475
+ - `uiChanged=true` or `expectedElementVisible=true` means the action outcome is already verified
476
+ - `nextAction="call_get_network_activity"` means the UI signal was inconclusive and the agent should inspect network activity
477
+ - if network requests succeed but the UI stays unchanged, treat the outcome as a backend/API result rather than a screen transition
@@ -121,7 +121,7 @@ start_app response example:
121
121
  ```json
122
122
  {
123
123
  "action_id": "start_app_1710000000000_1",
124
- "timestamp": 1710000000000,
124
+ "timestamp": "2026-04-23T08:00:00.000Z",
125
125
  "action_type": "start_app",
126
126
  "device": { "platform": "android", "id": "emulator-5554", "osVersion": "14", "model": "Pixel", "simulator": true },
127
127
  "target": { "selector": { "appId": "com.example.app" }, "resolved": null },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mobile-debug-mcp",
3
- "version": "0.24.2",
3
+ "version": "0.24.4",
4
4
  "description": "MCP server for mobile app debugging (Android + iOS), with focus on security and reliability",
5
5
  "type": "module",
6
6
  "bin": {
@@ -146,7 +146,7 @@ export class ToolsInteract {
146
146
 
147
147
  private static _actionFailure(
148
148
  actionId: string,
149
- timestamp: number,
149
+ timestamp: string,
150
150
  actionType: string,
151
151
  selector: Record<string, unknown> | null,
152
152
  resolved: ActionTargetResolved | null,
@@ -254,9 +254,10 @@ export class ToolsInteract {
254
254
  }
255
255
 
256
256
  static async tapElementHandler({ elementId }: { elementId: string }): Promise<TapElementResponse> {
257
- const timestamp = Date.now()
257
+ const timestampMs = Date.now()
258
+ const timestamp = new Date(timestampMs).toISOString()
258
259
  const actionType = 'tap_element'
259
- const actionId = nextActionId(actionType, timestamp)
260
+ const actionId = nextActionId(actionType, timestampMs)
260
261
  const selector = { elementId }
261
262
  const resolved = ToolsInteract._resolvedUiElements.get(elementId)
262
263
  if (!resolved) {
@@ -304,6 +305,7 @@ export class ToolsInteract {
304
305
  action_id: actionId,
305
306
  timestamp,
306
307
  action_type: actionType,
308
+ ...(tree?.device ? { device: tree.device } : {}),
307
309
  target: {
308
310
  selector,
309
311
  resolved: resolvedTarget
@@ -82,9 +82,10 @@ export function buildActionExecutionResult({
82
82
  failure?: { failureCode: ActionFailureCode; retryable: boolean }
83
83
  details?: Record<string, unknown>
84
84
  }): ActionExecutionResult {
85
- const timestamp = Date.now()
85
+ const timestampMs = Date.now()
86
+ const timestamp = new Date(timestampMs).toISOString()
86
87
  return {
87
- action_id: nextActionId(actionType, timestamp),
88
+ action_id: nextActionId(actionType, timestampMs),
88
89
  timestamp,
89
90
  action_type: actionType,
90
91
  ...(device ? { device } : {}),
@@ -99,3 +100,28 @@ export function buildActionExecutionResult({
99
100
  ...(details ? { details } : {})
100
101
  }
101
102
  }
103
+
104
+ export function wrapToolError(name: string, error: unknown) {
105
+ const message = error instanceof Error
106
+ ? error.message
107
+ : typeof error === 'object' && error !== null
108
+ ? (() => {
109
+ try {
110
+ return JSON.stringify(error, null, 2)
111
+ } catch {
112
+ return '[unserializable error object]'
113
+ }
114
+ })()
115
+ : String(error)
116
+ return {
117
+ content: [{
118
+ type: 'text' as const,
119
+ text: JSON.stringify({
120
+ error: {
121
+ tool: name,
122
+ message
123
+ }
124
+ }, null, 2)
125
+ }]
126
+ }
127
+ }
@@ -10,7 +10,7 @@ Inputs:
10
10
  - deviceId (optional)
11
11
 
12
12
  Output Structure:
13
- - action_id, timestamp, action_type
13
+ - action_id, timestamp (ISO 8601), action_type
14
14
  - target.selector = { appId }
15
15
  - success = true when launch was dispatched successfully
16
16
  - failure_code/retryable when launch dispatch fails
@@ -83,7 +83,7 @@ Inputs:
83
83
  - deviceId (optional)
84
84
 
85
85
  Output Structure:
86
- - action_id, timestamp, action_type
86
+ - action_id, timestamp (ISO 8601), action_type
87
87
  - target.selector = { appId }
88
88
  - success = true when the restart command completed
89
89
  - failure_code/retryable when restart dispatch fails
@@ -344,6 +344,7 @@ Capabilities:
344
344
  Constraints:
345
345
  - Does not verify correctness of the resulting state
346
346
  - Must not be used alone to confirm action success when an applicable expect_* tool exists
347
+ - Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
347
348
 
348
349
  Recommended Usage:
349
350
  1. Capture or define the expected outcome
@@ -532,7 +533,7 @@ Inputs:
532
533
  - deviceId (optional)
533
534
 
534
535
  Output Structure:
535
- - action_id, timestamp, action_type
536
+ - action_id, timestamp (ISO 8601), action_type
536
537
  - target.selector = { x, y }
537
538
  - success = true when the tap was dispatched
538
539
  - failure_code/retryable when dispatch fails
@@ -587,7 +588,7 @@ Inputs:
587
588
 
588
589
  Output Structure:
589
590
  - action_id: unique timestamp-based action identifier
590
- - timestamp: epoch milliseconds for the action attempt
591
+ - timestamp: ISO 8601 timestamp for the action attempt
591
592
  - action_type: "tap_element"
592
593
  - target.selector: original target handle ({ elementId })
593
594
  - target.resolved: minimal resolved element info used for the tap
@@ -640,7 +641,7 @@ Inputs:
640
641
  - platform/deviceId (optional)
641
642
 
642
643
  Output Structure:
643
- - action_id, timestamp, action_type
644
+ - action_id, timestamp (ISO 8601), action_type
644
645
  - target.selector = { x1, y1, x2, y2, duration }
645
646
  - success = true when the swipe was dispatched
646
647
  - failure_code/retryable when dispatch fails
@@ -692,7 +693,7 @@ Inputs:
692
693
  - direction, maxScrolls, scrollAmount, deviceId (optional)
693
694
 
694
695
  Output Structure:
695
- - action_id, timestamp, action_type
696
+ - action_id, timestamp (ISO 8601), action_type
696
697
  - target.selector = original selector
697
698
  - target.resolved = minimal resolved element info when found
698
699
  - success = true when scrolling produced a visible target element
@@ -746,7 +747,7 @@ Inputs:
746
747
  - platform/deviceId (optional)
747
748
 
748
749
  Output Structure:
749
- - action_id, timestamp, action_type
750
+ - action_id, timestamp (ISO 8601), action_type
750
751
  - target.selector = { text }
751
752
  - success = true when text input was dispatched
752
753
  - failure_code/retryable when dispatch fails
@@ -795,7 +796,7 @@ Inputs:
795
796
  - platform/deviceId (optional)
796
797
 
797
798
  Output Structure:
798
- - action_id, timestamp, action_type
799
+ - action_id, timestamp (ISO 8601), action_type
799
800
  - target.selector = { key: "back" }
800
801
  - success = true when the back action was dispatched
801
802
  - failure_code/retryable when dispatch fails
@@ -835,6 +836,8 @@ Failure Handling:
835
836
  description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
836
837
 
837
838
  MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
839
+ Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
840
+ For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
838
841
 
839
842
  HOW TO GATHER INPUTS before calling:
840
843
  1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
868
871
  },
869
872
  networkRequests: {
870
873
  type: 'array',
871
- description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
874
+ description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
872
875
  items: {
873
876
  type: 'object',
874
877
  properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
890
893
  name: 'get_network_activity',
891
894
  description: `Returns structured network events captured from platform logs since the last action.
892
895
 
893
- Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
896
+ Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
894
897
  Do not call more than once per action.
895
898
 
896
899
  Events are filtered to significant (non-background) requests only.
@@ -16,7 +16,8 @@ import {
16
16
  inferScrollFailure,
17
17
  ToolCallArgs,
18
18
  ToolHandler,
19
- wrapResponse
19
+ wrapResponse,
20
+ wrapToolError
20
21
  } from './common.js'
21
22
 
22
23
  async function handleStartApp(args: ToolCallArgs) {
@@ -375,8 +376,7 @@ export async function handleToolCall(name: string, args: ToolCallArgs = {}) {
375
376
  try {
376
377
  return await handler(args)
377
378
  } catch (error) {
378
- return {
379
- content: [{ type: 'text' as const, text: `Error executing tool ${name}: ${error instanceof Error ? error.message : String(error)}` }]
380
- }
379
+ console.error(`Error executing tool ${name}:`, error)
380
+ return wrapToolError(name, error)
381
381
  }
382
382
  }
package/src/types.ts CHANGED
@@ -173,7 +173,7 @@ export interface ActionTargetResolved {
173
173
 
174
174
  export interface ActionExecutionResult {
175
175
  action_id: string;
176
- timestamp: number;
176
+ timestamp: string;
177
177
  action_type: string;
178
178
  device?: DeviceInfo;
179
179
  target: {
@@ -26,6 +26,7 @@ async function run() {
26
26
  assert(waitForScreenChange, 'wait_for_screen_change should be registered')
27
27
  assert.match((waitForScreenChange as any).description, /does not verify correctness of the resulting state/i)
28
28
  assert.match((waitForScreenChange as any).description, /follow with expect_screen/i)
29
+ assert.match((waitForScreenChange as any).description, /backend\/API activity without a visible UI change/i)
29
30
 
30
31
  const captureDebugSnapshot = toolDefinitions.find((tool) => tool.name === 'capture_debug_snapshot')
31
32
  assert(captureDebugSnapshot, 'capture_debug_snapshot should be registered')
@@ -60,6 +61,18 @@ async function run() {
60
61
  assert.match((expectElementVisible as any).description, /selector is the primary input/i)
61
62
  assert.match((expectElementVisible as any).description, /Returns structured binary success\/failure only/i)
62
63
 
64
+ const classifyActionOutcome = toolDefinitions.find((tool) => tool.name === 'classify_action_outcome')
65
+ assert(classifyActionOutcome, 'classify_action_outcome should be registered')
66
+ assert.match((classifyActionOutcome as any).description, /backend\/API activity without a visible UI change/i)
67
+ assert.match((classifyActionOutcome as any).description, /get_network_activity/i)
68
+ assert.match((classifyActionOutcome as any).description, /immediately after the action/i)
69
+
70
+ const getNetworkActivity = toolDefinitions.find((tool) => tool.name === 'get_network_activity')
71
+ assert(getNetworkActivity, 'get_network_activity should be registered')
72
+ assert.match((getNetworkActivity as any).description, /backend\/API activity without a visible UI change/i)
73
+ assert.doesNotMatch((getNetworkActivity as any).description, /Call this only when/i)
74
+ assert.match((getNetworkActivity as any).description, /immediately after an action/i)
75
+
63
76
  await assert.rejects(() => handleToolCall('unknown_tool'), /Unknown tool: unknown_tool/)
64
77
 
65
78
  console.log('server contract tests passed')
@@ -47,7 +47,7 @@ async function run() {
47
47
 
48
48
  ;(ToolsInteract as any).tapElementHandler = async () => ({
49
49
  action_id: 'tap_element_1',
50
- timestamp: 1234567890,
50
+ timestamp: '2026-04-23T08:00:00.000Z',
51
51
  action_type: 'tap_element',
52
52
  target: {
53
53
  selector: { elementId: 'el_ready' },
@@ -62,6 +62,7 @@ async function run() {
62
62
  const tapElementPayload = JSON.parse((tapElementResponse as any).content[0].text)
63
63
  assert.strictEqual(tapElementPayload.success, true)
64
64
  assert.strictEqual(tapElementPayload.action_type, 'tap_element')
65
+ assert.match(tapElementPayload.timestamp, /^\d{4}-\d{2}-\d{2}T/)
65
66
  assert.strictEqual(tapElementPayload.target.resolved.elementId, 'el_ready')
66
67
  assert.strictEqual(tapElementPayload.ui_fingerprint_before, 'fp_before')
67
68
 
@@ -71,6 +72,7 @@ async function run() {
71
72
  const tapPayload = JSON.parse((tapResponse as any).content[0].text)
72
73
  assert.strictEqual(tapPayload.success, true)
73
74
  assert.strictEqual(tapPayload.action_type, 'tap')
75
+ assert.match(tapPayload.timestamp, /^\d{4}-\d{2}-\d{2}T/)
74
76
  assert.deepStrictEqual(tapPayload.target.selector, { x: 1, y: 2 })
75
77
  assert.strictEqual(tapPayload.ui_fingerprint_before, 'fp_mock')
76
78
 
@@ -93,6 +95,7 @@ async function run() {
93
95
  const startAppPayload = JSON.parse((startAppResponse as any).content[0].text)
94
96
  assert.strictEqual(startAppPayload.success, true)
95
97
  assert.strictEqual(startAppPayload.action_type, 'start_app')
98
+ assert.match(startAppPayload.timestamp, /^\d{4}-\d{2}-\d{2}T/)
96
99
  assert.strictEqual(startAppPayload.device.id, 'emulator-5554')
97
100
  assert.deepStrictEqual(startAppPayload.target.selector, { appId: 'com.example.app' })
98
101
  assert.strictEqual(startAppPayload.details.launch_time_ms, 123)
@@ -128,6 +131,30 @@ async function run() {
128
131
  assert.strictEqual(expectElementPayload.element_id, 'el_ready')
129
132
  assert.strictEqual(expectElementPayload.expected_condition, 'visible')
130
133
 
134
+ ;(ToolsInteract as any).tapHandler = async () => {
135
+ throw new Error('boom')
136
+ }
137
+
138
+ const failingTapResponse = await handleToolCall('tap', { platform: 'android', x: 1, y: 2 })
139
+ assert.strictEqual((failingTapResponse as any).content.length, 1)
140
+ const failingTapPayload = JSON.parse((failingTapResponse as any).content[0].text)
141
+ assert.deepStrictEqual(failingTapPayload, {
142
+ error: {
143
+ tool: 'tap',
144
+ message: 'boom'
145
+ }
146
+ })
147
+
148
+ ;(ToolsInteract as any).tapHandler = async () => {
149
+ throw { code: 'E_CUSTOM', detail: { field: 'value' } }
150
+ }
151
+
152
+ const objectTapResponse = await handleToolCall('tap', { platform: 'android', x: 1, y: 2 })
153
+ const objectTapPayload = JSON.parse((objectTapResponse as any).content[0].text)
154
+ assert.strictEqual(objectTapPayload.error.tool, 'tap')
155
+ assert.match(objectTapPayload.error.message, /"code": "E_CUSTOM"/)
156
+ assert.match(objectTapPayload.error.message, /"field": "value"/)
157
+
131
158
  ;(ToolsObserve as any).captureScreenshotHandler = async () => ({
132
159
  device: { platform: 'ios', id: 'booted', osVersion: '18.0', model: 'Simulator', simulator: true },
133
160
  screenshot: Buffer.from('png-data').toString('base64'),