mobile-debug-mcp 0.24.3 → 0.24.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,111 @@ import { AndroidObserve } from './android.js';
3
3
  import { iOSObserve } from './ios.js';
4
4
  export { AndroidObserve } from './android.js';
5
5
  export { iOSObserve } from './ios.js';
6
+ function normalizeHint(value) {
7
+ if (value === null || value === undefined)
8
+ return '';
9
+ return String(value).trim().replace(/\s+/g, ' ').toLowerCase();
10
+ }
11
+ function titleCase(value) {
12
+ return value
13
+ .replace(/[_-]+/g, ' ')
14
+ .replace(/\s+/g, ' ')
15
+ .trim()
16
+ .replace(/\b\w/g, (match) => match.toUpperCase());
17
+ }
18
+ function shortActivityName(activity) {
19
+ if (!activity)
20
+ return null;
21
+ const trimmed = String(activity).trim();
22
+ if (!trimmed)
23
+ return null;
24
+ const lastSegment = trimmed.split('.').pop() || trimmed;
25
+ const withoutSuffix = lastSegment.replace(/Activity$/, '');
26
+ return withoutSuffix ? titleCase(withoutSuffix) : titleCase(lastSegment);
27
+ }
28
+ function collectSnapshotTexts(tree) {
29
+ const elements = Array.isArray(tree?.elements) ? tree.elements : [];
30
+ const texts = [];
31
+ const actionables = [];
32
+ for (const element of elements) {
33
+ const rawText = element?.text ?? element?.contentDescription ?? element?.contentDesc ?? element?.accessibilityLabel ?? element?.resourceId ?? element?.id ?? '';
34
+ const text = normalizeHint(rawText);
35
+ if (text)
36
+ texts.push(text);
37
+ if (element?.clickable && element?.enabled !== false && text) {
38
+ actionables.push(text);
39
+ }
40
+ }
41
+ return {
42
+ texts: Array.from(new Set(texts)),
43
+ actionables: Array.from(new Set(actionables))
44
+ };
45
+ }
46
+ function inferSnapshotScreen(raw) {
47
+ const tree = raw.ui_tree;
48
+ const treeScreen = normalizeHint(tree?.screen);
49
+ if (treeScreen)
50
+ return titleCase(treeScreen);
51
+ const activity = shortActivityName(raw.activity);
52
+ if (activity)
53
+ return activity;
54
+ const { texts } = collectSnapshotTexts(tree);
55
+ if (texts.length > 0)
56
+ return titleCase(texts[0]);
57
+ return null;
58
+ }
59
+ function deriveSnapshotSemantic(raw) {
60
+ const tree = raw.ui_tree;
61
+ const { texts, actionables } = collectSnapshotTexts(tree);
62
+ const screenFromTree = normalizeHint(tree?.screen);
63
+ const activityHint = normalizeHint(raw.activity);
64
+ const screen = inferSnapshotScreen(raw);
65
+ if (!screen && !activityHint && texts.length === 0 && !raw.logs.length)
66
+ return null;
67
+ const hasErrorLogs = raw.logs.some((entry) => /error|fatal exception|exception|failed/i.test(entry.message));
68
+ const hasLoadingSignals = texts.some((text) => /loading|please wait|spinner|progress/i.test(text));
69
+ const hasPrimaryText = texts.some((text) => /sign in|log in|log in|login|home|checkout|settings|menu|profile|search/i.test(text));
70
+ const hasScreenshot = typeof raw.screenshot === 'string' && raw.screenshot.length > 0;
71
+ const hasUiTree = !!tree && Array.isArray(tree.elements);
72
+ const signals = {
73
+ has_activity: !!activityHint,
74
+ has_ui_tree: hasUiTree,
75
+ has_screenshot: hasScreenshot,
76
+ has_visible_text: texts.length > 0,
77
+ has_clickable_elements: actionables.length > 0,
78
+ has_error_logs: hasErrorLogs,
79
+ has_loading_signals: hasLoadingSignals,
80
+ has_primary_text: hasPrimaryText
81
+ };
82
+ const warnings = [];
83
+ if (screenFromTree && activityHint && screenFromTree !== activityHint) {
84
+ warnings.push('ui_tree.screen and activity hints differ');
85
+ }
86
+ if (!hasUiTree)
87
+ warnings.push('ui tree unavailable');
88
+ if (!activityHint)
89
+ warnings.push('activity unavailable');
90
+ if (hasErrorLogs)
91
+ warnings.push('error signals present in logs');
92
+ const evidenceScore = (hasUiTree ? 0.35 : 0) +
93
+ (screen ? 0.2 : 0) +
94
+ (activityHint ? 0.15 : 0) +
95
+ (actionables.length > 0 ? 0.15 : 0) +
96
+ (texts.length > 0 ? 0.1 : 0) +
97
+ (hasScreenshot ? 0.05 : 0) +
98
+ (hasErrorLogs ? -0.15 : 0) +
99
+ (hasLoadingSignals ? -0.05 : 0);
100
+ const confidence = Math.max(0, Math.min(1, Number(evidenceScore.toFixed(2))));
101
+ if (!screen && confidence < 0.3)
102
+ return null;
103
+ return {
104
+ screen,
105
+ signals,
106
+ actions_available: actionables.length > 0 ? actionables.slice(0, 10) : null,
107
+ confidence,
108
+ warnings: confidence >= 0.7 && warnings.length === 0 ? [] : warnings
109
+ };
110
+ }
6
111
  export class ToolsObserve {
7
112
  // Resolve a target device and return the appropriate observe instance and resolved info.
8
113
  static async resolveObserve(platform, deviceId, appId) {
@@ -95,7 +200,7 @@ export class ToolsObserve {
95
200
  }
96
201
  static async captureDebugSnapshotHandler({ reason, includeLogs = true, logLines = 200, platform, appId, deviceId, sessionId } = {}) {
97
202
  const timestamp = Date.now();
98
- const out = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] };
203
+ const raw = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] };
99
204
  // Parallel fetches for performance: screenshot, current screen, fingerprint, ui tree, and log stream/get logs
100
205
  const sid = sessionId || 'default';
101
206
  const tasks = {
@@ -114,59 +219,59 @@ export class ToolsObserve {
114
219
  if (res.status === 'fulfilled') {
115
220
  const val = res.value;
116
221
  if (key === 'screenshot') {
117
- out.screenshot = val && val.screenshot ? val.screenshot : null;
222
+ raw.screenshot = val && val.screenshot ? val.screenshot : null;
118
223
  }
119
224
  else if (key === 'currentScreen') {
120
- out.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : out.activity || '';
225
+ raw.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : raw.activity || '';
121
226
  }
122
227
  else if (key === 'fingerprint') {
123
228
  if (val && val.fingerprint)
124
- out.fingerprint = val.fingerprint;
229
+ raw.fingerprint = val.fingerprint;
125
230
  if (val && val.activity)
126
- out.activity = out.activity || val.activity;
231
+ raw.activity = raw.activity || val.activity;
127
232
  if (val && val.error)
128
- out.fingerprint_error = val.error;
233
+ raw.fingerprint_error = val.error;
129
234
  }
130
235
  else if (key === 'uiTree') {
131
- out.ui_tree = val;
236
+ raw.ui_tree = val;
132
237
  if (val && val.error)
133
- out.ui_tree_error = val.error;
238
+ raw.ui_tree_error = val.error;
134
239
  }
135
240
  else if (key === 'readLogStream') {
136
241
  // handle below after evaluating fallback
137
242
  // temporarily attach to out._streamEntries
138
- out._streamEntries = val && val.entries ? val.entries : [];
243
+ raw.logs = Array.isArray(val?.entries) ? val.entries : [];
139
244
  }
140
245
  }
141
246
  else {
142
247
  const errMsg = res.reason instanceof Error ? res.reason.message : String(res.reason);
143
248
  if (key === 'screenshot')
144
- out.screenshot_error = errMsg;
249
+ raw.screenshot_error = errMsg;
145
250
  if (key === 'currentScreen')
146
- out.activity_error = errMsg;
251
+ raw.activity_error = errMsg;
147
252
  if (key === 'fingerprint') {
148
- out.fingerprint = null;
149
- out.fingerprint_error = errMsg;
253
+ raw.fingerprint = null;
254
+ raw.fingerprint_error = errMsg;
150
255
  }
151
256
  if (key === 'uiTree') {
152
- out.ui_tree = null;
153
- out.ui_tree_error = errMsg;
257
+ raw.ui_tree = null;
258
+ raw.ui_tree_error = errMsg;
154
259
  }
155
260
  if (key === 'readLogStream') {
156
- out._streamEntries = [];
157
- out.logs_error = errMsg;
261
+ raw.logs = [];
262
+ raw.logs_error = errMsg;
158
263
  }
159
264
  }
160
265
  }
161
266
  // Logs: prefer stream entries, fallback to snapshot logs when empty
162
267
  if (includeLogs) {
163
268
  try {
164
- let entries = Array.isArray(out._streamEntries) ? out._streamEntries : [];
269
+ let entries = Array.isArray(raw.logs) ? raw.logs : [];
165
270
  if (!entries || entries.length === 0) {
166
271
  const gl = await ToolsObserve.getLogsHandler({ platform, appId, deviceId, lines: logLines });
167
- const raw = (gl && gl.logs) ? gl.logs : [];
272
+ const snapshotLogs = (gl && gl.logs) ? gl.logs : [];
168
273
  // raw may be structured entries or strings
169
- entries = raw.slice(-Math.max(0, logLines)).map(item => {
274
+ entries = snapshotLogs.slice(-Math.max(0, logLines)).map(item => {
170
275
  if (!item)
171
276
  return { timestamp: null, level: 'INFO', message: '' };
172
277
  if (typeof item === 'string') {
@@ -196,15 +301,14 @@ export class ToolsObserve {
196
301
  return { timestamp: tsNum, level, message: msg };
197
302
  });
198
303
  }
199
- out.logs = entries;
304
+ raw.logs = entries;
200
305
  }
201
306
  catch (e) {
202
- out.logs = [];
203
- out.logs_error = e instanceof Error ? e.message : String(e);
307
+ raw.logs = [];
308
+ raw.logs_error = e instanceof Error ? e.message : String(e);
204
309
  }
205
310
  }
206
- // Clean up internal temporary field
207
- delete out._streamEntries;
208
- return out;
311
+ const semantic = deriveSnapshotSemantic(raw);
312
+ return semantic ? { raw, semantic } : { raw };
209
313
  }
210
314
  }
@@ -240,7 +240,7 @@ Failure Handling:
240
240
  },
241
241
  {
242
242
  name: 'capture_debug_snapshot',
243
- description: 'Capture a complete debug snapshot (screenshot, ui tree, activity, fingerprint, logs). Returns structured JSON.',
243
+ description: 'Capture a complete debug snapshot (raw observation layer plus optional derived semantic layer). Returns structured JSON.',
244
244
  inputSchema: {
245
245
  type: 'object',
246
246
  properties: {
@@ -344,6 +344,7 @@ Capabilities:
344
344
  Constraints:
345
345
  - Does not verify correctness of the resulting state
346
346
  - Must not be used alone to confirm action success when an applicable expect_* tool exists
347
+ - Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
347
348
 
348
349
  Recommended Usage:
349
350
  1. Capture or define the expected outcome
@@ -835,6 +836,8 @@ Failure Handling:
835
836
  description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
836
837
 
837
838
  MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
839
+ Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
840
+ For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
838
841
 
839
842
  HOW TO GATHER INPUTS before calling:
840
843
  1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
868
871
  },
869
872
  networkRequests: {
870
873
  type: 'array',
871
- description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
874
+ description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
872
875
  items: {
873
876
  type: 'object',
874
877
  properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
890
893
  name: 'get_network_activity',
891
894
  description: `Returns structured network events captured from platform logs since the last action.
892
895
 
893
- Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
896
+ Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
894
897
  Do not call more than once per action.
895
898
 
896
899
  Events are filtered to significant (non-background) requests only.
package/docs/CHANGELOG.md CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  All notable changes to the **Mobile Debug MCP** project will be documented in this file.
4
4
 
5
+ ## [0.24.5]
6
+ - Improved snapshots
7
+
8
+ ## [0.24.4]
9
+ - Moving agents away from `wait_for_screen_change`
10
+
5
11
  ## [0.24.3]
6
12
  - Improved output consistency
7
13
 
@@ -36,6 +36,14 @@ It does not apply to:
36
36
  - observation-only flows
37
37
  - non-verifiable or exploratory actions
38
38
 
39
+ Outcome-specific guidance:
40
+
41
+ - visible navigation expected -> `wait_for_screen_change` (optional) -> `expect_screen`
42
+ - local UI change expected -> `wait_for_ui` (optional) -> `expect_element_visible`
43
+ - backend/API activity expected without a visible UI change -> compare `get_screen_fingerprint` before/after, then call `get_network_activity` immediately after the action and `classify_action_outcome` with the observed requests
44
+
45
+ For backend/API activity, `wait_for_screen_change` is not the right verification tool unless a visible transition is also expected.
46
+
39
47
  ## 4. Action Tools
40
48
 
41
49
  ### 4.1 Definition
@@ -201,7 +209,78 @@ String-only errors are not allowed, including fallback handler errors.
201
209
 
202
210
  Note: string diagnostics may still appear inside structured JSON payloads where explicitly defined by a tool.
203
211
 
204
- ## 9. Classification
212
+ ## 9. Observation Tools (Extended Semantics)
213
+
214
+ Observation tools inspect application state without mutating it.
215
+
216
+ Examples:
217
+
218
+ - `capture_debug_snapshot`
219
+ - `get_screen_fingerprint`
220
+ - `get_network_activity`
221
+ - `get_logs`
222
+
223
+ ### 9.1 Snapshot Response Model
224
+
225
+ `capture_debug_snapshot` MUST return a dual-layer response:
226
+
227
+ - `raw`: required object
228
+ - `semantic`: optional object
229
+
230
+ The raw layer is authoritative and MUST remain unchanged from the underlying observation data. It is the source of truth and MUST NOT be interpreted or rewritten.
231
+
232
+ The semantic layer is derived, best-effort, and MUST be generated exclusively from the raw layer.
233
+
234
+ Raw layer contents include:
235
+
236
+ - UI hierarchy or accessibility tree
237
+ - screenshot when available
238
+ - element-level attributes
239
+ - logs and fingerprint/activity observations
240
+ - raw error fields when partial collection fails
241
+
242
+ Semantic layer shape when present:
243
+
244
+ ```ts
245
+ {
246
+ screen: string | null,
247
+ signals: Record<string, string | number | boolean> | null,
248
+ actions_available: string[] | null,
249
+ confidence: number,
250
+ warnings: string[]
251
+ }
252
+ ```
253
+
254
+ Rules:
255
+
256
+ - `confidence` MUST be between 0 and 1
257
+ - `warnings` MUST be present when `semantic` is present
258
+ - `semantic` MAY be omitted entirely when derivation is not reliable
259
+ - `semantic` MUST be treated as unreliable if it conflicts with raw data
260
+ - `actions_available` are hints only and MUST NOT be treated as guaranteed executable actions
261
+
262
+ ### 9.2 Agent Usage Contract
263
+
264
+ Agents SHOULD use `semantic` for primary decision-making when present.
265
+
266
+ Agents MUST fall back to `raw` when:
267
+
268
+ - `semantic` is missing
269
+ - `confidence < 0.7`
270
+ - `warnings` is non-empty
271
+ - semantic output conflicts with expected state or raw data
272
+
273
+ `semantic` is for planning only and MUST NOT be used for verification.
274
+
275
+ ### 9.3 Relationship to Classification
276
+
277
+ Semantic signals MAY be used as input to `classify_action_outcome`.
278
+
279
+ Semantic output MUST NOT replace classification or verification.
280
+
281
+ Classification remains a supplementary, post-action interpretation mechanism.
282
+
283
+ ## 10. Classification
205
284
 
206
285
  Tool: `classify_action_outcome`
207
286
 
@@ -211,10 +290,11 @@ Rules:
211
290
  - MUST be deterministic
212
291
  - MUST NOT replace `expect_*` tools
213
292
  - MUST be treated as a supplementary signal only
293
+ - SHOULD be used with `get_network_activity` when the expected outcome is backend/API activity without a visible UI change
214
294
 
215
295
  It is not a verification mechanism.
216
296
 
217
- ## 10. Execution Patterns
297
+ ## 11. Execution Patterns
218
298
 
219
299
  Canonical pattern:
220
300
 
@@ -226,7 +306,7 @@ Interpretation:
226
306
  - `wait_for_screen_change.success` = UI changed
227
307
  - `expect_screen.success` = correct outcome verified
228
308
 
229
- ## 11. Known Deviations
309
+ ## 12. Known Deviations
230
310
 
231
311
  Explicitly allowed:
232
312
 
@@ -237,7 +317,7 @@ Explicitly allowed:
237
317
  - `scroll_to_element` outcome-based success (temporary exception)
238
318
  - extended runtime fields in `list_devices`
239
319
 
240
- ## 12. Migration Rules
320
+ ## 13. Migration Rules
241
321
 
242
322
  Must change now:
243
323
 
@@ -249,6 +329,7 @@ Should align when touched:
249
329
  - `start_app`, `restart_app`
250
330
  - `scroll_to_element`
251
331
  - `wait_for_ui`
332
+ - `capture_debug_snapshot`
252
333
 
253
334
  No change required:
254
335
 
@@ -257,7 +338,7 @@ No change required:
257
338
  - `expect_element_visible`
258
339
  - `wait_for_screen_change`
259
340
 
260
- ## 13. Guiding Principles
341
+ ## 14. Guiding Principles
261
342
 
262
343
  - Actions execute
263
344
  - Verification proves
@@ -53,6 +53,10 @@ Preferred verification:
53
53
 
54
54
  - navigation outcome known -> `expect_screen`
55
55
  - local UI change known -> `expect_element_visible`
56
+ - backend/API activity expected -> `classify_action_outcome` + `get_network_activity`
57
+
58
+ Use `wait_for_screen_change` only when a visible transition is the expected outcome. If a button should trigger an API request but the screen should stay the same, rely on network activity and classification instead.
59
+ For backend-only actions, prefer comparing `get_screen_fingerprint` before/after and call `get_network_activity` immediately after the action; do not wait on `wait_for_screen_change` if no visible transition is expected.
56
60
 
57
61
  ---
58
62
 
@@ -139,6 +143,7 @@ Notes:
139
143
  - Treats `null` fingerprints as transient and keeps polling.
140
144
  - Adds a stability confirmation before returning success to avoid transient animation frames.
141
145
  - Follow with `expect_screen` when the expected destination is known.
146
+ - Do not use this as the main success check for backend/API activity that does not change the visible UI.
142
147
 
143
148
  ---
144
149
 
@@ -451,3 +456,22 @@ Notes:
451
456
  - The tool resolves the selector internally when needed.
452
457
  - On failure, `reason` and `observed` tell you whether the selector was missing entirely or present but not yet visible.
453
458
  - Use when the screen should remain on the same destination but a specific element should appear or become visible.
459
+
460
+ ---
461
+
462
+ ## classify_action_outcome + get_network_activity
463
+
464
+ Use this pair when the action is expected to trigger network/backend work and the screen may not visibly change.
465
+
466
+ Pattern:
467
+
468
+ 1. perform the action
469
+ 2. call `classify_action_outcome` with `uiChanged` from `wait_for_screen_change` or a screen fingerprint comparison
470
+ 3. if the classifier asks for it, call `get_network_activity`
471
+ 4. call `classify_action_outcome` again with `networkRequests`
472
+
473
+ Guidance:
474
+
475
+ - `uiChanged=true` or `expectedElementVisible=true` means the action outcome is already verified
476
+ - `nextAction="call_get_network_activity"` means the UI signal was inconclusive and the agent should inspect network activity
477
+ - if network requests succeed but the UI stays unchanged, treat the outcome as a backend/API result rather than a screen transition
@@ -132,24 +132,40 @@ Behavior:
132
132
  - Returns partial data when components fail and includes per-part error fields (e.g. `screenshot_error`, `ui_tree_error`).
133
133
  - Caps logs to `logLines` entries and prefers recent entries.
134
134
  - Fast by default: does not wait for new logs and avoids long blocking operations.
135
+ - Returns a dual-layer payload:
136
+ - `raw` is authoritative and contains the underlying observation data unchanged.
137
+ - `semantic` is optional, derived from `raw`, and intended for planning only.
135
138
 
136
139
  Response (example):
137
140
 
138
141
  ```json
139
142
  {
140
- "timestamp": 1710000000,
141
- "reason": "Crash after tapping checkout",
142
- "activity": "CheckoutActivity",
143
- "fingerprint": "abc123",
144
- "screenshot": "<base64 PNG string>",
145
- "ui_tree": { ... },
146
- "logs": [ { "timestamp": 1710000000, "level": "ERROR", "message": "NullPointerException at CheckoutViewModel" } ]
143
+ "raw": {
144
+ "timestamp": 1710000000,
145
+ "reason": "Crash after tapping checkout",
146
+ "activity": "CheckoutActivity",
147
+ "fingerprint": "abc123",
148
+ "screenshot": "<base64 PNG string>",
149
+ "ui_tree": { ... },
150
+ "logs": [ { "timestamp": 1710000000, "level": "ERROR", "message": "NullPointerException at CheckoutViewModel" } ]
151
+ },
152
+ "semantic": {
153
+ "screen": "Checkout",
154
+ "signals": {
155
+ "has_error_logs": true,
156
+ "has_clickable_elements": false
157
+ },
158
+ "actions_available": ["review checkout", "inspect error"],
159
+ "confidence": 0.82,
160
+ "warnings": []
161
+ }
147
162
  }
148
163
  ```
149
164
 
150
165
  Notes:
151
166
  - Useful immediately after detecting crashes or unexpected UI behaviour.
152
167
  - Do not expect perfect data during a crash; tool is designed to return best-effort context and include errors for failed parts.
168
+ - Treat `semantic` as planning guidance only; `raw` remains the source of truth.
153
169
 
154
170
  ---
155
171
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mobile-debug-mcp",
3
- "version": "0.24.3",
3
+ "version": "0.24.5",
4
4
  "description": "MCP server for mobile app debugging (Android + iOS), with focus on security and reliability",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,10 +1,146 @@
1
1
  import { resolveTargetDevice } from '../utils/resolve-device.js'
2
2
  import { AndroidObserve } from './android.js'
3
3
  import { iOSObserve } from './ios.js'
4
+ import type {
5
+ CaptureDebugSnapshotRawResponse,
6
+ SnapshotSemanticResponse
7
+ } from '../types.js'
4
8
 
5
9
  export { AndroidObserve } from './android.js'
6
10
  export { iOSObserve } from './ios.js'
7
11
 
12
+ interface SnapshotTreeElementLike {
13
+ text?: string | null
14
+ contentDescription?: string | null
15
+ contentDesc?: string | null
16
+ accessibilityLabel?: string | null
17
+ resourceId?: string | null
18
+ id?: string | null
19
+ type?: string | null
20
+ class?: string | null
21
+ clickable?: boolean
22
+ enabled?: boolean
23
+ visible?: boolean
24
+ }
25
+
26
+ interface SnapshotTreeLike {
27
+ screen?: string | null
28
+ elements?: SnapshotTreeElementLike[]
29
+ }
30
+
31
+ function normalizeHint(value: unknown): string {
32
+ if (value === null || value === undefined) return ''
33
+ return String(value).trim().replace(/\s+/g, ' ').toLowerCase()
34
+ }
35
+
36
+ function titleCase(value: string): string {
37
+ return value
38
+ .replace(/[_-]+/g, ' ')
39
+ .replace(/\s+/g, ' ')
40
+ .trim()
41
+ .replace(/\b\w/g, (match) => match.toUpperCase())
42
+ }
43
+
44
+ function shortActivityName(activity: string | null | undefined): string | null {
45
+ if (!activity) return null
46
+ const trimmed = String(activity).trim()
47
+ if (!trimmed) return null
48
+ const lastSegment = trimmed.split('.').pop() || trimmed
49
+ const withoutSuffix = lastSegment.replace(/Activity$/, '')
50
+ return withoutSuffix ? titleCase(withoutSuffix) : titleCase(lastSegment)
51
+ }
52
+
53
+ function collectSnapshotTexts(tree: SnapshotTreeLike | null | undefined) {
54
+ const elements = Array.isArray(tree?.elements) ? tree!.elements! : []
55
+ const texts: string[] = []
56
+ const actionables: string[] = []
57
+
58
+ for (const element of elements) {
59
+ const rawText = element?.text ?? element?.contentDescription ?? element?.contentDesc ?? element?.accessibilityLabel ?? element?.resourceId ?? element?.id ?? ''
60
+ const text = normalizeHint(rawText)
61
+ if (text) texts.push(text)
62
+ if (element?.clickable && element?.enabled !== false && text) {
63
+ actionables.push(text)
64
+ }
65
+ }
66
+
67
+ return {
68
+ texts: Array.from(new Set(texts)),
69
+ actionables: Array.from(new Set(actionables))
70
+ }
71
+ }
72
+
73
+ function inferSnapshotScreen(raw: CaptureDebugSnapshotRawResponse): string | null {
74
+ const tree = raw.ui_tree as SnapshotTreeLike | null | undefined
75
+ const treeScreen = normalizeHint(tree?.screen)
76
+ if (treeScreen) return titleCase(treeScreen)
77
+
78
+ const activity = shortActivityName(raw.activity)
79
+ if (activity) return activity
80
+
81
+ const { texts } = collectSnapshotTexts(tree)
82
+ if (texts.length > 0) return titleCase(texts[0])
83
+
84
+ return null
85
+ }
86
+
87
+ function deriveSnapshotSemantic(raw: CaptureDebugSnapshotRawResponse): SnapshotSemanticResponse | null {
88
+ const tree = raw.ui_tree as SnapshotTreeLike | null | undefined
89
+ const { texts, actionables } = collectSnapshotTexts(tree)
90
+ const screenFromTree = normalizeHint(tree?.screen)
91
+ const activityHint = normalizeHint(raw.activity)
92
+ const screen = inferSnapshotScreen(raw)
93
+
94
+ if (!screen && !activityHint && texts.length === 0 && !raw.logs.length) return null
95
+
96
+ const hasErrorLogs = raw.logs.some((entry) => /error|fatal exception|exception|failed/i.test(entry.message))
97
+ const hasLoadingSignals = texts.some((text) => /loading|please wait|spinner|progress/i.test(text))
98
+ const hasPrimaryText = texts.some((text) => /sign in|log in|log in|login|home|checkout|settings|menu|profile|search/i.test(text))
99
+ const hasScreenshot = typeof raw.screenshot === 'string' && raw.screenshot.length > 0
100
+ const hasUiTree = !!tree && Array.isArray(tree.elements)
101
+
102
+ const signals: Record<string, string | number | boolean> = {
103
+ has_activity: !!activityHint,
104
+ has_ui_tree: hasUiTree,
105
+ has_screenshot: hasScreenshot,
106
+ has_visible_text: texts.length > 0,
107
+ has_clickable_elements: actionables.length > 0,
108
+ has_error_logs: hasErrorLogs,
109
+ has_loading_signals: hasLoadingSignals,
110
+ has_primary_text: hasPrimaryText
111
+ }
112
+
113
+ const warnings: string[] = []
114
+ if (screenFromTree && activityHint && screenFromTree !== activityHint) {
115
+ warnings.push('ui_tree.screen and activity hints differ')
116
+ }
117
+ if (!hasUiTree) warnings.push('ui tree unavailable')
118
+ if (!activityHint) warnings.push('activity unavailable')
119
+ if (hasErrorLogs) warnings.push('error signals present in logs')
120
+
121
+ const evidenceScore =
122
+ (hasUiTree ? 0.35 : 0) +
123
+ (screen ? 0.2 : 0) +
124
+ (activityHint ? 0.15 : 0) +
125
+ (actionables.length > 0 ? 0.15 : 0) +
126
+ (texts.length > 0 ? 0.1 : 0) +
127
+ (hasScreenshot ? 0.05 : 0) +
128
+ (hasErrorLogs ? -0.15 : 0) +
129
+ (hasLoadingSignals ? -0.05 : 0)
130
+
131
+ const confidence = Math.max(0, Math.min(1, Number(evidenceScore.toFixed(2))))
132
+
133
+ if (!screen && confidence < 0.3) return null
134
+
135
+ return {
136
+ screen,
137
+ signals,
138
+ actions_available: actionables.length > 0 ? actionables.slice(0, 10) : null,
139
+ confidence,
140
+ warnings: confidence >= 0.7 && warnings.length === 0 ? [] : warnings
141
+ }
142
+ }
143
+
8
144
  export class ToolsObserve {
9
145
  // Resolve a target device and return the appropriate observe instance and resolved info.
10
146
  private static async resolveObserve(platform?: 'android' | 'ios', deviceId?: string, appId?: string) {
@@ -103,7 +239,7 @@ export class ToolsObserve {
103
239
 
104
240
  static async captureDebugSnapshotHandler({ reason, includeLogs = true, logLines = 200, platform, appId, deviceId, sessionId }: { reason?: string; includeLogs?: boolean; logLines?: number; platform?: 'android' | 'ios'; appId?: string; deviceId?: string; sessionId?: string } = {}) {
105
241
  const timestamp = Date.now()
106
- const out: any = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] }
242
+ const raw: CaptureDebugSnapshotRawResponse = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] }
107
243
 
108
244
  // Parallel fetches for performance: screenshot, current screen, fingerprint, ui tree, and log stream/get logs
109
245
  const sid = sessionId || 'default'
@@ -125,40 +261,40 @@ export class ToolsObserve {
125
261
  if (res.status === 'fulfilled') {
126
262
  const val = res.value
127
263
  if (key === 'screenshot') {
128
- out.screenshot = val && val.screenshot ? val.screenshot : null
264
+ raw.screenshot = val && val.screenshot ? val.screenshot : null
129
265
  } else if (key === 'currentScreen') {
130
- out.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : out.activity || ''
266
+ raw.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : raw.activity || ''
131
267
  } else if (key === 'fingerprint') {
132
- if (val && val.fingerprint) out.fingerprint = val.fingerprint
133
- if (val && val.activity) out.activity = out.activity || val.activity
134
- if (val && val.error) out.fingerprint_error = val.error
268
+ if (val && val.fingerprint) raw.fingerprint = val.fingerprint
269
+ if (val && val.activity) raw.activity = raw.activity || val.activity
270
+ if (val && val.error) raw.fingerprint_error = val.error
135
271
  } else if (key === 'uiTree') {
136
- out.ui_tree = val
137
- if (val && val.error) out.ui_tree_error = val.error
272
+ raw.ui_tree = val
273
+ if (val && val.error) raw.ui_tree_error = val.error
138
274
  } else if (key === 'readLogStream') {
139
275
  // handle below after evaluating fallback
140
276
  // temporarily attach to out._streamEntries
141
- out._streamEntries = val && val.entries ? val.entries : []
277
+ raw.logs = Array.isArray(val?.entries) ? val.entries : []
142
278
  }
143
279
  } else {
144
280
  const errMsg = res.reason instanceof Error ? res.reason.message : String(res.reason)
145
- if (key === 'screenshot') out.screenshot_error = errMsg
146
- if (key === 'currentScreen') out.activity_error = errMsg
147
- if (key === 'fingerprint') { out.fingerprint = null; out.fingerprint_error = errMsg }
148
- if (key === 'uiTree') { out.ui_tree = null; out.ui_tree_error = errMsg }
149
- if (key === 'readLogStream') { out._streamEntries = [] ; out.logs_error = errMsg }
281
+ if (key === 'screenshot') raw.screenshot_error = errMsg
282
+ if (key === 'currentScreen') raw.activity_error = errMsg
283
+ if (key === 'fingerprint') { raw.fingerprint = null; raw.fingerprint_error = errMsg }
284
+ if (key === 'uiTree') { raw.ui_tree = null; raw.ui_tree_error = errMsg }
285
+ if (key === 'readLogStream') { raw.logs = []; raw.logs_error = errMsg }
150
286
  }
151
287
  }
152
288
 
153
289
  // Logs: prefer stream entries, fallback to snapshot logs when empty
154
290
  if (includeLogs) {
155
291
  try {
156
- let entries: any[] = Array.isArray(out._streamEntries) ? out._streamEntries : []
292
+ let entries: any[] = Array.isArray(raw.logs) ? raw.logs : []
157
293
  if (!entries || entries.length === 0) {
158
294
  const gl = await ToolsObserve.getLogsHandler({ platform, appId, deviceId, lines: logLines })
159
- const raw: any[] = (gl && (gl as any).logs) ? (gl as any).logs : []
295
+ const snapshotLogs: any[] = (gl && (gl as any).logs) ? (gl as any).logs : []
160
296
  // raw may be structured entries or strings
161
- entries = raw.slice(-Math.max(0, logLines)).map(item => {
297
+ entries = snapshotLogs.slice(-Math.max(0, logLines)).map(item => {
162
298
  if (!item) return { timestamp: null, level: 'INFO', message: '' }
163
299
  if (typeof item === 'string') {
164
300
  const level = /\b(FATAL EXCEPTION|ERROR| E )\b/i.test(item) ? 'ERROR' : /\b(WARN| W )\b/i.test(item) ? 'WARN' : 'INFO'
@@ -186,16 +322,14 @@ export class ToolsObserve {
186
322
  })
187
323
  }
188
324
 
189
- out.logs = entries
325
+ raw.logs = entries
190
326
  } catch (e) {
191
- out.logs = []
192
- out.logs_error = e instanceof Error ? e.message : String(e)
327
+ raw.logs = []
328
+ raw.logs_error = e instanceof Error ? e.message : String(e)
193
329
  }
194
330
  }
195
331
 
196
- // Clean up internal temporary field
197
- delete out._streamEntries
198
-
199
- return out
332
+ const semantic = deriveSnapshotSemantic(raw)
333
+ return semantic ? { raw, semantic } : { raw }
200
334
  }
201
335
  }
@@ -240,7 +240,7 @@ Failure Handling:
240
240
  },
241
241
  {
242
242
  name: 'capture_debug_snapshot',
243
- description: 'Capture a complete debug snapshot (screenshot, ui tree, activity, fingerprint, logs). Returns structured JSON.',
243
+ description: 'Capture a complete debug snapshot (raw observation layer plus optional derived semantic layer). Returns structured JSON.',
244
244
  inputSchema: {
245
245
  type: 'object',
246
246
  properties: {
@@ -344,6 +344,7 @@ Capabilities:
344
344
  Constraints:
345
345
  - Does not verify correctness of the resulting state
346
346
  - Must not be used alone to confirm action success when an applicable expect_* tool exists
347
+ - Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
347
348
 
348
349
  Recommended Usage:
349
350
  1. Capture or define the expected outcome
@@ -835,6 +836,8 @@ Failure Handling:
835
836
  description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
836
837
 
837
838
  MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
839
+ Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
840
+ For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
838
841
 
839
842
  HOW TO GATHER INPUTS before calling:
840
843
  1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
868
871
  },
869
872
  networkRequests: {
870
873
  type: 'array',
871
- description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
874
+ description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
872
875
  items: {
873
876
  type: 'object',
874
877
  properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
890
893
  name: 'get_network_activity',
891
894
  description: `Returns structured network events captured from platform logs since the last action.
892
895
 
893
- Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
896
+ Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
894
897
  Do not call more than once per action.
895
898
 
896
899
  Events are filtered to significant (non-background) requests only.
package/src/types.ts CHANGED
@@ -137,6 +137,35 @@ export interface GetCurrentScreenResponse {
137
137
  error?: string;
138
138
  }
139
139
 
140
+ export interface SnapshotSemanticResponse {
141
+ screen: string | null;
142
+ signals: Record<string, string | number | boolean> | null;
143
+ actions_available: string[] | null;
144
+ confidence: number;
145
+ warnings: string[];
146
+ }
147
+
148
+ export interface CaptureDebugSnapshotRawResponse {
149
+ timestamp: number;
150
+ reason: string;
151
+ activity: string | null;
152
+ fingerprint: string | null;
153
+ screenshot: string | null;
154
+ ui_tree: unknown | null;
155
+ logs: StructuredLogEntry[];
156
+ device?: DeviceInfo;
157
+ screenshot_error?: string;
158
+ activity_error?: string;
159
+ fingerprint_error?: string;
160
+ ui_tree_error?: string;
161
+ logs_error?: string;
162
+ }
163
+
164
+ export interface CaptureDebugSnapshotResponse {
165
+ raw: CaptureDebugSnapshotRawResponse;
166
+ semantic?: SnapshotSemanticResponse | null;
167
+ }
168
+
140
169
  export interface WaitForElementResponse {
141
170
  device: DeviceInfo;
142
171
  found: boolean;
@@ -35,8 +35,11 @@ async function run() {
35
35
 
36
36
  const res1: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: true, logLines: 50, sessionId: 's1' })
37
37
  console.log('res1:', JSON.stringify(res1, null, 2))
38
- const pass1 = res1 && res1.screenshot === 'BASE64PNG' && res1.activity && res1.fingerprint === 'abc123' && Array.isArray(res1.logs) && res1.logs.length === 1
38
+ const pass1 = res1 && res1.raw && res1.raw.screenshot === 'BASE64PNG' && res1.raw.activity && res1.raw.fingerprint === 'abc123' && Array.isArray(res1.raw.logs) && res1.raw.logs.length === 1
39
39
  assert.ok(pass1, 'captureDebugSnapshot should aggregate successful handler results')
40
+ assert.strictEqual(res1.semantic.screen, 'Main')
41
+ assert.strictEqual(res1.semantic.confidence >= 0.7, true)
42
+ assert.deepStrictEqual(res1.semantic.actions_available, null)
40
43
  console.log('Test 1:', pass1 ? 'PASS' : 'FAIL')
41
44
 
42
45
  // Restore handlers before next test
@@ -55,7 +58,7 @@ async function run() {
55
58
 
56
59
  const res2: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: true, logLines: 10, appId: 'com.example' })
57
60
  console.log('res2:', JSON.stringify(res2, null, 2))
58
- const pass2 = res2 && res2.screenshot_error && res2.ui_tree_error && Array.isArray(res2.logs) && res2.logs.length === 2
61
+ const pass2 = res2 && res2.raw && res2.raw.screenshot_error && res2.raw.ui_tree_error && Array.isArray(res2.raw.logs) && res2.raw.logs.length === 2
59
62
  assert.ok(pass2, 'captureDebugSnapshot should surface partial failures and fallback logs')
60
63
  console.log('Test 2:', pass2 ? 'PASS' : 'FAIL')
61
64
 
@@ -76,7 +79,7 @@ async function run() {
76
79
 
77
80
  const res3: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: false })
78
81
  console.log('res3:', JSON.stringify(res3, null, 2))
79
- const pass3 = res3 && typeof res3.logs !== 'undefined' && res3.logs.length === 0
82
+ const pass3 = res3 && res3.raw && typeof res3.raw.logs !== 'undefined' && res3.raw.logs.length === 0
80
83
  assert.ok(pass3, 'captureDebugSnapshot should return an empty logs array when includeLogs is false')
81
84
  console.log('Test 3:', pass3 ? 'PASS' : 'FAIL')
82
85
 
@@ -26,11 +26,14 @@ async function run() {
26
26
  assert(waitForScreenChange, 'wait_for_screen_change should be registered')
27
27
  assert.match((waitForScreenChange as any).description, /does not verify correctness of the resulting state/i)
28
28
  assert.match((waitForScreenChange as any).description, /follow with expect_screen/i)
29
+ assert.match((waitForScreenChange as any).description, /backend\/API activity without a visible UI change/i)
29
30
 
30
31
  const captureDebugSnapshot = toolDefinitions.find((tool) => tool.name === 'capture_debug_snapshot')
31
32
  assert(captureDebugSnapshot, 'capture_debug_snapshot should be registered')
32
33
  assert.strictEqual((captureDebugSnapshot as any).inputSchema.properties.includeLogs.default, true)
33
34
  assert.strictEqual((captureDebugSnapshot as any).inputSchema.properties.logLines.default, 200)
35
+ assert.match((captureDebugSnapshot as any).description, /raw observation layer/i)
36
+ assert.match((captureDebugSnapshot as any).description, /optional derived semantic layer/i)
34
37
 
35
38
  const startLogStream = toolDefinitions.find((tool) => tool.name === 'start_log_stream')
36
39
  assert(startLogStream, 'start_log_stream should be registered')
@@ -60,6 +63,18 @@ async function run() {
60
63
  assert.match((expectElementVisible as any).description, /selector is the primary input/i)
61
64
  assert.match((expectElementVisible as any).description, /Returns structured binary success\/failure only/i)
62
65
 
66
+ const classifyActionOutcome = toolDefinitions.find((tool) => tool.name === 'classify_action_outcome')
67
+ assert(classifyActionOutcome, 'classify_action_outcome should be registered')
68
+ assert.match((classifyActionOutcome as any).description, /backend\/API activity without a visible UI change/i)
69
+ assert.match((classifyActionOutcome as any).description, /get_network_activity/i)
70
+ assert.match((classifyActionOutcome as any).description, /immediately after the action/i)
71
+
72
+ const getNetworkActivity = toolDefinitions.find((tool) => tool.name === 'get_network_activity')
73
+ assert(getNetworkActivity, 'get_network_activity should be registered')
74
+ assert.match((getNetworkActivity as any).description, /backend\/API activity without a visible UI change/i)
75
+ assert.doesNotMatch((getNetworkActivity as any).description, /Call this only when/i)
76
+ assert.match((getNetworkActivity as any).description, /immediately after an action/i)
77
+
63
78
  await assert.rejects(() => handleToolCall('unknown_tool'), /Unknown tool: unknown_tool/)
64
79
 
65
80
  console.log('server contract tests passed')
@@ -16,6 +16,7 @@ async function run() {
16
16
  const originalCaptureScreenshotHandler = (ToolsObserve as any).captureScreenshotHandler
17
17
  const originalGetUITreeHandler = (ToolsObserve as any).getUITreeHandler
18
18
  const originalGetScreenFingerprintHandler = (ToolsObserve as any).getScreenFingerprintHandler
19
+ const originalCaptureDebugSnapshotHandler = (ToolsObserve as any).captureDebugSnapshotHandler
19
20
 
20
21
  try {
21
22
  ;(ToolsManage as any).installAppHandler = async () => ({
@@ -181,6 +182,32 @@ async function run() {
181
182
  assert.strictEqual(uiTreePayload.resolution.height, 2400)
182
183
  assert.strictEqual(uiTreePayload.elements[0].text, 'Login')
183
184
 
185
+ ;(ToolsObserve as any).captureDebugSnapshotHandler = async () => ({
186
+ raw: {
187
+ timestamp: 1710000000000,
188
+ reason: 'manual',
189
+ activity: 'com.example.MainActivity',
190
+ fingerprint: 'fp_raw',
191
+ screenshot: 'base64',
192
+ ui_tree: { screen: 'Home', elements: [] },
193
+ logs: [],
194
+ device: { platform: 'android', id: 'mock', osVersion: '14', model: 'Pixel', simulator: true }
195
+ },
196
+ semantic: {
197
+ screen: 'Home',
198
+ signals: { has_activity: true },
199
+ actions_available: ['open settings'],
200
+ confidence: 0.8,
201
+ warnings: []
202
+ }
203
+ })
204
+
205
+ const snapshotResponse = await handleToolCall('capture_debug_snapshot', { platform: 'android' })
206
+ const snapshotPayload = JSON.parse((snapshotResponse as any).content[0].text)
207
+ assert.strictEqual(snapshotPayload.raw.fingerprint, 'fp_raw')
208
+ assert.strictEqual(snapshotPayload.semantic.screen, 'Home')
209
+ assert.strictEqual(snapshotPayload.semantic.confidence, 0.8)
210
+
184
211
  console.log('server response-shape tests passed')
185
212
  } finally {
186
213
  ;(ToolsManage as any).installAppHandler = originalInstallAppHandler
@@ -193,6 +220,7 @@ async function run() {
193
220
  ;(ToolsObserve as any).captureScreenshotHandler = originalCaptureScreenshotHandler
194
221
  ;(ToolsObserve as any).getUITreeHandler = originalGetUITreeHandler
195
222
  ;(ToolsObserve as any).getScreenFingerprintHandler = originalGetScreenFingerprintHandler
223
+ ;(ToolsObserve as any).captureDebugSnapshotHandler = originalCaptureDebugSnapshotHandler
196
224
  }
197
225
  }
198
226