npm - mobile-debug-mcp - Versions diffs - 0.24.3 → 0.24.5 - Mend

mobile-debug-mcp 0.24.3 → 0.24.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/observe/index.js +130 -26
package/dist/server/tool-definitions.js +6 -3
package/docs/CHANGELOG.md +6 -0
package/docs/specs/mcp-tooling-spec-v1.md +86 -5
package/docs/tools/interact.md +24 -0
package/docs/tools/observe.md +23 -7
package/package.json +1 -1
package/src/observe/index.ts +158 -24
package/src/server/tool-definitions.ts +6 -3
package/src/types.ts +29 -0
package/test/unit/observe/capture_debug_snapshot.test.ts +6 -3
package/test/unit/server/contract.test.ts +15 -0
package/test/unit/server/response_shapes.test.ts +28 -0

package/dist/observe/index.js CHANGED Viewed

@@ -3,6 +3,111 @@ import { AndroidObserve } from './android.js';
 import { iOSObserve } from './ios.js';
 export { AndroidObserve } from './android.js';
 export { iOSObserve } from './ios.js';
+function normalizeHint(value) {
+    if (value === null || value === undefined)
+        return '';
+    return String(value).trim().replace(/\s+/g, ' ').toLowerCase();
+}
+function titleCase(value) {
+    return value
+        .replace(/[_-]+/g, ' ')
+        .replace(/\s+/g, ' ')
+        .trim()
+        .replace(/\b\w/g, (match) => match.toUpperCase());
+}
+function shortActivityName(activity) {
+    if (!activity)
+        return null;
+    const trimmed = String(activity).trim();
+    if (!trimmed)
+        return null;
+    const lastSegment = trimmed.split('.').pop() || trimmed;
+    const withoutSuffix = lastSegment.replace(/Activity$/, '');
+    return withoutSuffix ? titleCase(withoutSuffix) : titleCase(lastSegment);
+}
+function collectSnapshotTexts(tree) {
+    const elements = Array.isArray(tree?.elements) ? tree.elements : [];
+    const texts = [];
+    const actionables = [];
+    for (const element of elements) {
+        const rawText = element?.text ?? element?.contentDescription ?? element?.contentDesc ?? element?.accessibilityLabel ?? element?.resourceId ?? element?.id ?? '';
+        const text = normalizeHint(rawText);
+        if (text)
+            texts.push(text);
+        if (element?.clickable && element?.enabled !== false && text) {
+            actionables.push(text);
+        }
+    }
+    return {
+        texts: Array.from(new Set(texts)),
+        actionables: Array.from(new Set(actionables))
+    };
+}
+function inferSnapshotScreen(raw) {
+    const tree = raw.ui_tree;
+    const treeScreen = normalizeHint(tree?.screen);
+    if (treeScreen)
+        return titleCase(treeScreen);
+    const activity = shortActivityName(raw.activity);
+    if (activity)
+        return activity;
+    const { texts } = collectSnapshotTexts(tree);
+    if (texts.length > 0)
+        return titleCase(texts[0]);
+    return null;
+}
+function deriveSnapshotSemantic(raw) {
+    const tree = raw.ui_tree;
+    const { texts, actionables } = collectSnapshotTexts(tree);
+    const screenFromTree = normalizeHint(tree?.screen);
+    const activityHint = normalizeHint(raw.activity);
+    const screen = inferSnapshotScreen(raw);
+    if (!screen && !activityHint && texts.length === 0 && !raw.logs.length)
+        return null;
+    const hasErrorLogs = raw.logs.some((entry) => /error|fatal exception|exception|failed/i.test(entry.message));
+    const hasLoadingSignals = texts.some((text) => /loading|please wait|spinner|progress/i.test(text));
+    const hasPrimaryText = texts.some((text) => /sign in|log in|log in|login|home|checkout|settings|menu|profile|search/i.test(text));
+    const hasScreenshot = typeof raw.screenshot === 'string' && raw.screenshot.length > 0;
+    const hasUiTree = !!tree && Array.isArray(tree.elements);
+    const signals = {
+        has_activity: !!activityHint,
+        has_ui_tree: hasUiTree,
+        has_screenshot: hasScreenshot,
+        has_visible_text: texts.length > 0,
+        has_clickable_elements: actionables.length > 0,
+        has_error_logs: hasErrorLogs,
+        has_loading_signals: hasLoadingSignals,
+        has_primary_text: hasPrimaryText
+    };
+    const warnings = [];
+    if (screenFromTree && activityHint && screenFromTree !== activityHint) {
+        warnings.push('ui_tree.screen and activity hints differ');
+    }
+    if (!hasUiTree)
+        warnings.push('ui tree unavailable');
+    if (!activityHint)
+        warnings.push('activity unavailable');
+    if (hasErrorLogs)
+        warnings.push('error signals present in logs');
+    const evidenceScore = (hasUiTree ? 0.35 : 0) +
+        (screen ? 0.2 : 0) +
+        (activityHint ? 0.15 : 0) +
+        (actionables.length > 0 ? 0.15 : 0) +
+        (texts.length > 0 ? 0.1 : 0) +
+        (hasScreenshot ? 0.05 : 0) +
+        (hasErrorLogs ? -0.15 : 0) +
+        (hasLoadingSignals ? -0.05 : 0);
+    const confidence = Math.max(0, Math.min(1, Number(evidenceScore.toFixed(2))));
+    if (!screen && confidence < 0.3)
+        return null;
+    return {
+        screen,
+        signals,
+        actions_available: actionables.length > 0 ? actionables.slice(0, 10) : null,
+        confidence,
+        warnings: confidence >= 0.7 && warnings.length === 0 ? [] : warnings
+    };
+}
 export class ToolsObserve {
     // Resolve a target device and return the appropriate observe instance and resolved info.
     static async resolveObserve(platform, deviceId, appId) {
@@ -95,7 +200,7 @@ export class ToolsObserve {
     }
     static async captureDebugSnapshotHandler({ reason, includeLogs = true, logLines = 200, platform, appId, deviceId, sessionId } = {}) {
         const timestamp = Date.now();
-        const out = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] };
+        const raw = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] };
         // Parallel fetches for performance: screenshot, current screen, fingerprint, ui tree, and log stream/get logs
         const sid = sessionId || 'default';
         const tasks = {
@@ -114,59 +219,59 @@ export class ToolsObserve {
             if (res.status === 'fulfilled') {
                 const val = res.value;
                 if (key === 'screenshot') {
-                    out.screenshot = val && val.screenshot ? val.screenshot : null;
+                    raw.screenshot = val && val.screenshot ? val.screenshot : null;
                 }
                 else if (key === 'currentScreen') {
-                    out.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : out.activity || '';
+                    raw.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : raw.activity || '';
                 }
                 else if (key === 'fingerprint') {
                     if (val && val.fingerprint)
-                        out.fingerprint = val.fingerprint;
+                        raw.fingerprint = val.fingerprint;
                     if (val && val.activity)
-                        out.activity = out.activity || val.activity;
+                        raw.activity = raw.activity || val.activity;
                     if (val && val.error)
-                        out.fingerprint_error = val.error;
+                        raw.fingerprint_error = val.error;
                 }
                 else if (key === 'uiTree') {
-                    out.ui_tree = val;
+                    raw.ui_tree = val;
                     if (val && val.error)
-                        out.ui_tree_error = val.error;
+                        raw.ui_tree_error = val.error;
                 }
                 else if (key === 'readLogStream') {
                     // handle below after evaluating fallback
                     // temporarily attach to out._streamEntries
-                    out._streamEntries = val && val.entries ? val.entries : [];
+                    raw.logs = Array.isArray(val?.entries) ? val.entries : [];
                 }
             }
             else {
                 const errMsg = res.reason instanceof Error ? res.reason.message : String(res.reason);
                 if (key === 'screenshot')
-                    out.screenshot_error = errMsg;
+                    raw.screenshot_error = errMsg;
                 if (key === 'currentScreen')
-                    out.activity_error = errMsg;
+                    raw.activity_error = errMsg;
                 if (key === 'fingerprint') {
-                    out.fingerprint = null;
-                    out.fingerprint_error = errMsg;
+                    raw.fingerprint = null;
+                    raw.fingerprint_error = errMsg;
                 }
                 if (key === 'uiTree') {
-                    out.ui_tree = null;
-                    out.ui_tree_error = errMsg;
+                    raw.ui_tree = null;
+                    raw.ui_tree_error = errMsg;
                 }
                 if (key === 'readLogStream') {
-                    out._streamEntries = [];
-                    out.logs_error = errMsg;
+                    raw.logs = [];
+                    raw.logs_error = errMsg;
                 }
             }
         }
         // Logs: prefer stream entries, fallback to snapshot logs when empty
         if (includeLogs) {
             try {
-                let entries = Array.isArray(out._streamEntries) ? out._streamEntries : [];
+                let entries = Array.isArray(raw.logs) ? raw.logs : [];
                 if (!entries || entries.length === 0) {
                     const gl = await ToolsObserve.getLogsHandler({ platform, appId, deviceId, lines: logLines });
-                    const raw = (gl && gl.logs) ? gl.logs : [];
+                    const snapshotLogs = (gl && gl.logs) ? gl.logs : [];
                     // raw may be structured entries or strings
-                    entries = raw.slice(-Math.max(0, logLines)).map(item => {
+                    entries = snapshotLogs.slice(-Math.max(0, logLines)).map(item => {
                         if (!item)
                             return { timestamp: null, level: 'INFO', message: '' };
                         if (typeof item === 'string') {
@@ -196,15 +301,14 @@ export class ToolsObserve {
                         return { timestamp: tsNum, level, message: msg };
                     });
                 }
-                out.logs = entries;
+                raw.logs = entries;
             }
             catch (e) {
-                out.logs = [];
-                out.logs_error = e instanceof Error ? e.message : String(e);
+                raw.logs = [];
+                raw.logs_error = e instanceof Error ? e.message : String(e);
             }
         }
-        // Clean up internal temporary field
-        delete out._streamEntries;
-        return out;
+        const semantic = deriveSnapshotSemantic(raw);
+        return semantic ? { raw, semantic } : { raw };
     }
 }

package/dist/server/tool-definitions.js CHANGED Viewed

@@ -240,7 +240,7 @@ Failure Handling:
     },
     {
         name: 'capture_debug_snapshot',
-        description: 'Capture a complete debug snapshot (screenshot, ui tree, activity, fingerprint, logs). Returns structured JSON.',
+        description: 'Capture a complete debug snapshot (raw observation layer plus optional derived semantic layer). Returns structured JSON.',
         inputSchema: {
             type: 'object',
             properties: {
@@ -344,6 +344,7 @@ Capabilities:
 Constraints:
 - Does not verify correctness of the resulting state
 - Must not be used alone to confirm action success when an applicable expect_* tool exists
+- Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
 Recommended Usage:
 1. Capture or define the expected outcome
@@ -835,6 +836,8 @@ Failure Handling:
         description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
 MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
+Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
+For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
 HOW TO GATHER INPUTS before calling:
 1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
                 },
                 networkRequests: {
                     type: 'array',
-                    description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
+                    description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
                     items: {
                         type: 'object',
                         properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
         name: 'get_network_activity',
         description: `Returns structured network events captured from platform logs since the last action.
-Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
+Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
 Do not call more than once per action.
 Events are filtered to significant (non-background) requests only.

package/docs/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,12 @@
 All notable changes to the **Mobile Debug MCP** project will be documented in this file.
+## [0.24.5]
+- Improved snapshots
+## [0.24.4]
+- Moving agents away from `wait_for_screen_change`
 ## [0.24.3]
 - Improved output consistency

package/docs/specs/mcp-tooling-spec-v1.md CHANGED Viewed

@@ -36,6 +36,14 @@ It does not apply to:
 - observation-only flows
 - non-verifiable or exploratory actions
+Outcome-specific guidance:
+- visible navigation expected -> `wait_for_screen_change` (optional) -> `expect_screen`
+- local UI change expected -> `wait_for_ui` (optional) -> `expect_element_visible`
+- backend/API activity expected without a visible UI change -> compare `get_screen_fingerprint` before/after, then call `get_network_activity` immediately after the action and `classify_action_outcome` with the observed requests
+For backend/API activity, `wait_for_screen_change` is not the right verification tool unless a visible transition is also expected.
 ## 4. Action Tools
 ### 4.1 Definition
@@ -201,7 +209,78 @@ String-only errors are not allowed, including fallback handler errors.
 Note: string diagnostics may still appear inside structured JSON payloads where explicitly defined by a tool.
-## 9. Classification
+## 9. Observation Tools (Extended Semantics)
+Observation tools inspect application state without mutating it.
+Examples:
+- `capture_debug_snapshot`
+- `get_screen_fingerprint`
+- `get_network_activity`
+- `get_logs`
+### 9.1 Snapshot Response Model
+`capture_debug_snapshot` MUST return a dual-layer response:
+- `raw`: required object
+- `semantic`: optional object
+The raw layer is authoritative and MUST remain unchanged from the underlying observation data. It is the source of truth and MUST NOT be interpreted or rewritten.
+The semantic layer is derived, best-effort, and MUST be generated exclusively from the raw layer.
+Raw layer contents include:
+- UI hierarchy or accessibility tree
+- screenshot when available
+- element-level attributes
+- logs and fingerprint/activity observations
+- raw error fields when partial collection fails
+Semantic layer shape when present:
+```ts
+{
+  screen: string | null,
+  signals: Record<string, string | number | boolean> | null,
+  actions_available: string[] | null,
+  confidence: number,
+  warnings: string[]
+}
+```
+Rules:
+- `confidence` MUST be between 0 and 1
+- `warnings` MUST be present when `semantic` is present
+- `semantic` MAY be omitted entirely when derivation is not reliable
+- `semantic` MUST be treated as unreliable if it conflicts with raw data
+- `actions_available` are hints only and MUST NOT be treated as guaranteed executable actions
+### 9.2 Agent Usage Contract
+Agents SHOULD use `semantic` for primary decision-making when present.
+Agents MUST fall back to `raw` when:
+- `semantic` is missing
+- `confidence < 0.7`
+- `warnings` is non-empty
+- semantic output conflicts with expected state or raw data
+`semantic` is for planning only and MUST NOT be used for verification.
+### 9.3 Relationship to Classification
+Semantic signals MAY be used as input to `classify_action_outcome`.
+Semantic output MUST NOT replace classification or verification.
+Classification remains a supplementary, post-action interpretation mechanism.
+## 10. Classification
 Tool: `classify_action_outcome`
@@ -211,10 +290,11 @@ Rules:
 - MUST be deterministic
 - MUST NOT replace `expect_*` tools
 - MUST be treated as a supplementary signal only
+- SHOULD be used with `get_network_activity` when the expected outcome is backend/API activity without a visible UI change
 It is not a verification mechanism.
-## 10. Execution Patterns
+## 11. Execution Patterns
 Canonical pattern:
@@ -226,7 +306,7 @@ Interpretation:
 - `wait_for_screen_change.success` = UI changed
 - `expect_screen.success` = correct outcome verified
-## 11. Known Deviations
+## 12. Known Deviations
 Explicitly allowed:
@@ -237,7 +317,7 @@ Explicitly allowed:
 - `scroll_to_element` outcome-based success (temporary exception)
 - extended runtime fields in `list_devices`
-## 12. Migration Rules
+## 13. Migration Rules
 Must change now:
@@ -249,6 +329,7 @@ Should align when touched:
 - `start_app`, `restart_app`
 - `scroll_to_element`
 - `wait_for_ui`
+- `capture_debug_snapshot`
 No change required:
@@ -257,7 +338,7 @@ No change required:
 - `expect_element_visible`
 - `wait_for_screen_change`
-## 13. Guiding Principles
+## 14. Guiding Principles
 - Actions execute
 - Verification proves

package/docs/tools/interact.md CHANGED Viewed

@@ -53,6 +53,10 @@ Preferred verification:
 - navigation outcome known -> `expect_screen`
 - local UI change known -> `expect_element_visible`
+- backend/API activity expected -> `classify_action_outcome` + `get_network_activity`
+Use `wait_for_screen_change` only when a visible transition is the expected outcome. If a button should trigger an API request but the screen should stay the same, rely on network activity and classification instead.
+For backend-only actions, prefer comparing `get_screen_fingerprint` before/after and call `get_network_activity` immediately after the action; do not wait on `wait_for_screen_change` if no visible transition is expected.
 ---
@@ -139,6 +143,7 @@ Notes:
 - Treats `null` fingerprints as transient and keeps polling.
 - Adds a stability confirmation before returning success to avoid transient animation frames.
 - Follow with `expect_screen` when the expected destination is known.
+- Do not use this as the main success check for backend/API activity that does not change the visible UI.
 ---
@@ -451,3 +456,22 @@ Notes:
 - The tool resolves the selector internally when needed.
 - On failure, `reason` and `observed` tell you whether the selector was missing entirely or present but not yet visible.
 - Use when the screen should remain on the same destination but a specific element should appear or become visible.
+---
+## classify_action_outcome + get_network_activity
+Use this pair when the action is expected to trigger network/backend work and the screen may not visibly change.
+Pattern:
+1. perform the action
+2. call `classify_action_outcome` with `uiChanged` from `wait_for_screen_change` or a screen fingerprint comparison
+3. if the classifier asks for it, call `get_network_activity`
+4. call `classify_action_outcome` again with `networkRequests`
+Guidance:
+- `uiChanged=true` or `expectedElementVisible=true` means the action outcome is already verified
+- `nextAction="call_get_network_activity"` means the UI signal was inconclusive and the agent should inspect network activity
+- if network requests succeed but the UI stays unchanged, treat the outcome as a backend/API result rather than a screen transition

package/docs/tools/observe.md CHANGED Viewed

@@ -132,24 +132,40 @@ Behavior:
 - Returns partial data when components fail and includes per-part error fields (e.g. `screenshot_error`, `ui_tree_error`).
 - Caps logs to `logLines` entries and prefers recent entries.
 - Fast by default: does not wait for new logs and avoids long blocking operations.
+- Returns a dual-layer payload:
+  - `raw` is authoritative and contains the underlying observation data unchanged.
+  - `semantic` is optional, derived from `raw`, and intended for planning only.
 Response (example):
 ```json
 {
-  "timestamp": 1710000000,
-  "reason": "Crash after tapping checkout",
-  "activity": "CheckoutActivity",
-  "fingerprint": "abc123",
-  "screenshot": "<base64 PNG string>",
-  "ui_tree": { ... },
-  "logs": [ { "timestamp": 1710000000, "level": "ERROR", "message": "NullPointerException at CheckoutViewModel" } ]
+  "raw": {
+    "timestamp": 1710000000,
+    "reason": "Crash after tapping checkout",
+    "activity": "CheckoutActivity",
+    "fingerprint": "abc123",
+    "screenshot": "<base64 PNG string>",
+    "ui_tree": { ... },
+    "logs": [ { "timestamp": 1710000000, "level": "ERROR", "message": "NullPointerException at CheckoutViewModel" } ]
+  },
+  "semantic": {
+    "screen": "Checkout",
+    "signals": {
+      "has_error_logs": true,
+      "has_clickable_elements": false
+    },
+    "actions_available": ["review checkout", "inspect error"],
+    "confidence": 0.82,
+    "warnings": []
+  }
 }
 ```
 Notes:
 - Useful immediately after detecting crashes or unexpected UI behaviour.
 - Do not expect perfect data during a crash; tool is designed to return best-effort context and include errors for failed parts.
+- Treat `semantic` as planning guidance only; `raw` remains the source of truth.
 ---

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mobile-debug-mcp",
-  "version": "0.24.3",
+  "version": "0.24.5",
   "description": "MCP server for mobile app debugging (Android + iOS), with focus on security and reliability",
   "type": "module",
   "bin": {

package/src/observe/index.ts CHANGED Viewed

@@ -1,10 +1,146 @@
 import { resolveTargetDevice } from '../utils/resolve-device.js'
 import { AndroidObserve } from './android.js'
 import { iOSObserve } from './ios.js'
+import type {
+  CaptureDebugSnapshotRawResponse,
+  SnapshotSemanticResponse
+} from '../types.js'
 export { AndroidObserve } from './android.js'
 export { iOSObserve } from './ios.js'
+interface SnapshotTreeElementLike {
+  text?: string | null
+  contentDescription?: string | null
+  contentDesc?: string | null
+  accessibilityLabel?: string | null
+  resourceId?: string | null
+  id?: string | null
+  type?: string | null
+  class?: string | null
+  clickable?: boolean
+  enabled?: boolean
+  visible?: boolean
+}
+interface SnapshotTreeLike {
+  screen?: string | null
+  elements?: SnapshotTreeElementLike[]
+}
+function normalizeHint(value: unknown): string {
+  if (value === null || value === undefined) return ''
+  return String(value).trim().replace(/\s+/g, ' ').toLowerCase()
+}
+function titleCase(value: string): string {
+  return value
+    .replace(/[_-]+/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+    .replace(/\b\w/g, (match) => match.toUpperCase())
+}
+function shortActivityName(activity: string | null | undefined): string | null {
+  if (!activity) return null
+  const trimmed = String(activity).trim()
+  if (!trimmed) return null
+  const lastSegment = trimmed.split('.').pop() || trimmed
+  const withoutSuffix = lastSegment.replace(/Activity$/, '')
+  return withoutSuffix ? titleCase(withoutSuffix) : titleCase(lastSegment)
+}
+function collectSnapshotTexts(tree: SnapshotTreeLike | null | undefined) {
+  const elements = Array.isArray(tree?.elements) ? tree!.elements! : []
+  const texts: string[] = []
+  const actionables: string[] = []
+  for (const element of elements) {
+    const rawText = element?.text ?? element?.contentDescription ?? element?.contentDesc ?? element?.accessibilityLabel ?? element?.resourceId ?? element?.id ?? ''
+    const text = normalizeHint(rawText)
+    if (text) texts.push(text)
+    if (element?.clickable && element?.enabled !== false && text) {
+      actionables.push(text)
+    }
+  }
+  return {
+    texts: Array.from(new Set(texts)),
+    actionables: Array.from(new Set(actionables))
+  }
+}
+function inferSnapshotScreen(raw: CaptureDebugSnapshotRawResponse): string | null {
+  const tree = raw.ui_tree as SnapshotTreeLike | null | undefined
+  const treeScreen = normalizeHint(tree?.screen)
+  if (treeScreen) return titleCase(treeScreen)
+  const activity = shortActivityName(raw.activity)
+  if (activity) return activity
+  const { texts } = collectSnapshotTexts(tree)
+  if (texts.length > 0) return titleCase(texts[0])
+  return null
+}
+function deriveSnapshotSemantic(raw: CaptureDebugSnapshotRawResponse): SnapshotSemanticResponse | null {
+  const tree = raw.ui_tree as SnapshotTreeLike | null | undefined
+  const { texts, actionables } = collectSnapshotTexts(tree)
+  const screenFromTree = normalizeHint(tree?.screen)
+  const activityHint = normalizeHint(raw.activity)
+  const screen = inferSnapshotScreen(raw)
+  if (!screen && !activityHint && texts.length === 0 && !raw.logs.length) return null
+  const hasErrorLogs = raw.logs.some((entry) => /error|fatal exception|exception|failed/i.test(entry.message))
+  const hasLoadingSignals = texts.some((text) => /loading|please wait|spinner|progress/i.test(text))
+  const hasPrimaryText = texts.some((text) => /sign in|log in|log in|login|home|checkout|settings|menu|profile|search/i.test(text))
+  const hasScreenshot = typeof raw.screenshot === 'string' && raw.screenshot.length > 0
+  const hasUiTree = !!tree && Array.isArray(tree.elements)
+  const signals: Record<string, string | number | boolean> = {
+    has_activity: !!activityHint,
+    has_ui_tree: hasUiTree,
+    has_screenshot: hasScreenshot,
+    has_visible_text: texts.length > 0,
+    has_clickable_elements: actionables.length > 0,
+    has_error_logs: hasErrorLogs,
+    has_loading_signals: hasLoadingSignals,
+    has_primary_text: hasPrimaryText
+  }
+  const warnings: string[] = []
+  if (screenFromTree && activityHint && screenFromTree !== activityHint) {
+    warnings.push('ui_tree.screen and activity hints differ')
+  }
+  if (!hasUiTree) warnings.push('ui tree unavailable')
+  if (!activityHint) warnings.push('activity unavailable')
+  if (hasErrorLogs) warnings.push('error signals present in logs')
+  const evidenceScore =
+    (hasUiTree ? 0.35 : 0) +
+    (screen ? 0.2 : 0) +
+    (activityHint ? 0.15 : 0) +
+    (actionables.length > 0 ? 0.15 : 0) +
+    (texts.length > 0 ? 0.1 : 0) +
+    (hasScreenshot ? 0.05 : 0) +
+    (hasErrorLogs ? -0.15 : 0) +
+    (hasLoadingSignals ? -0.05 : 0)
+  const confidence = Math.max(0, Math.min(1, Number(evidenceScore.toFixed(2))))
+  if (!screen && confidence < 0.3) return null
+  return {
+    screen,
+    signals,
+    actions_available: actionables.length > 0 ? actionables.slice(0, 10) : null,
+    confidence,
+    warnings: confidence >= 0.7 && warnings.length === 0 ? [] : warnings
+  }
+}
 export class ToolsObserve {
   // Resolve a target device and return the appropriate observe instance and resolved info.
   private static async resolveObserve(platform?: 'android' | 'ios', deviceId?: string, appId?: string) {
@@ -103,7 +239,7 @@ export class ToolsObserve {
   static async captureDebugSnapshotHandler({ reason, includeLogs = true, logLines = 200, platform, appId, deviceId, sessionId }: { reason?: string; includeLogs?: boolean; logLines?: number; platform?: 'android' | 'ios'; appId?: string; deviceId?: string; sessionId?: string } = {}) {
     const timestamp = Date.now()
-    const out: any = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] }
+    const raw: CaptureDebugSnapshotRawResponse = { timestamp, reason: reason || '', activity: null, fingerprint: null, screenshot: null, ui_tree: null, logs: [] }
     // Parallel fetches for performance: screenshot, current screen, fingerprint, ui tree, and log stream/get logs
     const sid = sessionId || 'default'
@@ -125,40 +261,40 @@ export class ToolsObserve {
       if (res.status === 'fulfilled') {
         const val = res.value
         if (key === 'screenshot') {
-          out.screenshot = val && val.screenshot ? val.screenshot : null
+          raw.screenshot = val && val.screenshot ? val.screenshot : null
         } else if (key === 'currentScreen') {
-          out.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : out.activity || ''
+          raw.activity = val && ((val.activity || val.shortActivity)) ? (val.activity || val.shortActivity) : raw.activity || ''
         } else if (key === 'fingerprint') {
-          if (val && val.fingerprint) out.fingerprint = val.fingerprint
-          if (val && val.activity) out.activity = out.activity || val.activity
-          if (val && val.error) out.fingerprint_error = val.error
+          if (val && val.fingerprint) raw.fingerprint = val.fingerprint
+          if (val && val.activity) raw.activity = raw.activity || val.activity
+          if (val && val.error) raw.fingerprint_error = val.error
         } else if (key === 'uiTree') {
-          out.ui_tree = val
-          if (val && val.error) out.ui_tree_error = val.error
+          raw.ui_tree = val
+          if (val && val.error) raw.ui_tree_error = val.error
         } else if (key === 'readLogStream') {
           // handle below after evaluating fallback
           // temporarily attach to out._streamEntries
-          out._streamEntries = val && val.entries ? val.entries : []
+          raw.logs = Array.isArray(val?.entries) ? val.entries : []
         }
       } else {
         const errMsg = res.reason instanceof Error ? res.reason.message : String(res.reason)
-        if (key === 'screenshot') out.screenshot_error = errMsg
-        if (key === 'currentScreen') out.activity_error = errMsg
-        if (key === 'fingerprint') { out.fingerprint = null; out.fingerprint_error = errMsg }
-        if (key === 'uiTree') { out.ui_tree = null; out.ui_tree_error = errMsg }
-        if (key === 'readLogStream') { out._streamEntries = [] ; out.logs_error = errMsg }
+        if (key === 'screenshot') raw.screenshot_error = errMsg
+        if (key === 'currentScreen') raw.activity_error = errMsg
+        if (key === 'fingerprint') { raw.fingerprint = null; raw.fingerprint_error = errMsg }
+        if (key === 'uiTree') { raw.ui_tree = null; raw.ui_tree_error = errMsg }
+        if (key === 'readLogStream') { raw.logs = []; raw.logs_error = errMsg }
       }
     }
     // Logs: prefer stream entries, fallback to snapshot logs when empty
     if (includeLogs) {
       try {
-        let entries: any[] = Array.isArray(out._streamEntries) ? out._streamEntries : []
+        let entries: any[] = Array.isArray(raw.logs) ? raw.logs : []
         if (!entries || entries.length === 0) {
           const gl = await ToolsObserve.getLogsHandler({ platform, appId, deviceId, lines: logLines })
-          const raw: any[] = (gl && (gl as any).logs) ? (gl as any).logs : []
+          const snapshotLogs: any[] = (gl && (gl as any).logs) ? (gl as any).logs : []
           // raw may be structured entries or strings
-          entries = raw.slice(-Math.max(0, logLines)).map(item => {
+          entries = snapshotLogs.slice(-Math.max(0, logLines)).map(item => {
             if (!item) return { timestamp: null, level: 'INFO', message: '' }
             if (typeof item === 'string') {
               const level = /\b(FATAL EXCEPTION|ERROR| E )\b/i.test(item) ? 'ERROR' : /\b(WARN| W )\b/i.test(item) ? 'WARN' : 'INFO'
@@ -186,16 +322,14 @@ export class ToolsObserve {
           })
         }
-        out.logs = entries
+        raw.logs = entries
       } catch (e) {
-        out.logs = []
-        out.logs_error = e instanceof Error ? e.message : String(e)
+        raw.logs = []
+        raw.logs_error = e instanceof Error ? e.message : String(e)
       }
     }
-    // Clean up internal temporary field
-    delete out._streamEntries
-    return out
+    const semantic = deriveSnapshotSemantic(raw)
+    return semantic ? { raw, semantic } : { raw }
   }
 }

package/src/server/tool-definitions.ts CHANGED Viewed

@@ -240,7 +240,7 @@ Failure Handling:
   },
   {
     name: 'capture_debug_snapshot',
-    description: 'Capture a complete debug snapshot (screenshot, ui tree, activity, fingerprint, logs). Returns structured JSON.',
+    description: 'Capture a complete debug snapshot (raw observation layer plus optional derived semantic layer). Returns structured JSON.',
     inputSchema: {
       type: 'object',
       properties: {
@@ -344,6 +344,7 @@ Capabilities:
 Constraints:
 - Does not verify correctness of the resulting state
 - Must not be used alone to confirm action success when an applicable expect_* tool exists
+- Use classify_action_outcome + get_network_activity when the expected outcome is backend/API activity without a visible UI change
 Recommended Usage:
 1. Capture or define the expected outcome
@@ -835,6 +836,8 @@ Failure Handling:
     description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
 MUST be called after every action (tap, swipe, type_text, press_back, start_app, etc). Never skip.
+Use this with get_network_activity when the expected outcome is backend/API activity without a visible UI change.
+For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action instead of waiting for wait_for_screen_change.
 HOW TO GATHER INPUTS before calling:
 1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
@@ -868,7 +871,7 @@ BEHAVIOUR after outcome:
         },
         networkRequests: {
           type: 'array',
-          description: 'Pass this only after calling get_network_activity as instructed by nextAction. Map each request to endpoint + status.',
+          description: 'Pass this only after calling get_network_activity as instructed by nextAction. Also use it when the expected outcome is backend/API activity without a visible UI change.',
           items: {
             type: 'object',
             properties: {
@@ -890,7 +893,7 @@ BEHAVIOUR after outcome:
     name: 'get_network_activity',
     description: `Returns structured network events captured from platform logs since the last action.
-Call this only when classify_action_outcome returns nextAction="call_get_network_activity".
+Call this when classify_action_outcome returns nextAction="call_get_network_activity" or immediately after an action whose expected outcome is backend/API activity without a visible UI change.
 Do not call more than once per action.
 Events are filtered to significant (non-background) requests only.

package/src/types.ts CHANGED Viewed

@@ -137,6 +137,35 @@ export interface GetCurrentScreenResponse {
   error?: string;
 }
+export interface SnapshotSemanticResponse {
+  screen: string | null;
+  signals: Record<string, string | number | boolean> | null;
+  actions_available: string[] | null;
+  confidence: number;
+  warnings: string[];
+}
+export interface CaptureDebugSnapshotRawResponse {
+  timestamp: number;
+  reason: string;
+  activity: string | null;
+  fingerprint: string | null;
+  screenshot: string | null;
+  ui_tree: unknown | null;
+  logs: StructuredLogEntry[];
+  device?: DeviceInfo;
+  screenshot_error?: string;
+  activity_error?: string;
+  fingerprint_error?: string;
+  ui_tree_error?: string;
+  logs_error?: string;
+}
+export interface CaptureDebugSnapshotResponse {
+  raw: CaptureDebugSnapshotRawResponse;
+  semantic?: SnapshotSemanticResponse | null;
+}
 export interface WaitForElementResponse {
   device: DeviceInfo;
   found: boolean;

package/test/unit/observe/capture_debug_snapshot.test.ts CHANGED Viewed

@@ -35,8 +35,11 @@ async function run() {
     const res1: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: true, logLines: 50, sessionId: 's1' })
     console.log('res1:', JSON.stringify(res1, null, 2))
-    const pass1 = res1 && res1.screenshot === 'BASE64PNG' && res1.activity && res1.fingerprint === 'abc123' && Array.isArray(res1.logs) && res1.logs.length === 1
+    const pass1 = res1 && res1.raw && res1.raw.screenshot === 'BASE64PNG' && res1.raw.activity && res1.raw.fingerprint === 'abc123' && Array.isArray(res1.raw.logs) && res1.raw.logs.length === 1
     assert.ok(pass1, 'captureDebugSnapshot should aggregate successful handler results')
+    assert.strictEqual(res1.semantic.screen, 'Main')
+    assert.strictEqual(res1.semantic.confidence >= 0.7, true)
+    assert.deepStrictEqual(res1.semantic.actions_available, null)
     console.log('Test 1:', pass1 ? 'PASS' : 'FAIL')
     // Restore handlers before next test
@@ -55,7 +58,7 @@ async function run() {
     const res2: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: true, logLines: 10, appId: 'com.example' })
     console.log('res2:', JSON.stringify(res2, null, 2))
-    const pass2 = res2 && res2.screenshot_error && res2.ui_tree_error && Array.isArray(res2.logs) && res2.logs.length === 2
+    const pass2 = res2 && res2.raw && res2.raw.screenshot_error && res2.raw.ui_tree_error && Array.isArray(res2.raw.logs) && res2.raw.logs.length === 2
     assert.ok(pass2, 'captureDebugSnapshot should surface partial failures and fallback logs')
     console.log('Test 2:', pass2 ? 'PASS' : 'FAIL')
@@ -76,7 +79,7 @@ async function run() {
     const res3: any = await ToolsObserve.captureDebugSnapshotHandler({ platform: 'android', includeLogs: false })
     console.log('res3:', JSON.stringify(res3, null, 2))
-    const pass3 = res3 && typeof res3.logs !== 'undefined' && res3.logs.length === 0
+    const pass3 = res3 && res3.raw && typeof res3.raw.logs !== 'undefined' && res3.raw.logs.length === 0
     assert.ok(pass3, 'captureDebugSnapshot should return an empty logs array when includeLogs is false')
     console.log('Test 3:', pass3 ? 'PASS' : 'FAIL')

package/test/unit/server/contract.test.ts CHANGED Viewed

@@ -26,11 +26,14 @@ async function run() {
   assert(waitForScreenChange, 'wait_for_screen_change should be registered')
   assert.match((waitForScreenChange as any).description, /does not verify correctness of the resulting state/i)
   assert.match((waitForScreenChange as any).description, /follow with expect_screen/i)
+  assert.match((waitForScreenChange as any).description, /backend\/API activity without a visible UI change/i)
   const captureDebugSnapshot = toolDefinitions.find((tool) => tool.name === 'capture_debug_snapshot')
   assert(captureDebugSnapshot, 'capture_debug_snapshot should be registered')
   assert.strictEqual((captureDebugSnapshot as any).inputSchema.properties.includeLogs.default, true)
   assert.strictEqual((captureDebugSnapshot as any).inputSchema.properties.logLines.default, 200)
+  assert.match((captureDebugSnapshot as any).description, /raw observation layer/i)
+  assert.match((captureDebugSnapshot as any).description, /optional derived semantic layer/i)
   const startLogStream = toolDefinitions.find((tool) => tool.name === 'start_log_stream')
   assert(startLogStream, 'start_log_stream should be registered')
@@ -60,6 +63,18 @@ async function run() {
   assert.match((expectElementVisible as any).description, /selector is the primary input/i)
   assert.match((expectElementVisible as any).description, /Returns structured binary success\/failure only/i)
+  const classifyActionOutcome = toolDefinitions.find((tool) => tool.name === 'classify_action_outcome')
+  assert(classifyActionOutcome, 'classify_action_outcome should be registered')
+  assert.match((classifyActionOutcome as any).description, /backend\/API activity without a visible UI change/i)
+  assert.match((classifyActionOutcome as any).description, /get_network_activity/i)
+  assert.match((classifyActionOutcome as any).description, /immediately after the action/i)
+  const getNetworkActivity = toolDefinitions.find((tool) => tool.name === 'get_network_activity')
+  assert(getNetworkActivity, 'get_network_activity should be registered')
+  assert.match((getNetworkActivity as any).description, /backend\/API activity without a visible UI change/i)
+  assert.doesNotMatch((getNetworkActivity as any).description, /Call this only when/i)
+  assert.match((getNetworkActivity as any).description, /immediately after an action/i)
   await assert.rejects(() => handleToolCall('unknown_tool'), /Unknown tool: unknown_tool/)
   console.log('server contract tests passed')

package/test/unit/server/response_shapes.test.ts CHANGED Viewed

@@ -16,6 +16,7 @@ async function run() {
   const originalCaptureScreenshotHandler = (ToolsObserve as any).captureScreenshotHandler
   const originalGetUITreeHandler = (ToolsObserve as any).getUITreeHandler
   const originalGetScreenFingerprintHandler = (ToolsObserve as any).getScreenFingerprintHandler
+  const originalCaptureDebugSnapshotHandler = (ToolsObserve as any).captureDebugSnapshotHandler
   try {
     ;(ToolsManage as any).installAppHandler = async () => ({
@@ -181,6 +182,32 @@ async function run() {
     assert.strictEqual(uiTreePayload.resolution.height, 2400)
     assert.strictEqual(uiTreePayload.elements[0].text, 'Login')
+    ;(ToolsObserve as any).captureDebugSnapshotHandler = async () => ({
+      raw: {
+        timestamp: 1710000000000,
+        reason: 'manual',
+        activity: 'com.example.MainActivity',
+        fingerprint: 'fp_raw',
+        screenshot: 'base64',
+        ui_tree: { screen: 'Home', elements: [] },
+        logs: [],
+        device: { platform: 'android', id: 'mock', osVersion: '14', model: 'Pixel', simulator: true }
+      },
+      semantic: {
+        screen: 'Home',
+        signals: { has_activity: true },
+        actions_available: ['open settings'],
+        confidence: 0.8,
+        warnings: []
+      }
+    })
+    const snapshotResponse = await handleToolCall('capture_debug_snapshot', { platform: 'android' })
+    const snapshotPayload = JSON.parse((snapshotResponse as any).content[0].text)
+    assert.strictEqual(snapshotPayload.raw.fingerprint, 'fp_raw')
+    assert.strictEqual(snapshotPayload.semantic.screen, 'Home')
+    assert.strictEqual(snapshotPayload.semantic.confidence, 0.8)
     console.log('server response-shape tests passed')
   } finally {
     ;(ToolsManage as any).installAppHandler = originalInstallAppHandler
@@ -193,6 +220,7 @@ async function run() {
     ;(ToolsObserve as any).captureScreenshotHandler = originalCaptureScreenshotHandler
     ;(ToolsObserve as any).getUITreeHandler = originalGetUITreeHandler
     ;(ToolsObserve as any).getScreenFingerprintHandler = originalGetScreenFingerprintHandler
+    ;(ToolsObserve as any).captureDebugSnapshotHandler = originalCaptureDebugSnapshotHandler
   }
 }