npm - mobile-debug-mcp - Versions diffs - 0.26.5 → 0.27.0 - Mend

mobile-debug-mcp 0.26.5 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/interact/index.js +352 -185
package/dist/server/common.js +39 -0
package/dist/server-core.js +1 -1
package/docs/CHANGELOG.md +3 -0
package/docs/ROADMAP.md +109 -11
package/docs/rfcs/010-verification-stabilization-and-temporal-convergence.md +265 -0
package/docs/rfcs/011-recovery-and-replanning-for-failed-or-ambiguous-interaction-flows.md +321 -0
package/docs/rfcs/011.1-recovery-contract-types-and-runtime-wiring-spec.md +253 -0
package/docs/rfcs/012.md +203 -0
package/docs/specs/mcp-tooling-spec-v1.md +12 -0
package/docs/tools/interact.md +10 -0
package/package.json +1 -1
package/src/interact/index.ts +393 -186
package/src/server/common.ts +44 -1
package/src/server-core.ts +1 -1
package/src/types.ts +36 -0
package/test/unit/interact/adjust_control.test.ts +77 -1
package/test/unit/interact/verification_stabilization.test.ts +94 -0
package/test/unit/server/common.test.ts +36 -1

package/dist/server/common.js CHANGED Viewed

@@ -1,4 +1,6 @@
 import { ToolsObserve } from '../observe/index.js';
+export const DEFAULT_MAX_RECOVERY_ATTEMPTS = 3;
+export const DEFAULT_MAX_RETRY_DEPTH = 3;
 export function wrapResponse(data) {
     return {
         content: [{
@@ -86,6 +88,8 @@ export function normalizeResolvedTarget(value = null) {
 export function inferGenericFailure(message) {
     if (message && /timeout/i.test(message))
         return { failureCode: 'TIMEOUT', retryable: true };
+    if (message && /semantic mismatch/i.test(message))
+        return { failureCode: 'SEMANTIC_MISMATCH', retryable: false };
     return { failureCode: 'UNKNOWN', retryable: false };
 }
 export function inferScrollFailure(message) {
@@ -106,6 +110,40 @@ export function determineActionLifecycleState({ success, failure }) {
         return ACTION_LIFECYCLE_STATE_BY_OUTCOME.success;
     return ACTION_LIFECYCLE_STATE_BY_OUTCOME.success;
 }
+function mapFailureCodeToFailureClass(code) {
+    switch (code) {
+        case 'ELEMENT_NOT_FOUND':
+        case 'AMBIGUOUS_TARGET':
+        case 'STALE_REFERENCE':
+            return 'TargetResolutionFailure';
+        case 'ELEMENT_NOT_INTERACTABLE':
+            return 'ExecutionFailure';
+        case 'TIMEOUT':
+        case 'ACTION_REJECTED':
+        case 'NAVIGATION_NO_CHANGE':
+        case 'UNKNOWN':
+            return 'ExecutionFailure';
+        case 'VERIFICATION_FAILED':
+        case 'EXPECT_STATE_MISMATCH':
+            return 'VerificationFailure';
+        case 'CONTROL_CONVERGENCE_FAILED':
+            return 'ControlConvergenceFailure';
+        case 'SEMANTIC_MISMATCH':
+            return 'SemanticMismatchFailure';
+    }
+}
+function buildRecoveryState(failureCode, retryable) {
+    return {
+        failure_class: mapFailureCodeToFailureClass(failureCode),
+        runtime_code: failureCode,
+        recovery_attempts: 0,
+        max_recovery_attempts: DEFAULT_MAX_RECOVERY_ATTEMPTS,
+        retry_depth: 0,
+        max_retry_depth: DEFAULT_MAX_RETRY_DEPTH,
+        is_terminal: false,
+        retry_allowed: retryable
+    };
+}
 export function buildActionExecutionResult({ actionType, device, selector, resolved, success, uiFingerprintBefore, uiFingerprintAfter, failure, details, sourceModule }) {
     const timestampMs = Date.now();
     const timestamp = new Date(timestampMs).toISOString();
@@ -122,6 +160,7 @@ export function buildActionExecutionResult({ actionType, device, selector, resol
         },
         success,
         ...(failure ? { failure_code: failure.failureCode, retryable: failure.retryable } : {}),
+        ...(failure ? { recovery: buildRecoveryState(failure.failureCode, failure.retryable) } : {}),
         ui_fingerprint_before: uiFingerprintBefore,
         ui_fingerprint_after: uiFingerprintAfter,
         ...(details ? { details } : {})

package/dist/server-core.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { handleToolCall } from './server/tool-handlers.js';
 export { wrapResponse, toolDefinitions, handleToolCall };
 export const serverInfo = {
     name: 'mobile-debug-mcp',
-    version: '0.26.5'
+    version: '0.27.0'
 };
 export function createServer() {
     const server = new Server(serverInfo, {

package/docs/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,9 @@
 All notable changes to the **Mobile Debug MCP** project will be documented in this file.
+## [0.27.0]
+- defines a structured recovery and replanning model for UI interaction failures, enabling the system to respond to execution uncertainty with bounded, deterministic recovery strategies.
 ## [0.26.5]
 - Introduces a semantic control model to improve the identification and interaction with custom and composite UI controls.

package/docs/ROADMAP.md CHANGED Viewed

@@ -50,11 +50,14 @@ Higher task success with fewer retries.
 - Wait and Synchronization Reliability
 - Actionability Resolution
+- Verification Stabilization and Temporal Convergence
 ## Upcoming Work
+- Adjustable Control Precision Hardening
 - Environment Auto-Configuration and Toolchain Discovery
 - Adjustable Control Support
+- Verification Stabilization and Temporal Convergence
 - Signal-Oriented Diagnostic Filtering
 - Long Press Gesture
 # Stronger State Verification
@@ -243,6 +246,53 @@ Blocks or strengthens:
 ---
+# Verification Stabilization and Temporal Convergence
+## Rationale
+Real-world feedback exposed false-negative readiness failures caused by transient UI timing, even when target state had actually converged.
+**Status:** Planned
+Addresses friction where agents:
+- fail readiness checks on transient timing races
+- act on stale snapshots
+- misclassify eventual success as timeout failure
+- encounter lag between UI convergence and verification success
+## Scope
+- Bounded recheck before readiness failure
+- Temporal debounce for transient state mismatches
+- Verify-until-stable semantics for readiness checks
+- Stability confirmation windows
+- Snapshot freshness and convergence heuristics
+## Expected Impact
+Very high.
+## Exit Criteria
+- False-negative readiness failures materially reduced
+- Stability confirmation logic implemented
+- Benchmark async flows validate improved convergence detection
+- Verification timing behavior documented in guardrails
+## Success Metrics
+- Higher first-pass verification success
+- Lower false timeout failures
+- Higher wait success rate
+- Fewer retries caused by premature failure classification
+## Dependencies
+Depends on:
+- Stronger State Verification
+- Wait and Synchronization Reliability
+Strengthens:
+- Actionability Resolution
+- Adjustable Control Support
+- Recovery and replanning readiness
+---
 # Actionability Resolution
 ## Rationale
@@ -338,6 +388,48 @@ Blocks or strengthens:
 ---
+# Adjustable Control Precision Hardening
+## Rationale
+Post-implementation feedback shows semantics exist, but fine-grained adjustable targeting and convergence still need hardening.
+**Status:** Planned
+Addresses friction around:
+- slider thumb targeting precision
+- tap vs drag adjustment strategy selection
+- snapping and quantized convergence behavior
+- repeated adjustment retries before landing on target value
+## Scope
+- Fine-grained slider targeting refinement
+- Drag vs tap adjustment strategy heuristics
+- Improved value snapping convergence
+- Control-specific adjustment fallback policies
+## Expected Impact
+High.
+## Exit Criteria
+- Benchmark slider flows reach target values with fewer retries
+- Adjustment strategy selection validated across representative controls
+- Reduced repeated-tap convergence failures
+## Success Metrics
+- Fewer retries for adjustable controls
+- Higher first-attempt target value success
+- Reduced control convergence failures
+## Dependencies
+Depends on:
+- Adjustable Control Support
+- Better Compose / Custom Control Semantics
+Strengthens:
+- Recovery readiness
+---
 # Signal-Oriented Diagnostic Filtering
 ## Rationale
@@ -574,11 +666,13 @@ Synchronization & Actionability
 Control Precision & Observability
 - Adjustable Control Support
+- Adjustable Control Precision Hardening
+- Better Compose / Custom Control Semantics
 - Signal-Oriented Diagnostic Filtering
+- Verification Stabilization and Temporal Convergence
 Interaction Expansion
 - Long Press Gesture
-- Better Compose / Custom Control Semantics
 - Pinch to Zoom
 Deep Observability
@@ -598,11 +692,13 @@ Make core loop reliable and reduce onboarding friction.
 ## Wave 2 (Control Precision + Diagnostics)
 - Adjustable Control Support
+- Adjustable Control Precision Hardening
 - Better Compose / Custom Control Semantics
 - Signal-Oriented Diagnostic Filtering
+- Verification Stabilization and Temporal Convergence
 Focus:
-Improve control precision, custom control semantics, and signal observability.
+Improve control precision, verification convergence, custom control reliability, and signal observability.
 ---
@@ -629,14 +725,16 @@ Roadmap Ordering:
 1. Stronger State Verification
 2. Richer Element Identity
 3. Wait and Synchronization Reliability
-4. Environment Auto-Configuration and Toolchain Discovery
-5. Actionability Resolution
-6. Adjustable Control Support
-7. Better Compose / Custom Control Semantics
-8. Signal-Oriented Diagnostic Filtering
-9. Long Press Gesture
-10. Pinch to Zoom
-11. Action Trace Correlation
+4. Verification Stabilization and Temporal Convergence
+5. Environment Auto-Configuration and Toolchain Discovery
+6. Actionability Resolution
+7. Adjustable Control Support
+8. Adjustable Control Precision Hardening
+9. Better Compose / Custom Control Semantics
+10. Signal-Oriented Diagnostic Filtering
+11. Long Press Gesture
+12. Pinch to Zoom
+13. Action Trace Correlation
 Rationale:
 - Early roadmap items harden state, targeting, synchronization, environment readiness, and action execution.
@@ -649,7 +747,7 @@ Rationale:
 ## Future Considerations
 Still out of scope:
-- Recovery planning logic
+- Full autonomous recovery planning (deferred until after verification stabilization)
 - Autonomous retry strategy
 - MCP-level agent orchestration
 - Autonomous recovery hinting (future consideration only)

package/docs/rfcs/010-verification-stabilization-and-temporal-convergence.md ADDED Viewed

@@ -0,0 +1,265 @@
+# RFC 010 — Verification Stabilization and Temporal Convergence
+## 1. Summary
+This RFC defines a verification stabilization layer that ensures UI state transitions are not misclassified due to timing instability, transient UI states, or stale snapshots.
+It introduces temporal semantics into verification so that readiness and state checks are based on convergence over time, not a single snapshot.
+---
+## 2. Problem Statement
+Current verification behavior is snapshot-based and may produce false-negative failures when UI state is in transition.
+Observed issues include:
+- readiness checks timing out even though UI converges shortly after
+- stale snapshots being treated as authoritative state
+- transient UI states causing premature failure classification
+- mismatch between UI convergence and verification success
+These issues lead to unnecessary retries, incorrect failure classification, and degraded automation reliability.
+---
+## 3. Goals
+This RFC introduces a temporal verification model that MUST:
+- reduce false-negative readiness failures
+- ensure verification reflects stable UI convergence
+- introduce bounded recheck before failure
+- debounce transient mismatches
+- maintain deterministic verification behavior
+---
+## 4. Non-Goals
+This RFC does NOT define:
+- recovery or replanning strategies (covered by a later RFC)
+- probabilistic verification
+- ML-based state inference
+- changes to action execution semantics
+Verification remains deterministic and grounded in observable UI state.
+---
+## 5. Runtime Ownership and Integration
+This RFC applies to existing verification surfaces:
+- expect_* handlers (e.g. expect_state)
+- readiness checks in wait_for_ui_element
+- post-action verification in src/interact
+It augments these surfaces with temporal semantics; it does not replace them.
+### 5.1 Ownership and Composition with Existing Logic
+This RFC refines existing behavior rather than introducing a parallel mechanism.
+- `wait_for_ui_element` (and underlying `waitForUICore`) owns **readiness stabilization**.
+- `expect_*` handlers (e.g. `expect_state`) own **state verification stabilization**.
+- `src/interact` owns **post-action verification application** of these rules.
+Composition rules:
+- `wait_for_ui_element` MUST apply stabilization for presence/readiness before returning success or failure.
+- `expect_*` MUST apply stabilization for state/value assertions.
+- If both are used in sequence, `wait_for_ui_element` completes first, then `expect_*` applies its own stabilization.
+- Stabilization MUST NOT be duplicated across layers for the same check.
+---
+## 6. Temporal Verification Model
+Verification MUST consider state over time, not a single observation.
+### 6.1 Stabilization Window
+Verification SHOULD use a bounded observation window before declaring failure.
+Within this window:
+- multiple UI reads MAY be performed
+- transient mismatches MUST NOT immediately trigger failure
+### 6.2 Verify-Until-Stable
+Verification SHOULD require state to be stable across consecutive observations before success is confirmed.
+Example:
+- state must match expected condition for N consecutive reads
+### 6.3 Debounce Semantics
+Transient mismatches SHOULD be debounced.
+Short-lived mismatches within the stabilization window MUST NOT be treated as terminal failure.
+### 6.4 Deterministic Defaults (Required)
+Implementations MUST use bounded defaults unless explicitly overridden:
+- `stabilization_window_ms`: 1000ms (range: 500–1500ms)
+- `stable_observation_count`: 2 consecutive matching reads
+- `max_recheck_attempts`: 3
+- `min_read_interval_ms`: 100–200ms between reads
+These values MUST be configurable but bounded to prevent unbounded waits.
+---
+## 6.1 Reference Stabilization Algorithm
+For a given verification predicate `P(snapshot)`:
+1. Start timer `t0`.
+2. Initialize `stable_count = 0`, `attempts = 0`.
+3. Loop until `now - t0 > stabilization_window_ms` OR `stable_count >= stable_observation_count`:
+   - Read fresh snapshot `S`.
+   - If `P(S)` is true:
+       - `stable_count += 1`
+     Else:
+       - `stable_count = 0`
+   - `attempts += 1`
+   - Sleep `min_read_interval_ms`.
+4. If `stable_count >= stable_observation_count`: SUCCESS
+5. Else if `attempts < max_recheck_attempts`:
+   - Perform one additional fresh read and re-evaluate once.
+6. Else: FAILURE
+Notes:
+- Implementations MUST ensure at least one fresh read occurs before failure.
+- Debounce is achieved via resetting `stable_count` on mismatch.
+---
+## 7. Snapshot Freshness
+Verification MUST account for snapshot freshness.
+### 7.1 Freshness Constraints
+- snapshots older than `snapshot_stale_threshold_ms` MUST be considered stale (default: 500ms)
+- stale snapshots MUST NOT be used as final verification evidence and MUST trigger a fresh read
+### 7.2 Re-read Requirement
+Before declaring failure, the system MUST attempt at least one fresh UI read within the stabilization window.
+### 7.3 Freshness Defaults
+- `snapshot_stale_threshold_ms`: 500ms (range: 300–800ms)
+---
+## 8. Runtime Failure Code Mapping
+Existing runtime failure signals MUST map into RFC 010 failure categories.
+| Runtime Code | RFC 010 Category |
+|--------------|------------------|
+| ELEMENT_NOT_FOUND | Target Resolution Failure |
+| STALE_REFERENCE | Target Resolution Failure |
+| AMBIGUOUS_TARGET | Target Resolution Failure |
+| TIMEOUT | Execution Failure |
+| ACTION_REJECTED | Execution Failure |
+| VERIFICATION_FAILED | Verification Failure |
+| EXPECT_STATE_MISMATCH | Verification Failure |
+| CONTROL_CONVERGENCE_FAILED | Control Convergence Failure |
+| SEMANTIC_MISMATCH | Semantic Mismatch Failure |
+| UNKNOWN | Execution Failure (default fallback) |
+This mapping MUST be deterministic, exhaustive, and versioned with the runtime.
+### 8.1 Failure Gating Rules
+Failure MUST only be emitted when:
+- stabilization window is exhausted
+- fresh snapshot verification still fails
+Transient mismatches SHOULD NOT be classified as:
+- TIMEOUT
+- VERIFICATION_FAILED
+until stabilization logic has completed.
+- FAILURE MUST NOT be emitted if `stable_observation_count` has not been attempted within the stabilization window.
+- FAILURE MUST NOT be emitted without at least one fresh read within `snapshot_stale_threshold_ms`.
+- TIMEOUT MUST correspond to exhaustion of `stabilization_window_ms`, not a single read failure.
+---
+## 9. Integration with RFC 005 (Verification Correctness)
+RFC 005 defines what correctness means.
+RFC 010 defines when correctness can be confidently evaluated.
+RFC 010 augments RFC 005 by introducing temporal convergence requirements before asserting success or failure.
+---
+## 10. Integration with RFC 006 (Execution Layer)
+Post-action verification in src/interact MUST apply stabilization logic before returning failure.
+Execution MUST NOT prematurely surface verification failure without applying temporal checks defined in this RFC.
+`src/interact` MUST wrap post-action verification with the reference stabilization algorithm. It MUST pass through configuration (window, counts) and MUST NOT short-circuit on first mismatch.
+---
+## 11. Integration with RFC 011.1 (Recovery Contract)
+Verification stabilization reduces false-positive failure signals that would otherwise trigger downstream recovery mechanisms (defined in a companion RFC).
+---
+## 13. Output Behavior (Progressive Extension)
+Future implementations MAY expose additional metadata such as:
+```ts
+interface VerificationMetadata {
+  stabilization_attempts?: number;
+  stabilization_window_ms?: number;
+  stable_observation_count?: number;
+  snapshot_freshness_ms?: number;
+}
+```
+These fields are optional and for observability only.
+---
+## 14. Failure Modes
+Verification stabilization MAY fail due to:
+- UI never converging to expected state
+- repeated oscillation of UI state
+- persistent stale snapshot conditions
+In these cases, failure MUST be emitted after stabilization window is exhausted.
+---
+## 15. Success Metrics
+- reduced false-negative readiness failures
+- higher first-pass verification success
+- lower premature timeout rates
+- improved reliability of wait and readiness checks
+---
+## 16. Summary
+This RFC introduces temporal stabilization into verification, ensuring that UI state is evaluated based on convergence over time rather than single snapshots. It improves reliability by eliminating transient mismatches and stale-state errors without introducing probabilistic behavior.