mobile-debug-mcp 0.26.5 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  import { ToolsObserve } from '../observe/index.js';
2
+ export const DEFAULT_MAX_RECOVERY_ATTEMPTS = 3;
3
+ export const DEFAULT_MAX_RETRY_DEPTH = 3;
2
4
  export function wrapResponse(data) {
3
5
  return {
4
6
  content: [{
@@ -86,6 +88,8 @@ export function normalizeResolvedTarget(value = null) {
86
88
  export function inferGenericFailure(message) {
87
89
  if (message && /timeout/i.test(message))
88
90
  return { failureCode: 'TIMEOUT', retryable: true };
91
+ if (message && /semantic mismatch/i.test(message))
92
+ return { failureCode: 'SEMANTIC_MISMATCH', retryable: false };
89
93
  return { failureCode: 'UNKNOWN', retryable: false };
90
94
  }
91
95
  export function inferScrollFailure(message) {
@@ -106,6 +110,40 @@ export function determineActionLifecycleState({ success, failure }) {
106
110
  return ACTION_LIFECYCLE_STATE_BY_OUTCOME.success;
107
111
  return ACTION_LIFECYCLE_STATE_BY_OUTCOME.success;
108
112
  }
113
+ function mapFailureCodeToFailureClass(code) {
114
+ switch (code) {
115
+ case 'ELEMENT_NOT_FOUND':
116
+ case 'AMBIGUOUS_TARGET':
117
+ case 'STALE_REFERENCE':
118
+ return 'TargetResolutionFailure';
119
+ case 'ELEMENT_NOT_INTERACTABLE':
120
+ return 'ExecutionFailure';
121
+ case 'TIMEOUT':
122
+ case 'ACTION_REJECTED':
123
+ case 'NAVIGATION_NO_CHANGE':
124
+ case 'UNKNOWN':
125
+ return 'ExecutionFailure';
126
+ case 'VERIFICATION_FAILED':
127
+ case 'EXPECT_STATE_MISMATCH':
128
+ return 'VerificationFailure';
129
+ case 'CONTROL_CONVERGENCE_FAILED':
130
+ return 'ControlConvergenceFailure';
131
+ case 'SEMANTIC_MISMATCH':
132
+ return 'SemanticMismatchFailure';
133
+ }
134
+ }
135
+ function buildRecoveryState(failureCode, retryable) {
136
+ return {
137
+ failure_class: mapFailureCodeToFailureClass(failureCode),
138
+ runtime_code: failureCode,
139
+ recovery_attempts: 0,
140
+ max_recovery_attempts: DEFAULT_MAX_RECOVERY_ATTEMPTS,
141
+ retry_depth: 0,
142
+ max_retry_depth: DEFAULT_MAX_RETRY_DEPTH,
143
+ is_terminal: false,
144
+ retry_allowed: retryable
145
+ };
146
+ }
109
147
  export function buildActionExecutionResult({ actionType, device, selector, resolved, success, uiFingerprintBefore, uiFingerprintAfter, failure, details, sourceModule }) {
110
148
  const timestampMs = Date.now();
111
149
  const timestamp = new Date(timestampMs).toISOString();
@@ -122,6 +160,7 @@ export function buildActionExecutionResult({ actionType, device, selector, resol
122
160
  },
123
161
  success,
124
162
  ...(failure ? { failure_code: failure.failureCode, retryable: failure.retryable } : {}),
163
+ ...(failure ? { recovery: buildRecoveryState(failure.failureCode, failure.retryable) } : {}),
125
164
  ui_fingerprint_before: uiFingerprintBefore,
126
165
  ui_fingerprint_after: uiFingerprintAfter,
127
166
  ...(details ? { details } : {})
@@ -6,7 +6,7 @@ import { handleToolCall } from './server/tool-handlers.js';
6
6
  export { wrapResponse, toolDefinitions, handleToolCall };
7
7
  export const serverInfo = {
8
8
  name: 'mobile-debug-mcp',
9
- version: '0.26.5'
9
+ version: '0.27.0'
10
10
  };
11
11
  export function createServer() {
12
12
  const server = new Server(serverInfo, {
package/docs/CHANGELOG.md CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  All notable changes to the **Mobile Debug MCP** project will be documented in this file.
4
4
 
5
+ ## [0.27.0]
6
+ - defines a structured recovery and replanning model for UI interaction failures, enabling the system to respond to execution uncertainty with bounded, deterministic recovery strategies.
7
+
5
8
  ## [0.26.5]
6
9
  - Introduces a semantic control model to improve the identification and interaction with custom and composite UI controls.
7
10
 
package/docs/ROADMAP.md CHANGED
@@ -50,11 +50,14 @@ Higher task success with fewer retries.
50
50
 
51
51
  - Wait and Synchronization Reliability
52
52
  - Actionability Resolution
53
+ - Verification Stabilization and Temporal Convergence
53
54
 
54
55
  ## Upcoming Work
55
56
 
57
+ - Adjustable Control Precision Hardening
56
58
  - Environment Auto-Configuration and Toolchain Discovery
57
59
  - Adjustable Control Support
60
+ - Verification Stabilization and Temporal Convergence
58
61
  - Signal-Oriented Diagnostic Filtering
59
62
  - Long Press Gesture
60
63
  # Stronger State Verification
@@ -243,6 +246,53 @@ Blocks or strengthens:
243
246
 
244
247
  ---
245
248
 
249
+ # Verification Stabilization and Temporal Convergence
250
+
251
+ ## Rationale
252
+ Real-world feedback exposed false-negative readiness failures caused by transient UI timing, even when target state had actually converged.
253
+
254
+ **Status:** Planned
255
+
256
+ Addresses friction where agents:
257
+ - fail readiness checks on transient timing races
258
+ - act on stale snapshots
259
+ - misclassify eventual success as timeout failure
260
+ - encounter lag between UI convergence and verification success
261
+
262
+ ## Scope
263
+ - Bounded recheck before readiness failure
264
+ - Temporal debounce for transient state mismatches
265
+ - Verify-until-stable semantics for readiness checks
266
+ - Stability confirmation windows
267
+ - Snapshot freshness and convergence heuristics
268
+
269
+ ## Expected Impact
270
+ Very high.
271
+
272
+ ## Exit Criteria
273
+ - False-negative readiness failures materially reduced
274
+ - Stability confirmation logic implemented
275
+ - Benchmark async flows validate improved convergence detection
276
+ - Verification timing behavior documented in guardrails
277
+
278
+ ## Success Metrics
279
+ - Higher first-pass verification success
280
+ - Lower false timeout failures
281
+ - Higher wait success rate
282
+ - Fewer retries caused by premature failure classification
283
+
284
+ ## Dependencies
285
+ Depends on:
286
+ - Stronger State Verification
287
+ - Wait and Synchronization Reliability
288
+
289
+ Strengthens:
290
+ - Actionability Resolution
291
+ - Adjustable Control Support
292
+ - Recovery and replanning readiness
293
+
294
+ ---
295
+
246
296
  # Actionability Resolution
247
297
 
248
298
  ## Rationale
@@ -338,6 +388,48 @@ Blocks or strengthens:
338
388
 
339
389
  ---
340
390
 
391
+ # Adjustable Control Precision Hardening
392
+
393
+ ## Rationale
394
+ Post-implementation feedback shows semantics exist, but fine-grained adjustable targeting and convergence still need hardening.
395
+
396
+ **Status:** Planned
397
+
398
+ Addresses friction around:
399
+ - slider thumb targeting precision
400
+ - tap vs drag adjustment strategy selection
401
+ - snapping and quantized convergence behavior
402
+ - repeated adjustment retries before landing on target value
403
+
404
+ ## Scope
405
+ - Fine-grained slider targeting refinement
406
+ - Drag vs tap adjustment strategy heuristics
407
+ - Improved value snapping convergence
408
+ - Control-specific adjustment fallback policies
409
+
410
+ ## Expected Impact
411
+ High.
412
+
413
+ ## Exit Criteria
414
+ - Benchmark slider flows reach target values with fewer retries
415
+ - Adjustment strategy selection validated across representative controls
416
+ - Reduced repeated-tap convergence failures
417
+
418
+ ## Success Metrics
419
+ - Fewer retries for adjustable controls
420
+ - Higher first-attempt target value success
421
+ - Reduced control convergence failures
422
+
423
+ ## Dependencies
424
+ Depends on:
425
+ - Adjustable Control Support
426
+ - Better Compose / Custom Control Semantics
427
+
428
+ Strengthens:
429
+ - Recovery readiness
430
+
431
+ ---
432
+
341
433
  # Signal-Oriented Diagnostic Filtering
342
434
 
343
435
  ## Rationale
@@ -574,11 +666,13 @@ Synchronization & Actionability
574
666
 
575
667
  Control Precision & Observability
576
668
  - Adjustable Control Support
669
+ - Adjustable Control Precision Hardening
670
+ - Better Compose / Custom Control Semantics
577
671
  - Signal-Oriented Diagnostic Filtering
672
+ - Verification Stabilization and Temporal Convergence
578
673
 
579
674
  Interaction Expansion
580
675
  - Long Press Gesture
581
- - Better Compose / Custom Control Semantics
582
676
  - Pinch to Zoom
583
677
 
584
678
  Deep Observability
@@ -598,11 +692,13 @@ Make core loop reliable and reduce onboarding friction.
598
692
 
599
693
  ## Wave 2 (Control Precision + Diagnostics)
600
694
  - Adjustable Control Support
695
+ - Adjustable Control Precision Hardening
601
696
  - Better Compose / Custom Control Semantics
602
697
  - Signal-Oriented Diagnostic Filtering
698
+ - Verification Stabilization and Temporal Convergence
603
699
 
604
700
  Focus:
605
- Improve control precision, custom control semantics, and signal observability.
701
+ Improve control precision, verification convergence, custom control reliability, and signal observability.
606
702
 
607
703
  ---
608
704
 
@@ -629,14 +725,16 @@ Roadmap Ordering:
629
725
  1. Stronger State Verification
630
726
  2. Richer Element Identity
631
727
  3. Wait and Synchronization Reliability
632
- 4. Environment Auto-Configuration and Toolchain Discovery
633
- 5. Actionability Resolution
634
- 6. Adjustable Control Support
635
- 7. Better Compose / Custom Control Semantics
636
- 8. Signal-Oriented Diagnostic Filtering
637
- 9. Long Press Gesture
638
- 10. Pinch to Zoom
639
- 11. Action Trace Correlation
728
+ 4. Verification Stabilization and Temporal Convergence
729
+ 5. Environment Auto-Configuration and Toolchain Discovery
730
+ 6. Actionability Resolution
731
+ 7. Adjustable Control Support
732
+ 8. Adjustable Control Precision Hardening
733
+ 9. Better Compose / Custom Control Semantics
734
+ 10. Signal-Oriented Diagnostic Filtering
735
+ 11. Long Press Gesture
736
+ 12. Pinch to Zoom
737
+ 13. Action Trace Correlation
640
738
 
641
739
  Rationale:
642
740
  - Early roadmap items harden state, targeting, synchronization, environment readiness, and action execution.
@@ -649,7 +747,7 @@ Rationale:
649
747
  ## Future Considerations
650
748
  Still out of scope:
651
749
 
652
- - Recovery planning logic
750
+ - Full autonomous recovery planning (deferred until after verification stabilization)
653
751
  - Autonomous retry strategy
654
752
  - MCP-level agent orchestration
655
753
  - Autonomous recovery hinting (future consideration only)
@@ -0,0 +1,265 @@
1
+
2
+
3
+ # RFC 010 — Verification Stabilization and Temporal Convergence
4
+
5
+ ## 1. Summary
6
+
7
+ This RFC defines a verification stabilization layer that ensures UI state transitions are not misclassified due to timing instability, transient UI states, or stale snapshots.
8
+
9
+ It introduces temporal semantics into verification so that readiness and state checks are based on convergence over time, not a single snapshot.
10
+
11
+ ---
12
+
13
+ ## 2. Problem Statement
14
+
15
+ Current verification behavior is snapshot-based and may produce false-negative failures when UI state is in transition.
16
+
17
+ Observed issues include:
18
+
19
+ - readiness checks timing out even though UI converges shortly after
20
+ - stale snapshots being treated as authoritative state
21
+ - transient UI states causing premature failure classification
22
+ - mismatch between UI convergence and verification success
23
+
24
+ These issues lead to unnecessary retries, incorrect failure classification, and degraded automation reliability.
25
+
26
+ ---
27
+
28
+ ## 3. Goals
29
+
30
+ This RFC introduces a temporal verification model that MUST:
31
+
32
+ - reduce false-negative readiness failures
33
+ - ensure verification reflects stable UI convergence
34
+ - introduce bounded recheck before failure
35
+ - debounce transient mismatches
36
+ - maintain deterministic verification behavior
37
+
38
+ ---
39
+
40
+ ## 4. Non-Goals
41
+
42
+ This RFC does NOT define:
43
+
44
+ - recovery or replanning strategies (covered by a later RFC)
45
+ - probabilistic verification
46
+ - ML-based state inference
47
+ - changes to action execution semantics
48
+
49
+ Verification remains deterministic and grounded in observable UI state.
50
+
51
+ ---
52
+
53
+ ## 5. Runtime Ownership and Integration
54
+
55
+ This RFC applies to existing verification surfaces:
56
+
57
+ - expect_* handlers (e.g. expect_state)
58
+ - readiness checks in wait_for_ui_element
59
+ - post-action verification in src/interact
60
+
61
+ It augments these surfaces with temporal semantics; it does not replace them.
62
+
63
+ ### 5.1 Ownership and Composition with Existing Logic
64
+
65
+ This RFC refines existing behavior rather than introducing a parallel mechanism.
66
+
67
+ - `wait_for_ui_element` (and underlying `waitForUICore`) owns **readiness stabilization**.
68
+ - `expect_*` handlers (e.g. `expect_state`) own **state verification stabilization**.
69
+ - `src/interact` owns **post-action verification application** of these rules.
70
+
71
+ Composition rules:
72
+ - `wait_for_ui_element` MUST apply stabilization for presence/readiness before returning success or failure.
73
+ - `expect_*` MUST apply stabilization for state/value assertions.
74
+ - If both are used in sequence, `wait_for_ui_element` completes first, then `expect_*` applies its own stabilization.
75
+ - Stabilization MUST NOT be duplicated across layers for the same check.
76
+
77
+ ---
78
+
79
+ ## 6. Temporal Verification Model
80
+
81
+ Verification MUST consider state over time, not a single observation.
82
+
83
+ ### 6.1 Stabilization Window
84
+
85
+ Verification SHOULD use a bounded observation window before declaring failure.
86
+
87
+ Within this window:
88
+ - multiple UI reads MAY be performed
89
+ - transient mismatches MUST NOT immediately trigger failure
90
+
91
+ ### 6.2 Verify-Until-Stable
92
+
93
+ Verification SHOULD require state to be stable across consecutive observations before success is confirmed.
94
+
95
+ Example:
96
+ - state must match expected condition for N consecutive reads
97
+
98
+ ### 6.3 Debounce Semantics
99
+
100
+ Transient mismatches SHOULD be debounced.
101
+
102
+ Short-lived mismatches within the stabilization window MUST NOT be treated as terminal failure.
103
+
104
+ ### 6.4 Deterministic Defaults (Required)
105
+
106
+ Implementations MUST use bounded defaults unless explicitly overridden:
107
+
108
+ - `stabilization_window_ms`: 1000ms (range: 500–1500ms)
109
+ - `stable_observation_count`: 2 consecutive matching reads
110
+ - `max_recheck_attempts`: 3
111
+ - `min_read_interval_ms`: 100–200ms between reads
112
+
113
+ These values MUST be configurable but bounded to prevent unbounded waits.
114
+
115
+ ---
116
+
117
+ ## 6.1 Reference Stabilization Algorithm
118
+
119
+ For a given verification predicate `P(snapshot)`:
120
+
121
+ 1. Start timer `t0`.
122
+ 2. Initialize `stable_count = 0`, `attempts = 0`.
123
+ 3. Loop until `now - t0 > stabilization_window_ms` OR `stable_count >= stable_observation_count`:
124
+ - Read fresh snapshot `S`.
125
+ - If `P(S)` is true:
126
+ - `stable_count += 1`
127
+ Else:
128
+ - `stable_count = 0`
129
+ - `attempts += 1`
130
+ - Sleep `min_read_interval_ms`.
131
+ 4. If `stable_count >= stable_observation_count`: SUCCESS
132
+ 5. Else if `attempts < max_recheck_attempts`:
133
+ - Perform one additional fresh read and re-evaluate once.
134
+ 6. Else: FAILURE
135
+
136
+ Notes:
137
+ - Implementations MUST ensure at least one fresh read occurs before failure.
138
+ - Debounce is achieved via resetting `stable_count` on mismatch.
139
+
140
+ ---
141
+
142
+ ## 7. Snapshot Freshness
143
+
144
+ Verification MUST account for snapshot freshness.
145
+
146
+ ### 7.1 Freshness Constraints
147
+
148
+ - snapshots older than `snapshot_stale_threshold_ms` MUST be considered stale (default: 500ms)
149
+ - stale snapshots MUST NOT be used as final verification evidence and MUST trigger a fresh read
150
+
151
+ ### 7.2 Re-read Requirement
152
+
153
+ Before declaring failure, the system MUST attempt at least one fresh UI read within the stabilization window.
154
+
155
+ ### 7.3 Freshness Defaults
156
+
157
+ - `snapshot_stale_threshold_ms`: 500ms (range: 300–800ms)
158
+
159
+ ---
160
+
161
+ ## 8. Runtime Failure Code Mapping
162
+
163
+ Existing runtime failure signals MUST map into RFC 010 failure categories.
164
+
165
+ | Runtime Code | RFC 010 Category |
166
+ |--------------|------------------|
167
+ | ELEMENT_NOT_FOUND | Target Resolution Failure |
168
+ | STALE_REFERENCE | Target Resolution Failure |
169
+ | AMBIGUOUS_TARGET | Target Resolution Failure |
170
+ | TIMEOUT | Execution Failure |
171
+ | ACTION_REJECTED | Execution Failure |
172
+ | VERIFICATION_FAILED | Verification Failure |
173
+ | EXPECT_STATE_MISMATCH | Verification Failure |
174
+ | CONTROL_CONVERGENCE_FAILED | Control Convergence Failure |
175
+ | SEMANTIC_MISMATCH | Semantic Mismatch Failure |
176
+ | UNKNOWN | Execution Failure (default fallback) |
177
+
178
+ This mapping MUST be deterministic, exhaustive, and versioned with the runtime.
179
+
180
+ ### 8.1 Failure Gating Rules
181
+
182
+ Failure MUST only be emitted when:
183
+
184
+ - stabilization window is exhausted
185
+ - fresh snapshot verification still fails
186
+
187
+ Transient mismatches SHOULD NOT be classified as:
188
+ - TIMEOUT
189
+ - VERIFICATION_FAILED
190
+
191
+ until stabilization logic has completed.
192
+
193
+ - FAILURE MUST NOT be emitted if `stable_observation_count` has not been attempted within the stabilization window.
194
+ - FAILURE MUST NOT be emitted without at least one fresh read within `snapshot_stale_threshold_ms`.
195
+ - TIMEOUT MUST correspond to exhaustion of `stabilization_window_ms`, not a single read failure.
196
+
197
+ ---
198
+
199
+ ## 9. Integration with RFC 005 (Verification Correctness)
200
+
201
+ RFC 005 defines what correctness means.
202
+
203
+ RFC 010 defines when correctness can be confidently evaluated.
204
+
205
+ RFC 010 augments RFC 005 by introducing temporal convergence requirements before asserting success or failure.
206
+
207
+ ---
208
+
209
+ ## 10. Integration with RFC 006 (Execution Layer)
210
+
211
+ Post-action verification in src/interact MUST apply stabilization logic before returning failure.
212
+
213
+ Execution MUST NOT prematurely surface verification failure without applying temporal checks defined in this RFC.
214
+
215
+ `src/interact` MUST wrap post-action verification with the reference stabilization algorithm. It MUST pass through configuration (window, counts) and MUST NOT short-circuit on first mismatch.
216
+
217
+ ---
218
+
219
+ ## 11. Integration with RFC 011.1 (Recovery Contract)
220
+
221
+ Verification stabilization reduces false-positive failure signals that would otherwise trigger downstream recovery mechanisms (defined in a companion RFC).
222
+
223
+ ---
224
+
225
+ ## 13. Output Behavior (Progressive Extension)
226
+
227
+ Future implementations MAY expose additional metadata such as:
228
+
229
+ ```ts
230
+ interface VerificationMetadata {
231
+ stabilization_attempts?: number;
232
+ stabilization_window_ms?: number;
233
+ stable_observation_count?: number;
234
+ snapshot_freshness_ms?: number;
235
+ }
236
+ ```
237
+
238
+ These fields are optional and for observability only.
239
+
240
+ ---
241
+
242
+ ## 14. Failure Modes
243
+
244
+ Verification stabilization MAY fail due to:
245
+
246
+ - UI never converging to expected state
247
+ - repeated oscillation of UI state
248
+ - persistent stale snapshot conditions
249
+
250
+ In these cases, failure MUST be emitted after stabilization window is exhausted.
251
+
252
+ ---
253
+
254
+ ## 15. Success Metrics
255
+
256
+ - reduced false-negative readiness failures
257
+ - higher first-pass verification success
258
+ - lower premature timeout rates
259
+ - improved reliability of wait and readiness checks
260
+
261
+ ---
262
+
263
+ ## 16. Summary
264
+
265
+ This RFC introduces temporal stabilization into verification, ensuring that UI state is evaluated based on convergence over time rather than single snapshots. It improves reliability by eliminating transient mismatches and stale-state errors without introducing probabilistic behavior.