mobile-debug-mcp 0.24.8 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -341,6 +341,123 @@ export function getCenter(bounds) {
341
341
  const [x1, y1, x2, y2] = bounds;
342
342
  return [Math.floor((x1 + x2) / 2), Math.floor((y1 + y2) / 2)];
343
343
  }
344
+ function parseBooleanAttr(value) {
345
+ if (value === true || value === 'true')
346
+ return true;
347
+ if (value === false || value === 'false')
348
+ return false;
349
+ return null;
350
+ }
351
+ function parseNumberAttr(value) {
352
+ if (typeof value === 'number' && Number.isFinite(value))
353
+ return value;
354
+ if (typeof value !== 'string')
355
+ return null;
356
+ const parsed = Number(value);
357
+ return Number.isFinite(parsed) ? parsed : null;
358
+ }
359
+ function normalizeClassName(value) {
360
+ return typeof value === 'string' ? value.trim().toLowerCase() : '';
361
+ }
362
+ function inferAndroidRole(className) {
363
+ if (/seekbar|slider|progress/.test(className))
364
+ return 'slider';
365
+ if (/switch|toggle/.test(className))
366
+ return 'switch';
367
+ if (/checkbox/.test(className))
368
+ return 'checkbox';
369
+ if (/radiobutton|radio/.test(className))
370
+ return 'radio';
371
+ if (/edittext|textfield|search/.test(className))
372
+ return 'text_field';
373
+ if (/button|fab/.test(className))
374
+ return 'button';
375
+ if (/imageview|icon/.test(className))
376
+ return 'image';
377
+ if (/recyclerview|scroll|layout|viewgroup|frame/.test(className))
378
+ return 'container';
379
+ return null;
380
+ }
381
+ function buildAndroidSelectorConfidence(source) {
382
+ switch (source) {
383
+ case 'resource_id':
384
+ return { score: 1, reason: 'resource_id' };
385
+ case 'content_desc':
386
+ return { score: 0.9, reason: 'content_description' };
387
+ case 'text':
388
+ return { score: 0.6, reason: 'text_match' };
389
+ case 'class':
390
+ return { score: 0.35, reason: 'class_match' };
391
+ default:
392
+ return null;
393
+ }
394
+ }
395
+ function buildAndroidSelector(text, contentDescription, resourceId, className) {
396
+ if (resourceId)
397
+ return { value: resourceId, confidence: buildAndroidSelectorConfidence('resource_id') };
398
+ if (contentDescription)
399
+ return { value: contentDescription, confidence: buildAndroidSelectorConfidence('content_desc') };
400
+ if (text)
401
+ return { value: text, confidence: buildAndroidSelectorConfidence('text') };
402
+ if (className)
403
+ return { value: className, confidence: buildAndroidSelectorConfidence('class') };
404
+ return null;
405
+ }
406
+ function buildAndroidSemantic(clickable, className) {
407
+ return {
408
+ is_clickable: clickable,
409
+ is_container: /recyclerview|scroll|layout|viewgroup|frame/.test(className)
410
+ };
411
+ }
412
+ function isSliderLikeAndroid(node) {
413
+ const className = String(node['@_class'] || '').toLowerCase();
414
+ return /seekbar|slider|range|progress/i.test(className);
415
+ }
416
+ function extractAndroidState(node) {
417
+ const checked = parseBooleanAttr(node['@_checked']);
418
+ const selectedFlag = parseBooleanAttr(node['@_selected']);
419
+ const focused = parseBooleanAttr(node['@_focused']);
420
+ const expanded = parseBooleanAttr(node['@_expanded']);
421
+ const enabled = parseBooleanAttr(node['@_enabled']);
422
+ const textValue = typeof node['@_text'] === 'string' && node['@_text'].trim().length > 0 ? node['@_text'] : null;
423
+ const state = {};
424
+ if (checked !== null)
425
+ state.checked = checked;
426
+ if (selectedFlag !== null) {
427
+ state.selected = textValue || node['@_content-desc'] || true;
428
+ }
429
+ if (focused !== null)
430
+ state.focused = focused;
431
+ if (expanded !== null)
432
+ state.expanded = expanded;
433
+ if (enabled !== null)
434
+ state.enabled = enabled;
435
+ if (textValue && /edittext|textfield|search/i.test(String(node['@_class'] || ''))) {
436
+ state.text_value = textValue;
437
+ }
438
+ if (isSliderLikeAndroid(node)) {
439
+ const rawProgress = parseNumberAttr(node['@_progress']);
440
+ const max = parseNumberAttr(node['@_max']);
441
+ const fallbackValue = rawProgress ?? parseNumberAttr(node['@_value']) ?? parseNumberAttr(node['@_content-desc']);
442
+ const numericValue = rawProgress ?? fallbackValue;
443
+ if (numericValue !== null) {
444
+ state.raw_value = numericValue;
445
+ state.value_range = max !== null && max > 0 ? { min: 0, max } : null;
446
+ state.value = max !== null && max > 0 ? Math.round((numericValue / max) * 100) : numericValue;
447
+ }
448
+ }
449
+ else {
450
+ const numericValue = parseNumberAttr(node['@_value']);
451
+ if (numericValue !== null) {
452
+ state.value = numericValue;
453
+ state.raw_value = numericValue;
454
+ }
455
+ else if (textValue) {
456
+ state.value = textValue;
457
+ }
458
+ }
459
+ return Object.keys(state).length > 0 ? state : null;
460
+ }
344
461
  export async function getScreenResolution(deviceId) {
345
462
  try {
346
463
  const output = await execAdb(['shell', 'wm', 'size'], deviceId);
@@ -362,20 +479,34 @@ export function traverseNode(node, elements, parentIndex = -1, depth = 0) {
362
479
  const text = node['@_text'] || null;
363
480
  const contentDescription = node['@_content-desc'] || null;
364
481
  const clickable = node['@_clickable'] === 'true';
482
+ const className = String(node['@_class'] || 'unknown');
365
483
  const bounds = parseBounds(node['@_bounds'] || '[0,0][0,0]');
484
+ const state = extractAndroidState(node);
485
+ const role = inferAndroidRole(normalizeClassName(className));
486
+ const resourceId = typeof node['@_resource-id'] === 'string' && node['@_resource-id'].trim().length > 0 ? node['@_resource-id'] : null;
487
+ const stableId = resourceId ?? (typeof contentDescription === 'string' && contentDescription.trim().length > 0 ? contentDescription : null);
488
+ const testTag = stableId;
489
+ const selector = buildAndroidSelector(text, contentDescription, resourceId, normalizeClassName(className));
490
+ const semantic = buildAndroidSemantic(clickable, normalizeClassName(className));
366
491
  const isUseful = clickable || (text && text.length > 0) || (contentDescription && contentDescription.length > 0);
367
492
  if (isUseful) {
368
493
  const element = {
369
494
  text,
370
495
  contentDescription,
371
- type: node['@_class'] || 'unknown',
372
- resourceId: node['@_resource-id'] || null,
496
+ type: className,
497
+ resourceId,
373
498
  clickable,
374
499
  enabled: node['@_enabled'] === 'true',
375
500
  visible: true,
376
501
  bounds,
377
502
  center: getCenter(bounds),
378
- depth
503
+ depth,
504
+ state,
505
+ stable_id: stableId,
506
+ role,
507
+ test_tag: testTag,
508
+ selector,
509
+ semantic
379
510
  };
380
511
  if (parentIndex !== -1) {
381
512
  element.parentId = parentIndex;
package/docs/CHANGELOG.md CHANGED
@@ -2,6 +2,15 @@
2
2
 
3
3
  All notable changes to the **Mobile Debug MCP** project will be documented in this file.
4
4
 
5
+ ## [0.25.1]
6
+ - Platform-native element identity metadata for UI targeting
7
+ - Hierarchy-independent element references
8
+ - Selector confidence metadata for reliability
9
+ - Structured fallback resolution strategy
10
+
11
+ ## [0.25.0]
12
+ - Introduces the `expect_state` tool and a standardized state object for UI elements across Android and iOS.
13
+
5
14
  ## [0.24.8]
6
15
  - Improved slider interaction
7
16
 
@@ -0,0 +1,406 @@
1
+ # Mobile Debug MCP Prioritized Roadmap
2
+
3
+ ## Prioritization Criteria
4
+
5
+ Ordered by:
6
+
7
+ 1. Impact on agent reliability
8
+ 2. Reduction in retries / brittleness
9
+ 3. Breadth of app coverage improved
10
+ 4. Implementation complexity vs payoff
11
+
12
+ ## Program-Level Success Metrics
13
+ Track roadmap impact across releases using:
14
+
15
+ - Retry reduction rate (% fewer action retries per task)
16
+ - Element match success rate (% successful element targeting)
17
+ - Verification success rate (% expect_* checks passing first attempt)
18
+ - Wait success rate for asynchronous UI flows
19
+ - Custom control interaction success rate
20
+ - Gesture success rate
21
+ - Mean time to root cause during debugging
22
+ - Overall agent task completion rate
23
+
24
+ Primary KPI:
25
+ Higher task success with fewer retries.
26
+
27
+ ---
28
+
29
+ # Completed
30
+
31
+ These priorities are done and kept here for history:
32
+
33
+ - Priority 1 — Stronger State Verification
34
+ - Priority 2 — Richer Element Identity
35
+
36
+ Completion notes:
37
+
38
+ - State-aware verification is now implemented and wired through the tool surface.
39
+ - Platform-native element metadata and selector-confidence hints are now part of the runtime contract.
40
+
41
+ ---
42
+
43
+ # Priority 1 — Stronger State Verification
44
+
45
+ ## Why first
46
+ Highest leverage improvement.
47
+
48
+ **Status:** Completed
49
+
50
+ Most failures are not “can’t act,” they’re:
51
+ - uncertain state
52
+ - weak verification
53
+ - retry loops caused by inference
54
+
55
+ ## Deliver
56
+ - Direct readable control values
57
+ - Expanded `expect_*` verification
58
+ - Move from inference to state introspection
59
+
60
+ ## Expected Impact
61
+ Very high.
62
+
63
+ ## Done Criteria
64
+ - Control state readable for core widgets (toggle, slider, input, dropdown)
65
+ - New expect_* state verifiers implemented
66
+ - Agents can verify state without visual inference in representative flows
67
+ - Documentation and snapshot response shape updated
68
+
69
+ ## Success Metrics
70
+ - 30%+ retry reduction on stateful tasks
71
+ - Higher first-pass verification success
72
+ - Reduced false positive verifications
73
+
74
+ ## Dependencies
75
+ Blocks or strengthens:
76
+ - Priority 5 — Better Compose / Custom Control Semantics
77
+ - Priority 6 — Pinch to Zoom verification
78
+ - Priority 7 — Action Trace Correlation
79
+
80
+ ---
81
+
82
+ # Priority 2 — Richer Element Identity
83
+
84
+ ## Why second
85
+ Directly reduces selector brittleness.
86
+
87
+ **Status:** Completed
88
+
89
+ Improves:
90
+ - targeting stability
91
+ - repeatability
92
+ - agent confidence
93
+
94
+ ## Deliver
95
+ - Stable IDs / test tags prioritization
96
+ - Selector confidence metadata
97
+ - Preferred selector hierarchy
98
+
99
+ ## Expected Impact
100
+ Very high.
101
+
102
+ ## Done Criteria
103
+ - Stable selector preference order implemented
104
+ - Test tags/resource IDs surfaced where available
105
+ - Selector confidence metadata available
106
+ - Structural fallback selectors defined
107
+
108
+ ## Success Metrics
109
+ - Higher element match rate
110
+ - Reduced selector drift failures
111
+ - Lower retargeting retries
112
+
113
+ ## Dependencies
114
+ Blocks or strengthens:
115
+ - Priority 4 — Long Press targeting reliability
116
+ - Priority 5 — Better Compose / Custom Control Semantics
117
+ - Priority 6 — Pinch to Zoom targeting
118
+
119
+ ---
120
+
121
+ # Priority 3 — Wait and Synchronization Reliability
122
+
123
+ ## Why third
124
+ Reliable async synchronization is foundational for agent success and should precede gesture expansion.
125
+
126
+ Addresses failures where agents:
127
+ - skip UI waits after actions
128
+ - rely on network/log signals too early
129
+ - struggle with in-place UI updates
130
+ - misread stale UI snapshots
131
+
132
+ ## Deliver
133
+ - UI-first synchronization policy guidance
134
+ - wait_for_ui_change (hierarchy diff based waiting)
135
+ - Structured loading state detection
136
+ - Snapshot revision / staleness metadata
137
+ - Compose-aware wait robustness improvements
138
+
139
+ ## Expected Impact
140
+ Very high.
141
+
142
+ ## Done Criteria
143
+ - wait_for_ui_change implemented
144
+ - Loading state detection available for representative controls
145
+ - Snapshot revision or staleness metadata exposed
146
+ - UI-first sync guidance added to spec guardrails
147
+ - In-place update waits validated on benchmark flows
148
+
149
+ ## Success Metrics
150
+ - Reduced missed async UI transitions
151
+ - Fewer retries caused by premature actions
152
+ - Higher wait success rate for dynamic UI flows
153
+ - Lower fallback usage to network/log checks
154
+
155
+ ## Dependencies
156
+ Depends on:
157
+ - Priority 1 — Stronger State Verification
158
+ - Priority 2 — Richer Element Identity
159
+
160
+ Blocks or strengthens:
161
+ - Priority 5 — Better Compose / Custom Control Semantics
162
+ - Priority 7 — Action Trace Correlation
163
+
164
+ ---
165
+
166
+ # Priority 4 — Long Press Gesture
167
+
168
+ ## Why fourth
169
+ High utility, relatively low complexity.
170
+
171
+ Unlocks many currently awkward interactions:
172
+
173
+ - context menus
174
+ - hidden actions
175
+ - reorder handles
176
+ - press-and-hold controls
177
+
178
+ Broad usefulness.
179
+
180
+ ## Deliver
181
+ New tool:
182
+
183
+ ```json
184
+ long_press(element_id, duration_ms?)
185
+ ```
186
+
187
+ Verification alignment:
188
+ - expect_context_menu
189
+ - expect_press_effect
190
+
191
+ ## Expected Impact
192
+ High.
193
+
194
+ ## Done Criteria
195
+ - long_press tool implemented across supported platforms
196
+ - Duration defaults and overrides supported
197
+ - Verification patterns for long press outcomes defined
198
+ - Included in action capability model
199
+
200
+ ## Success Metrics
201
+ - Increased hidden/control-surface interaction coverage
202
+ - Reduced dead-end interaction failures
203
+ - Long press task success rate tracked
204
+
205
+ ## Dependencies
206
+ Depends on:
207
+ - Priority 2 — Richer Element Identity
208
+
209
+ Strengthens:
210
+ - Priority 5 semantics interaction contracts
211
+
212
+ ---
213
+
214
+ # Priority 5 — Better Compose / Custom Control Semantics
215
+
216
+ ## Why fifth
217
+ Important, but strengthened by priorities 1–4 first.
218
+
219
+ Semantics become more useful once:
220
+ - identity is stronger
221
+ - verification is stronger
222
+ - gestures are richer
223
+ - synchronization is more reliable
224
+
225
+ ## Deliver
226
+ - Composite control traits
227
+ - Control role enrichment (adjustable, expandable, selectable_group)
228
+ - Interaction contracts metadata
229
+ - Custom widget gesture affordance hints
230
+ - Semantic confidence annotations
231
+ - Compose-aware selectors for waits (merged semantics and element relationships)
232
+
233
+ ## Expected Impact
234
+ High.
235
+
236
+ ## Done Criteria
237
+ - Semantic traits implemented for major custom control classes
238
+ - Interaction contracts surfaced in snapshot model
239
+ - Confidence model defined for derived semantics
240
+ - Custom control manipulation success validated in benchmark flows
241
+
242
+ ## Success Metrics
243
+ - Higher custom control interaction success rate
244
+ - Fewer retries on non-standard widgets
245
+ - Reduced semantic ambiguity failures
246
+
247
+ ## Dependencies
248
+ Depends on:
249
+ - Priority 1 — Stronger State Verification
250
+ - Priority 2 — Richer Element Identity
251
+ - Priority 3 — Wait and Synchronization Reliability
252
+ - Priority 4 — Long Press
253
+
254
+ ---
255
+
256
+ # Priority 6 — Pinch to Zoom
257
+
258
+ ## Why sixth
259
+ Valuable, but narrower than long press.
260
+
261
+ Applies mainly to:
262
+ - maps
263
+ - images
264
+ - canvases
265
+ - zoomable custom surfaces
266
+
267
+ Useful, but less universal.
268
+
269
+ ## Deliver
270
+
271
+ ```json
272
+ pinch_to_zoom(target, scale, center?)
273
+ ```
274
+
275
+ Verification:
276
+ - expect_zoom_level
277
+ - expect_viewport_change
278
+
279
+ ## Expected Impact
280
+ Medium-high.
281
+
282
+ ## Done Criteria
283
+ - pinch_to_zoom implemented
284
+ - Zoom in/out flows supported
285
+ - Verification primitives for viewport or zoom state available
286
+ - Gesture integrated into action model
287
+
288
+ ## Success Metrics
289
+ - Successful execution across zoomable surfaces
290
+ - Reduced failures on map/image workflows
291
+ - Gesture success rate tracked
292
+
293
+ ## Dependencies
294
+ Depends on:
295
+ - Priority 1 — Stronger State Verification
296
+ - Priority 2 — Richer Element Identity
297
+
298
+ ---
299
+
300
+ # Priority 7 — Action Trace Correlation
301
+
302
+ ## Why seventh
303
+ Very valuable for debugging,
304
+ but less critical than improving control success first.
305
+
306
+ Improves diagnosis more than task completion.
307
+
308
+ ## Deliver
309
+ - Action correlation metadata
310
+ - UI/network/log linkage
311
+
312
+ ## Expected Impact
313
+ Medium-high.
314
+
315
+ ## Done Criteria
316
+ - Action correlation model defined
317
+ - UI/network/log linkage captured for representative actions
318
+ - Correlation metadata exposed to agents
319
+ - Debugging workflows validated with trace linkage
320
+
321
+ ## Success Metrics
322
+ - Lower time-to-root-cause
323
+ - Faster diagnosis of partial failures
324
+ - Improved action causality attribution
325
+
326
+ ## Dependencies
327
+ Depends on:
328
+ - Priority 1 — Stronger State Verification
329
+ - Priority 2 — Richer Element Identity
330
+ - Priority 3 — Wait and Synchronization Reliability
331
+
332
+ ---
333
+
334
+ # Delivery Waves
335
+
336
+ ## Dependency Summary
337
+ Foundational sequence:
338
+
339
+ Layer 1 (Foundations)
340
+ - Priority 1
341
+ - Priority 2
342
+
343
+ Layer 2 (Synchronization)
344
+ - Priority 3 depends on 1,2
345
+
346
+ Layer 3 (Interaction Expansion)
347
+ - Priority 4 depends on 2
348
+ - Priority 5 depends on 1,2,3,4
349
+ - Priority 6 depends on 1,2
350
+
351
+ Layer 4 (Observability)
352
+ - Priority 7 depends on 1,2,3
353
+
354
+ ## Wave 1 (Immediate)
355
+ - Stronger State Verification
356
+ - Richer Element Identity
357
+ - Wait and Synchronization Reliability
358
+
359
+ Focus:
360
+ Make core loop more reliable.
361
+
362
+ ---
363
+
364
+ ## Wave 2
365
+ - Long Press
366
+ - Better Compose Semantics
367
+
368
+ Focus:
369
+ Expand interaction capability.
370
+
371
+ ---
372
+
373
+ ## Wave 3
374
+ - Pinch to Zoom
375
+ - Action Trace Correlation
376
+
377
+ Focus:
378
+ Advanced gestures + observability.
379
+
380
+ ---
381
+
382
+ # Priority Stack Summary
383
+
384
+ Execution Order:
385
+ 1. Stronger State Verification
386
+ 2. Richer Element Identity
387
+ 3. Wait and Synchronization Reliability
388
+ 4. Long Press
389
+ 5. Better Compose / Custom Control Semantics
390
+ 6. Pinch to Zoom
391
+ 7. Action Trace Correlation
392
+
393
+ Rationale:
394
+ - Priorities 1–3 harden control, verification, and synchronization.
395
+ - Priorities 4–6 expand interaction capability.
396
+ - Priority 7 adds observability once control reliability matures.
397
+
398
+ ---
399
+
400
+ ## Explicitly Deferred
401
+ Still out of scope:
402
+
403
+ - Recovery planning logic
404
+ - Autonomous retry strategy
405
+ - MCP-level agent orchestration
406
+ - Autonomous recovery hinting (future consideration only)