mobile-debug-mcp 0.26.2 → 0.26.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/AGENTS.md CHANGED
@@ -41,11 +41,14 @@ Portable agent skills live under `skills/`.
41
41
  - `skills/README.md` — repo-wide skill convention
42
42
  - `skills/mcp-builder/` — build/install/toolchain guidance
43
43
  - `skills/test-authoring/` — test creation and placement guidance
44
+ - `skills/rfc-review/` — RFC review rubric and response template
44
45
 
45
46
  If the task is about **creating or updating tests**, load `skills/test-authoring/SKILL.md` first.
46
47
 
47
48
  If the task is about **building, installing, or diagnosing native tooling**, load `skills/mcp-builder/SKILL.md` first.
48
49
 
50
+ If the task is about **reviewing an RFC or spec draft**, load `skills/rfc-review/SKILL.md` first.
51
+
49
52
  ### Repository docs
50
53
 
51
54
  - `README.md` — high-level repo overview and commands
@@ -203,6 +203,22 @@ export class ToolsInteract {
203
203
  semantic: element.semantic ?? null
204
204
  };
205
205
  }
206
+ static _summarizeResolutionCandidate(candidate) {
207
+ const bounds = ToolsInteract._normalizeBounds(candidate.el.bounds);
208
+ return {
209
+ text: candidate.el.text ?? null,
210
+ resource_id: candidate.el.resourceId ?? candidate.el.resourceID ?? candidate.el.id ?? null,
211
+ accessibility_id: candidate.el.contentDescription ?? candidate.el.contentDesc ?? candidate.el.accessibilityLabel ?? candidate.el.label ?? null,
212
+ class: candidate.el.type ?? candidate.el.class ?? null,
213
+ bounds: bounds
214
+ ? { left: bounds[0], top: bounds[1], right: bounds[2], bottom: bounds[3] }
215
+ : null,
216
+ clickable: !!candidate.el.clickable,
217
+ enabled: !!candidate.el.enabled,
218
+ score: candidate.score,
219
+ reason: candidate.reason
220
+ };
221
+ }
206
222
  static _actionFailure(actionType, selector, resolved, failureCode, retryable, uiFingerprintBefore, uiFingerprintAfter, sourceModule = 'interact') {
207
223
  return buildActionExecutionResult({
208
224
  actionType,
@@ -419,17 +435,18 @@ export class ToolsInteract {
419
435
  if (!q)
420
436
  return { found: false, error: 'Empty query' };
421
437
  let best = null;
422
- let bestScore = 0;
423
- let lastTree = null;
424
- const scoreElement = (el) => {
438
+ let bestTree = null;
439
+ let bestIterationCandidates = [];
440
+ let shouldStop = false;
441
+ const scoreElement = (el, idx) => {
425
442
  if (!el || !el.visible)
426
- return 0;
443
+ return null;
427
444
  const bounds = el.bounds || [0, 0, 0, 0];
428
445
  if (!Array.isArray(bounds) || bounds.length < 4)
429
- return 0;
446
+ return null;
430
447
  const [l, t, r, b] = bounds;
431
448
  if (r <= l || b <= t)
432
- return 0;
449
+ return null;
433
450
  // Do not early-return on non-interactable elements — score them so we can locate their clickable ancestor later
434
451
  const interactable = !!(el.clickable || el.enabled || el.focusable);
435
452
  const text = normalize(el.text ?? el.label ?? el.value ?? '');
@@ -437,64 +454,98 @@ export class ToolsInteract {
437
454
  const resourceId = normalize(el.resourceId ?? el.resourceID ?? el.id ?? '');
438
455
  const className = normalize(el.type ?? el.class ?? '');
439
456
  let score = 0;
457
+ let reason = 'best_scoring_candidate';
440
458
  if (exact) {
441
- if (text && text === q)
459
+ if (text && text === q) {
442
460
  score = 1.0;
443
- else if (content && content === q)
461
+ reason = 'exact_text_match';
462
+ }
463
+ else if (content && content === q) {
444
464
  score = 0.95;
465
+ reason = 'exact_content_desc_match';
466
+ }
467
+ else if (resourceId && resourceId === q) {
468
+ score = 0.92;
469
+ reason = 'exact_resource_id_match';
470
+ }
471
+ else if (className && className === q) {
472
+ score = 0.3;
473
+ reason = 'exact_class_match';
474
+ }
445
475
  }
446
476
  else {
447
- if (text && text === q)
477
+ if (text && text === q) {
448
478
  score = 1.0;
449
- else if (content && content === q)
479
+ reason = 'exact_text_match';
480
+ }
481
+ else if (content && content === q) {
450
482
  score = 0.95;
451
- else if (text && text.includes(q))
483
+ reason = 'exact_content_desc_match';
484
+ }
485
+ else if (resourceId && resourceId === q) {
486
+ score = 0.92;
487
+ reason = 'exact_resource_id_match';
488
+ }
489
+ else if (text && text.includes(q)) {
452
490
  score = 0.6;
453
- else if (content && content.includes(q))
491
+ reason = 'partial_text_match';
492
+ }
493
+ else if (content && content.includes(q)) {
454
494
  score = 0.55;
455
- else if (resourceId && resourceId.includes(q))
495
+ reason = 'partial_content_desc_match';
496
+ }
497
+ else if (resourceId && resourceId.includes(q)) {
456
498
  score = 0.7;
457
- else if (className && className.includes(q))
499
+ reason = 'partial_resource_id_match';
500
+ }
501
+ else if (className && className.includes(q)) {
458
502
  score = 0.3;
503
+ reason = 'partial_class_match';
504
+ }
459
505
  }
460
506
  if (score > 0 && interactable)
461
507
  score += 0.05;
462
- return score;
508
+ if (score <= 0)
509
+ return null;
510
+ return { el, idx, score, reason, interactable };
463
511
  };
464
512
  while (Date.now() <= deadline) {
465
513
  try {
466
514
  const tree = await ToolsObserve.getUITreeHandler({ platform, deviceId });
467
- lastTree = tree;
468
515
  if (tree && Array.isArray(tree.elements)) {
469
516
  const elements = tree.elements;
517
+ const iterationCandidates = [];
518
+ let iterationImprovedBest = false;
470
519
  for (let i = 0; i < elements.length; i++) {
471
520
  const el = elements[i];
472
521
  try {
473
- const s = scoreElement(el);
474
- const interactable = !!(el.clickable || el.enabled || el.focusable);
475
- if (s > bestScore) {
476
- bestScore = s;
477
- best = el;
478
- if (best) {
479
- best._index = i;
480
- best._interactable = interactable;
522
+ const candidate = scoreElement(el, i);
523
+ if (!candidate)
524
+ continue;
525
+ iterationCandidates.push(candidate);
526
+ if (!best || candidate.score > best.score) {
527
+ best = candidate;
528
+ bestTree = tree;
529
+ iterationImprovedBest = true;
530
+ if (best.score >= 0.95) {
531
+ shouldStop = true;
532
+ break;
481
533
  }
482
534
  }
483
- if (bestScore >= 0.95)
484
- break;
485
535
  }
486
536
  catch (e) {
487
537
  console.error('Error scoring element:', e);
488
538
  }
489
539
  }
490
- if (bestScore >= 0.95)
491
- break;
540
+ if (iterationImprovedBest) {
541
+ bestIterationCandidates = iterationCandidates.slice();
542
+ }
492
543
  }
493
544
  }
494
545
  catch (e) {
495
546
  console.error('Error fetching UI tree:', e);
496
547
  }
497
- if (Date.now() > deadline)
548
+ if (shouldStop || Date.now() > deadline)
498
549
  break;
499
550
  await new Promise(r => setTimeout(r, 100));
500
551
  }
@@ -502,17 +553,17 @@ export class ToolsInteract {
502
553
  return { found: false, error: 'Element not found' };
503
554
  // If the best match is not interactable, try to resolve an actionable ancestor.
504
555
  try {
505
- const elements = (lastTree && Array.isArray(lastTree.elements)) ? lastTree.elements : [];
506
- const screen = lastTree?.resolution && typeof lastTree.resolution === 'object' ? lastTree.resolution : null;
556
+ const elements = (bestTree && Array.isArray(bestTree.elements)) ? bestTree.elements : [];
557
+ const screen = bestTree?.resolution && typeof bestTree.resolution === 'object' ? bestTree.resolution : null;
507
558
  let chosen = best;
508
- const childBounds = Array.isArray(chosen?.bounds) ? chosen.bounds : null;
559
+ const childBounds = Array.isArray(chosen?.el?.bounds) ? chosen.el.bounds : null;
509
560
  // Strategy 1: if parentId references an index, climb that chain
510
561
  let resolvedAncestor = null;
511
- if (childBounds && (chosen.parentId !== undefined && chosen.parentId !== null)) {
562
+ if (childBounds && (chosen.el.parentId !== undefined && chosen.el.parentId !== null)) {
512
563
  let cur = chosen;
513
564
  let safety = 0;
514
- while (cur && safety < 20 && !(cur.clickable || cur.focusable) && (cur.parentId !== undefined && cur.parentId !== null)) {
515
- let pid = cur.parentId;
565
+ while (cur && safety < 20 && !(cur.el.clickable || cur.el.focusable) && (cur.el.parentId !== undefined && cur.el.parentId !== null)) {
566
+ let pid = cur.el.parentId;
516
567
  let idx = null;
517
568
  if (typeof pid === 'number')
518
569
  idx = pid;
@@ -520,18 +571,19 @@ export class ToolsInteract {
520
571
  idx = Number(pid);
521
572
  // If parentId is not an index, try to find by matching resourceId or id field
522
573
  if (idx !== null && elements[idx]) {
523
- cur = elements[idx];
524
- if (cur && (cur.clickable || cur.enabled || cur.focusable)) {
574
+ cur = { el: elements[idx], idx };
575
+ if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) {
525
576
  resolvedAncestor = cur;
526
577
  break;
527
578
  }
528
579
  }
529
580
  else if (typeof pid === 'string') {
530
581
  // fallback: search elements for matching resourceId or id
531
- const found = elements.find((el) => (el.resourceId === pid || el.id === pid));
582
+ const foundIndex = elements.findIndex((el) => (el.resourceId === pid || el.id === pid));
583
+ const found = foundIndex >= 0 ? elements[foundIndex] : null;
532
584
  if (found) {
533
- cur = found;
534
- if (cur && (cur.clickable || cur.enabled || cur.focusable)) {
585
+ cur = { el: found, idx: foundIndex };
586
+ if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) {
535
587
  resolvedAncestor = cur;
536
588
  break;
537
589
  }
@@ -551,16 +603,19 @@ export class ToolsInteract {
551
603
  if (!resolvedAncestor && childBounds) {
552
604
  const [cl, ct, cr, cb] = childBounds;
553
605
  // find candidates that are clickable and contain the child bounds
554
- const candidates = elements.filter((el) => el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds.length >= 4).map((el) => ({ el, bounds: el.bounds }));
606
+ const candidates = elements
607
+ .map((el, idx) => ({ el, idx }))
608
+ .filter(({ el }) => el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds.length >= 4);
555
609
  let bestCandidate = null;
556
610
  let bestCandidateArea = Infinity;
557
611
  for (const c of candidates) {
558
- const [pl, pt, pr, pb] = c.bounds;
612
+ const bounds = c.el.bounds;
613
+ const [pl, pt, pr, pb] = bounds;
559
614
  if (pl <= cl && pt <= ct && pr >= cr && pb >= cb) {
560
615
  const area = (pr - pl) * (pb - pt);
561
616
  if (area < bestCandidateArea) {
562
617
  bestCandidateArea = area;
563
- bestCandidate = c.el;
618
+ bestCandidate = c;
564
619
  }
565
620
  }
566
621
  }
@@ -568,17 +623,24 @@ export class ToolsInteract {
568
623
  resolvedAncestor = bestCandidate;
569
624
  }
570
625
  if (resolvedAncestor) {
571
- best = resolvedAncestor;
572
- // small score bump to reflect actionability
573
- bestScore = Math.min(1, bestScore + 0.02);
626
+ best = {
627
+ el: resolvedAncestor.el,
628
+ idx: resolvedAncestor.idx,
629
+ score: Math.min(1, best.score + 0.02),
630
+ reason: 'clickable_parent_preferred',
631
+ interactable: true
632
+ };
574
633
  }
575
- if (best && !(best.clickable || best.focusable)) {
576
- const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best, idx: best._index ?? elements.indexOf(best) }, screen);
634
+ if (best && !(best.el.clickable || best.el.focusable)) {
635
+ const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best.el, idx: best.idx }, screen);
577
636
  if (nearbyActionable) {
578
- best = nearbyActionable.el;
579
- best._index = nearbyActionable.idx;
580
- best._interactable = true;
581
- best._sliderLike = nearbyActionable.sliderLike;
637
+ best = {
638
+ el: nearbyActionable.el,
639
+ idx: nearbyActionable.idx,
640
+ score: Math.min(1, best.score + 0.02),
641
+ reason: nearbyActionable.sliderLike ? 'slider_track_preferred' : 'nearby_actionable_control',
642
+ interactable: true
643
+ };
582
644
  }
583
645
  }
584
646
  }
@@ -587,29 +649,34 @@ export class ToolsInteract {
587
649
  }
588
650
  if (!best)
589
651
  return { found: false, error: 'Element not found' };
590
- const boundsObj = Array.isArray(best.bounds) ? { left: best.bounds[0], top: best.bounds[1], right: best.bounds[2], bottom: best.bounds[3] } : null;
652
+ const boundsObj = Array.isArray(best.el.bounds) ? { left: best.el.bounds[0], top: best.el.bounds[1], right: best.el.bounds[2], bottom: best.el.bounds[3] } : null;
591
653
  const tapCoordinates = boundsObj ? { x: Math.floor((boundsObj.left + boundsObj.right) / 2), y: Math.floor((boundsObj.top + boundsObj.bottom) / 2) } : null;
654
+ const uniqueRanked = bestIterationCandidates.filter((candidate, index, array) => index === array.findIndex((other) => other.idx === candidate.idx && other.el === candidate.el));
655
+ const alternateCandidates = uniqueRanked
656
+ .filter((candidate) => candidate.idx !== best.idx || candidate.el !== best.el)
657
+ .slice(0, 3)
658
+ .map((candidate) => ToolsInteract._summarizeResolutionCandidate(candidate));
592
659
  const outEl = {
593
- text: best.text ?? null,
594
- resourceId: best.resourceId ?? null,
595
- contentDesc: best.contentDescription ?? best.contentDesc ?? null,
596
- class: best.type ?? best.class ?? null,
660
+ text: best.el.text ?? null,
661
+ resourceId: best.el.resourceId ?? null,
662
+ contentDesc: best.el.contentDescription ?? best.el.contentDesc ?? null,
663
+ class: best.el.type ?? best.el.class ?? null,
597
664
  bounds: boundsObj,
598
- clickable: !!best.clickable,
599
- enabled: !!best.enabled,
600
- stable_id: best.stable_id ?? null,
601
- role: best.role ?? null,
602
- test_tag: best.test_tag ?? null,
603
- selector: best.selector ?? null,
604
- semantic: best.semantic ?? null,
665
+ clickable: !!best.el.clickable,
666
+ enabled: !!best.el.enabled,
667
+ stable_id: best.el.stable_id ?? null,
668
+ role: best.el.role ?? null,
669
+ test_tag: best.el.test_tag ?? null,
670
+ selector: best.el.selector ?? null,
671
+ semantic: best.el.semantic ?? null,
605
672
  tapCoordinates,
606
673
  telemetry: {
607
- matchedIndex: best?._index ?? null,
608
- matchedInteractable: !!best?._interactable,
609
- sliderLike: !!best?._sliderLike
674
+ matchedIndex: best.idx ?? null,
675
+ matchedInteractable: !!best.interactable,
676
+ sliderLike: best.reason === 'slider_track_preferred'
610
677
  }
611
678
  };
612
- if (best?._sliderLike) {
679
+ if (best.reason === 'slider_track_preferred') {
613
680
  const isVertical = !!boundsObj && (boundsObj.bottom - boundsObj.top) > (boundsObj.right - boundsObj.left);
614
681
  const interactionHint = {
615
682
  kind: 'slider',
@@ -618,8 +685,15 @@ export class ToolsInteract {
618
685
  };
619
686
  outEl.interactionHint = interactionHint;
620
687
  }
621
- const scoreVal = Math.min(1, Number(bestScore.toFixed(3)));
622
- return { found: true, element: outEl, score: scoreVal, confidence: scoreVal };
688
+ const scoreVal = Math.min(1, Number(best.score.toFixed(3)));
689
+ const resolution = {
690
+ confidence: scoreVal,
691
+ reason: best.reason,
692
+ fallback_available: alternateCandidates.length > 0,
693
+ matched_count: uniqueRanked.length,
694
+ alternates: alternateCandidates
695
+ };
696
+ return { found: true, element: outEl, score: scoreVal, confidence: scoreVal, resolution };
623
697
  }
624
698
  static async waitForUIHandler({ selector, condition = 'exists', timeout_ms = 60000, poll_interval_ms = 300, match, retry = { max_attempts: 1, backoff_ms: 0 }, platform, deviceId }) {
625
699
  const overallStart = Date.now();
@@ -596,7 +596,9 @@ Recommended Usage:
596
596
  },
597
597
  {
598
598
  name: 'find_element',
599
- description: 'Find a UI element by semantic query (text, content-desc, resource-id, class). Returns best match.',
599
+ description: `Find a UI element by semantic query (text, content-desc, resource-id, class).
600
+
601
+ Returns the best match plus resolution metadata when available, including confidence, selection reason, and fallback alternates.`,
600
602
  inputSchema: {
601
603
  type: 'object',
602
604
  properties: {
@@ -6,7 +6,7 @@ import { handleToolCall } from './server/tool-handlers.js';
6
6
  export { wrapResponse, toolDefinitions, handleToolCall };
7
7
  export const serverInfo = {
8
8
  name: 'mobile-debug-mcp',
9
- version: '0.26.2'
9
+ version: '0.26.3'
10
10
  };
11
11
  export function createServer() {
12
12
  const server = new Server(serverInfo, {
package/docs/CHANGELOG.md CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  All notable changes to the **Mobile Debug MCP** project will be documented in this file.
4
4
 
5
+ ## [0.26.3]
6
+ - updates the `find_element` tool to return detailed resolution metadata, including confidence scores,
7
+
5
8
  ## [0.26.2]
6
9
  - unified action execution and verification model
7
10
 
package/docs/ROADMAP.md CHANGED
@@ -53,9 +53,9 @@ Higher task success with fewer retries.
53
53
  ## Upcoming Work
54
54
 
55
55
  - Adjustable Control Support
56
+ - Better Compose / Custom Control Semantics
56
57
  - Signal-Oriented Diagnostic Filtering
57
58
  - Long Press Gesture
58
- - Better Compose / Custom Control Semantics
59
59
 
60
60
  ## Later Horizon
61
61
 
@@ -160,6 +160,7 @@ Addresses failures where agents:
160
160
  - wait_for_ui_change (hierarchy diff based waiting)
161
161
  - Structured loading state detection
162
162
  - Snapshot revision / staleness metadata
163
+ - Focused snapshot views / incremental snapshot diffs
163
164
  - Compose-aware wait robustness improvements
164
165
 
165
166
  ## Expected Impact
@@ -169,6 +170,7 @@ Very high.
169
170
  - wait_for_ui_change implemented
170
171
  - Loading state detection available for representative controls
171
172
  - Snapshot revision or staleness metadata exposed
173
+ - Focused or diff-oriented snapshots validated in benchmark flows
172
174
  - UI-first sync guidance added to spec guardrails
173
175
  - In-place update waits validated on benchmark flows
174
176
 
@@ -379,9 +381,9 @@ Strengthens:
379
381
  # Better Compose / Custom Control Semantics
380
382
 
381
383
  ## Rationale
382
- Important, but strengthened by earlier capabilities first.
384
+ Higher priority after agent feedback exposed custom control semantics as a core reliability gap, not a later optimization.
383
385
 
384
- **Status:** Planned
386
+ **Status:** Spec Ready
385
387
 
386
388
  Semantics become more useful once:
387
389
  - identity is stronger
@@ -419,7 +421,6 @@ Depends on:
419
421
  - Wait and Synchronization Reliability
420
422
  - Actionability Resolution
421
423
  - Adjustable Control Support
422
- - Signal-Oriented Diagnostic Filtering
423
424
  - Long Press Gesture
424
425
 
425
426
  ---
@@ -543,19 +544,19 @@ Make core loop more reliable.
543
544
 
544
545
  ## Wave 2 (Control Precision + Diagnostics)
545
546
  - Adjustable Control Support
547
+ - Better Compose / Custom Control Semantics
546
548
  - Signal-Oriented Diagnostic Filtering
547
549
 
548
550
  Focus:
549
- Improve control precision and signal observability.
551
+ Improve control precision, custom control semantics, and signal observability.
550
552
 
551
553
  ---
552
554
 
553
555
  ## Wave 3 (Interaction Expansion)
554
556
  - Long Press Gesture
555
- - Better Compose / Custom Control Semantics
556
557
 
557
558
  Focus:
558
- Expand interaction capability.
559
+ Expand interaction capability after core control reliability is improved.
559
560
 
560
561
  ---
561
562
 
@@ -576,9 +577,9 @@ Roadmap Ordering:
576
577
  3. Wait and Synchronization Reliability
577
578
  4. Actionability Resolution
578
579
  5. Adjustable Control Support
579
- 6. Signal-Oriented Diagnostic Filtering
580
- 7. Long Press Gesture
581
- 8. Better Compose / Custom Control Semantics
580
+ 6. Better Compose / Custom Control Semantics
581
+ 7. Signal-Oriented Diagnostic Filtering
582
+ 8. Long Press Gesture
582
583
  9. Pinch to Zoom
583
584
  10. Action Trace Correlation
584
585
 
@@ -0,0 +1,277 @@
1
+ # RFC 007 — Actionability Resolution and Executable Target Selection
2
+
3
+ ## 1. Summary
4
+
5
+ This RFC defines how the system resolves which discovered UI element should receive an action before dispatch.
6
+
7
+ It addresses ambiguity between:
8
+ - visible elements vs actionable elements
9
+ - leaf nodes vs clickable containers
10
+ - semantic targets vs coordinate fallbacks
11
+ - multiple candidate targets with uncertain executability
12
+
13
+ Goal:
14
+ Improve first-attempt action correctness by resolving the best executable target prior to action dispatch.
15
+
16
+ This RFC defines the `Resolved` stage semantics referenced in RFC 005 and operationalized by RFC 006.
17
+ It is grounded in the existing element-resolution flow and extends current resolution behavior rather than assuming a wholly new resolver architecture.
18
+
19
+ ---
20
+
21
+ ## 2. Problem Statement
22
+
23
+ Current interaction failures often arise before execution.
24
+
25
+ The agent may discover the intended UI concept, but not the correct executable target.
26
+
27
+ Examples:
28
+ - tapping label text instead of clickable container
29
+ - sliders not surfacing semantic handles
30
+ - generic Compose containers hiding true affordances
31
+ - multiple matching targets without ranking logic
32
+
33
+ Observed failure modes:
34
+ - false taps
35
+ - submit ambiguity
36
+ - coordinate guessing
37
+ - retry loops
38
+ - brittle fallback behavior
39
+
40
+ This is a target-resolution problem, not an execution problem.
41
+
42
+ ---
43
+
44
+ ## 3. Design Goals
45
+
46
+ Resolution MUST:
47
+ - Prefer executable targets over merely visible matches
48
+ - Reduce ambiguous target selection
49
+ - Support confidence-based ranking
50
+ - Build on existing runtime resolution surfaces before introducing new resolution metadata
51
+ - Use structural and semantic resolution signals
52
+ - Minimize coordinate fallback usage
53
+ - Integrate with verification expectations from RFC 005
54
+
55
+ ---
56
+
57
+ ## 4. Actionability Model
58
+
59
+ Candidate targets are evaluated using actionability signals.
60
+
61
+ ### Structural signals
62
+ - clickable
63
+ - enabled
64
+ - focusable
65
+ - bounds
66
+ - parent action ownership
67
+
68
+ ### Semantic signals
69
+ - control role
70
+ - label association
71
+ - affordance hints
72
+ - selectable or adjustable semantics
73
+
74
+ ### Interaction signals
75
+ - reliable target patterns
76
+ - control-specific heuristics
77
+ - gesture compatibility
78
+
79
+ ---
80
+
81
+ ## 4.1 Current Runtime Resolution Surfaces
82
+
83
+ This RFC builds on current runtime resolution paths, including:
84
+ - `findElementHandler` for candidate discovery
85
+ - `_resolveActionableAncestor` for executable ancestor promotion
86
+ - `tapElementHandler` for resolved element dispatch
87
+ - `scrollToElementHandler` for scroll-mediated target acquisition
88
+
89
+ These existing handlers are the current implementation substrate for the Resolved stage.
90
+ This RFC extends and systematizes those behaviors; it does not assume replacement of those paths.
91
+
92
+ ---
93
+
94
+ ## 5. Target Candidate Ranking
95
+
96
+ When multiple targets match, candidates are ranked.
97
+
98
+ Illustrative confidence model:
99
+
100
+ resolution_confidence =
101
+ interactability_score
102
+ + semantic_match_score
103
+ + structural_reliability_score
104
+
105
+ Highest-confidence executable target is preferred.
106
+
107
+ The confidence model is illustrative and normative only at the rule-precedence level; implementations may use simpler heuristics while preserving resolution ordering guarantees. Any scoring mechanism is implementation-defined and may not be externally surfaced.
108
+
109
+ ---
110
+
111
+ ## 6. Resolution Rules
112
+
113
+ ### Rule A — Prefer actionable containers over passive leaf nodes
114
+
115
+ Prefer:
116
+ - clickable container
117
+
118
+ Over:
119
+ - passive child text nodes
120
+
121
+ Example:
122
+ Prefer button container over "Generate Session" label node.
123
+
124
+ ---
125
+
126
+ ### Rule B — Prefer semantic controls over coordinate fallbacks
127
+
128
+ Use semantic control targets whenever possible.
129
+
130
+ Coordinate fallback only when:
131
+ - no semantic target exists
132
+ - adjustable control semantics absent
133
+ - fallback confidence acceptable
134
+
135
+ ---
136
+
137
+ ### Rule C — Prefer explicit affordance ownership
138
+
139
+ If child and parent differ:
140
+ prefer the node owning the action handler.
141
+
142
+ ---
143
+
144
+ ## 7. Ambiguity Handling
145
+
146
+ When multiple plausible targets remain:
147
+
148
+ System SHOULD:
149
+ - rank candidates
150
+ - expose confidence
151
+ - preserve alternates for fallback reasoning
152
+
153
+ Low-confidence targets may trigger:
154
+ - guarded execution
155
+ - alternate resolution attempt
156
+ - explicit recovery path
157
+
158
+ ---
159
+
160
+ ## 8. Adjustable Control Resolution
161
+
162
+ Special handling for:
163
+ - sliders
164
+ - steppers
165
+ - drag controls
166
+
167
+ Support:
168
+ - adjustable-role recognition
169
+ - control-bound discovery
170
+ - value-aware interaction targeting
171
+
172
+ This RFC defines target resolution.
173
+ Value-setting behavior remains governed by Adjustable Control Support.
174
+
175
+ ---
176
+
177
+ ## 9. Compose / Custom Control Resolution
178
+
179
+ Support derived actionability for:
180
+ - merged Compose semantics
181
+ - composite controls
182
+ - inferred interaction contracts
183
+
184
+ This RFC depends on and strengthens Better Compose / Custom Control Semantics.
185
+
186
+ ---
187
+
188
+ ## 10. Resolution Output Model (Current + Future Extension)
189
+
190
+ This model is non-normative and represents a progressive enrichment direction rather than a required runtime contract.
191
+
192
+ Resolution may evolve toward the following enriched output shape. Current runtime implementations may expose only resolved-target output plus limited supporting metadata.
193
+
194
+ At minimum, current implementations are expected to produce a resolved target. Confidence, alternates, fallback metadata, and reason codes may be introduced incrementally.
195
+
196
+ Illustrative future-complete shape:
197
+
198
+ {
199
+ "resolved_target": "...",
200
+ "confidence": 0.92,
201
+ "fallback_available": true,
202
+ "resolution_reason": "clickable_parent_preferred"
203
+ }
204
+
205
+ ---
206
+
207
+ ## 11. Verification Integration
208
+
209
+ Resolution is incomplete without verification expectations.
210
+
211
+ Resolved output should be derived directly from the existing element-resolution flow before adding richer metadata layers.
212
+
213
+ Resolved target should carry expected post-action signal.
214
+
215
+ Examples:
216
+ - navigation transition expected
217
+ - menu expected
218
+ - control value change expected
219
+
220
+ This feeds RFC 005 verification.
221
+
222
+ ---
223
+
224
+ ## 12. Success Metrics
225
+
226
+ Track:
227
+ - reduced false-tap failures
228
+ - lower retarget retries
229
+ - higher first-attempt action success
230
+ - reduced coordinate fallback usage
231
+ - improved custom control interaction success
232
+
233
+ ---
234
+
235
+ ## 13. Dependencies
236
+
237
+ Depends on:
238
+ - Stronger State Verification
239
+ - Richer Element Identity
240
+ - Wait and Synchronization Reliability
241
+
242
+ Strengthens:
243
+ - Adjustable Control Support
244
+ - Better Compose / Custom Control Semantics
245
+
246
+ ---
247
+
248
+ ## 14. Relationship to Other RFCs
249
+
250
+ RFC 005
251
+ Defines what Resolved means in lifecycle semantics.
252
+
253
+ RFC 006
254
+ Defines how runtime interprets action execution.
255
+
256
+ RFC 007
257
+ Defines how a target becomes Resolved.
258
+ Specifically, it formalizes the current discovery → actionable ancestor resolution → dispatch preparation flow already present in runtime handlers.
259
+
260
+ Together:
261
+ - RFC 005 — action correctness
262
+ - RFC 006 — runtime execution binding
263
+ - RFC 007 — executable target resolution
264
+
265
+ ---
266
+
267
+ ## 15. Summary
268
+
269
+ This RFC reduces failures caused by acting on the wrong thing, even when the right thing was discovered.
270
+
271
+ It improves:
272
+ - action precision
273
+ - control reliability
274
+ - Compose interaction robustness
275
+ - agent success with fewer retries
276
+
277
+ It addresses one of the largest remaining sources of interaction brittleness.
@@ -199,7 +199,14 @@ Output:
199
199
  "telemetry": { "matchedIndex": 3, "matchedInteractable": true }
200
200
  },
201
201
  "score": 1.0,
202
- "confidence": 1.0
202
+ "confidence": 1.0,
203
+ "resolution": {
204
+ "confidence": 1.0,
205
+ "reason": "exact_text_match",
206
+ "fallback_available": false,
207
+ "matched_count": 1,
208
+ "alternates": []
209
+ }
203
210
  }
204
211
  ```
205
212
 
@@ -207,6 +214,7 @@ Notes:
207
214
 
208
215
  - Best used when no precise selector is available yet.
209
216
  - `tapCoordinates` are suitable for `tap` calls.
217
+ - `resolution` explains why the element was selected and may include fallback alternates when the runtime had to promote a parent or nearby control.
210
218
  - Prefer `wait_for_ui` when you already know a deterministic selector and want a stable `elementId`.
211
219
 
212
220
  ---
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mobile-debug-mcp",
3
- "version": "0.26.2",
3
+ "version": "0.26.3",
4
4
  "description": "MCP server for mobile app debugging (Android + iOS), with focus on security and reliability",
5
5
  "type": "module",
6
6
  "bin": {
@@ -10,6 +10,7 @@ import { buildActionExecutionResult } from '../server/common.js'
10
10
  import type {
11
11
  ActionFailureCode,
12
12
  ActionTargetResolved,
13
+ FindElementResponse,
13
14
  ExpectElementVisibleResponse,
14
15
  ExpectStateResponse,
15
16
  ExpectScreenResponse,
@@ -68,6 +69,32 @@ interface UiChangeSignatureSet {
68
69
  state: string | null
69
70
  }
70
71
 
72
+ interface RankedResolutionCandidate {
73
+ el: UiElement
74
+ idx: number
75
+ score: number
76
+ reason: string
77
+ interactable: boolean
78
+ }
79
+
80
+ interface FindElementResolutionSummary {
81
+ confidence: number
82
+ reason: string
83
+ fallback_available: boolean
84
+ matched_count: number
85
+ alternates: Array<{
86
+ text: string | null
87
+ resource_id: string | null
88
+ accessibility_id: string | null
89
+ class: string | null
90
+ bounds: { left: number; top: number; right: number; bottom: number } | null
91
+ clickable: boolean
92
+ enabled: boolean
93
+ score: number
94
+ reason: string
95
+ }>
96
+ }
97
+
71
98
 
72
99
  export class ToolsInteract {
73
100
  private static readonly _maxResolvedUiElements = 256
@@ -290,6 +317,23 @@ export class ToolsInteract {
290
317
  }
291
318
  }
292
319
 
320
+ private static _summarizeResolutionCandidate(candidate: RankedResolutionCandidate): FindElementResolutionSummary['alternates'][number] {
321
+ const bounds = ToolsInteract._normalizeBounds(candidate.el.bounds)
322
+ return {
323
+ text: candidate.el.text ?? null,
324
+ resource_id: candidate.el.resourceId ?? candidate.el.resourceID ?? candidate.el.id ?? null,
325
+ accessibility_id: candidate.el.contentDescription ?? candidate.el.contentDesc ?? candidate.el.accessibilityLabel ?? candidate.el.label ?? null,
326
+ class: candidate.el.type ?? candidate.el.class ?? null,
327
+ bounds: bounds
328
+ ? { left: bounds[0], top: bounds[1], right: bounds[2], bottom: bounds[3] }
329
+ : null,
330
+ clickable: !!candidate.el.clickable,
331
+ enabled: !!candidate.el.enabled,
332
+ score: candidate.score,
333
+ reason: candidate.reason
334
+ }
335
+ }
336
+
293
337
  private static _actionFailure(
294
338
  actionType: string,
295
339
  selector: Record<string, unknown> | null,
@@ -546,7 +590,7 @@ export class ToolsInteract {
546
590
  return await interact.scrollToElement(selector, direction, maxScrolls, scrollAmount, resolved.id)
547
591
  }
548
592
 
549
- static async findElementHandler({ query, exact = false, timeoutMs = 3000, platform, deviceId }: { query: string, exact?: boolean, timeoutMs?: number, platform?: 'android' | 'ios', deviceId?: string }) {
593
+ static async findElementHandler({ query, exact = false, timeoutMs = 3000, platform, deviceId }: { query: string, exact?: boolean, timeoutMs?: number, platform?: 'android' | 'ios', deviceId?: string }): Promise<FindElementResponse> {
550
594
  // Try to use observe layer to fetch the current UI tree and perform a fast semantic search
551
595
  const start = Date.now()
552
596
  const deadline = start + timeoutMs
@@ -555,16 +599,17 @@ export class ToolsInteract {
555
599
  const q = normalize(query)
556
600
  if (!q) return { found: false, error: 'Empty query' }
557
601
 
558
- let best: UiElement | null = null
559
- let bestScore = 0
560
- let lastTree: any = null
602
+ let best: RankedResolutionCandidate | null = null
603
+ let bestTree: any = null
604
+ let bestIterationCandidates: RankedResolutionCandidate[] = []
605
+ let shouldStop = false
561
606
 
562
- const scoreElement = (el: UiElement | null) => {
563
- if (!el || !el.visible) return 0
607
+ const scoreElement = (el: UiElement | null, idx: number): RankedResolutionCandidate | null => {
608
+ if (!el || !el.visible) return null
564
609
  const bounds = el.bounds || [0,0,0,0]
565
- if (!Array.isArray(bounds) || bounds.length < 4) return 0
610
+ if (!Array.isArray(bounds) || bounds.length < 4) return null
566
611
  const [l,t,r,b] = bounds
567
- if (r <= l || b <= t) return 0
612
+ if (r <= l || b <= t) return null
568
613
  // Do not early-return on non-interactable elements — score them so we can locate their clickable ancestor later
569
614
  const interactable = !!(el.clickable || el.enabled || el.focusable)
570
615
 
@@ -574,44 +619,80 @@ export class ToolsInteract {
574
619
  const className = normalize(el.type ?? el.class ?? '')
575
620
 
576
621
  let score = 0
622
+ let reason = 'best_scoring_candidate'
577
623
  if (exact) {
578
- if (text && text === q) score = 1.0
579
- else if (content && content === q) score = 0.95
624
+ if (text && text === q) {
625
+ score = 1.0
626
+ reason = 'exact_text_match'
627
+ } else if (content && content === q) {
628
+ score = 0.95
629
+ reason = 'exact_content_desc_match'
630
+ } else if (resourceId && resourceId === q) {
631
+ score = 0.92
632
+ reason = 'exact_resource_id_match'
633
+ } else if (className && className === q) {
634
+ score = 0.3
635
+ reason = 'exact_class_match'
636
+ }
580
637
  } else {
581
- if (text && text === q) score = 1.0
582
- else if (content && content === q) score = 0.95
583
- else if (text && text.includes(q)) score = 0.6
584
- else if (content && content.includes(q)) score = 0.55
585
- else if (resourceId && resourceId.includes(q)) score = 0.7
586
- else if (className && className.includes(q)) score = 0.3
638
+ if (text && text === q) {
639
+ score = 1.0
640
+ reason = 'exact_text_match'
641
+ } else if (content && content === q) {
642
+ score = 0.95
643
+ reason = 'exact_content_desc_match'
644
+ } else if (resourceId && resourceId === q) {
645
+ score = 0.92
646
+ reason = 'exact_resource_id_match'
647
+ } else if (text && text.includes(q)) {
648
+ score = 0.6
649
+ reason = 'partial_text_match'
650
+ } else if (content && content.includes(q)) {
651
+ score = 0.55
652
+ reason = 'partial_content_desc_match'
653
+ } else if (resourceId && resourceId.includes(q)) {
654
+ score = 0.7
655
+ reason = 'partial_resource_id_match'
656
+ } else if (className && className.includes(q)) {
657
+ score = 0.3
658
+ reason = 'partial_class_match'
659
+ }
587
660
  }
588
661
  if (score > 0 && interactable) score += 0.05
589
- return score
662
+ if (score <= 0) return null
663
+ return { el, idx, score, reason, interactable }
590
664
  }
591
665
 
592
666
  while (Date.now() <= deadline) {
593
667
  try {
594
- const tree = await ToolsObserve.getUITreeHandler({ platform, deviceId })
595
- lastTree = tree
668
+ const tree = await ToolsObserve.getUITreeHandler({ platform, deviceId })
596
669
  if (tree && Array.isArray((tree as any).elements)) {
597
670
  const elements = ((tree as any).elements as UiElement[])
671
+ const iterationCandidates: RankedResolutionCandidate[] = []
672
+ let iterationImprovedBest = false
598
673
  for (let i = 0; i < elements.length; i++) {
599
674
  const el = elements[i]
600
675
  try {
601
- const s = scoreElement(el)
602
- const interactable = !!(el.clickable || el.enabled || (el as any).focusable)
603
- if (s > bestScore) {
604
- bestScore = s
605
- best = el as UiElement
606
- if (best) { best._index = i; best._interactable = interactable }
676
+ const candidate = scoreElement(el, i)
677
+ if (!candidate) continue
678
+ iterationCandidates.push(candidate)
679
+ if (!best || candidate.score > best.score) {
680
+ best = candidate
681
+ bestTree = tree
682
+ iterationImprovedBest = true
683
+ if (best.score >= 0.95) {
684
+ shouldStop = true
685
+ break
686
+ }
607
687
  }
608
- if (bestScore >= 0.95) break
609
688
  } catch (e) { console.error('Error scoring element:', e) }
610
689
  }
611
- if (bestScore >= 0.95) break
690
+ if (iterationImprovedBest) {
691
+ bestIterationCandidates = iterationCandidates.slice()
692
+ }
612
693
  }
613
694
  } catch (e) { console.error('Error fetching UI tree:', e) }
614
- if (Date.now() > deadline) break
695
+ if (shouldStop || Date.now() > deadline) break
615
696
  await new Promise(r => setTimeout(r, 100))
616
697
  }
617
698
 
@@ -619,31 +700,32 @@ export class ToolsInteract {
619
700
 
620
701
  // If the best match is not interactable, try to resolve an actionable ancestor.
621
702
  try {
622
- const elements = (lastTree && Array.isArray(lastTree.elements)) ? (lastTree.elements as UiElement[]) : []
623
- const screen = lastTree?.resolution && typeof lastTree.resolution === 'object' ? lastTree.resolution as UiResolution : null
624
- let chosen = best as any
625
- const childBounds = Array.isArray(chosen?.bounds) ? chosen.bounds : null
703
+ const elements = (bestTree && Array.isArray(bestTree.elements)) ? (bestTree.elements as UiElement[]) : []
704
+ const screen = bestTree?.resolution && typeof bestTree.resolution === 'object' ? bestTree.resolution as UiResolution : null
705
+ let chosen = best as { el: UiElement, idx: number }
706
+ const childBounds = Array.isArray(chosen?.el?.bounds) ? chosen.el.bounds : null
626
707
 
627
708
  // Strategy 1: if parentId references an index, climb that chain
628
- let resolvedAncestor: any = null
629
- if (childBounds && (chosen.parentId !== undefined && chosen.parentId !== null)) {
709
+ let resolvedAncestor: { el: UiElement, idx: number } | null = null
710
+ if (childBounds && (chosen.el.parentId !== undefined && chosen.el.parentId !== null)) {
630
711
  let cur = chosen
631
712
  let safety = 0
632
- while (cur && safety < 20 && !(cur.clickable || cur.focusable) && (cur.parentId !== undefined && cur.parentId !== null)) {
633
- let pid = cur.parentId
713
+ while (cur && safety < 20 && !(cur.el.clickable || cur.el.focusable) && (cur.el.parentId !== undefined && cur.el.parentId !== null)) {
714
+ let pid = cur.el.parentId
634
715
  let idx: number | null = null
635
716
  if (typeof pid === 'number') idx = pid
636
717
  else if (typeof pid === 'string' && /^\d+$/.test(pid)) idx = Number(pid)
637
718
  // If parentId is not an index, try to find by matching resourceId or id field
638
719
  if (idx !== null && elements[idx]) {
639
- cur = elements[idx]
640
- if (cur && (cur.clickable || cur.enabled || cur.focusable)) { resolvedAncestor = cur; break }
720
+ cur = { el: elements[idx], idx }
721
+ if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) { resolvedAncestor = cur; break }
641
722
  } else if (typeof pid === 'string') {
642
723
  // fallback: search elements for matching resourceId or id
643
- const found = elements.find((el: UiElement)=> (el.resourceId === pid || el.id === pid))
724
+ const foundIndex = elements.findIndex((el: UiElement)=> (el.resourceId === pid || el.id === pid))
725
+ const found = foundIndex >= 0 ? elements[foundIndex] : null
644
726
  if (found) {
645
- cur = found
646
- if (cur && (cur.clickable || cur.enabled || cur.focusable)) { resolvedAncestor = cur; break }
727
+ cur = { el: found, idx: foundIndex }
728
+ if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) { resolvedAncestor = cur; break }
647
729
  // otherwise continue climbing if this found element has its own parentId
648
730
  } else {
649
731
  break
@@ -659,62 +741,77 @@ export class ToolsInteract {
659
741
  if (!resolvedAncestor && childBounds) {
660
742
  const [cl,ct,cr,cb] = childBounds
661
743
  // find candidates that are clickable and contain the child bounds
662
- const candidates = elements.filter((el: UiElement)=> el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds!.length>=4).map((el: UiElement)=>({el, bounds: el.bounds! as number[]}))
663
- let bestCandidate: any = null
744
+ const candidates = elements
745
+ .map((el: UiElement, idx: number) => ({ el, idx }))
746
+ .filter(({ el }) => el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds!.length >= 4)
747
+ let bestCandidate: { el: UiElement, idx: number } | null = null
664
748
  let bestCandidateArea = Infinity
665
749
  for (const c of candidates) {
666
- const [pl,pt,pr,pb] = c.bounds
750
+ const bounds = c.el.bounds as number[]
751
+ const [pl,pt,pr,pb] = bounds
667
752
  if (pl <= cl && pt <= ct && pr >= cr && pb >= cb) {
668
753
  const area = (pr-pl) * (pb-pt)
669
- if (area < bestCandidateArea) { bestCandidateArea = area; bestCandidate = c.el }
754
+ if (area < bestCandidateArea) { bestCandidateArea = area; bestCandidate = c }
670
755
  }
671
756
  }
672
757
  if (bestCandidate) resolvedAncestor = bestCandidate
673
758
  }
674
759
 
675
760
  if (resolvedAncestor) {
676
- best = resolvedAncestor
677
- // small score bump to reflect actionability
678
- bestScore = Math.min(1, bestScore + 0.02)
761
+ best = {
762
+ el: resolvedAncestor.el,
763
+ idx: resolvedAncestor.idx,
764
+ score: Math.min(1, best.score + 0.02),
765
+ reason: 'clickable_parent_preferred',
766
+ interactable: true
767
+ }
679
768
  }
680
769
 
681
- if (best && !(best.clickable || best.focusable)) {
682
- const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best, idx: best._index ?? elements.indexOf(best) }, screen)
770
+ if (best && !(best.el.clickable || best.el.focusable)) {
771
+ const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best.el, idx: best.idx }, screen)
683
772
  if (nearbyActionable) {
684
- best = nearbyActionable.el
685
- best._index = nearbyActionable.idx
686
- best._interactable = true
687
- best._sliderLike = nearbyActionable.sliderLike
773
+ best = {
774
+ el: nearbyActionable.el,
775
+ idx: nearbyActionable.idx,
776
+ score: Math.min(1, best.score + 0.02),
777
+ reason: nearbyActionable.sliderLike ? 'slider_track_preferred' : 'nearby_actionable_control',
778
+ interactable: true
779
+ }
688
780
  }
689
781
  }
690
782
  } catch (e) { console.error('Error resolving ancestor:', e) }
691
783
 
692
784
  if (!best) return { found: false, error: 'Element not found' }
693
785
 
694
- const boundsObj = Array.isArray(best.bounds) ? { left: best.bounds[0], top: best.bounds[1], right: best.bounds[2], bottom: best.bounds[3] } : null
786
+ const boundsObj = Array.isArray(best.el.bounds) ? { left: best.el.bounds[0], top: best.el.bounds[1], right: best.el.bounds[2], bottom: best.el.bounds[3] } : null
695
787
  const tapCoordinates = boundsObj ? { x: Math.floor((boundsObj.left + boundsObj.right) / 2), y: Math.floor((boundsObj.top + boundsObj.bottom) / 2) } : null
788
+ const uniqueRanked = bestIterationCandidates.filter((candidate, index, array) => index === array.findIndex((other) => other.idx === candidate.idx && other.el === candidate.el))
789
+ const alternateCandidates = uniqueRanked
790
+ .filter((candidate) => candidate.idx !== best.idx || candidate.el !== best.el)
791
+ .slice(0, 3)
792
+ .map((candidate) => ToolsInteract._summarizeResolutionCandidate(candidate))
696
793
 
697
794
  const outEl = {
698
- text: best.text ?? null,
699
- resourceId: best.resourceId ?? null,
700
- contentDesc: best.contentDescription ?? best.contentDesc ?? null,
701
- class: best.type ?? best.class ?? null,
795
+ text: best.el.text ?? null,
796
+ resourceId: best.el.resourceId ?? null,
797
+ contentDesc: best.el.contentDescription ?? best.el.contentDesc ?? null,
798
+ class: best.el.type ?? best.el.class ?? null,
702
799
  bounds: boundsObj,
703
- clickable: !!best.clickable,
704
- enabled: !!best.enabled,
705
- stable_id: best.stable_id ?? null,
706
- role: best.role ?? null,
707
- test_tag: best.test_tag ?? null,
708
- selector: best.selector ?? null,
709
- semantic: best.semantic ?? null,
800
+ clickable: !!best.el.clickable,
801
+ enabled: !!best.el.enabled,
802
+ stable_id: best.el.stable_id ?? null,
803
+ role: best.el.role ?? null,
804
+ test_tag: best.el.test_tag ?? null,
805
+ selector: best.el.selector ?? null,
806
+ semantic: best.el.semantic ?? null,
710
807
  tapCoordinates,
711
808
  telemetry: {
712
- matchedIndex: best?._index ?? null,
713
- matchedInteractable: !!best?._interactable,
714
- sliderLike: !!best?._sliderLike
809
+ matchedIndex: best.idx ?? null,
810
+ matchedInteractable: !!best.interactable,
811
+ sliderLike: best.reason === 'slider_track_preferred'
715
812
  }
716
813
  }
717
- if (best?._sliderLike) {
814
+ if (best.reason === 'slider_track_preferred') {
718
815
  const isVertical = !!boundsObj && (boundsObj.bottom - boundsObj.top) > (boundsObj.right - boundsObj.left)
719
816
  const interactionHint = {
720
817
  kind: 'slider',
@@ -723,8 +820,15 @@ export class ToolsInteract {
723
820
  }
724
821
  ;(outEl as any).interactionHint = interactionHint
725
822
  }
726
- const scoreVal = Math.min(1, Number(bestScore.toFixed(3)))
727
- return { found: true, element: outEl, score: scoreVal, confidence: scoreVal }
823
+ const scoreVal = Math.min(1, Number(best.score.toFixed(3)))
824
+ const resolution: FindElementResolutionSummary = {
825
+ confidence: scoreVal,
826
+ reason: best.reason,
827
+ fallback_available: alternateCandidates.length > 0,
828
+ matched_count: uniqueRanked.length,
829
+ alternates: alternateCandidates
830
+ }
831
+ return { found: true, element: outEl, score: scoreVal, confidence: scoreVal, resolution }
728
832
  }
729
833
 
730
834
  static async waitForUIHandler({ selector, condition = 'exists', timeout_ms = 60000, poll_interval_ms = 300, match, retry = { max_attempts: 1, backoff_ms: 0 }, platform, deviceId }: { selector?: { text?: string, resource_id?: string, accessibility_id?: string, contains?: boolean }, condition?: 'exists'|'not_exists'|'visible'|'clickable', timeout_ms?: number, poll_interval_ms?: number, match?: { index?: number }, retry?: { max_attempts?: number, backoff_ms?: number }, platform?: 'android'|'ios', deviceId?: string }) {
@@ -596,7 +596,9 @@ Recommended Usage:
596
596
  },
597
597
  {
598
598
  name: 'find_element',
599
- description: 'Find a UI element by semantic query (text, content-desc, resource-id, class). Returns best match.',
599
+ description: `Find a UI element by semantic query (text, content-desc, resource-id, class).
600
+
601
+ Returns the best match plus resolution metadata when available, including confidence, selection reason, and fallback alternates.`,
600
602
  inputSchema: {
601
603
  type: 'object',
602
604
  properties: {
@@ -13,7 +13,7 @@ export { wrapResponse, toolDefinitions, handleToolCall }
13
13
 
14
14
  export const serverInfo = {
15
15
  name: 'mobile-debug-mcp',
16
- version: '0.26.2'
16
+ version: '0.26.3'
17
17
  }
18
18
 
19
19
  export function createServer() {
package/src/types.ts CHANGED
@@ -254,6 +254,79 @@ export interface ActionTargetResolved {
254
254
  semantic?: UIElementSemanticMetadata | null;
255
255
  }
256
256
 
257
+ export interface ResolutionAlternate {
258
+ text: string | null;
259
+ resource_id: string | null;
260
+ accessibility_id: string | null;
261
+ class: string | null;
262
+ bounds: {
263
+ left: number;
264
+ top: number;
265
+ right: number;
266
+ bottom: number;
267
+ } | null;
268
+ clickable: boolean;
269
+ enabled: boolean;
270
+ score: number;
271
+ reason: string;
272
+ }
273
+
274
+ export interface ResolutionSummary {
275
+ confidence: number;
276
+ reason: string;
277
+ fallback_available: boolean;
278
+ matched_count: number;
279
+ alternates: ResolutionAlternate[];
280
+ }
281
+
282
+ export interface FindElementElement {
283
+ text: string | null;
284
+ resourceId: string | null;
285
+ contentDesc: string | null;
286
+ class: string | null;
287
+ bounds: {
288
+ left: number;
289
+ top: number;
290
+ right: number;
291
+ bottom: number;
292
+ } | null;
293
+ clickable: boolean;
294
+ enabled: boolean;
295
+ stable_id?: string | null;
296
+ role?: string | null;
297
+ test_tag?: string | null;
298
+ selector?: UIResolutionSelector | null;
299
+ semantic?: UIElementSemanticMetadata | null;
300
+ tapCoordinates: {
301
+ x: number;
302
+ y: number;
303
+ } | null;
304
+ telemetry: {
305
+ matchedIndex: number | null;
306
+ matchedInteractable: boolean;
307
+ sliderLike: boolean;
308
+ };
309
+ interactionHint?: {
310
+ kind: 'slider';
311
+ axis: 'horizontal' | 'vertical';
312
+ trackBounds: {
313
+ left: number;
314
+ top: number;
315
+ right: number;
316
+ bottom: number;
317
+ } | null;
318
+ };
319
+ }
320
+
321
+ export interface FindElementResponse {
322
+ found: boolean;
323
+ element?: FindElementElement | null;
324
+ score?: number;
325
+ confidence?: number;
326
+ resolution?: ResolutionSummary | null;
327
+ error?: string;
328
+ }
329
+
257
330
  export interface ActionExecutionResult {
258
331
  action_id: string;
259
332
  timestamp: string;
@@ -73,6 +73,9 @@ async function run() {
73
73
  process.stdout.write('res4 ' + JSON.stringify(res4, null, 2) + '\n');
74
74
  const pass4 = res4.found === true && res4.element && res4.element.clickable === true && res4.element.resourceId === 'btn_generate' && res4.element.tapCoordinates && typeof res4.element.tapCoordinates.x === 'number' && typeof res4.element.tapCoordinates.y === 'number' && typeof res4.confidence === 'number'
75
75
  assert.ok(pass4, 'Child text should resolve to a clickable parent ancestor')
76
+ assert.strictEqual(res4.resolution?.reason, 'clickable_parent_preferred')
77
+ assert.strictEqual(res4.resolution?.fallback_available, true)
78
+ assert.ok((res4.resolution?.alternates || []).length >= 1, 'Parent promotion should preserve alternates')
76
79
  process.stdout.write('Test 4: ' + (pass4 ? 'PASS' : 'FAIL') + '\n');
77
80
 
78
81
  // Test 5: duration label should resolve to the nearby slider control
@@ -111,6 +114,8 @@ async function run() {
111
114
  process.stdout.write('Test 6: ' + (pass6 ? 'PASS' : 'FAIL') + '\n');
112
115
  const pass6b = res6.element && res6.element.telemetry && res6.element.telemetry.sliderLike === true && res6.element.interactionHint && res6.element.interactionHint.kind === 'slider'
113
116
  assert.ok(pass6b, 'Duration lookup should include slider-specific telemetry')
117
+ assert.strictEqual(res6.resolution?.reason, 'slider_track_preferred')
118
+ assert.strictEqual(res6.resolution?.fallback_available, true)
114
119
  process.stdout.write('Test 6b: ' + (pass6b ? 'PASS' : 'FAIL') + '\n');
115
120
 
116
121
  // Test 7: prefer vertical track-like control over a closer text button