mobile-debug-mcp 0.26.2 → 0.26.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -0
- package/dist/interact/index.js +144 -70
- package/dist/server/tool-definitions.js +3 -1
- package/dist/server-core.js +1 -1
- package/docs/CHANGELOG.md +3 -0
- package/docs/ROADMAP.md +11 -10
- package/docs/rfcs/007-actionability-resolution-and-executable-target-selection.md +277 -0
- package/docs/tools/interact.md +9 -1
- package/package.json +1 -1
- package/src/interact/index.ts +176 -72
- package/src/server/tool-definitions.ts +3 -1
- package/src/server-core.ts +1 -1
- package/src/types.ts +73 -0
- package/test/unit/observe/find_element.test.ts +5 -0
package/AGENTS.md
CHANGED
|
@@ -41,11 +41,14 @@ Portable agent skills live under `skills/`.
|
|
|
41
41
|
- `skills/README.md` — repo-wide skill convention
|
|
42
42
|
- `skills/mcp-builder/` — build/install/toolchain guidance
|
|
43
43
|
- `skills/test-authoring/` — test creation and placement guidance
|
|
44
|
+
- `skills/rfc-review/` — RFC review rubric and response template
|
|
44
45
|
|
|
45
46
|
If the task is about **creating or updating tests**, load `skills/test-authoring/SKILL.md` first.
|
|
46
47
|
|
|
47
48
|
If the task is about **building, installing, or diagnosing native tooling**, load `skills/mcp-builder/SKILL.md` first.
|
|
48
49
|
|
|
50
|
+
If the task is about **reviewing an RFC or spec draft**, load `skills/rfc-review/SKILL.md` first.
|
|
51
|
+
|
|
49
52
|
### Repository docs
|
|
50
53
|
|
|
51
54
|
- `README.md` — high-level repo overview and commands
|
package/dist/interact/index.js
CHANGED
|
@@ -203,6 +203,22 @@ export class ToolsInteract {
|
|
|
203
203
|
semantic: element.semantic ?? null
|
|
204
204
|
};
|
|
205
205
|
}
|
|
206
|
+
static _summarizeResolutionCandidate(candidate) {
|
|
207
|
+
const bounds = ToolsInteract._normalizeBounds(candidate.el.bounds);
|
|
208
|
+
return {
|
|
209
|
+
text: candidate.el.text ?? null,
|
|
210
|
+
resource_id: candidate.el.resourceId ?? candidate.el.resourceID ?? candidate.el.id ?? null,
|
|
211
|
+
accessibility_id: candidate.el.contentDescription ?? candidate.el.contentDesc ?? candidate.el.accessibilityLabel ?? candidate.el.label ?? null,
|
|
212
|
+
class: candidate.el.type ?? candidate.el.class ?? null,
|
|
213
|
+
bounds: bounds
|
|
214
|
+
? { left: bounds[0], top: bounds[1], right: bounds[2], bottom: bounds[3] }
|
|
215
|
+
: null,
|
|
216
|
+
clickable: !!candidate.el.clickable,
|
|
217
|
+
enabled: !!candidate.el.enabled,
|
|
218
|
+
score: candidate.score,
|
|
219
|
+
reason: candidate.reason
|
|
220
|
+
};
|
|
221
|
+
}
|
|
206
222
|
static _actionFailure(actionType, selector, resolved, failureCode, retryable, uiFingerprintBefore, uiFingerprintAfter, sourceModule = 'interact') {
|
|
207
223
|
return buildActionExecutionResult({
|
|
208
224
|
actionType,
|
|
@@ -419,17 +435,18 @@ export class ToolsInteract {
|
|
|
419
435
|
if (!q)
|
|
420
436
|
return { found: false, error: 'Empty query' };
|
|
421
437
|
let best = null;
|
|
422
|
-
let
|
|
423
|
-
let
|
|
424
|
-
|
|
438
|
+
let bestTree = null;
|
|
439
|
+
let bestIterationCandidates = [];
|
|
440
|
+
let shouldStop = false;
|
|
441
|
+
const scoreElement = (el, idx) => {
|
|
425
442
|
if (!el || !el.visible)
|
|
426
|
-
return
|
|
443
|
+
return null;
|
|
427
444
|
const bounds = el.bounds || [0, 0, 0, 0];
|
|
428
445
|
if (!Array.isArray(bounds) || bounds.length < 4)
|
|
429
|
-
return
|
|
446
|
+
return null;
|
|
430
447
|
const [l, t, r, b] = bounds;
|
|
431
448
|
if (r <= l || b <= t)
|
|
432
|
-
return
|
|
449
|
+
return null;
|
|
433
450
|
// Do not early-return on non-interactable elements — score them so we can locate their clickable ancestor later
|
|
434
451
|
const interactable = !!(el.clickable || el.enabled || el.focusable);
|
|
435
452
|
const text = normalize(el.text ?? el.label ?? el.value ?? '');
|
|
@@ -437,64 +454,98 @@ export class ToolsInteract {
|
|
|
437
454
|
const resourceId = normalize(el.resourceId ?? el.resourceID ?? el.id ?? '');
|
|
438
455
|
const className = normalize(el.type ?? el.class ?? '');
|
|
439
456
|
let score = 0;
|
|
457
|
+
let reason = 'best_scoring_candidate';
|
|
440
458
|
if (exact) {
|
|
441
|
-
if (text && text === q)
|
|
459
|
+
if (text && text === q) {
|
|
442
460
|
score = 1.0;
|
|
443
|
-
|
|
461
|
+
reason = 'exact_text_match';
|
|
462
|
+
}
|
|
463
|
+
else if (content && content === q) {
|
|
444
464
|
score = 0.95;
|
|
465
|
+
reason = 'exact_content_desc_match';
|
|
466
|
+
}
|
|
467
|
+
else if (resourceId && resourceId === q) {
|
|
468
|
+
score = 0.92;
|
|
469
|
+
reason = 'exact_resource_id_match';
|
|
470
|
+
}
|
|
471
|
+
else if (className && className === q) {
|
|
472
|
+
score = 0.3;
|
|
473
|
+
reason = 'exact_class_match';
|
|
474
|
+
}
|
|
445
475
|
}
|
|
446
476
|
else {
|
|
447
|
-
if (text && text === q)
|
|
477
|
+
if (text && text === q) {
|
|
448
478
|
score = 1.0;
|
|
449
|
-
|
|
479
|
+
reason = 'exact_text_match';
|
|
480
|
+
}
|
|
481
|
+
else if (content && content === q) {
|
|
450
482
|
score = 0.95;
|
|
451
|
-
|
|
483
|
+
reason = 'exact_content_desc_match';
|
|
484
|
+
}
|
|
485
|
+
else if (resourceId && resourceId === q) {
|
|
486
|
+
score = 0.92;
|
|
487
|
+
reason = 'exact_resource_id_match';
|
|
488
|
+
}
|
|
489
|
+
else if (text && text.includes(q)) {
|
|
452
490
|
score = 0.6;
|
|
453
|
-
|
|
491
|
+
reason = 'partial_text_match';
|
|
492
|
+
}
|
|
493
|
+
else if (content && content.includes(q)) {
|
|
454
494
|
score = 0.55;
|
|
455
|
-
|
|
495
|
+
reason = 'partial_content_desc_match';
|
|
496
|
+
}
|
|
497
|
+
else if (resourceId && resourceId.includes(q)) {
|
|
456
498
|
score = 0.7;
|
|
457
|
-
|
|
499
|
+
reason = 'partial_resource_id_match';
|
|
500
|
+
}
|
|
501
|
+
else if (className && className.includes(q)) {
|
|
458
502
|
score = 0.3;
|
|
503
|
+
reason = 'partial_class_match';
|
|
504
|
+
}
|
|
459
505
|
}
|
|
460
506
|
if (score > 0 && interactable)
|
|
461
507
|
score += 0.05;
|
|
462
|
-
|
|
508
|
+
if (score <= 0)
|
|
509
|
+
return null;
|
|
510
|
+
return { el, idx, score, reason, interactable };
|
|
463
511
|
};
|
|
464
512
|
while (Date.now() <= deadline) {
|
|
465
513
|
try {
|
|
466
514
|
const tree = await ToolsObserve.getUITreeHandler({ platform, deviceId });
|
|
467
|
-
lastTree = tree;
|
|
468
515
|
if (tree && Array.isArray(tree.elements)) {
|
|
469
516
|
const elements = tree.elements;
|
|
517
|
+
const iterationCandidates = [];
|
|
518
|
+
let iterationImprovedBest = false;
|
|
470
519
|
for (let i = 0; i < elements.length; i++) {
|
|
471
520
|
const el = elements[i];
|
|
472
521
|
try {
|
|
473
|
-
const
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
522
|
+
const candidate = scoreElement(el, i);
|
|
523
|
+
if (!candidate)
|
|
524
|
+
continue;
|
|
525
|
+
iterationCandidates.push(candidate);
|
|
526
|
+
if (!best || candidate.score > best.score) {
|
|
527
|
+
best = candidate;
|
|
528
|
+
bestTree = tree;
|
|
529
|
+
iterationImprovedBest = true;
|
|
530
|
+
if (best.score >= 0.95) {
|
|
531
|
+
shouldStop = true;
|
|
532
|
+
break;
|
|
481
533
|
}
|
|
482
534
|
}
|
|
483
|
-
if (bestScore >= 0.95)
|
|
484
|
-
break;
|
|
485
535
|
}
|
|
486
536
|
catch (e) {
|
|
487
537
|
console.error('Error scoring element:', e);
|
|
488
538
|
}
|
|
489
539
|
}
|
|
490
|
-
if (
|
|
491
|
-
|
|
540
|
+
if (iterationImprovedBest) {
|
|
541
|
+
bestIterationCandidates = iterationCandidates.slice();
|
|
542
|
+
}
|
|
492
543
|
}
|
|
493
544
|
}
|
|
494
545
|
catch (e) {
|
|
495
546
|
console.error('Error fetching UI tree:', e);
|
|
496
547
|
}
|
|
497
|
-
if (Date.now() > deadline)
|
|
548
|
+
if (shouldStop || Date.now() > deadline)
|
|
498
549
|
break;
|
|
499
550
|
await new Promise(r => setTimeout(r, 100));
|
|
500
551
|
}
|
|
@@ -502,17 +553,17 @@ export class ToolsInteract {
|
|
|
502
553
|
return { found: false, error: 'Element not found' };
|
|
503
554
|
// If the best match is not interactable, try to resolve an actionable ancestor.
|
|
504
555
|
try {
|
|
505
|
-
const elements = (
|
|
506
|
-
const screen =
|
|
556
|
+
const elements = (bestTree && Array.isArray(bestTree.elements)) ? bestTree.elements : [];
|
|
557
|
+
const screen = bestTree?.resolution && typeof bestTree.resolution === 'object' ? bestTree.resolution : null;
|
|
507
558
|
let chosen = best;
|
|
508
|
-
const childBounds = Array.isArray(chosen?.bounds) ? chosen.bounds : null;
|
|
559
|
+
const childBounds = Array.isArray(chosen?.el?.bounds) ? chosen.el.bounds : null;
|
|
509
560
|
// Strategy 1: if parentId references an index, climb that chain
|
|
510
561
|
let resolvedAncestor = null;
|
|
511
|
-
if (childBounds && (chosen.parentId !== undefined && chosen.parentId !== null)) {
|
|
562
|
+
if (childBounds && (chosen.el.parentId !== undefined && chosen.el.parentId !== null)) {
|
|
512
563
|
let cur = chosen;
|
|
513
564
|
let safety = 0;
|
|
514
|
-
while (cur && safety < 20 && !(cur.clickable || cur.focusable) && (cur.parentId !== undefined && cur.parentId !== null)) {
|
|
515
|
-
let pid = cur.parentId;
|
|
565
|
+
while (cur && safety < 20 && !(cur.el.clickable || cur.el.focusable) && (cur.el.parentId !== undefined && cur.el.parentId !== null)) {
|
|
566
|
+
let pid = cur.el.parentId;
|
|
516
567
|
let idx = null;
|
|
517
568
|
if (typeof pid === 'number')
|
|
518
569
|
idx = pid;
|
|
@@ -520,18 +571,19 @@ export class ToolsInteract {
|
|
|
520
571
|
idx = Number(pid);
|
|
521
572
|
// If parentId is not an index, try to find by matching resourceId or id field
|
|
522
573
|
if (idx !== null && elements[idx]) {
|
|
523
|
-
cur = elements[idx];
|
|
524
|
-
if (cur && (cur.clickable || cur.enabled || cur.focusable)) {
|
|
574
|
+
cur = { el: elements[idx], idx };
|
|
575
|
+
if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) {
|
|
525
576
|
resolvedAncestor = cur;
|
|
526
577
|
break;
|
|
527
578
|
}
|
|
528
579
|
}
|
|
529
580
|
else if (typeof pid === 'string') {
|
|
530
581
|
// fallback: search elements for matching resourceId or id
|
|
531
|
-
const
|
|
582
|
+
const foundIndex = elements.findIndex((el) => (el.resourceId === pid || el.id === pid));
|
|
583
|
+
const found = foundIndex >= 0 ? elements[foundIndex] : null;
|
|
532
584
|
if (found) {
|
|
533
|
-
cur = found;
|
|
534
|
-
if (cur && (cur.clickable || cur.enabled || cur.focusable)) {
|
|
585
|
+
cur = { el: found, idx: foundIndex };
|
|
586
|
+
if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) {
|
|
535
587
|
resolvedAncestor = cur;
|
|
536
588
|
break;
|
|
537
589
|
}
|
|
@@ -551,16 +603,19 @@ export class ToolsInteract {
|
|
|
551
603
|
if (!resolvedAncestor && childBounds) {
|
|
552
604
|
const [cl, ct, cr, cb] = childBounds;
|
|
553
605
|
// find candidates that are clickable and contain the child bounds
|
|
554
|
-
const candidates = elements
|
|
606
|
+
const candidates = elements
|
|
607
|
+
.map((el, idx) => ({ el, idx }))
|
|
608
|
+
.filter(({ el }) => el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds.length >= 4);
|
|
555
609
|
let bestCandidate = null;
|
|
556
610
|
let bestCandidateArea = Infinity;
|
|
557
611
|
for (const c of candidates) {
|
|
558
|
-
const
|
|
612
|
+
const bounds = c.el.bounds;
|
|
613
|
+
const [pl, pt, pr, pb] = bounds;
|
|
559
614
|
if (pl <= cl && pt <= ct && pr >= cr && pb >= cb) {
|
|
560
615
|
const area = (pr - pl) * (pb - pt);
|
|
561
616
|
if (area < bestCandidateArea) {
|
|
562
617
|
bestCandidateArea = area;
|
|
563
|
-
bestCandidate = c
|
|
618
|
+
bestCandidate = c;
|
|
564
619
|
}
|
|
565
620
|
}
|
|
566
621
|
}
|
|
@@ -568,17 +623,24 @@ export class ToolsInteract {
|
|
|
568
623
|
resolvedAncestor = bestCandidate;
|
|
569
624
|
}
|
|
570
625
|
if (resolvedAncestor) {
|
|
571
|
-
best =
|
|
572
|
-
|
|
573
|
-
|
|
626
|
+
best = {
|
|
627
|
+
el: resolvedAncestor.el,
|
|
628
|
+
idx: resolvedAncestor.idx,
|
|
629
|
+
score: Math.min(1, best.score + 0.02),
|
|
630
|
+
reason: 'clickable_parent_preferred',
|
|
631
|
+
interactable: true
|
|
632
|
+
};
|
|
574
633
|
}
|
|
575
|
-
if (best && !(best.clickable || best.focusable)) {
|
|
576
|
-
const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best, idx: best.
|
|
634
|
+
if (best && !(best.el.clickable || best.el.focusable)) {
|
|
635
|
+
const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best.el, idx: best.idx }, screen);
|
|
577
636
|
if (nearbyActionable) {
|
|
578
|
-
best =
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
637
|
+
best = {
|
|
638
|
+
el: nearbyActionable.el,
|
|
639
|
+
idx: nearbyActionable.idx,
|
|
640
|
+
score: Math.min(1, best.score + 0.02),
|
|
641
|
+
reason: nearbyActionable.sliderLike ? 'slider_track_preferred' : 'nearby_actionable_control',
|
|
642
|
+
interactable: true
|
|
643
|
+
};
|
|
582
644
|
}
|
|
583
645
|
}
|
|
584
646
|
}
|
|
@@ -587,29 +649,34 @@ export class ToolsInteract {
|
|
|
587
649
|
}
|
|
588
650
|
if (!best)
|
|
589
651
|
return { found: false, error: 'Element not found' };
|
|
590
|
-
const boundsObj = Array.isArray(best.bounds) ? { left: best.bounds[0], top: best.bounds[1], right: best.bounds[2], bottom: best.bounds[3] } : null;
|
|
652
|
+
const boundsObj = Array.isArray(best.el.bounds) ? { left: best.el.bounds[0], top: best.el.bounds[1], right: best.el.bounds[2], bottom: best.el.bounds[3] } : null;
|
|
591
653
|
const tapCoordinates = boundsObj ? { x: Math.floor((boundsObj.left + boundsObj.right) / 2), y: Math.floor((boundsObj.top + boundsObj.bottom) / 2) } : null;
|
|
654
|
+
const uniqueRanked = bestIterationCandidates.filter((candidate, index, array) => index === array.findIndex((other) => other.idx === candidate.idx && other.el === candidate.el));
|
|
655
|
+
const alternateCandidates = uniqueRanked
|
|
656
|
+
.filter((candidate) => candidate.idx !== best.idx || candidate.el !== best.el)
|
|
657
|
+
.slice(0, 3)
|
|
658
|
+
.map((candidate) => ToolsInteract._summarizeResolutionCandidate(candidate));
|
|
592
659
|
const outEl = {
|
|
593
|
-
text: best.text ?? null,
|
|
594
|
-
resourceId: best.resourceId ?? null,
|
|
595
|
-
contentDesc: best.contentDescription ?? best.contentDesc ?? null,
|
|
596
|
-
class: best.type ?? best.class ?? null,
|
|
660
|
+
text: best.el.text ?? null,
|
|
661
|
+
resourceId: best.el.resourceId ?? null,
|
|
662
|
+
contentDesc: best.el.contentDescription ?? best.el.contentDesc ?? null,
|
|
663
|
+
class: best.el.type ?? best.el.class ?? null,
|
|
597
664
|
bounds: boundsObj,
|
|
598
|
-
clickable: !!best.clickable,
|
|
599
|
-
enabled: !!best.enabled,
|
|
600
|
-
stable_id: best.stable_id ?? null,
|
|
601
|
-
role: best.role ?? null,
|
|
602
|
-
test_tag: best.test_tag ?? null,
|
|
603
|
-
selector: best.selector ?? null,
|
|
604
|
-
semantic: best.semantic ?? null,
|
|
665
|
+
clickable: !!best.el.clickable,
|
|
666
|
+
enabled: !!best.el.enabled,
|
|
667
|
+
stable_id: best.el.stable_id ?? null,
|
|
668
|
+
role: best.el.role ?? null,
|
|
669
|
+
test_tag: best.el.test_tag ?? null,
|
|
670
|
+
selector: best.el.selector ?? null,
|
|
671
|
+
semantic: best.el.semantic ?? null,
|
|
605
672
|
tapCoordinates,
|
|
606
673
|
telemetry: {
|
|
607
|
-
matchedIndex: best
|
|
608
|
-
matchedInteractable: !!best
|
|
609
|
-
sliderLike:
|
|
674
|
+
matchedIndex: best.idx ?? null,
|
|
675
|
+
matchedInteractable: !!best.interactable,
|
|
676
|
+
sliderLike: best.reason === 'slider_track_preferred'
|
|
610
677
|
}
|
|
611
678
|
};
|
|
612
|
-
if (best
|
|
679
|
+
if (best.reason === 'slider_track_preferred') {
|
|
613
680
|
const isVertical = !!boundsObj && (boundsObj.bottom - boundsObj.top) > (boundsObj.right - boundsObj.left);
|
|
614
681
|
const interactionHint = {
|
|
615
682
|
kind: 'slider',
|
|
@@ -618,8 +685,15 @@ export class ToolsInteract {
|
|
|
618
685
|
};
|
|
619
686
|
outEl.interactionHint = interactionHint;
|
|
620
687
|
}
|
|
621
|
-
const scoreVal = Math.min(1, Number(
|
|
622
|
-
|
|
688
|
+
const scoreVal = Math.min(1, Number(best.score.toFixed(3)));
|
|
689
|
+
const resolution = {
|
|
690
|
+
confidence: scoreVal,
|
|
691
|
+
reason: best.reason,
|
|
692
|
+
fallback_available: alternateCandidates.length > 0,
|
|
693
|
+
matched_count: uniqueRanked.length,
|
|
694
|
+
alternates: alternateCandidates
|
|
695
|
+
};
|
|
696
|
+
return { found: true, element: outEl, score: scoreVal, confidence: scoreVal, resolution };
|
|
623
697
|
}
|
|
624
698
|
static async waitForUIHandler({ selector, condition = 'exists', timeout_ms = 60000, poll_interval_ms = 300, match, retry = { max_attempts: 1, backoff_ms: 0 }, platform, deviceId }) {
|
|
625
699
|
const overallStart = Date.now();
|
|
@@ -596,7 +596,9 @@ Recommended Usage:
|
|
|
596
596
|
},
|
|
597
597
|
{
|
|
598
598
|
name: 'find_element',
|
|
599
|
-
description:
|
|
599
|
+
description: `Find a UI element by semantic query (text, content-desc, resource-id, class).
|
|
600
|
+
|
|
601
|
+
Returns the best match plus resolution metadata when available, including confidence, selection reason, and fallback alternates.`,
|
|
600
602
|
inputSchema: {
|
|
601
603
|
type: 'object',
|
|
602
604
|
properties: {
|
package/dist/server-core.js
CHANGED
|
@@ -6,7 +6,7 @@ import { handleToolCall } from './server/tool-handlers.js';
|
|
|
6
6
|
export { wrapResponse, toolDefinitions, handleToolCall };
|
|
7
7
|
export const serverInfo = {
|
|
8
8
|
name: 'mobile-debug-mcp',
|
|
9
|
-
version: '0.26.
|
|
9
|
+
version: '0.26.3'
|
|
10
10
|
};
|
|
11
11
|
export function createServer() {
|
|
12
12
|
const server = new Server(serverInfo, {
|
package/docs/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to the **Mobile Debug MCP** project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.26.3]
|
|
6
|
+
- updates the `find_element` tool to return detailed resolution metadata, including confidence scores,
|
|
7
|
+
|
|
5
8
|
## [0.26.2]
|
|
6
9
|
- unified action execution and verification model
|
|
7
10
|
|
package/docs/ROADMAP.md
CHANGED
|
@@ -53,9 +53,9 @@ Higher task success with fewer retries.
|
|
|
53
53
|
## Upcoming Work
|
|
54
54
|
|
|
55
55
|
- Adjustable Control Support
|
|
56
|
+
- Better Compose / Custom Control Semantics
|
|
56
57
|
- Signal-Oriented Diagnostic Filtering
|
|
57
58
|
- Long Press Gesture
|
|
58
|
-
- Better Compose / Custom Control Semantics
|
|
59
59
|
|
|
60
60
|
## Later Horizon
|
|
61
61
|
|
|
@@ -160,6 +160,7 @@ Addresses failures where agents:
|
|
|
160
160
|
- wait_for_ui_change (hierarchy diff based waiting)
|
|
161
161
|
- Structured loading state detection
|
|
162
162
|
- Snapshot revision / staleness metadata
|
|
163
|
+
- Focused snapshot views / incremental snapshot diffs
|
|
163
164
|
- Compose-aware wait robustness improvements
|
|
164
165
|
|
|
165
166
|
## Expected Impact
|
|
@@ -169,6 +170,7 @@ Very high.
|
|
|
169
170
|
- wait_for_ui_change implemented
|
|
170
171
|
- Loading state detection available for representative controls
|
|
171
172
|
- Snapshot revision or staleness metadata exposed
|
|
173
|
+
- Focused or diff-oriented snapshots validated in benchmark flows
|
|
172
174
|
- UI-first sync guidance added to spec guardrails
|
|
173
175
|
- In-place update waits validated on benchmark flows
|
|
174
176
|
|
|
@@ -379,9 +381,9 @@ Strengthens:
|
|
|
379
381
|
# Better Compose / Custom Control Semantics
|
|
380
382
|
|
|
381
383
|
## Rationale
|
|
382
|
-
|
|
384
|
+
Higher priority after agent feedback exposed custom control semantics as a core reliability gap, not a later optimization.
|
|
383
385
|
|
|
384
|
-
**Status:**
|
|
386
|
+
**Status:** Spec Ready
|
|
385
387
|
|
|
386
388
|
Semantics become more useful once:
|
|
387
389
|
- identity is stronger
|
|
@@ -419,7 +421,6 @@ Depends on:
|
|
|
419
421
|
- Wait and Synchronization Reliability
|
|
420
422
|
- Actionability Resolution
|
|
421
423
|
- Adjustable Control Support
|
|
422
|
-
- Signal-Oriented Diagnostic Filtering
|
|
423
424
|
- Long Press Gesture
|
|
424
425
|
|
|
425
426
|
---
|
|
@@ -543,19 +544,19 @@ Make core loop more reliable.
|
|
|
543
544
|
|
|
544
545
|
## Wave 2 (Control Precision + Diagnostics)
|
|
545
546
|
- Adjustable Control Support
|
|
547
|
+
- Better Compose / Custom Control Semantics
|
|
546
548
|
- Signal-Oriented Diagnostic Filtering
|
|
547
549
|
|
|
548
550
|
Focus:
|
|
549
|
-
Improve control precision and signal observability.
|
|
551
|
+
Improve control precision, custom control semantics, and signal observability.
|
|
550
552
|
|
|
551
553
|
---
|
|
552
554
|
|
|
553
555
|
## Wave 3 (Interaction Expansion)
|
|
554
556
|
- Long Press Gesture
|
|
555
|
-
- Better Compose / Custom Control Semantics
|
|
556
557
|
|
|
557
558
|
Focus:
|
|
558
|
-
Expand interaction capability.
|
|
559
|
+
Expand interaction capability after core control reliability is improved.
|
|
559
560
|
|
|
560
561
|
---
|
|
561
562
|
|
|
@@ -576,9 +577,9 @@ Roadmap Ordering:
|
|
|
576
577
|
3. Wait and Synchronization Reliability
|
|
577
578
|
4. Actionability Resolution
|
|
578
579
|
5. Adjustable Control Support
|
|
579
|
-
6.
|
|
580
|
-
7.
|
|
581
|
-
8.
|
|
580
|
+
6. Better Compose / Custom Control Semantics
|
|
581
|
+
7. Signal-Oriented Diagnostic Filtering
|
|
582
|
+
8. Long Press Gesture
|
|
582
583
|
9. Pinch to Zoom
|
|
583
584
|
10. Action Trace Correlation
|
|
584
585
|
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# RFC 007 — Actionability Resolution and Executable Target Selection
|
|
2
|
+
|
|
3
|
+
## 1. Summary
|
|
4
|
+
|
|
5
|
+
This RFC defines how the system resolves which discovered UI element should receive an action before dispatch.
|
|
6
|
+
|
|
7
|
+
It addresses ambiguity between:
|
|
8
|
+
- visible elements vs actionable elements
|
|
9
|
+
- leaf nodes vs clickable containers
|
|
10
|
+
- semantic targets vs coordinate fallbacks
|
|
11
|
+
- multiple candidate targets with uncertain executability
|
|
12
|
+
|
|
13
|
+
Goal:
|
|
14
|
+
Improve first-attempt action correctness by resolving the best executable target prior to action dispatch.
|
|
15
|
+
|
|
16
|
+
This RFC defines the `Resolved` stage semantics referenced in RFC 005 and operationalized by RFC 006.
|
|
17
|
+
It is grounded in the existing element-resolution flow and extends current resolution behavior rather than assuming a wholly new resolver architecture.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 2. Problem Statement
|
|
22
|
+
|
|
23
|
+
Current interaction failures often arise before execution.
|
|
24
|
+
|
|
25
|
+
The agent may discover the intended UI concept, but not the correct executable target.
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
- tapping label text instead of clickable container
|
|
29
|
+
- sliders not surfacing semantic handles
|
|
30
|
+
- generic Compose containers hiding true affordances
|
|
31
|
+
- multiple matching targets without ranking logic
|
|
32
|
+
|
|
33
|
+
Observed failure modes:
|
|
34
|
+
- false taps
|
|
35
|
+
- submit ambiguity
|
|
36
|
+
- coordinate guessing
|
|
37
|
+
- retry loops
|
|
38
|
+
- brittle fallback behavior
|
|
39
|
+
|
|
40
|
+
This is a target-resolution problem, not an execution problem.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 3. Design Goals
|
|
45
|
+
|
|
46
|
+
Resolution MUST:
|
|
47
|
+
- Prefer executable targets over merely visible matches
|
|
48
|
+
- Reduce ambiguous target selection
|
|
49
|
+
- Support confidence-based ranking
|
|
50
|
+
- Build on existing runtime resolution surfaces before introducing new resolution metadata
|
|
51
|
+
- Use structural and semantic resolution signals
|
|
52
|
+
- Minimize coordinate fallback usage
|
|
53
|
+
- Integrate with verification expectations from RFC 005
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## 4. Actionability Model
|
|
58
|
+
|
|
59
|
+
Candidate targets are evaluated using actionability signals.
|
|
60
|
+
|
|
61
|
+
### Structural signals
|
|
62
|
+
- clickable
|
|
63
|
+
- enabled
|
|
64
|
+
- focusable
|
|
65
|
+
- bounds
|
|
66
|
+
- parent action ownership
|
|
67
|
+
|
|
68
|
+
### Semantic signals
|
|
69
|
+
- control role
|
|
70
|
+
- label association
|
|
71
|
+
- affordance hints
|
|
72
|
+
- selectable or adjustable semantics
|
|
73
|
+
|
|
74
|
+
### Interaction signals
|
|
75
|
+
- reliable target patterns
|
|
76
|
+
- control-specific heuristics
|
|
77
|
+
- gesture compatibility
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 4.1 Current Runtime Resolution Surfaces
|
|
82
|
+
|
|
83
|
+
This RFC builds on current runtime resolution paths, including:
|
|
84
|
+
- `findElementHandler` for candidate discovery
|
|
85
|
+
- `_resolveActionableAncestor` for executable ancestor promotion
|
|
86
|
+
- `tapElementHandler` for resolved element dispatch
|
|
87
|
+
- `scrollToElementHandler` for scroll-mediated target acquisition
|
|
88
|
+
|
|
89
|
+
These existing handlers are the current implementation substrate for the Resolved stage.
|
|
90
|
+
This RFC extends and systematizes those behaviors; it does not assume replacement of those paths.
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## 5. Target Candidate Ranking
|
|
95
|
+
|
|
96
|
+
When multiple targets match, candidates are ranked.
|
|
97
|
+
|
|
98
|
+
Illustrative confidence model:
|
|
99
|
+
|
|
100
|
+
resolution_confidence =
|
|
101
|
+
interactability_score
|
|
102
|
+
+ semantic_match_score
|
|
103
|
+
+ structural_reliability_score
|
|
104
|
+
|
|
105
|
+
Highest-confidence executable target is preferred.
|
|
106
|
+
|
|
107
|
+
The confidence model is illustrative and normative only at the rule-precedence level; implementations may use simpler heuristics while preserving resolution ordering guarantees. Any scoring mechanism is implementation-defined and may not be externally surfaced.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 6. Resolution Rules
|
|
112
|
+
|
|
113
|
+
### Rule A — Prefer actionable containers over passive leaf nodes
|
|
114
|
+
|
|
115
|
+
Prefer:
|
|
116
|
+
- clickable container
|
|
117
|
+
|
|
118
|
+
Over:
|
|
119
|
+
- passive child text nodes
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
Prefer button container over "Generate Session" label node.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
### Rule B — Prefer semantic controls over coordinate fallbacks
|
|
127
|
+
|
|
128
|
+
Use semantic control targets whenever possible.
|
|
129
|
+
|
|
130
|
+
Coordinate fallback only when:
|
|
131
|
+
- no semantic target exists
|
|
132
|
+
- adjustable control semantics absent
|
|
133
|
+
- fallback confidence acceptable
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
### Rule C — Prefer explicit affordance ownership
|
|
138
|
+
|
|
139
|
+
If child and parent differ:
|
|
140
|
+
prefer the node owning the action handler.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## 7. Ambiguity Handling
|
|
145
|
+
|
|
146
|
+
When multiple plausible targets remain:
|
|
147
|
+
|
|
148
|
+
System SHOULD:
|
|
149
|
+
- rank candidates
|
|
150
|
+
- expose confidence
|
|
151
|
+
- preserve alternates for fallback reasoning
|
|
152
|
+
|
|
153
|
+
Low-confidence targets may trigger:
|
|
154
|
+
- guarded execution
|
|
155
|
+
- alternate resolution attempt
|
|
156
|
+
- explicit recovery path
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## 8. Adjustable Control Resolution
|
|
161
|
+
|
|
162
|
+
Special handling for:
|
|
163
|
+
- sliders
|
|
164
|
+
- steppers
|
|
165
|
+
- drag controls
|
|
166
|
+
|
|
167
|
+
Support:
|
|
168
|
+
- adjustable-role recognition
|
|
169
|
+
- control-bound discovery
|
|
170
|
+
- value-aware interaction targeting
|
|
171
|
+
|
|
172
|
+
This RFC defines target resolution.
|
|
173
|
+
Value-setting behavior remains governed by Adjustable Control Support.
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## 9. Compose / Custom Control Resolution
|
|
178
|
+
|
|
179
|
+
Support derived actionability for:
|
|
180
|
+
- merged Compose semantics
|
|
181
|
+
- composite controls
|
|
182
|
+
- inferred interaction contracts
|
|
183
|
+
|
|
184
|
+
This RFC depends on and strengthens Better Compose / Custom Control Semantics.
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## 10. Resolution Output Model (Current + Future Extension)
|
|
189
|
+
|
|
190
|
+
This model is non-normative and represents a progressive enrichment direction rather than a required runtime contract.
|
|
191
|
+
|
|
192
|
+
Resolution may evolve toward the following enriched output shape. Current runtime implementations may expose only resolved-target output plus limited supporting metadata.
|
|
193
|
+
|
|
194
|
+
At minimum, current implementations are expected to produce a resolved target. Confidence, alternates, fallback metadata, and reason codes may be introduced incrementally.
|
|
195
|
+
|
|
196
|
+
Illustrative future-complete shape:
|
|
197
|
+
|
|
198
|
+
{
|
|
199
|
+
"resolved_target": "...",
|
|
200
|
+
"confidence": 0.92,
|
|
201
|
+
"fallback_available": true,
|
|
202
|
+
"resolution_reason": "clickable_parent_preferred"
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## 11. Verification Integration
|
|
208
|
+
|
|
209
|
+
Resolution is incomplete without verification expectations.
|
|
210
|
+
|
|
211
|
+
Resolved output should be derived directly from the existing element-resolution flow before adding richer metadata layers.
|
|
212
|
+
|
|
213
|
+
Resolved target should carry expected post-action signal.
|
|
214
|
+
|
|
215
|
+
Examples:
|
|
216
|
+
- navigation transition expected
|
|
217
|
+
- menu expected
|
|
218
|
+
- control value change expected
|
|
219
|
+
|
|
220
|
+
This feeds RFC 005 verification.
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## 12. Success Metrics
|
|
225
|
+
|
|
226
|
+
Track:
|
|
227
|
+
- reduced false-tap failures
|
|
228
|
+
- lower retarget retries
|
|
229
|
+
- higher first-attempt action success
|
|
230
|
+
- reduced coordinate fallback usage
|
|
231
|
+
- improved custom control interaction success
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## 13. Dependencies
|
|
236
|
+
|
|
237
|
+
Depends on:
|
|
238
|
+
- Stronger State Verification
|
|
239
|
+
- Richer Element Identity
|
|
240
|
+
- Wait and Synchronization Reliability
|
|
241
|
+
|
|
242
|
+
Strengthens:
|
|
243
|
+
- Adjustable Control Support
|
|
244
|
+
- Better Compose / Custom Control Semantics
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## 14. Relationship to Other RFCs
|
|
249
|
+
|
|
250
|
+
RFC 005
|
|
251
|
+
Defines what Resolved means in lifecycle semantics.
|
|
252
|
+
|
|
253
|
+
RFC 006
|
|
254
|
+
Defines how runtime interprets action execution.
|
|
255
|
+
|
|
256
|
+
RFC 007
|
|
257
|
+
Defines how a target becomes Resolved.
|
|
258
|
+
Specifically, it formalizes the current discovery → actionable ancestor resolution → dispatch preparation flow already present in runtime handlers.
|
|
259
|
+
|
|
260
|
+
Together:
|
|
261
|
+
- RFC 005 — action correctness
|
|
262
|
+
- RFC 006 — runtime execution binding
|
|
263
|
+
- RFC 007 — executable target resolution
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 15. Summary
|
|
268
|
+
|
|
269
|
+
This RFC reduces failures caused by acting on the wrong thing, even when the right thing was discovered.
|
|
270
|
+
|
|
271
|
+
It improves:
|
|
272
|
+
- action precision
|
|
273
|
+
- control reliability
|
|
274
|
+
- Compose interaction robustness
|
|
275
|
+
- agent success with fewer retries
|
|
276
|
+
|
|
277
|
+
It addresses one of the largest remaining sources of interaction brittleness.
|
package/docs/tools/interact.md
CHANGED
|
@@ -199,7 +199,14 @@ Output:
|
|
|
199
199
|
"telemetry": { "matchedIndex": 3, "matchedInteractable": true }
|
|
200
200
|
},
|
|
201
201
|
"score": 1.0,
|
|
202
|
-
"confidence": 1.0
|
|
202
|
+
"confidence": 1.0,
|
|
203
|
+
"resolution": {
|
|
204
|
+
"confidence": 1.0,
|
|
205
|
+
"reason": "exact_text_match",
|
|
206
|
+
"fallback_available": false,
|
|
207
|
+
"matched_count": 1,
|
|
208
|
+
"alternates": []
|
|
209
|
+
}
|
|
203
210
|
}
|
|
204
211
|
```
|
|
205
212
|
|
|
@@ -207,6 +214,7 @@ Notes:
|
|
|
207
214
|
|
|
208
215
|
- Best used when no precise selector is available yet.
|
|
209
216
|
- `tapCoordinates` are suitable for `tap` calls.
|
|
217
|
+
- `resolution` explains why the element was selected and may include fallback alternates when the runtime had to promote a parent or nearby control.
|
|
210
218
|
- Prefer `wait_for_ui` when you already know a deterministic selector and want a stable `elementId`.
|
|
211
219
|
|
|
212
220
|
---
|
package/package.json
CHANGED
package/src/interact/index.ts
CHANGED
|
@@ -10,6 +10,7 @@ import { buildActionExecutionResult } from '../server/common.js'
|
|
|
10
10
|
import type {
|
|
11
11
|
ActionFailureCode,
|
|
12
12
|
ActionTargetResolved,
|
|
13
|
+
FindElementResponse,
|
|
13
14
|
ExpectElementVisibleResponse,
|
|
14
15
|
ExpectStateResponse,
|
|
15
16
|
ExpectScreenResponse,
|
|
@@ -68,6 +69,32 @@ interface UiChangeSignatureSet {
|
|
|
68
69
|
state: string | null
|
|
69
70
|
}
|
|
70
71
|
|
|
72
|
+
interface RankedResolutionCandidate {
|
|
73
|
+
el: UiElement
|
|
74
|
+
idx: number
|
|
75
|
+
score: number
|
|
76
|
+
reason: string
|
|
77
|
+
interactable: boolean
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
interface FindElementResolutionSummary {
|
|
81
|
+
confidence: number
|
|
82
|
+
reason: string
|
|
83
|
+
fallback_available: boolean
|
|
84
|
+
matched_count: number
|
|
85
|
+
alternates: Array<{
|
|
86
|
+
text: string | null
|
|
87
|
+
resource_id: string | null
|
|
88
|
+
accessibility_id: string | null
|
|
89
|
+
class: string | null
|
|
90
|
+
bounds: { left: number; top: number; right: number; bottom: number } | null
|
|
91
|
+
clickable: boolean
|
|
92
|
+
enabled: boolean
|
|
93
|
+
score: number
|
|
94
|
+
reason: string
|
|
95
|
+
}>
|
|
96
|
+
}
|
|
97
|
+
|
|
71
98
|
|
|
72
99
|
export class ToolsInteract {
|
|
73
100
|
private static readonly _maxResolvedUiElements = 256
|
|
@@ -290,6 +317,23 @@ export class ToolsInteract {
|
|
|
290
317
|
}
|
|
291
318
|
}
|
|
292
319
|
|
|
320
|
+
private static _summarizeResolutionCandidate(candidate: RankedResolutionCandidate): FindElementResolutionSummary['alternates'][number] {
|
|
321
|
+
const bounds = ToolsInteract._normalizeBounds(candidate.el.bounds)
|
|
322
|
+
return {
|
|
323
|
+
text: candidate.el.text ?? null,
|
|
324
|
+
resource_id: candidate.el.resourceId ?? candidate.el.resourceID ?? candidate.el.id ?? null,
|
|
325
|
+
accessibility_id: candidate.el.contentDescription ?? candidate.el.contentDesc ?? candidate.el.accessibilityLabel ?? candidate.el.label ?? null,
|
|
326
|
+
class: candidate.el.type ?? candidate.el.class ?? null,
|
|
327
|
+
bounds: bounds
|
|
328
|
+
? { left: bounds[0], top: bounds[1], right: bounds[2], bottom: bounds[3] }
|
|
329
|
+
: null,
|
|
330
|
+
clickable: !!candidate.el.clickable,
|
|
331
|
+
enabled: !!candidate.el.enabled,
|
|
332
|
+
score: candidate.score,
|
|
333
|
+
reason: candidate.reason
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
293
337
|
private static _actionFailure(
|
|
294
338
|
actionType: string,
|
|
295
339
|
selector: Record<string, unknown> | null,
|
|
@@ -546,7 +590,7 @@ export class ToolsInteract {
|
|
|
546
590
|
return await interact.scrollToElement(selector, direction, maxScrolls, scrollAmount, resolved.id)
|
|
547
591
|
}
|
|
548
592
|
|
|
549
|
-
static async findElementHandler({ query, exact = false, timeoutMs = 3000, platform, deviceId }: { query: string, exact?: boolean, timeoutMs?: number, platform?: 'android' | 'ios', deviceId?: string }) {
|
|
593
|
+
static async findElementHandler({ query, exact = false, timeoutMs = 3000, platform, deviceId }: { query: string, exact?: boolean, timeoutMs?: number, platform?: 'android' | 'ios', deviceId?: string }): Promise<FindElementResponse> {
|
|
550
594
|
// Try to use observe layer to fetch the current UI tree and perform a fast semantic search
|
|
551
595
|
const start = Date.now()
|
|
552
596
|
const deadline = start + timeoutMs
|
|
@@ -555,16 +599,17 @@ export class ToolsInteract {
|
|
|
555
599
|
const q = normalize(query)
|
|
556
600
|
if (!q) return { found: false, error: 'Empty query' }
|
|
557
601
|
|
|
558
|
-
let best:
|
|
559
|
-
let
|
|
560
|
-
let
|
|
602
|
+
let best: RankedResolutionCandidate | null = null
|
|
603
|
+
let bestTree: any = null
|
|
604
|
+
let bestIterationCandidates: RankedResolutionCandidate[] = []
|
|
605
|
+
let shouldStop = false
|
|
561
606
|
|
|
562
|
-
const scoreElement = (el: UiElement | null) => {
|
|
563
|
-
if (!el || !el.visible) return
|
|
607
|
+
const scoreElement = (el: UiElement | null, idx: number): RankedResolutionCandidate | null => {
|
|
608
|
+
if (!el || !el.visible) return null
|
|
564
609
|
const bounds = el.bounds || [0,0,0,0]
|
|
565
|
-
if (!Array.isArray(bounds) || bounds.length < 4) return
|
|
610
|
+
if (!Array.isArray(bounds) || bounds.length < 4) return null
|
|
566
611
|
const [l,t,r,b] = bounds
|
|
567
|
-
if (r <= l || b <= t) return
|
|
612
|
+
if (r <= l || b <= t) return null
|
|
568
613
|
// Do not early-return on non-interactable elements — score them so we can locate their clickable ancestor later
|
|
569
614
|
const interactable = !!(el.clickable || el.enabled || el.focusable)
|
|
570
615
|
|
|
@@ -574,44 +619,80 @@ export class ToolsInteract {
|
|
|
574
619
|
const className = normalize(el.type ?? el.class ?? '')
|
|
575
620
|
|
|
576
621
|
let score = 0
|
|
622
|
+
let reason = 'best_scoring_candidate'
|
|
577
623
|
if (exact) {
|
|
578
|
-
if (text && text === q)
|
|
579
|
-
|
|
624
|
+
if (text && text === q) {
|
|
625
|
+
score = 1.0
|
|
626
|
+
reason = 'exact_text_match'
|
|
627
|
+
} else if (content && content === q) {
|
|
628
|
+
score = 0.95
|
|
629
|
+
reason = 'exact_content_desc_match'
|
|
630
|
+
} else if (resourceId && resourceId === q) {
|
|
631
|
+
score = 0.92
|
|
632
|
+
reason = 'exact_resource_id_match'
|
|
633
|
+
} else if (className && className === q) {
|
|
634
|
+
score = 0.3
|
|
635
|
+
reason = 'exact_class_match'
|
|
636
|
+
}
|
|
580
637
|
} else {
|
|
581
|
-
if (text && text === q)
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
else if (content && content
|
|
585
|
-
|
|
586
|
-
|
|
638
|
+
if (text && text === q) {
|
|
639
|
+
score = 1.0
|
|
640
|
+
reason = 'exact_text_match'
|
|
641
|
+
} else if (content && content === q) {
|
|
642
|
+
score = 0.95
|
|
643
|
+
reason = 'exact_content_desc_match'
|
|
644
|
+
} else if (resourceId && resourceId === q) {
|
|
645
|
+
score = 0.92
|
|
646
|
+
reason = 'exact_resource_id_match'
|
|
647
|
+
} else if (text && text.includes(q)) {
|
|
648
|
+
score = 0.6
|
|
649
|
+
reason = 'partial_text_match'
|
|
650
|
+
} else if (content && content.includes(q)) {
|
|
651
|
+
score = 0.55
|
|
652
|
+
reason = 'partial_content_desc_match'
|
|
653
|
+
} else if (resourceId && resourceId.includes(q)) {
|
|
654
|
+
score = 0.7
|
|
655
|
+
reason = 'partial_resource_id_match'
|
|
656
|
+
} else if (className && className.includes(q)) {
|
|
657
|
+
score = 0.3
|
|
658
|
+
reason = 'partial_class_match'
|
|
659
|
+
}
|
|
587
660
|
}
|
|
588
661
|
if (score > 0 && interactable) score += 0.05
|
|
589
|
-
return
|
|
662
|
+
if (score <= 0) return null
|
|
663
|
+
return { el, idx, score, reason, interactable }
|
|
590
664
|
}
|
|
591
665
|
|
|
592
666
|
while (Date.now() <= deadline) {
|
|
593
667
|
try {
|
|
594
|
-
|
|
595
|
-
lastTree = tree
|
|
668
|
+
const tree = await ToolsObserve.getUITreeHandler({ platform, deviceId })
|
|
596
669
|
if (tree && Array.isArray((tree as any).elements)) {
|
|
597
670
|
const elements = ((tree as any).elements as UiElement[])
|
|
671
|
+
const iterationCandidates: RankedResolutionCandidate[] = []
|
|
672
|
+
let iterationImprovedBest = false
|
|
598
673
|
for (let i = 0; i < elements.length; i++) {
|
|
599
674
|
const el = elements[i]
|
|
600
675
|
try {
|
|
601
|
-
const
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
best =
|
|
606
|
-
|
|
676
|
+
const candidate = scoreElement(el, i)
|
|
677
|
+
if (!candidate) continue
|
|
678
|
+
iterationCandidates.push(candidate)
|
|
679
|
+
if (!best || candidate.score > best.score) {
|
|
680
|
+
best = candidate
|
|
681
|
+
bestTree = tree
|
|
682
|
+
iterationImprovedBest = true
|
|
683
|
+
if (best.score >= 0.95) {
|
|
684
|
+
shouldStop = true
|
|
685
|
+
break
|
|
686
|
+
}
|
|
607
687
|
}
|
|
608
|
-
if (bestScore >= 0.95) break
|
|
609
688
|
} catch (e) { console.error('Error scoring element:', e) }
|
|
610
689
|
}
|
|
611
|
-
if (
|
|
690
|
+
if (iterationImprovedBest) {
|
|
691
|
+
bestIterationCandidates = iterationCandidates.slice()
|
|
692
|
+
}
|
|
612
693
|
}
|
|
613
694
|
} catch (e) { console.error('Error fetching UI tree:', e) }
|
|
614
|
-
if (Date.now() > deadline) break
|
|
695
|
+
if (shouldStop || Date.now() > deadline) break
|
|
615
696
|
await new Promise(r => setTimeout(r, 100))
|
|
616
697
|
}
|
|
617
698
|
|
|
@@ -619,31 +700,32 @@ export class ToolsInteract {
|
|
|
619
700
|
|
|
620
701
|
// If the best match is not interactable, try to resolve an actionable ancestor.
|
|
621
702
|
try {
|
|
622
|
-
const elements = (
|
|
623
|
-
const screen =
|
|
624
|
-
let chosen = best as
|
|
625
|
-
const childBounds = Array.isArray(chosen?.bounds) ? chosen.bounds : null
|
|
703
|
+
const elements = (bestTree && Array.isArray(bestTree.elements)) ? (bestTree.elements as UiElement[]) : []
|
|
704
|
+
const screen = bestTree?.resolution && typeof bestTree.resolution === 'object' ? bestTree.resolution as UiResolution : null
|
|
705
|
+
let chosen = best as { el: UiElement, idx: number }
|
|
706
|
+
const childBounds = Array.isArray(chosen?.el?.bounds) ? chosen.el.bounds : null
|
|
626
707
|
|
|
627
708
|
// Strategy 1: if parentId references an index, climb that chain
|
|
628
|
-
let resolvedAncestor:
|
|
629
|
-
if (childBounds && (chosen.parentId !== undefined && chosen.parentId !== null)) {
|
|
709
|
+
let resolvedAncestor: { el: UiElement, idx: number } | null = null
|
|
710
|
+
if (childBounds && (chosen.el.parentId !== undefined && chosen.el.parentId !== null)) {
|
|
630
711
|
let cur = chosen
|
|
631
712
|
let safety = 0
|
|
632
|
-
while (cur && safety < 20 && !(cur.clickable || cur.focusable) && (cur.parentId !== undefined && cur.parentId !== null)) {
|
|
633
|
-
let pid = cur.parentId
|
|
713
|
+
while (cur && safety < 20 && !(cur.el.clickable || cur.el.focusable) && (cur.el.parentId !== undefined && cur.el.parentId !== null)) {
|
|
714
|
+
let pid = cur.el.parentId
|
|
634
715
|
let idx: number | null = null
|
|
635
716
|
if (typeof pid === 'number') idx = pid
|
|
636
717
|
else if (typeof pid === 'string' && /^\d+$/.test(pid)) idx = Number(pid)
|
|
637
718
|
// If parentId is not an index, try to find by matching resourceId or id field
|
|
638
719
|
if (idx !== null && elements[idx]) {
|
|
639
|
-
cur = elements[idx]
|
|
640
|
-
if (cur && (cur.clickable || cur.enabled || cur.focusable)) { resolvedAncestor = cur; break }
|
|
720
|
+
cur = { el: elements[idx], idx }
|
|
721
|
+
if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) { resolvedAncestor = cur; break }
|
|
641
722
|
} else if (typeof pid === 'string') {
|
|
642
723
|
// fallback: search elements for matching resourceId or id
|
|
643
|
-
const
|
|
724
|
+
const foundIndex = elements.findIndex((el: UiElement)=> (el.resourceId === pid || el.id === pid))
|
|
725
|
+
const found = foundIndex >= 0 ? elements[foundIndex] : null
|
|
644
726
|
if (found) {
|
|
645
|
-
cur = found
|
|
646
|
-
if (cur && (cur.clickable || cur.enabled || cur.focusable)) { resolvedAncestor = cur; break }
|
|
727
|
+
cur = { el: found, idx: foundIndex }
|
|
728
|
+
if (cur && (cur.el.clickable || cur.el.enabled || cur.el.focusable)) { resolvedAncestor = cur; break }
|
|
647
729
|
// otherwise continue climbing if this found element has its own parentId
|
|
648
730
|
} else {
|
|
649
731
|
break
|
|
@@ -659,62 +741,77 @@ export class ToolsInteract {
|
|
|
659
741
|
if (!resolvedAncestor && childBounds) {
|
|
660
742
|
const [cl,ct,cr,cb] = childBounds
|
|
661
743
|
// find candidates that are clickable and contain the child bounds
|
|
662
|
-
const candidates = elements
|
|
663
|
-
|
|
744
|
+
const candidates = elements
|
|
745
|
+
.map((el: UiElement, idx: number) => ({ el, idx }))
|
|
746
|
+
.filter(({ el }) => el && (el.clickable || el.focusable) && Array.isArray(el.bounds) && el.bounds!.length >= 4)
|
|
747
|
+
let bestCandidate: { el: UiElement, idx: number } | null = null
|
|
664
748
|
let bestCandidateArea = Infinity
|
|
665
749
|
for (const c of candidates) {
|
|
666
|
-
const
|
|
750
|
+
const bounds = c.el.bounds as number[]
|
|
751
|
+
const [pl,pt,pr,pb] = bounds
|
|
667
752
|
if (pl <= cl && pt <= ct && pr >= cr && pb >= cb) {
|
|
668
753
|
const area = (pr-pl) * (pb-pt)
|
|
669
|
-
if (area < bestCandidateArea) { bestCandidateArea = area; bestCandidate = c
|
|
754
|
+
if (area < bestCandidateArea) { bestCandidateArea = area; bestCandidate = c }
|
|
670
755
|
}
|
|
671
756
|
}
|
|
672
757
|
if (bestCandidate) resolvedAncestor = bestCandidate
|
|
673
758
|
}
|
|
674
759
|
|
|
675
760
|
if (resolvedAncestor) {
|
|
676
|
-
best =
|
|
677
|
-
|
|
678
|
-
|
|
761
|
+
best = {
|
|
762
|
+
el: resolvedAncestor.el,
|
|
763
|
+
idx: resolvedAncestor.idx,
|
|
764
|
+
score: Math.min(1, best.score + 0.02),
|
|
765
|
+
reason: 'clickable_parent_preferred',
|
|
766
|
+
interactable: true
|
|
767
|
+
}
|
|
679
768
|
}
|
|
680
769
|
|
|
681
|
-
if (best && !(best.clickable || best.focusable)) {
|
|
682
|
-
const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best, idx: best.
|
|
770
|
+
if (best && !(best.el.clickable || best.el.focusable)) {
|
|
771
|
+
const nearbyActionable = ToolsInteract._resolveNearbyActionableControl(elements, { el: best.el, idx: best.idx }, screen)
|
|
683
772
|
if (nearbyActionable) {
|
|
684
|
-
best =
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
773
|
+
best = {
|
|
774
|
+
el: nearbyActionable.el,
|
|
775
|
+
idx: nearbyActionable.idx,
|
|
776
|
+
score: Math.min(1, best.score + 0.02),
|
|
777
|
+
reason: nearbyActionable.sliderLike ? 'slider_track_preferred' : 'nearby_actionable_control',
|
|
778
|
+
interactable: true
|
|
779
|
+
}
|
|
688
780
|
}
|
|
689
781
|
}
|
|
690
782
|
} catch (e) { console.error('Error resolving ancestor:', e) }
|
|
691
783
|
|
|
692
784
|
if (!best) return { found: false, error: 'Element not found' }
|
|
693
785
|
|
|
694
|
-
const boundsObj = Array.isArray(best.bounds) ? { left: best.bounds[0], top: best.bounds[1], right: best.bounds[2], bottom: best.bounds[3] } : null
|
|
786
|
+
const boundsObj = Array.isArray(best.el.bounds) ? { left: best.el.bounds[0], top: best.el.bounds[1], right: best.el.bounds[2], bottom: best.el.bounds[3] } : null
|
|
695
787
|
const tapCoordinates = boundsObj ? { x: Math.floor((boundsObj.left + boundsObj.right) / 2), y: Math.floor((boundsObj.top + boundsObj.bottom) / 2) } : null
|
|
788
|
+
const uniqueRanked = bestIterationCandidates.filter((candidate, index, array) => index === array.findIndex((other) => other.idx === candidate.idx && other.el === candidate.el))
|
|
789
|
+
const alternateCandidates = uniqueRanked
|
|
790
|
+
.filter((candidate) => candidate.idx !== best.idx || candidate.el !== best.el)
|
|
791
|
+
.slice(0, 3)
|
|
792
|
+
.map((candidate) => ToolsInteract._summarizeResolutionCandidate(candidate))
|
|
696
793
|
|
|
697
794
|
const outEl = {
|
|
698
|
-
text: best.text ?? null,
|
|
699
|
-
resourceId: best.resourceId ?? null,
|
|
700
|
-
contentDesc: best.contentDescription ?? best.contentDesc ?? null,
|
|
701
|
-
class: best.type ?? best.class ?? null,
|
|
795
|
+
text: best.el.text ?? null,
|
|
796
|
+
resourceId: best.el.resourceId ?? null,
|
|
797
|
+
contentDesc: best.el.contentDescription ?? best.el.contentDesc ?? null,
|
|
798
|
+
class: best.el.type ?? best.el.class ?? null,
|
|
702
799
|
bounds: boundsObj,
|
|
703
|
-
clickable: !!best.clickable,
|
|
704
|
-
enabled: !!best.enabled,
|
|
705
|
-
stable_id: best.stable_id ?? null,
|
|
706
|
-
role: best.role ?? null,
|
|
707
|
-
test_tag: best.test_tag ?? null,
|
|
708
|
-
selector: best.selector ?? null,
|
|
709
|
-
semantic: best.semantic ?? null,
|
|
800
|
+
clickable: !!best.el.clickable,
|
|
801
|
+
enabled: !!best.el.enabled,
|
|
802
|
+
stable_id: best.el.stable_id ?? null,
|
|
803
|
+
role: best.el.role ?? null,
|
|
804
|
+
test_tag: best.el.test_tag ?? null,
|
|
805
|
+
selector: best.el.selector ?? null,
|
|
806
|
+
semantic: best.el.semantic ?? null,
|
|
710
807
|
tapCoordinates,
|
|
711
808
|
telemetry: {
|
|
712
|
-
matchedIndex: best
|
|
713
|
-
matchedInteractable: !!best
|
|
714
|
-
sliderLike:
|
|
809
|
+
matchedIndex: best.idx ?? null,
|
|
810
|
+
matchedInteractable: !!best.interactable,
|
|
811
|
+
sliderLike: best.reason === 'slider_track_preferred'
|
|
715
812
|
}
|
|
716
813
|
}
|
|
717
|
-
if (best
|
|
814
|
+
if (best.reason === 'slider_track_preferred') {
|
|
718
815
|
const isVertical = !!boundsObj && (boundsObj.bottom - boundsObj.top) > (boundsObj.right - boundsObj.left)
|
|
719
816
|
const interactionHint = {
|
|
720
817
|
kind: 'slider',
|
|
@@ -723,8 +820,15 @@ export class ToolsInteract {
|
|
|
723
820
|
}
|
|
724
821
|
;(outEl as any).interactionHint = interactionHint
|
|
725
822
|
}
|
|
726
|
-
const scoreVal = Math.min(1, Number(
|
|
727
|
-
|
|
823
|
+
const scoreVal = Math.min(1, Number(best.score.toFixed(3)))
|
|
824
|
+
const resolution: FindElementResolutionSummary = {
|
|
825
|
+
confidence: scoreVal,
|
|
826
|
+
reason: best.reason,
|
|
827
|
+
fallback_available: alternateCandidates.length > 0,
|
|
828
|
+
matched_count: uniqueRanked.length,
|
|
829
|
+
alternates: alternateCandidates
|
|
830
|
+
}
|
|
831
|
+
return { found: true, element: outEl, score: scoreVal, confidence: scoreVal, resolution }
|
|
728
832
|
}
|
|
729
833
|
|
|
730
834
|
static async waitForUIHandler({ selector, condition = 'exists', timeout_ms = 60000, poll_interval_ms = 300, match, retry = { max_attempts: 1, backoff_ms: 0 }, platform, deviceId }: { selector?: { text?: string, resource_id?: string, accessibility_id?: string, contains?: boolean }, condition?: 'exists'|'not_exists'|'visible'|'clickable', timeout_ms?: number, poll_interval_ms?: number, match?: { index?: number }, retry?: { max_attempts?: number, backoff_ms?: number }, platform?: 'android'|'ios', deviceId?: string }) {
|
|
@@ -596,7 +596,9 @@ Recommended Usage:
|
|
|
596
596
|
},
|
|
597
597
|
{
|
|
598
598
|
name: 'find_element',
|
|
599
|
-
description:
|
|
599
|
+
description: `Find a UI element by semantic query (text, content-desc, resource-id, class).
|
|
600
|
+
|
|
601
|
+
Returns the best match plus resolution metadata when available, including confidence, selection reason, and fallback alternates.`,
|
|
600
602
|
inputSchema: {
|
|
601
603
|
type: 'object',
|
|
602
604
|
properties: {
|
package/src/server-core.ts
CHANGED
package/src/types.ts
CHANGED
|
@@ -254,6 +254,79 @@ export interface ActionTargetResolved {
|
|
|
254
254
|
semantic?: UIElementSemanticMetadata | null;
|
|
255
255
|
}
|
|
256
256
|
|
|
257
|
+
export interface ResolutionAlternate {
|
|
258
|
+
text: string | null;
|
|
259
|
+
resource_id: string | null;
|
|
260
|
+
accessibility_id: string | null;
|
|
261
|
+
class: string | null;
|
|
262
|
+
bounds: {
|
|
263
|
+
left: number;
|
|
264
|
+
top: number;
|
|
265
|
+
right: number;
|
|
266
|
+
bottom: number;
|
|
267
|
+
} | null;
|
|
268
|
+
clickable: boolean;
|
|
269
|
+
enabled: boolean;
|
|
270
|
+
score: number;
|
|
271
|
+
reason: string;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
export interface ResolutionSummary {
|
|
275
|
+
confidence: number;
|
|
276
|
+
reason: string;
|
|
277
|
+
fallback_available: boolean;
|
|
278
|
+
matched_count: number;
|
|
279
|
+
alternates: ResolutionAlternate[];
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
export interface FindElementElement {
|
|
283
|
+
text: string | null;
|
|
284
|
+
resourceId: string | null;
|
|
285
|
+
contentDesc: string | null;
|
|
286
|
+
class: string | null;
|
|
287
|
+
bounds: {
|
|
288
|
+
left: number;
|
|
289
|
+
top: number;
|
|
290
|
+
right: number;
|
|
291
|
+
bottom: number;
|
|
292
|
+
} | null;
|
|
293
|
+
clickable: boolean;
|
|
294
|
+
enabled: boolean;
|
|
295
|
+
stable_id?: string | null;
|
|
296
|
+
role?: string | null;
|
|
297
|
+
test_tag?: string | null;
|
|
298
|
+
selector?: UIResolutionSelector | null;
|
|
299
|
+
semantic?: UIElementSemanticMetadata | null;
|
|
300
|
+
tapCoordinates: {
|
|
301
|
+
x: number;
|
|
302
|
+
y: number;
|
|
303
|
+
} | null;
|
|
304
|
+
telemetry: {
|
|
305
|
+
matchedIndex: number | null;
|
|
306
|
+
matchedInteractable: boolean;
|
|
307
|
+
sliderLike: boolean;
|
|
308
|
+
};
|
|
309
|
+
interactionHint?: {
|
|
310
|
+
kind: 'slider';
|
|
311
|
+
axis: 'horizontal' | 'vertical';
|
|
312
|
+
trackBounds: {
|
|
313
|
+
left: number;
|
|
314
|
+
top: number;
|
|
315
|
+
right: number;
|
|
316
|
+
bottom: number;
|
|
317
|
+
} | null;
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export interface FindElementResponse {
|
|
322
|
+
found: boolean;
|
|
323
|
+
element?: FindElementElement | null;
|
|
324
|
+
score?: number;
|
|
325
|
+
confidence?: number;
|
|
326
|
+
resolution?: ResolutionSummary | null;
|
|
327
|
+
error?: string;
|
|
328
|
+
}
|
|
329
|
+
|
|
257
330
|
export interface ActionExecutionResult {
|
|
258
331
|
action_id: string;
|
|
259
332
|
timestamp: string;
|
|
@@ -73,6 +73,9 @@ async function run() {
|
|
|
73
73
|
process.stdout.write('res4 ' + JSON.stringify(res4, null, 2) + '\n');
|
|
74
74
|
const pass4 = res4.found === true && res4.element && res4.element.clickable === true && res4.element.resourceId === 'btn_generate' && res4.element.tapCoordinates && typeof res4.element.tapCoordinates.x === 'number' && typeof res4.element.tapCoordinates.y === 'number' && typeof res4.confidence === 'number'
|
|
75
75
|
assert.ok(pass4, 'Child text should resolve to a clickable parent ancestor')
|
|
76
|
+
assert.strictEqual(res4.resolution?.reason, 'clickable_parent_preferred')
|
|
77
|
+
assert.strictEqual(res4.resolution?.fallback_available, true)
|
|
78
|
+
assert.ok((res4.resolution?.alternates || []).length >= 1, 'Parent promotion should preserve alternates')
|
|
76
79
|
process.stdout.write('Test 4: ' + (pass4 ? 'PASS' : 'FAIL') + '\n');
|
|
77
80
|
|
|
78
81
|
// Test 5: duration label should resolve to the nearby slider control
|
|
@@ -111,6 +114,8 @@ async function run() {
|
|
|
111
114
|
process.stdout.write('Test 6: ' + (pass6 ? 'PASS' : 'FAIL') + '\n');
|
|
112
115
|
const pass6b = res6.element && res6.element.telemetry && res6.element.telemetry.sliderLike === true && res6.element.interactionHint && res6.element.interactionHint.kind === 'slider'
|
|
113
116
|
assert.ok(pass6b, 'Duration lookup should include slider-specific telemetry')
|
|
117
|
+
assert.strictEqual(res6.resolution?.reason, 'slider_track_preferred')
|
|
118
|
+
assert.strictEqual(res6.resolution?.fallback_available, true)
|
|
114
119
|
process.stdout.write('Test 6b: ' + (pass6b ? 'PASS' : 'FAIL') + '\n');
|
|
115
120
|
|
|
116
121
|
// Test 7: prefer vertical track-like control over a closer text button
|