mobile-debug-mcp 0.26.0 → 0.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/interact/classify.js +48 -11
- package/dist/server/tool-definitions.js +19 -12
- package/dist/server/tool-handlers.js +2 -0
- package/dist/server-core.js +1 -1
- package/docs/CHANGELOG.md +3 -0
- package/docs/ROADMAP.md +66 -38
- package/docs/rfcs/004-action-verification-routing.md +342 -0
- package/docs/specs/mcp-tooling-spec-v1.md +3 -3
- package/docs/tools/interact.md +10 -8
- package/package.json +1 -1
- package/src/interact/classify.ts +53 -13
- package/src/server/tool-definitions.ts +19 -12
- package/src/server/tool-handlers.ts +2 -0
- package/src/server-core.ts +1 -1
- package/test/unit/interact/classify_action_outcome.test.ts +44 -25
- package/test/unit/server/contract.test.ts +8 -6
|
@@ -1,35 +1,72 @@
|
|
|
1
|
+
const ACTION_CATEGORY_BY_TYPE = {
|
|
2
|
+
tap: 'local_state',
|
|
3
|
+
tap_element: 'local_state',
|
|
4
|
+
swipe: 'local_state',
|
|
5
|
+
scroll_to_element: 'local_state',
|
|
6
|
+
type_text: 'local_state',
|
|
7
|
+
press_back: 'local_state',
|
|
8
|
+
start_app: 'side_effect',
|
|
9
|
+
restart_app: 'side_effect',
|
|
10
|
+
terminate_app: 'side_effect',
|
|
11
|
+
reset_app_data: 'side_effect',
|
|
12
|
+
install_app: 'side_effect',
|
|
13
|
+
build_app: 'side_effect',
|
|
14
|
+
build_and_install: 'side_effect'
|
|
15
|
+
};
|
|
16
|
+
function inferActionCategory(actionType) {
|
|
17
|
+
if (typeof actionType !== 'string')
|
|
18
|
+
return null;
|
|
19
|
+
const normalized = actionType.trim().toLowerCase();
|
|
20
|
+
if (!normalized)
|
|
21
|
+
return null;
|
|
22
|
+
return ACTION_CATEGORY_BY_TYPE[normalized] ?? 'side_effect';
|
|
23
|
+
}
|
|
1
24
|
/**
|
|
2
25
|
* Pure deterministic classifier. Applies rules in fixed order.
|
|
3
26
|
* Same inputs always produce the same output.
|
|
4
27
|
*/
|
|
5
28
|
export function classifyActionOutcome(input) {
|
|
6
|
-
const { uiChanged, expectedElementVisible, networkRequests, hasLogErrors } = input;
|
|
29
|
+
const { uiChanged, expectedElementVisible, actionType, networkRequests, hasLogErrors } = input;
|
|
30
|
+
const actionCategory = inferActionCategory(actionType);
|
|
7
31
|
// Step 1 — UI signal is positive
|
|
8
32
|
if (uiChanged || expectedElementVisible === true) {
|
|
9
33
|
return { outcome: 'success', reasoning: expectedElementVisible === true ? 'expected element is visible' : 'UI changed after action' };
|
|
10
34
|
}
|
|
11
|
-
// Step 2 —
|
|
12
|
-
if (
|
|
35
|
+
// Step 2 — no action type means we cannot choose a safe routing path
|
|
36
|
+
if (actionCategory === null) {
|
|
13
37
|
return {
|
|
14
38
|
outcome: 'unknown',
|
|
15
|
-
reasoning: '
|
|
16
|
-
nextAction: 'call_get_network_activity'
|
|
39
|
+
reasoning: 'actionType was not supplied; pass the runtime action_type so the classifier can distinguish local-state and side-effect routing'
|
|
17
40
|
};
|
|
18
41
|
}
|
|
19
|
-
|
|
20
|
-
const failedRequest = networkRequests.find((r) => r.status === 'failure' || r.status === 'retryable');
|
|
42
|
+
const failedRequest = networkRequests?.find((r) => r.status === 'failure' || r.status === 'retryable');
|
|
21
43
|
if (failedRequest) {
|
|
22
44
|
return { outcome: 'backend_failure', reasoning: `network request ${failedRequest.endpoint} returned ${failedRequest.status}` };
|
|
23
45
|
}
|
|
24
|
-
// Step
|
|
46
|
+
// Step 3 — local-state actions should be verified with state-specific signals first
|
|
47
|
+
if (actionCategory === 'local_state') {
|
|
48
|
+
const logNote = hasLogErrors ? ' (log errors present)' : '';
|
|
49
|
+
return {
|
|
50
|
+
outcome: 'no_op',
|
|
51
|
+
reasoning: `local-state action${logNote}; use expect_state, refreshed snapshot comparison, or expect_element_visible instead of defaulting to network inspection`
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
// Step 4 — side-effect actions may legitimately need network or log inspection
|
|
55
|
+
if (networkRequests === null || networkRequests === undefined) {
|
|
56
|
+
return {
|
|
57
|
+
outcome: 'unknown',
|
|
58
|
+
reasoning: 'side-effect action without network data; inspect network or log signals only if the outcome is still ambiguous'
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
// Step 5 — no network requests at all
|
|
25
62
|
if (networkRequests.length === 0) {
|
|
26
63
|
const logNote = hasLogErrors ? ' (log errors present)' : '';
|
|
27
|
-
return { outcome: 'no_op', reasoning: `
|
|
64
|
+
return { outcome: 'no_op', reasoning: `side-effect action and no network activity${logNote}` };
|
|
28
65
|
}
|
|
29
|
-
// Step
|
|
66
|
+
// Step 6 — network requests exist and all succeeded
|
|
30
67
|
if (networkRequests.every((r) => r.status === 'success')) {
|
|
31
68
|
return { outcome: 'ui_failure', reasoning: 'network requests succeeded but UI did not change' };
|
|
32
69
|
}
|
|
33
|
-
// Step
|
|
70
|
+
// Step 7 — fallback
|
|
34
71
|
return { outcome: 'unknown', reasoning: 'signals are inconclusive' };
|
|
35
72
|
}
|
|
@@ -344,7 +344,7 @@ Capabilities:
|
|
|
344
344
|
Constraints:
|
|
345
345
|
- Does not verify correctness of the resulting state
|
|
346
346
|
- Must not be used alone to confirm action success when an applicable expect_* tool exists
|
|
347
|
-
-
|
|
347
|
+
- For backend/API activity without a visible UI change, pass the runtime action_type into classify_action_outcome and collect network evidence only if the result remains ambiguous
|
|
348
348
|
|
|
349
349
|
Recommended Usage:
|
|
350
350
|
1. Capture or define the expected outcome
|
|
@@ -918,26 +918,29 @@ Failure Handling:
|
|
|
918
918
|
name: 'classify_action_outcome',
|
|
919
919
|
description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
|
|
920
920
|
|
|
921
|
-
|
|
922
|
-
Use this
|
|
923
|
-
For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action
|
|
921
|
+
Use the runtime action result's \`action_type\` as \`actionType\` so the classifier can distinguish local-state actions from side-effect actions.
|
|
922
|
+
Use this when the intended outcome is not already fully verified by the UI signal alone.
|
|
923
|
+
For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action if the outcome is still ambiguous.
|
|
924
924
|
|
|
925
925
|
HOW TO GATHER INPUTS before calling:
|
|
926
926
|
1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
|
|
927
927
|
2. If you checked for a specific element with wait_for_ui, set expectedElementVisible.
|
|
928
|
-
3.
|
|
928
|
+
3. Pass actionType from the action response when available.
|
|
929
|
+
4. Only provide networkRequests if you already collected them or want to classify a side-effect action with backend evidence.
|
|
929
930
|
|
|
930
931
|
RULES (applied in order — stop at first match):
|
|
931
932
|
1. If uiChanged=true OR expectedElementVisible=true → outcome=success
|
|
932
|
-
2.
|
|
933
|
+
2. If actionType is missing → outcome=unknown
|
|
933
934
|
3. If any request has status=failure or retryable → outcome=backend_failure
|
|
934
|
-
4. If
|
|
935
|
-
5. If
|
|
936
|
-
6.
|
|
935
|
+
4. If actionType maps to a local-state action → outcome=no_op; prefer state-based verification and avoid default network fallback
|
|
936
|
+
5. If actionType maps to a side-effect action and no networkRequests were supplied → outcome=unknown
|
|
937
|
+
6. If no requests returned → outcome=no_op
|
|
938
|
+
7. If all requests succeeded → outcome=ui_failure
|
|
939
|
+
8. Otherwise → outcome=unknown
|
|
937
940
|
|
|
938
941
|
BEHAVIOUR after outcome:
|
|
939
942
|
- success → continue
|
|
940
|
-
- no_op → retry
|
|
943
|
+
- no_op → retry with richer state verification or re-resolve the element
|
|
941
944
|
- backend_failure → stop and report the failing endpoint
|
|
942
945
|
- ui_failure → stop and report failure
|
|
943
946
|
- unknown → take one recovery step (e.g. capture_debug_snapshot), then stop`,
|
|
@@ -952,9 +955,13 @@ BEHAVIOUR after outcome:
|
|
|
952
955
|
type: 'boolean',
|
|
953
956
|
description: 'true if the element you expected to appear is now visible (from wait_for_ui). Omit if you did not check for a specific element.'
|
|
954
957
|
},
|
|
958
|
+
actionType: {
|
|
959
|
+
type: 'string',
|
|
960
|
+
description: 'The runtime action_type from the action response (for example tap, tap_element, swipe, type_text, press_back, start_app).'
|
|
961
|
+
},
|
|
955
962
|
networkRequests: {
|
|
956
963
|
type: 'array',
|
|
957
|
-
description: '
|
|
964
|
+
description: 'Optional network evidence collected after the action. Use it when the expected outcome is backend/API activity or when the UI signal is ambiguous.',
|
|
958
965
|
items: {
|
|
959
966
|
type: 'object',
|
|
960
967
|
properties: {
|
|
@@ -976,7 +983,7 @@ BEHAVIOUR after outcome:
|
|
|
976
983
|
name: 'get_network_activity',
|
|
977
984
|
description: `Returns structured network events captured from platform logs since the last action.
|
|
978
985
|
|
|
979
|
-
Call this
|
|
986
|
+
Call this immediately after an action when you want backend evidence for a side-effect flow, only if the result is still ambiguous.
|
|
980
987
|
Do not call more than once per action.
|
|
981
988
|
|
|
982
989
|
Events are filtered to significant (non-background) requests only.
|
|
@@ -385,11 +385,13 @@ async function handleStopLogStream(args) {
|
|
|
385
385
|
function handleClassifyActionOutcome(args) {
|
|
386
386
|
const uiChanged = requireBooleanArg(args, 'uiChanged');
|
|
387
387
|
const expectedElementVisible = getBooleanArg(args, 'expectedElementVisible');
|
|
388
|
+
const actionType = getStringArg(args, 'actionType');
|
|
388
389
|
const networkRequests = getArrayArg(args, 'networkRequests');
|
|
389
390
|
const hasLogErrors = getBooleanArg(args, 'hasLogErrors');
|
|
390
391
|
const result = classifyActionOutcome({
|
|
391
392
|
uiChanged,
|
|
392
393
|
expectedElementVisible: expectedElementVisible ?? null,
|
|
394
|
+
actionType: actionType ?? null,
|
|
393
395
|
networkRequests: networkRequests ?? null,
|
|
394
396
|
hasLogErrors: hasLogErrors ?? null
|
|
395
397
|
});
|
package/dist/server-core.js
CHANGED
|
@@ -6,7 +6,7 @@ import { handleToolCall } from './server/tool-handlers.js';
|
|
|
6
6
|
export { wrapResponse, toolDefinitions, handleToolCall };
|
|
7
7
|
export const serverInfo = {
|
|
8
8
|
name: 'mobile-debug-mcp',
|
|
9
|
-
version: '0.26.
|
|
9
|
+
version: '0.26.1'
|
|
10
10
|
};
|
|
11
11
|
export function createServer() {
|
|
12
12
|
const server = new Server(serverInfo, {
|
package/docs/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to the **Mobile Debug MCP** project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.26.1]
|
|
6
|
+
- Fixed overuse of `get_network_activity`
|
|
7
|
+
|
|
5
8
|
## [0.26.0]
|
|
6
9
|
- RFC-003 wait/synchronization contract with `snapshot_revision`, `captured_at_ms`, and `loading_state`
|
|
7
10
|
- Added `wait_for_ui_change` for stable in-place UI mutations
|
package/docs/ROADMAP.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
# Mobile Debug MCP
|
|
1
|
+
# Mobile Debug MCP Roadmap
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## Planning Principles
|
|
4
4
|
|
|
5
5
|
Ordered by:
|
|
6
6
|
|
|
@@ -26,33 +26,45 @@ Higher task success with fewer retries.
|
|
|
26
26
|
|
|
27
27
|
---
|
|
28
28
|
|
|
29
|
-
#
|
|
29
|
+
# Roadmap Status Overview
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
## Completed Foundations
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
| Capability | Status | Notes |
|
|
34
|
+
|-----------|--------|-------|
|
|
35
|
+
| Stronger State Verification | Complete | Foundational verification layer shipped |
|
|
36
|
+
| Richer Element Identity | Complete | Identity and selector confidence foundations shipped |
|
|
37
|
+
|
|
38
|
+
## Current Focus
|
|
39
|
+
|
|
40
|
+
- Wait and Synchronization Reliability
|
|
41
|
+
|
|
42
|
+
## Upcoming Work
|
|
43
|
+
|
|
44
|
+
- Long Press Gesture
|
|
45
|
+
- Better Compose / Custom Control Semantics
|
|
35
46
|
|
|
36
|
-
|
|
47
|
+
## Later Horizon
|
|
37
48
|
|
|
38
|
-
-
|
|
39
|
-
-
|
|
49
|
+
- Pinch to Zoom
|
|
50
|
+
- Action Trace Correlation
|
|
40
51
|
|
|
41
52
|
---
|
|
42
53
|
|
|
43
|
-
#
|
|
54
|
+
# Stronger State Verification
|
|
44
55
|
|
|
45
56
|
## Why first
|
|
46
57
|
Highest leverage improvement.
|
|
47
58
|
|
|
48
|
-
**Status:** Completed
|
|
59
|
+
**Status:** Completed
|
|
60
|
+
**Priority:** P1
|
|
49
61
|
|
|
50
62
|
Most failures are not “can’t act,” they’re:
|
|
51
63
|
- uncertain state
|
|
52
64
|
- weak verification
|
|
53
65
|
- retry loops caused by inference
|
|
54
66
|
|
|
55
|
-
##
|
|
67
|
+
## Scope
|
|
56
68
|
- Direct readable control values
|
|
57
69
|
- Expanded `expect_*` verification
|
|
58
70
|
- Move from inference to state introspection
|
|
@@ -60,7 +72,7 @@ Most failures are not “can’t act,” they’re:
|
|
|
60
72
|
## Expected Impact
|
|
61
73
|
Very high.
|
|
62
74
|
|
|
63
|
-
##
|
|
75
|
+
## Exit Criteria
|
|
64
76
|
- Control state readable for core widgets (toggle, slider, input, dropdown)
|
|
65
77
|
- New expect_* state verifiers implemented
|
|
66
78
|
- Agents can verify state without visual inference in representative flows
|
|
@@ -79,19 +91,20 @@ Blocks or strengthens:
|
|
|
79
91
|
|
|
80
92
|
---
|
|
81
93
|
|
|
82
|
-
#
|
|
94
|
+
# Richer Element Identity
|
|
83
95
|
|
|
84
96
|
## Why second
|
|
85
97
|
Directly reduces selector brittleness.
|
|
86
98
|
|
|
87
|
-
**Status:** Completed
|
|
99
|
+
**Status:** Completed
|
|
100
|
+
**Priority:** P2
|
|
88
101
|
|
|
89
102
|
Improves:
|
|
90
103
|
- targeting stability
|
|
91
104
|
- repeatability
|
|
92
105
|
- agent confidence
|
|
93
106
|
|
|
94
|
-
##
|
|
107
|
+
## Scope
|
|
95
108
|
- Stable IDs / test tags prioritization
|
|
96
109
|
- Selector confidence metadata
|
|
97
110
|
- Preferred selector hierarchy
|
|
@@ -99,7 +112,7 @@ Improves:
|
|
|
99
112
|
## Expected Impact
|
|
100
113
|
Very high.
|
|
101
114
|
|
|
102
|
-
##
|
|
115
|
+
## Exit Criteria
|
|
103
116
|
- Stable selector preference order implemented
|
|
104
117
|
- Test tags/resource IDs surfaced where available
|
|
105
118
|
- Selector confidence metadata available
|
|
@@ -118,18 +131,21 @@ Blocks or strengthens:
|
|
|
118
131
|
|
|
119
132
|
---
|
|
120
133
|
|
|
121
|
-
#
|
|
134
|
+
# Wait and Synchronization Reliability
|
|
122
135
|
|
|
123
136
|
## Why third
|
|
124
137
|
Reliable async synchronization is foundational for agent success and should precede gesture expansion.
|
|
125
138
|
|
|
139
|
+
**Status:** Spec Ready
|
|
140
|
+
**Priority:** P3
|
|
141
|
+
|
|
126
142
|
Addresses failures where agents:
|
|
127
143
|
- skip UI waits after actions
|
|
128
144
|
- rely on network/log signals too early
|
|
129
145
|
- struggle with in-place UI updates
|
|
130
146
|
- misread stale UI snapshots
|
|
131
147
|
|
|
132
|
-
##
|
|
148
|
+
## Scope
|
|
133
149
|
- UI-first synchronization policy guidance
|
|
134
150
|
- wait_for_ui_change (hierarchy diff based waiting)
|
|
135
151
|
- Structured loading state detection
|
|
@@ -139,7 +155,7 @@ Addresses failures where agents:
|
|
|
139
155
|
## Expected Impact
|
|
140
156
|
Very high.
|
|
141
157
|
|
|
142
|
-
##
|
|
158
|
+
## Exit Criteria
|
|
143
159
|
- wait_for_ui_change implemented
|
|
144
160
|
- Loading state detection available for representative controls
|
|
145
161
|
- Snapshot revision or staleness metadata exposed
|
|
@@ -163,11 +179,14 @@ Blocks or strengthens:
|
|
|
163
179
|
|
|
164
180
|
---
|
|
165
181
|
|
|
166
|
-
#
|
|
182
|
+
# Long Press Gesture
|
|
167
183
|
|
|
168
184
|
## Why fourth
|
|
169
185
|
High utility, relatively low complexity.
|
|
170
186
|
|
|
187
|
+
**Status:** Planned
|
|
188
|
+
**Priority:** P4
|
|
189
|
+
|
|
171
190
|
Unlocks many currently awkward interactions:
|
|
172
191
|
|
|
173
192
|
- context menus
|
|
@@ -177,7 +196,7 @@ Unlocks many currently awkward interactions:
|
|
|
177
196
|
|
|
178
197
|
Broad usefulness.
|
|
179
198
|
|
|
180
|
-
##
|
|
199
|
+
## Scope
|
|
181
200
|
New tool:
|
|
182
201
|
|
|
183
202
|
```json
|
|
@@ -191,7 +210,7 @@ Verification alignment:
|
|
|
191
210
|
## Expected Impact
|
|
192
211
|
High.
|
|
193
212
|
|
|
194
|
-
##
|
|
213
|
+
## Exit Criteria
|
|
195
214
|
- long_press tool implemented across supported platforms
|
|
196
215
|
- Duration defaults and overrides supported
|
|
197
216
|
- Verification patterns for long press outcomes defined
|
|
@@ -211,18 +230,21 @@ Strengthens:
|
|
|
211
230
|
|
|
212
231
|
---
|
|
213
232
|
|
|
214
|
-
#
|
|
233
|
+
# Better Compose / Custom Control Semantics
|
|
215
234
|
|
|
216
235
|
## Why fifth
|
|
217
236
|
Important, but strengthened by priorities 1–4 first.
|
|
218
237
|
|
|
238
|
+
**Status:** Planned
|
|
239
|
+
**Priority:** P5
|
|
240
|
+
|
|
219
241
|
Semantics become more useful once:
|
|
220
242
|
- identity is stronger
|
|
221
243
|
- verification is stronger
|
|
222
244
|
- gestures are richer
|
|
223
245
|
- synchronization is more reliable
|
|
224
246
|
|
|
225
|
-
##
|
|
247
|
+
## Scope
|
|
226
248
|
- Composite control traits
|
|
227
249
|
- Control role enrichment (adjustable, expandable, selectable_group)
|
|
228
250
|
- Interaction contracts metadata
|
|
@@ -233,7 +255,7 @@ Semantics become more useful once:
|
|
|
233
255
|
## Expected Impact
|
|
234
256
|
High.
|
|
235
257
|
|
|
236
|
-
##
|
|
258
|
+
## Exit Criteria
|
|
237
259
|
- Semantic traits implemented for major custom control classes
|
|
238
260
|
- Interaction contracts surfaced in snapshot model
|
|
239
261
|
- Confidence model defined for derived semantics
|
|
@@ -253,11 +275,14 @@ Depends on:
|
|
|
253
275
|
|
|
254
276
|
---
|
|
255
277
|
|
|
256
|
-
#
|
|
278
|
+
# Pinch to Zoom
|
|
257
279
|
|
|
258
280
|
## Why sixth
|
|
259
281
|
Valuable, but narrower than long press.
|
|
260
282
|
|
|
283
|
+
**Status:** Planned
|
|
284
|
+
**Priority:** P6
|
|
285
|
+
|
|
261
286
|
Applies mainly to:
|
|
262
287
|
- maps
|
|
263
288
|
- images
|
|
@@ -266,7 +291,7 @@ Applies mainly to:
|
|
|
266
291
|
|
|
267
292
|
Useful, but less universal.
|
|
268
293
|
|
|
269
|
-
##
|
|
294
|
+
## Scope
|
|
270
295
|
|
|
271
296
|
```json
|
|
272
297
|
pinch_to_zoom(target, scale, center?)
|
|
@@ -279,7 +304,7 @@ Verification:
|
|
|
279
304
|
## Expected Impact
|
|
280
305
|
Medium-high.
|
|
281
306
|
|
|
282
|
-
##
|
|
307
|
+
## Exit Criteria
|
|
283
308
|
- pinch_to_zoom implemented
|
|
284
309
|
- Zoom in/out flows supported
|
|
285
310
|
- Verification primitives for viewport or zoom state available
|
|
@@ -297,22 +322,25 @@ Depends on:
|
|
|
297
322
|
|
|
298
323
|
---
|
|
299
324
|
|
|
300
|
-
#
|
|
325
|
+
# Action Trace Correlation
|
|
301
326
|
|
|
302
327
|
## Why seventh
|
|
303
328
|
Very valuable for debugging,
|
|
304
329
|
but less critical than improving control success first.
|
|
305
330
|
|
|
331
|
+
**Status:** Planned
|
|
332
|
+
**Priority:** P7
|
|
333
|
+
|
|
306
334
|
Improves diagnosis more than task completion.
|
|
307
335
|
|
|
308
|
-
##
|
|
336
|
+
## Scope
|
|
309
337
|
- Action correlation metadata
|
|
310
338
|
- UI/network/log linkage
|
|
311
339
|
|
|
312
340
|
## Expected Impact
|
|
313
341
|
Medium-high.
|
|
314
342
|
|
|
315
|
-
##
|
|
343
|
+
## Exit Criteria
|
|
316
344
|
- Action correlation model defined
|
|
317
345
|
- UI/network/log linkage captured for representative actions
|
|
318
346
|
- Correlation metadata exposed to agents
|
|
@@ -331,7 +359,7 @@ Depends on:
|
|
|
331
359
|
|
|
332
360
|
---
|
|
333
361
|
|
|
334
|
-
#
|
|
362
|
+
# Roadmap Sequence
|
|
335
363
|
|
|
336
364
|
## Dependency Summary
|
|
337
365
|
Foundational sequence:
|
|
@@ -351,7 +379,7 @@ Layer 3 (Interaction Expansion)
|
|
|
351
379
|
Layer 4 (Observability)
|
|
352
380
|
- Priority 7 depends on 1,2,3
|
|
353
381
|
|
|
354
|
-
## Wave 1 (
|
|
382
|
+
## Wave 1 (Current Focus)
|
|
355
383
|
- Stronger State Verification
|
|
356
384
|
- Richer Element Identity
|
|
357
385
|
- Wait and Synchronization Reliability
|
|
@@ -361,7 +389,7 @@ Make core loop more reliable.
|
|
|
361
389
|
|
|
362
390
|
---
|
|
363
391
|
|
|
364
|
-
## Wave 2
|
|
392
|
+
## Wave 2 (Expansion)
|
|
365
393
|
- Long Press
|
|
366
394
|
- Better Compose Semantics
|
|
367
395
|
|
|
@@ -370,7 +398,7 @@ Expand interaction capability.
|
|
|
370
398
|
|
|
371
399
|
---
|
|
372
400
|
|
|
373
|
-
## Wave 3
|
|
401
|
+
## Wave 3 (Advanced)
|
|
374
402
|
- Pinch to Zoom
|
|
375
403
|
- Action Trace Correlation
|
|
376
404
|
|
|
@@ -379,7 +407,7 @@ Advanced gestures + observability.
|
|
|
379
407
|
|
|
380
408
|
---
|
|
381
409
|
|
|
382
|
-
#
|
|
410
|
+
# Capability Sequence
|
|
383
411
|
|
|
384
412
|
Execution Order:
|
|
385
413
|
1. Stronger State Verification
|
|
@@ -397,7 +425,7 @@ Rationale:
|
|
|
397
425
|
|
|
398
426
|
---
|
|
399
427
|
|
|
400
|
-
##
|
|
428
|
+
## Future Considerations
|
|
401
429
|
Still out of scope:
|
|
402
430
|
|
|
403
431
|
- Recovery planning logic
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
# RFC 004: Verification Routing for Local and Side-Effect Actions
|
|
4
|
+
|
|
5
|
+
## Status
|
|
6
|
+
Draft
|
|
7
|
+
|
|
8
|
+
## Summary
|
|
9
|
+
|
|
10
|
+
This RFC corrects a specification flaw in action verification routing where agents may treat lack of obvious UI change as a trigger to inspect network activity by default.
|
|
11
|
+
|
|
12
|
+
The current fallback can cause unnecessary network calls during purely local UI interactions (for example sliders, pickers, toggles, text entry), creating noise and reinforcing incorrect agent behavior.
|
|
13
|
+
|
|
14
|
+
This RFC separates:
|
|
15
|
+
- action verification
|
|
16
|
+
- failure diagnosis
|
|
17
|
+
- backend signal inspection
|
|
18
|
+
|
|
19
|
+
And introduces context-aware routing based on action type.
|
|
20
|
+
|
|
21
|
+
## Motivation
|
|
22
|
+
|
|
23
|
+
Observed agent sessions showed `get_network_activity` being invoked during local UI manipulation solely because an action produced no coarse-grained UI diff.
|
|
24
|
+
|
|
25
|
+
Current implicit reasoning resembles:
|
|
26
|
+
|
|
27
|
+
```text
|
|
28
|
+
if uiChanged == false:
|
|
29
|
+
inspect network activity
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This is overly broad.
|
|
33
|
+
|
|
34
|
+
For many interactions, absence of obvious snapshot change does not imply backend ambiguity. It often means verification used the wrong signals.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
- Slider value changed but tree structure did not.
|
|
38
|
+
- Picker selection updated in-place.
|
|
39
|
+
- Toggle changed checked state only.
|
|
40
|
+
- Text field value changed without large snapshot delta.
|
|
41
|
+
- Tab or accordion state changed through selection metadata.
|
|
42
|
+
|
|
43
|
+
In these cases network inspection is diagnostic noise, not evidence.
|
|
44
|
+
|
|
45
|
+
## Problem Statement
|
|
46
|
+
|
|
47
|
+
The current model conflates:
|
|
48
|
+
|
|
49
|
+
1. Verifying whether an action succeeded.
|
|
50
|
+
2. Diagnosing why an action may have failed.
|
|
51
|
+
|
|
52
|
+
These are distinct phases.
|
|
53
|
+
|
|
54
|
+
As a result:
|
|
55
|
+
- agents overuse network inspection
|
|
56
|
+
- verification costs increase
|
|
57
|
+
- local-state actions are treated as ambiguous too often
|
|
58
|
+
- network hints can be elevated beyond their intended role
|
|
59
|
+
|
|
60
|
+
## Goals
|
|
61
|
+
|
|
62
|
+
This RFC:
|
|
63
|
+
- Prevents default network fallbacks for local-state actions.
|
|
64
|
+
- Makes verification primarily state-driven.
|
|
65
|
+
- Restricts network activity inspection to side-effect actions where ambiguity remains.
|
|
66
|
+
- Refines `classify_action_outcome` decision routing.
|
|
67
|
+
|
|
68
|
+
## Non-Goals
|
|
69
|
+
|
|
70
|
+
This RFC does not:
|
|
71
|
+
- change raw snapshot precedence (raw remains authoritative)
|
|
72
|
+
- redefine expect_* ownership of verification
|
|
73
|
+
- make network activity mandatory evidence
|
|
74
|
+
- expand semantic hints into executable truth
|
|
75
|
+
|
|
76
|
+
## Action Categories
|
|
77
|
+
|
|
78
|
+
### Category A: Local-State Actions
|
|
79
|
+
|
|
80
|
+
Actions expected to modify client-side UI state.
|
|
81
|
+
|
|
82
|
+
Examples:
|
|
83
|
+
- tap toggle
|
|
84
|
+
- drag slider
|
|
85
|
+
- picker selection
|
|
86
|
+
- text entry
|
|
87
|
+
- scrolling
|
|
88
|
+
- tab switching
|
|
89
|
+
- expand/collapse
|
|
90
|
+
- local navigation controls
|
|
91
|
+
|
|
92
|
+
### Category B: Side-Effect Actions
|
|
93
|
+
|
|
94
|
+
Actions that may trigger backend or asynchronous side effects.
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
- submit
|
|
98
|
+
- save
|
|
99
|
+
- sync
|
|
100
|
+
- search
|
|
101
|
+
- refresh
|
|
102
|
+
- login
|
|
103
|
+
- purchase flows
|
|
104
|
+
|
|
105
|
+
## Action Classification Source of Truth
|
|
106
|
+
|
|
107
|
+
## Action Type Emission (Runtime Contract)
|
|
108
|
+
|
|
109
|
+
`action_type` MUST be emitted by the runtime layer that produces or executes actions. It is not inferred by the agent.
|
|
110
|
+
|
|
111
|
+
There are three valid sources of truth, in order of precedence:
|
|
112
|
+
|
|
113
|
+
### 1. Tool Schema Annotation (preferred)
|
|
114
|
+
|
|
115
|
+
If the action originates from a tool invocation, `action_type` MUST be defined in the tool’s schema definition.
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
|
|
119
|
+
```json
|
|
120
|
+
{
|
|
121
|
+
"name": "toggle_switch",
|
|
122
|
+
"action_type": "local_state"
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
or
|
|
127
|
+
|
|
128
|
+
```json
|
|
129
|
+
{
|
|
130
|
+
"name": "submit_form",
|
|
131
|
+
"action_type": "side_effect"
|
|
132
|
+
}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
This is the canonical source.
|
|
136
|
+
|
|
137
|
+
### 2. Handler Output (runtime execution layer)
|
|
138
|
+
|
|
139
|
+
If tool schema does not define `action_type`, the runtime handler that executes the action MUST attach it before returning the action result.
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
|
|
143
|
+
```json
|
|
144
|
+
{
|
|
145
|
+
"action": "click",
|
|
146
|
+
"target": "save_button",
|
|
147
|
+
"action_type": "side_effect"
|
|
148
|
+
}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
This is valid only when schema-level annotation is absent.
|
|
152
|
+
|
|
153
|
+
### 3. Fallback Mapping Table (last resort, deterministic only)
|
|
154
|
+
|
|
155
|
+
If neither schema nor handler provides `action_type`, the system MUST use a deterministic mapping table maintained by the runtime.
|
|
156
|
+
|
|
157
|
+
This table MUST be:
|
|
158
|
+
- static (no runtime inference)
|
|
159
|
+
- versioned
|
|
160
|
+
- explicitly defined in implementation
|
|
161
|
+
|
|
162
|
+
Example mapping:
|
|
163
|
+
|
|
164
|
+
| action | action_type |
|
|
165
|
+
|--------|------------|
|
|
166
|
+
| tap_toggle | local_state |
|
|
167
|
+
| enter_text | local_state |
|
|
168
|
+
| submit | side_effect |
|
|
169
|
+
| refresh | side_effect |
|
|
170
|
+
|
|
171
|
+
If an action is not in the table, it MUST default to:
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
side_effect
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Hard Constraint
|
|
178
|
+
|
|
179
|
+
Agents MUST NOT infer or override `action_type` based on UI state changes, snapshot diffs, or network activity.
|
|
180
|
+
|
|
181
|
+
### Normative Interpretation
|
|
182
|
+
|
|
183
|
+
`action_type` is part of the execution contract, not the reasoning layer.
|
|
184
|
+
|
|
185
|
+
Action type MUST be explicitly defined by the action schema or tool output.
|
|
186
|
+
|
|
187
|
+
Valid values:
|
|
188
|
+
- local_state
|
|
189
|
+
- side_effect
|
|
190
|
+
|
|
191
|
+
Agents MUST NOT infer action type from UI changes.
|
|
192
|
+
|
|
193
|
+
If action type is missing, agents MUST treat it as side_effect only if backend interaction is plausible; otherwise classify as local_state.
|
|
194
|
+
|
|
195
|
+
## Revised Verification Routing
|
|
196
|
+
|
|
197
|
+
### For Local-State Actions
|
|
198
|
+
|
|
199
|
+
Verification priority:
|
|
200
|
+
|
|
201
|
+
1. Expected state assertions
|
|
202
|
+
2. Refreshed snapshot comparison
|
|
203
|
+
3. Element property checks
|
|
204
|
+
4. Targeted expect_* verification
|
|
205
|
+
|
|
206
|
+
Signals may include:
|
|
207
|
+
- value changes
|
|
208
|
+
- selected state
|
|
209
|
+
- checked state
|
|
210
|
+
- focus changes
|
|
211
|
+
- labels/text
|
|
212
|
+
- enabled/disabled transitions
|
|
213
|
+
- position/state metadata
|
|
214
|
+
|
|
215
|
+
Network activity should not be used as default fallback.
|
|
216
|
+
|
|
217
|
+
## For Side-Effect Actions
|
|
218
|
+
|
|
219
|
+
Verification priority:
|
|
220
|
+
|
|
221
|
+
1. Expected UI/state verification first
|
|
222
|
+
2. Retry richer local verification if ambiguous
|
|
223
|
+
3. Only then optionally inspect network or log signals
|
|
224
|
+
|
|
225
|
+
Network signals are supporting hints, not primary proof of success.
|
|
226
|
+
|
|
227
|
+
## Decision Logic Update
|
|
228
|
+
|
|
229
|
+
Replace implied logic:
|
|
230
|
+
|
|
231
|
+
```text
|
|
232
|
+
if uiChanged == false:
|
|
233
|
+
get_network_activity()
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
With:
|
|
237
|
+
|
|
238
|
+
```text
|
|
239
|
+
if expected_state_verified:
|
|
240
|
+
success
|
|
241
|
+
|
|
242
|
+
elif action_type == local_state:
|
|
243
|
+
retry using richer state verification
|
|
244
|
+
|
|
245
|
+
elif action_type == side_effect and ambiguity_remains:
|
|
246
|
+
optionally inspect network activity
|
|
247
|
+
|
|
248
|
+
else:
|
|
249
|
+
inconclusive
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Definition of Ambiguity
|
|
253
|
+
|
|
254
|
+
Ambiguity exists only when:
|
|
255
|
+
|
|
256
|
+
- expected state cannot be evaluated from UI snapshot, AND
|
|
257
|
+
- no single deterministic state predicate can be computed from UI fields
|
|
258
|
+
|
|
259
|
+
Ambiguity does NOT include:
|
|
260
|
+
- absence of visual diff
|
|
261
|
+
- absence of network activity
|
|
262
|
+
- lack of large UI tree changes
|
|
263
|
+
|
|
264
|
+
## Normative Rules
|
|
265
|
+
|
|
266
|
+
### Rule 1
|
|
267
|
+
|
|
268
|
+
Agents MUST NOT use network activity inspection as a default fallback for local-state actions solely because coarse UI diffs are absent.
|
|
269
|
+
|
|
270
|
+
### Rule 2
|
|
271
|
+
|
|
272
|
+
Agents MUST prefer explicit state verification over backend diagnostics whenever the action is expected to be locally observable.
|
|
273
|
+
|
|
274
|
+
### Rule 3
|
|
275
|
+
|
|
276
|
+
Network activity MAY be consulted only when:
|
|
277
|
+
- the action plausibly triggers backend work, and
|
|
278
|
+
- local verification remains ambiguous under the defined ambiguity criteria.
|
|
279
|
+
|
|
280
|
+
### Rule 4
|
|
281
|
+
|
|
282
|
+
Network activity evidence MUST be treated as auxiliary signal, not authoritative proof of action success.
|
|
283
|
+
|
|
284
|
+
## Unified Diagnostic Signals
|
|
285
|
+
|
|
286
|
+
Network activity and log inspection are equivalent diagnostic signals.
|
|
287
|
+
|
|
288
|
+
Both:
|
|
289
|
+
- are secondary to UI state verification
|
|
290
|
+
- MUST NOT be used as default fallback for local-state actions
|
|
291
|
+
- follow the same escalation rules defined in this RFC
|
|
292
|
+
|
|
293
|
+
## Impact on classify_action_outcome
|
|
294
|
+
|
|
295
|
+
`classify_action_outcome` should be interpreted as routing logic, not a mandatory network escalation path.
|
|
296
|
+
|
|
297
|
+
For `uiChanged=false`, action category determines next step.
|
|
298
|
+
|
|
299
|
+
No automatic implication:
|
|
300
|
+
|
|
301
|
+
```text
|
|
302
|
+
uiChanged=false => inspect network
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## Expected Benefits
|
|
306
|
+
|
|
307
|
+
- Fewer unnecessary tool calls
|
|
308
|
+
- Cleaner verification traces
|
|
309
|
+
- Reduced cargo-cult network probing
|
|
310
|
+
- Better behavior for local UI interactions
|
|
311
|
+
- Stronger separation between verification and diagnosis
|
|
312
|
+
- More reliable agent reasoning
|
|
313
|
+
|
|
314
|
+
## Compatibility
|
|
315
|
+
|
|
316
|
+
This is a patch-level specification correction.
|
|
317
|
+
|
|
318
|
+
It refines routing semantics but does not break:
|
|
319
|
+
- existing expect_* semantics
|
|
320
|
+
- snapshot response shape
|
|
321
|
+
- raw-over-semantic precedence
|
|
322
|
+
- action execution model
|
|
323
|
+
|
|
324
|
+
## Implementation Notes
|
|
325
|
+
|
|
326
|
+
Follow-up work may include:
|
|
327
|
+
- prompt updates
|
|
328
|
+
- regression examples for sliders/toggles/pickers
|
|
329
|
+
- protocol examples showing correct routing
|
|
330
|
+
- telemetry on reduced unnecessary network inspections
|
|
331
|
+
|
|
332
|
+
## Open Questions
|
|
333
|
+
|
|
334
|
+
Questions for review:
|
|
335
|
+
|
|
336
|
+
1. Should action category be explicitly emitted as runtime metadata, or is heuristic inference acceptable only within the fallback mapping layer defined in the Action Type Emission contract?
|
|
337
|
+
2. Should side-effect actions permit optional log inspection alongside network hints?
|
|
338
|
+
3. Should local-state verification examples be added to core spec or examples appendix?
|
|
339
|
+
|
|
340
|
+
## Decision Requested
|
|
341
|
+
|
|
342
|
+
Adopt verification routing based on action type and remove implicit default escalation from missing UI diffs to network inspection.
|
|
@@ -41,7 +41,7 @@ Outcome-specific guidance:
|
|
|
41
41
|
- visible navigation expected -> `wait_for_screen_change` (optional) -> `expect_screen`
|
|
42
42
|
- local UI change expected -> `wait_for_ui` (optional) -> `expect_element_visible`
|
|
43
43
|
- readable element state expected -> `wait_for_ui` (optional) -> `expect_state`
|
|
44
|
-
- backend/API activity expected without a visible UI change -> compare `get_screen_fingerprint` before/after, then call `
|
|
44
|
+
- backend/API activity expected without a visible UI change -> compare `get_screen_fingerprint` before/after, then call `classify_action_outcome` with the runtime `action_type`; collect `get_network_activity` only if the result remains ambiguous
|
|
45
45
|
|
|
46
46
|
For backend/API activity, `wait_for_screen_change` is not the right verification tool unless a visible transition is also expected.
|
|
47
47
|
|
|
@@ -294,11 +294,11 @@ Tool: `classify_action_outcome`
|
|
|
294
294
|
|
|
295
295
|
Rules:
|
|
296
296
|
|
|
297
|
-
- MAY use UI, network, and log signals
|
|
297
|
+
- MAY use UI, action, network, and log signals
|
|
298
298
|
- MUST be deterministic
|
|
299
299
|
- MUST NOT replace `expect_*` tools
|
|
300
300
|
- MUST be treated as a supplementary signal only
|
|
301
|
-
- SHOULD be used with `get_network_activity` when the
|
|
301
|
+
- SHOULD be used with `get_network_activity` only when the outcome is still ambiguous after routing by `action_type`
|
|
302
302
|
|
|
303
303
|
It is not a verification mechanism.
|
|
304
304
|
|
package/docs/tools/interact.md
CHANGED
|
@@ -17,6 +17,7 @@ Important:
|
|
|
17
17
|
|
|
18
18
|
- `wait_for_*` tools must not be used as the final verification of action success when an applicable `expect_*` tool exists.
|
|
19
19
|
- action tools report execution success, not outcome correctness.
|
|
20
|
+
- `classify_action_outcome` should receive the runtime `action_type` when you want routing to distinguish local-state and side-effect actions.
|
|
20
21
|
|
|
21
22
|
## tap / swipe / type_text / press_back
|
|
22
23
|
|
|
@@ -54,10 +55,10 @@ Preferred verification:
|
|
|
54
55
|
- navigation outcome known -> `expect_screen`
|
|
55
56
|
- local UI change known -> `expect_element_visible`
|
|
56
57
|
- readable element state known -> `expect_state`
|
|
57
|
-
- backend/API activity expected -> `classify_action_outcome` + `get_network_activity`
|
|
58
|
+
- backend/API activity expected -> `classify_action_outcome` + optional `get_network_activity` if the UI signal remains ambiguous
|
|
58
59
|
|
|
59
|
-
Use `wait_for_screen_change` only when a visible transition is the expected outcome. If a button should trigger an API request but the screen should stay the same, rely on
|
|
60
|
-
For backend-only actions, prefer comparing `get_screen_fingerprint` before/after and
|
|
60
|
+
Use `wait_for_screen_change` only when a visible transition is the expected outcome. If a button should trigger an API request but the screen should stay the same, rely on `action_type` plus classification first.
|
|
61
|
+
For backend-only actions, prefer comparing `get_screen_fingerprint` before/after and collect `get_network_activity` immediately after the action only if the result is still ambiguous; do not wait on `wait_for_screen_change` if no visible transition is expected.
|
|
61
62
|
Use `wait_for_ui_change` when the screen stays in place but visible text or element state should change.
|
|
62
63
|
|
|
63
64
|
---
|
|
@@ -507,17 +508,18 @@ Notes:
|
|
|
507
508
|
|
|
508
509
|
## classify_action_outcome + get_network_activity
|
|
509
510
|
|
|
510
|
-
Use this pair when the action
|
|
511
|
+
Use this pair when the action may trigger network/backend work and the screen may not visibly change.
|
|
511
512
|
|
|
512
513
|
Pattern:
|
|
513
514
|
|
|
514
515
|
1. perform the action
|
|
515
516
|
2. call `classify_action_outcome` with `uiChanged` from `wait_for_screen_change` or a screen fingerprint comparison
|
|
516
|
-
3.
|
|
517
|
-
4.
|
|
517
|
+
3. pass the runtime `action_type` value as `actionType`
|
|
518
|
+
4. collect `get_network_activity` only if the action is side-effect oriented and the UI signal remains ambiguous
|
|
519
|
+
5. call `classify_action_outcome` again with `networkRequests` if you collected them
|
|
518
520
|
|
|
519
521
|
Guidance:
|
|
520
522
|
|
|
521
523
|
- `uiChanged=true` or `expectedElementVisible=true` means the action outcome is already verified
|
|
522
|
-
-
|
|
523
|
-
-
|
|
524
|
+
- local-state actions should prefer refreshed snapshots, `expect_state`, or `expect_element_visible` over default network inspection
|
|
525
|
+
- network activity is auxiliary evidence, not mandatory proof
|
package/package.json
CHANGED
package/src/interact/classify.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export type ActionOutcome = 'success' | 'no_op' | 'backend_failure' | 'ui_failure' | 'unknown'
|
|
2
2
|
export type NetworkRequestStatus = 'success' | 'failure' | 'retryable'
|
|
3
|
+
export type ActionCategory = 'local_state' | 'side_effect'
|
|
3
4
|
|
|
4
5
|
export interface NetworkRequest {
|
|
5
6
|
endpoint: string
|
|
@@ -9,6 +10,8 @@ export interface NetworkRequest {
|
|
|
9
10
|
export interface ClassifyActionOutcomeInput {
|
|
10
11
|
uiChanged: boolean
|
|
11
12
|
expectedElementVisible?: boolean | null
|
|
13
|
+
/** Concrete action_type from the runtime action result (for example: tap, type_text, start_app). */
|
|
14
|
+
actionType?: string | null
|
|
12
15
|
/** null = get_network_activity has not been called yet */
|
|
13
16
|
networkRequests?: NetworkRequest[] | null
|
|
14
17
|
hasLogErrors?: boolean | null
|
|
@@ -17,8 +20,29 @@ export interface ClassifyActionOutcomeInput {
|
|
|
17
20
|
export interface ClassifyActionOutcomeResult {
|
|
18
21
|
outcome: ActionOutcome
|
|
19
22
|
reasoning: string
|
|
20
|
-
|
|
21
|
-
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const ACTION_CATEGORY_BY_TYPE: Record<string, ActionCategory> = {
|
|
26
|
+
tap: 'local_state',
|
|
27
|
+
tap_element: 'local_state',
|
|
28
|
+
swipe: 'local_state',
|
|
29
|
+
scroll_to_element: 'local_state',
|
|
30
|
+
type_text: 'local_state',
|
|
31
|
+
press_back: 'local_state',
|
|
32
|
+
start_app: 'side_effect',
|
|
33
|
+
restart_app: 'side_effect',
|
|
34
|
+
terminate_app: 'side_effect',
|
|
35
|
+
reset_app_data: 'side_effect',
|
|
36
|
+
install_app: 'side_effect',
|
|
37
|
+
build_app: 'side_effect',
|
|
38
|
+
build_and_install: 'side_effect'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function inferActionCategory(actionType?: string | null): ActionCategory | null {
|
|
42
|
+
if (typeof actionType !== 'string') return null
|
|
43
|
+
const normalized = actionType.trim().toLowerCase()
|
|
44
|
+
if (!normalized) return null
|
|
45
|
+
return ACTION_CATEGORY_BY_TYPE[normalized] ?? 'side_effect'
|
|
22
46
|
}
|
|
23
47
|
|
|
24
48
|
/**
|
|
@@ -26,39 +50,55 @@ export interface ClassifyActionOutcomeResult {
|
|
|
26
50
|
* Same inputs always produce the same output.
|
|
27
51
|
*/
|
|
28
52
|
export function classifyActionOutcome(input: ClassifyActionOutcomeInput): ClassifyActionOutcomeResult {
|
|
29
|
-
const { uiChanged, expectedElementVisible, networkRequests, hasLogErrors } = input
|
|
53
|
+
const { uiChanged, expectedElementVisible, actionType, networkRequests, hasLogErrors } = input
|
|
54
|
+
const actionCategory = inferActionCategory(actionType)
|
|
30
55
|
|
|
31
56
|
// Step 1 — UI signal is positive
|
|
32
57
|
if (uiChanged || expectedElementVisible === true) {
|
|
33
58
|
return { outcome: 'success', reasoning: expectedElementVisible === true ? 'expected element is visible' : 'UI changed after action' }
|
|
34
59
|
}
|
|
35
60
|
|
|
36
|
-
// Step 2 —
|
|
37
|
-
if (
|
|
61
|
+
// Step 2 — no action type means we cannot choose a safe routing path
|
|
62
|
+
if (actionCategory === null) {
|
|
38
63
|
return {
|
|
39
64
|
outcome: 'unknown',
|
|
40
|
-
reasoning: '
|
|
41
|
-
nextAction: 'call_get_network_activity'
|
|
65
|
+
reasoning: 'actionType was not supplied; pass the runtime action_type so the classifier can distinguish local-state and side-effect routing'
|
|
42
66
|
}
|
|
43
67
|
}
|
|
44
68
|
|
|
45
|
-
|
|
46
|
-
const failedRequest = networkRequests.find((r) => r.status === 'failure' || r.status === 'retryable')
|
|
69
|
+
const failedRequest = networkRequests?.find((r) => r.status === 'failure' || r.status === 'retryable')
|
|
47
70
|
if (failedRequest) {
|
|
48
71
|
return { outcome: 'backend_failure', reasoning: `network request ${failedRequest.endpoint} returned ${failedRequest.status}` }
|
|
49
72
|
}
|
|
50
73
|
|
|
51
|
-
// Step
|
|
74
|
+
// Step 3 — local-state actions should be verified with state-specific signals first
|
|
75
|
+
if (actionCategory === 'local_state') {
|
|
76
|
+
const logNote = hasLogErrors ? ' (log errors present)' : ''
|
|
77
|
+
return {
|
|
78
|
+
outcome: 'no_op',
|
|
79
|
+
reasoning: `local-state action${logNote}; use expect_state, refreshed snapshot comparison, or expect_element_visible instead of defaulting to network inspection`
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Step 4 — side-effect actions may legitimately need network or log inspection
|
|
84
|
+
if (networkRequests === null || networkRequests === undefined) {
|
|
85
|
+
return {
|
|
86
|
+
outcome: 'unknown',
|
|
87
|
+
reasoning: 'side-effect action without network data; inspect network or log signals only if the outcome is still ambiguous'
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Step 5 — no network requests at all
|
|
52
92
|
if (networkRequests.length === 0) {
|
|
53
93
|
const logNote = hasLogErrors ? ' (log errors present)' : ''
|
|
54
|
-
return { outcome: 'no_op', reasoning: `
|
|
94
|
+
return { outcome: 'no_op', reasoning: `side-effect action and no network activity${logNote}` }
|
|
55
95
|
}
|
|
56
96
|
|
|
57
|
-
// Step
|
|
97
|
+
// Step 6 — network requests exist and all succeeded
|
|
58
98
|
if (networkRequests.every((r) => r.status === 'success')) {
|
|
59
99
|
return { outcome: 'ui_failure', reasoning: 'network requests succeeded but UI did not change' }
|
|
60
100
|
}
|
|
61
101
|
|
|
62
|
-
// Step
|
|
102
|
+
// Step 7 — fallback
|
|
63
103
|
return { outcome: 'unknown', reasoning: 'signals are inconclusive' }
|
|
64
104
|
}
|
|
@@ -344,7 +344,7 @@ Capabilities:
|
|
|
344
344
|
Constraints:
|
|
345
345
|
- Does not verify correctness of the resulting state
|
|
346
346
|
- Must not be used alone to confirm action success when an applicable expect_* tool exists
|
|
347
|
-
-
|
|
347
|
+
- For backend/API activity without a visible UI change, pass the runtime action_type into classify_action_outcome and collect network evidence only if the result remains ambiguous
|
|
348
348
|
|
|
349
349
|
Recommended Usage:
|
|
350
350
|
1. Capture or define the expected outcome
|
|
@@ -918,26 +918,29 @@ Failure Handling:
|
|
|
918
918
|
name: 'classify_action_outcome',
|
|
919
919
|
description: `Classify the outcome of the most recent action into exactly one of: success, no_op, backend_failure, ui_failure, unknown.
|
|
920
920
|
|
|
921
|
-
|
|
922
|
-
Use this
|
|
923
|
-
For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action
|
|
921
|
+
Use the runtime action result's \`action_type\` as \`actionType\` so the classifier can distinguish local-state actions from side-effect actions.
|
|
922
|
+
Use this when the intended outcome is not already fully verified by the UI signal alone.
|
|
923
|
+
For backend/API activity, compare get_screen_fingerprint before and after the action and call get_network_activity immediately after the action if the outcome is still ambiguous.
|
|
924
924
|
|
|
925
925
|
HOW TO GATHER INPUTS before calling:
|
|
926
926
|
1. Call wait_for_screen_change or compare get_screen_fingerprint before/after — set uiChanged accordingly.
|
|
927
927
|
2. If you checked for a specific element with wait_for_ui, set expectedElementVisible.
|
|
928
|
-
3.
|
|
928
|
+
3. Pass actionType from the action response when available.
|
|
929
|
+
4. Only provide networkRequests if you already collected them or want to classify a side-effect action with backend evidence.
|
|
929
930
|
|
|
930
931
|
RULES (applied in order — stop at first match):
|
|
931
932
|
1. If uiChanged=true OR expectedElementVisible=true → outcome=success
|
|
932
|
-
2.
|
|
933
|
+
2. If actionType is missing → outcome=unknown
|
|
933
934
|
3. If any request has status=failure or retryable → outcome=backend_failure
|
|
934
|
-
4. If
|
|
935
|
-
5. If
|
|
936
|
-
6.
|
|
935
|
+
4. If actionType maps to a local-state action → outcome=no_op; prefer state-based verification and avoid default network fallback
|
|
936
|
+
5. If actionType maps to a side-effect action and no networkRequests were supplied → outcome=unknown
|
|
937
|
+
6. If no requests returned → outcome=no_op
|
|
938
|
+
7. If all requests succeeded → outcome=ui_failure
|
|
939
|
+
8. Otherwise → outcome=unknown
|
|
937
940
|
|
|
938
941
|
BEHAVIOUR after outcome:
|
|
939
942
|
- success → continue
|
|
940
|
-
- no_op → retry
|
|
943
|
+
- no_op → retry with richer state verification or re-resolve the element
|
|
941
944
|
- backend_failure → stop and report the failing endpoint
|
|
942
945
|
- ui_failure → stop and report failure
|
|
943
946
|
- unknown → take one recovery step (e.g. capture_debug_snapshot), then stop`,
|
|
@@ -952,9 +955,13 @@ BEHAVIOUR after outcome:
|
|
|
952
955
|
type: 'boolean',
|
|
953
956
|
description: 'true if the element you expected to appear is now visible (from wait_for_ui). Omit if you did not check for a specific element.'
|
|
954
957
|
},
|
|
958
|
+
actionType: {
|
|
959
|
+
type: 'string',
|
|
960
|
+
description: 'The runtime action_type from the action response (for example tap, tap_element, swipe, type_text, press_back, start_app).'
|
|
961
|
+
},
|
|
955
962
|
networkRequests: {
|
|
956
963
|
type: 'array',
|
|
957
|
-
description: '
|
|
964
|
+
description: 'Optional network evidence collected after the action. Use it when the expected outcome is backend/API activity or when the UI signal is ambiguous.',
|
|
958
965
|
items: {
|
|
959
966
|
type: 'object',
|
|
960
967
|
properties: {
|
|
@@ -976,7 +983,7 @@ BEHAVIOUR after outcome:
|
|
|
976
983
|
name: 'get_network_activity',
|
|
977
984
|
description: `Returns structured network events captured from platform logs since the last action.
|
|
978
985
|
|
|
979
|
-
Call this
|
|
986
|
+
Call this immediately after an action when you want backend evidence for a side-effect flow, only if the result is still ambiguous.
|
|
980
987
|
Do not call more than once per action.
|
|
981
988
|
|
|
982
989
|
Events are filtered to significant (non-background) requests only.
|
|
@@ -448,11 +448,13 @@ async function handleStopLogStream(args: ToolCallArgs) {
|
|
|
448
448
|
function handleClassifyActionOutcome(args: ToolCallArgs) {
|
|
449
449
|
const uiChanged = requireBooleanArg(args, 'uiChanged')
|
|
450
450
|
const expectedElementVisible = getBooleanArg(args, 'expectedElementVisible')
|
|
451
|
+
const actionType = getStringArg(args, 'actionType')
|
|
451
452
|
const networkRequests = getArrayArg<ClassifyNetworkRequestArg>(args, 'networkRequests')
|
|
452
453
|
const hasLogErrors = getBooleanArg(args, 'hasLogErrors')
|
|
453
454
|
const result = classifyActionOutcome({
|
|
454
455
|
uiChanged,
|
|
455
456
|
expectedElementVisible: expectedElementVisible ?? null,
|
|
457
|
+
actionType: actionType ?? null,
|
|
456
458
|
networkRequests: networkRequests ?? null,
|
|
457
459
|
hasLogErrors: hasLogErrors ?? null
|
|
458
460
|
})
|
package/src/server-core.ts
CHANGED
|
@@ -7,7 +7,6 @@ function run() {
|
|
|
7
7
|
const result = classifyActionOutcome({ uiChanged: true })
|
|
8
8
|
assert.strictEqual(result.outcome, 'success')
|
|
9
9
|
assert.ok(result.reasoning.length > 0)
|
|
10
|
-
assert.strictEqual(result.nextAction, undefined)
|
|
11
10
|
}
|
|
12
11
|
|
|
13
12
|
// Step 1 — expectedElementVisible → success
|
|
@@ -15,7 +14,6 @@ function run() {
|
|
|
15
14
|
const result = classifyActionOutcome({ uiChanged: false, expectedElementVisible: true })
|
|
16
15
|
assert.strictEqual(result.outcome, 'success')
|
|
17
16
|
assert.strictEqual(result.reasoning, 'expected element is visible')
|
|
18
|
-
assert.strictEqual(result.nextAction, undefined)
|
|
19
17
|
}
|
|
20
18
|
|
|
21
19
|
// Step 1 — both uiChanged and expectedElementVisible → success
|
|
@@ -24,24 +22,50 @@ function run() {
|
|
|
24
22
|
assert.strictEqual(result.outcome, 'success')
|
|
25
23
|
}
|
|
26
24
|
|
|
27
|
-
//
|
|
25
|
+
// No actionType supplied → unknown
|
|
28
26
|
{
|
|
29
27
|
const result = classifyActionOutcome({ uiChanged: false })
|
|
30
28
|
assert.strictEqual(result.outcome, 'unknown')
|
|
31
|
-
assert.
|
|
29
|
+
assert.ok(result.reasoning.includes('actionType was not supplied'))
|
|
32
30
|
}
|
|
33
31
|
|
|
34
|
-
//
|
|
32
|
+
// Local-state action routes to state verification rather than forced network probing
|
|
35
33
|
{
|
|
36
|
-
const result = classifyActionOutcome({ uiChanged: false,
|
|
34
|
+
const result = classifyActionOutcome({ uiChanged: false, actionType: 'tap' })
|
|
35
|
+
assert.strictEqual(result.outcome, 'no_op')
|
|
36
|
+
assert.ok(result.reasoning.includes('local-state action'))
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Local-state action with network data still prefers local-state semantics
|
|
40
|
+
{
|
|
41
|
+
const result = classifyActionOutcome({
|
|
42
|
+
uiChanged: false,
|
|
43
|
+
actionType: 'type_text',
|
|
44
|
+
networkRequests: []
|
|
45
|
+
})
|
|
46
|
+
assert.strictEqual(result.outcome, 'no_op')
|
|
47
|
+
assert.ok(result.reasoning.includes('local-state action'))
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Explicit side-effect action without networkRequests supplied → unknown
|
|
51
|
+
{
|
|
52
|
+
const result = classifyActionOutcome({ uiChanged: false, actionType: 'start_app' })
|
|
37
53
|
assert.strictEqual(result.outcome, 'unknown')
|
|
38
|
-
assert.
|
|
54
|
+
assert.ok(result.reasoning.includes('side-effect action'))
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Side-effect action with empty networkRequests → no_op
|
|
58
|
+
{
|
|
59
|
+
const result = classifyActionOutcome({ uiChanged: false, actionType: 'start_app', networkRequests: [] })
|
|
60
|
+
assert.strictEqual(result.outcome, 'no_op')
|
|
61
|
+
assert.ok(result.reasoning.includes('side-effect action'))
|
|
39
62
|
}
|
|
40
63
|
|
|
41
|
-
//
|
|
64
|
+
// Network failure → backend_failure
|
|
42
65
|
{
|
|
43
66
|
const result = classifyActionOutcome({
|
|
44
67
|
uiChanged: false,
|
|
68
|
+
actionType: 'start_app',
|
|
45
69
|
networkRequests: [{ endpoint: '/login', status: 'failure' }]
|
|
46
70
|
})
|
|
47
71
|
assert.strictEqual(result.outcome, 'backend_failure')
|
|
@@ -49,10 +73,11 @@ function run() {
|
|
|
49
73
|
assert.ok(result.reasoning.includes('failure'))
|
|
50
74
|
}
|
|
51
75
|
|
|
52
|
-
//
|
|
76
|
+
// Retryable status → backend_failure
|
|
53
77
|
{
|
|
54
78
|
const result = classifyActionOutcome({
|
|
55
79
|
uiChanged: false,
|
|
80
|
+
actionType: 'start_app',
|
|
56
81
|
networkRequests: [
|
|
57
82
|
{ endpoint: '/api/submit', status: 'retryable' },
|
|
58
83
|
{ endpoint: '/api/other', status: 'success' }
|
|
@@ -62,25 +87,11 @@ function run() {
|
|
|
62
87
|
assert.ok(result.reasoning.includes('/api/submit'))
|
|
63
88
|
}
|
|
64
89
|
|
|
65
|
-
//
|
|
66
|
-
{
|
|
67
|
-
const result = classifyActionOutcome({ uiChanged: false, networkRequests: [] })
|
|
68
|
-
assert.strictEqual(result.outcome, 'no_op')
|
|
69
|
-
assert.ok(result.reasoning.includes('no UI change'))
|
|
70
|
-
assert.ok(result.reasoning.includes('no network activity'))
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
// Step 4 — empty network requests with log errors → no_op with note
|
|
74
|
-
{
|
|
75
|
-
const result = classifyActionOutcome({ uiChanged: false, networkRequests: [], hasLogErrors: true })
|
|
76
|
-
assert.strictEqual(result.outcome, 'no_op')
|
|
77
|
-
assert.ok(result.reasoning.includes('log errors'))
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// Step 5 — all requests succeeded but UI unchanged → ui_failure
|
|
90
|
+
// All requests succeeded and UI stayed unchanged → ui_failure
|
|
81
91
|
{
|
|
82
92
|
const result = classifyActionOutcome({
|
|
83
93
|
uiChanged: false,
|
|
94
|
+
actionType: 'start_app',
|
|
84
95
|
networkRequests: [
|
|
85
96
|
{ endpoint: '/api/save', status: 'success' },
|
|
86
97
|
{ endpoint: '/api/refresh', status: 'success' }
|
|
@@ -90,10 +101,18 @@ function run() {
|
|
|
90
101
|
assert.ok(result.reasoning.includes('network requests succeeded'))
|
|
91
102
|
}
|
|
92
103
|
|
|
104
|
+
// Empty network requests with log errors → no_op with note
|
|
105
|
+
{
|
|
106
|
+
const result = classifyActionOutcome({ uiChanged: false, actionType: 'start_app', networkRequests: [], hasLogErrors: true })
|
|
107
|
+
assert.strictEqual(result.outcome, 'no_op')
|
|
108
|
+
assert.ok(result.reasoning.includes('log errors'))
|
|
109
|
+
}
|
|
110
|
+
|
|
93
111
|
// Step 1 takes priority over network signals — success even when failures present
|
|
94
112
|
{
|
|
95
113
|
const result = classifyActionOutcome({
|
|
96
114
|
uiChanged: true,
|
|
115
|
+
actionType: 'start_app',
|
|
97
116
|
networkRequests: [{ endpoint: '/api/log', status: 'failure' }]
|
|
98
117
|
})
|
|
99
118
|
assert.strictEqual(result.outcome, 'success')
|
|
@@ -68,15 +68,17 @@ async function run() {
|
|
|
68
68
|
|
|
69
69
|
const classifyActionOutcome = toolDefinitions.find((tool) => tool.name === 'classify_action_outcome')
|
|
70
70
|
assert(classifyActionOutcome, 'classify_action_outcome should be registered')
|
|
71
|
-
assert.match((classifyActionOutcome as any).description, /
|
|
72
|
-
assert.match((classifyActionOutcome as any).description, /
|
|
73
|
-
assert.match((classifyActionOutcome as any).description, /
|
|
71
|
+
assert.match((classifyActionOutcome as any).description, /action_type/i)
|
|
72
|
+
assert.match((classifyActionOutcome as any).description, /local-state/i)
|
|
73
|
+
assert.match((classifyActionOutcome as any).description, /side-effect/i)
|
|
74
|
+
assert.strictEqual((classifyActionOutcome as any).inputSchema.properties.actionType.type, 'string')
|
|
75
|
+
assert.match((classifyActionOutcome as any).inputSchema.properties.networkRequests.description, /optional network evidence/i)
|
|
74
76
|
|
|
75
77
|
const getNetworkActivity = toolDefinitions.find((tool) => tool.name === 'get_network_activity')
|
|
76
78
|
assert(getNetworkActivity, 'get_network_activity should be registered')
|
|
77
|
-
assert.match((getNetworkActivity as any).description, /
|
|
78
|
-
assert.doesNotMatch((getNetworkActivity as any).description, /
|
|
79
|
-
assert.match((getNetworkActivity as any).description, /
|
|
79
|
+
assert.match((getNetworkActivity as any).description, /side-effect/i)
|
|
80
|
+
assert.doesNotMatch((getNetworkActivity as any).description, /nextAction/i)
|
|
81
|
+
assert.match((getNetworkActivity as any).description, /only if the result is still ambiguous/i)
|
|
80
82
|
|
|
81
83
|
await assert.rejects(() => handleToolCall('unknown_tool'), /Unknown tool: unknown_tool/)
|
|
82
84
|
|