screenforge 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +0 -0
- cli/_version.py +1 -0
- cli/dispatch.py +266 -0
- cli/doctor.py +487 -0
- cli/modes/__init__.py +0 -0
- cli/modes/action.py +262 -0
- cli/modes/default.py +248 -0
- cli/modes/demo.py +162 -0
- cli/modes/dry_run.py +237 -0
- cli/modes/init.py +133 -0
- cli/modes/plan.py +148 -0
- cli/modes/workflow.py +354 -0
- cli/parser.py +305 -0
- cli/reporter.py +207 -0
- cli/session.py +146 -0
- cli/shared.py +427 -0
- cli/shorthand.py +90 -0
- cli/tool_protocol_handlers.py +446 -0
- common/__init__.py +0 -0
- common/adapters/__init__.py +21 -0
- common/adapters/android_adapter.py +273 -0
- common/adapters/base_adapter.py +24 -0
- common/adapters/ios_adapter.py +278 -0
- common/adapters/web_adapter.py +271 -0
- common/ai.py +277 -0
- common/ai_autonomous.py +273 -0
- common/ai_heal.py +222 -0
- common/cache/__init__.py +15 -0
- common/cache/cache_hash.py +57 -0
- common/cache/cache_manager.py +300 -0
- common/cache/cache_stats.py +133 -0
- common/cache/cache_storage.py +79 -0
- common/cache/embedding_loader.py +150 -0
- common/capabilities.py +121 -0
- common/case_memory.py +327 -0
- common/error_codes.py +61 -0
- common/exceptions.py +18 -0
- common/executor.py +1504 -0
- common/failure_diagnosis.py +138 -0
- common/history_manager.py +75 -0
- common/logs.py +168 -0
- common/mcp_server.py +467 -0
- common/preflight.py +496 -0
- common/progress.py +37 -0
- common/run_reporter.py +415 -0
- common/run_resume.py +149 -0
- common/runtime_modes.py +35 -0
- common/tool_protocol.py +196 -0
- common/visual_fallback.py +71 -0
- common/workflow_schema.py +150 -0
- config/__init__.py +0 -0
- config/config.py +167 -0
- config/env_loader.py +76 -0
- screenforge-0.4.0.dist-info/METADATA +43 -0
- screenforge-0.4.0.dist-info/RECORD +64 -0
- screenforge-0.4.0.dist-info/WHEEL +5 -0
- screenforge-0.4.0.dist-info/entry_points.txt +2 -0
- screenforge-0.4.0.dist-info/licenses/LICENSE +21 -0
- screenforge-0.4.0.dist-info/top_level.txt +4 -0
- utils/__init__.py +0 -0
- utils/screenshot_annotator.py +60 -0
- utils/utils_ios.py +195 -0
- utils/utils_web.py +304 -0
- utils/utils_xml.py +218 -0
utils/utils_web.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def normalize_loopback_url(url: str) -> str:
|
|
5
|
+
parsed = urlsplit(str(url))
|
|
6
|
+
if parsed.hostname != "localhost":
|
|
7
|
+
return str(url)
|
|
8
|
+
|
|
9
|
+
normalized_netloc = parsed.netloc.replace("localhost", "127.0.0.1", 1)
|
|
10
|
+
return urlunsplit(
|
|
11
|
+
(
|
|
12
|
+
parsed.scheme,
|
|
13
|
+
normalized_netloc,
|
|
14
|
+
parsed.path,
|
|
15
|
+
parsed.query,
|
|
16
|
+
parsed.fragment,
|
|
17
|
+
)
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
def compress_web_dom(page) -> str:
|
|
21
|
+
"""
|
|
22
|
+
通过向 Playwright 的 page 注入 JS,提取当前页面可见的、有交互价值的元素。
|
|
23
|
+
该算法采用了"物理可见性校验"与"布局噪音消除"机制,能将动辄几万行的 HTML 压缩 95% 以上的噪音,
|
|
24
|
+
并将其降维成与 Android XML 结构一致且富含语义的 JSON。
|
|
25
|
+
"""
|
|
26
|
+
js_script = """
|
|
27
|
+
() => {
|
|
28
|
+
const elements = [];
|
|
29
|
+
let refIndex = 0;
|
|
30
|
+
// nodeData -> 原始 DOM el,供「重复同名控件消歧」后处理沿 DOM 上爬找所在行。
|
|
31
|
+
// 只在内存里用,绝不参与序列化(DOM 节点有环,JSON.stringify 会炸)。
|
|
32
|
+
const rawElOf = new Map();
|
|
33
|
+
|
|
34
|
+
// Web 端具有明确交互语义的 role 集合
|
|
35
|
+
const interactiveRoles = new Set(['button', 'link', 'menuitem', 'option', 'tab', 'switch', 'checkbox', 'radio', 'combobox']);
|
|
36
|
+
// 纯结构或绝对无用的标签。注意 iframe 不在此列:我们要递归进入其内容
|
|
37
|
+
// 文档(见 walk)而不是把 iframe 框本身当作元素。
|
|
38
|
+
const ignoreTags = new Set(['script', 'style', 'noscript', 'head', 'meta', 'title', 'br', 'hr', 'svg', 'path', 'g', 'img', 'video', 'audio', 'iframe']);
|
|
39
|
+
|
|
40
|
+
// 判定元素是否处于 inert 子树。closest('[inert]') 只在同一棵树内向上找,
|
|
41
|
+
// 看不到 shadow host / iframe 之外的祖先 —— 故由 walk 把跨边界继承来的
|
|
42
|
+
// inert 状态(inherited)一并算入,与 offX/offY 跨边界传递的方式一致。
|
|
43
|
+
function isInertEl(el, inherited) {
|
|
44
|
+
if (inherited) return true;
|
|
45
|
+
try { return el.closest('[inert]') !== null; } catch (e) { return false; }
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// 处理单个元素:offX/offY 是从内层文档(iframe)坐标系到顶层视口坐标系
|
|
49
|
+
// 的偏移量,保证 shadow/iframe 内元素的 bbox 仍然是顶层坐标,ref 点击不会错位。
|
|
50
|
+
// inheritedInert:宿主/iframe 跨边界继承来的 inert 状态(见 walk)。
|
|
51
|
+
function processEl(el, offX, offY, inheritedInert) {
|
|
52
|
+
const tag = el.tagName.toLowerCase();
|
|
53
|
+
if (ignoreTags.has(tag)) return;
|
|
54
|
+
|
|
55
|
+
// 1. 物理可见性校验 (过滤幽灵节点)
|
|
56
|
+
let rect;
|
|
57
|
+
try { rect = el.getBoundingClientRect(); } catch (e) { return; }
|
|
58
|
+
if (rect.width === 0 || rect.height === 0) return;
|
|
59
|
+
let style;
|
|
60
|
+
try { style = (el.ownerDocument.defaultView || window).getComputedStyle(el); } catch (e) { return; }
|
|
61
|
+
if (style.visibility === 'hidden' || style.opacity === '0' || style.display === 'none') return;
|
|
62
|
+
|
|
63
|
+
// 2. 交互意图判定。disabled / aria-disabled 的控件不可点:标 clickable=false
|
|
64
|
+
// (仍然收录,便于断言其存在/禁用),否则 LLM 会去点禁用按钮并卡超时。
|
|
65
|
+
// 用 :disabled 伪类而非 el.disabled —— 后者只反映元素自身的 disabled
|
|
66
|
+
// 属性,看不到「<fieldset disabled> 传播给后代控件」这一规范行为
|
|
67
|
+
// (含首个 <legend> 内控件豁免、嵌套 fieldset 由外层继续禁用)。
|
|
68
|
+
// :disabled 正是浏览器对「actually disabled」的实现,一次到位且权威。
|
|
69
|
+
const ariaDisabled = el.getAttribute('aria-disabled') === 'true';
|
|
70
|
+
let nativeDisabled;
|
|
71
|
+
try { nativeDisabled = el.matches(':disabled'); }
|
|
72
|
+
catch (e) { nativeDisabled = el.disabled === true; }
|
|
73
|
+
const isDisabled = nativeDisabled || ariaDisabled;
|
|
74
|
+
// inert 子树(开 <dialog> 时背景标 inert 的标准模式)会吞掉点击 ——
|
|
75
|
+
// 这类控件仍可见会被收录,但若仍标 clickable,LLM 会去点模态背后的死
|
|
76
|
+
// 按钮然后 no-op/超时,故同样判为不可点。但 inert ≠ disabled(前者是"被
|
|
77
|
+
// 遮挡/暂不可交互",后者是"控件本身被禁用"),分开上报:让 LLM 能推断
|
|
78
|
+
// "有模态需先关掉"而非"表单被禁用",也避免 assert disabled 误判通过。
|
|
79
|
+
const isInert = isInertEl(el, inheritedInert);
|
|
80
|
+
const isInteractive = !isDisabled && !isInert && (
|
|
81
|
+
['a', 'button', 'input', 'select', 'textarea'].includes(tag) ||
|
|
82
|
+
el.hasAttribute('onclick') ||
|
|
83
|
+
interactiveRoles.has(el.getAttribute('role')) ||
|
|
84
|
+
style.cursor === 'pointer');
|
|
85
|
+
|
|
86
|
+
// 3. 智能文本提取 (防止父容器吞噬子节点文本造成大量重复)
|
|
87
|
+
const directText = Array.from(el.childNodes)
|
|
88
|
+
.filter(node => node.nodeType === Node.TEXT_NODE)
|
|
89
|
+
.map(node => node.nodeValue.trim())
|
|
90
|
+
.join(' ').trim();
|
|
91
|
+
|
|
92
|
+
let fullText = el.innerText ? el.innerText.trim() : '';
|
|
93
|
+
if (tag === 'input' || tag === 'textarea') fullText = el.value || '';
|
|
94
|
+
if (fullText.length > 100) fullText = fullText.substring(0, 100) + '...';
|
|
95
|
+
|
|
96
|
+
const ariaLabel = el.getAttribute('aria-label') || el.getAttribute('title') || el.getAttribute('alt') || '';
|
|
97
|
+
|
|
98
|
+
// 4. 噪音与垃圾数据剔除策略
|
|
99
|
+
// 收录条件用「语义可交互」(标签/role 本身可交互,忽略 disabled),
|
|
100
|
+
// 这样禁用按钮也会被收录(clickable=false),而纯排版 wrapper 仍被丢弃。
|
|
101
|
+
const isSemanticControl = ['a', 'button', 'input', 'select', 'textarea'].includes(tag) ||
|
|
102
|
+
el.hasAttribute('onclick') ||
|
|
103
|
+
interactiveRoles.has(el.getAttribute('role'));
|
|
104
|
+
const keepForLayout = isInteractive || isSemanticControl;
|
|
105
|
+
if (!keepForLayout && !directText && !ariaLabel) return;
|
|
106
|
+
|
|
107
|
+
// For a kept (interactive/semantic) element, prefer innerText but
|
|
108
|
+
// fall back to its own directText — otherwise a clickable shadow
|
|
109
|
+
// HOST whose light text isn't slotted (innerText==='') would be
|
|
110
|
+
// dropped by the empty-shell guard below, leaving it invisible to
|
|
111
|
+
// the LLM (the exact blind spot this change targets).
|
|
112
|
+
const displayText = keepForLayout ? (fullText || directText) : (directText.length > 0 ? fullText : directText);
|
|
113
|
+
|
|
114
|
+
const placeholder = el.getAttribute('placeholder') || '';
|
|
115
|
+
const type = el.getAttribute('type') || '';
|
|
116
|
+
const name = el.getAttribute('name') || '';
|
|
117
|
+
|
|
118
|
+
if (!displayText && !ariaLabel && !placeholder && !['input', 'select', 'textarea'].includes(tag)) return;
|
|
119
|
+
|
|
120
|
+
// 5. 构建低 Token 结构体
|
|
121
|
+
refIndex++;
|
|
122
|
+
const nodeData = { "ref": "@" + refIndex, "class": tag, "clickable": isInteractive };
|
|
123
|
+
if (el.id) nodeData.id = el.id;
|
|
124
|
+
if (name) nodeData.name = name;
|
|
125
|
+
if (type) nodeData.type = type;
|
|
126
|
+
if (placeholder) nodeData.placeholder = placeholder;
|
|
127
|
+
if (ariaLabel) nodeData.desc = ariaLabel;
|
|
128
|
+
if (displayText) nodeData.text = displayText;
|
|
129
|
+
if (isDisabled) nodeData.disabled = true;
|
|
130
|
+
if (isInert) nodeData.inert = true;
|
|
131
|
+
nodeData.x = Math.round(rect.x + offX);
|
|
132
|
+
nodeData.y = Math.round(rect.y + offY);
|
|
133
|
+
nodeData.w = Math.round(rect.width);
|
|
134
|
+
nodeData.h = Math.round(rect.height);
|
|
135
|
+
|
|
136
|
+
elements.push(nodeData);
|
|
137
|
+
rawElOf.set(nodeData, el);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// 递归遍历:普通子树 + shadow DOM + 同源 iframe。这是修复"压缩器对
|
|
141
|
+
// shadow DOM / iframe 失明"的核心:querySelectorAll('*') 不穿透 shadow
|
|
142
|
+
// root,也不进入 iframe 文档,导致整类应用对 LLM 不可见。
|
|
143
|
+
function walk(root, offX, offY, depth, inheritedInert) {
|
|
144
|
+
// Depth cap: static DOM can't form true cycles (an iframe's
|
|
145
|
+
// contentDocument is always a fresh document; a shadow root can't
|
|
146
|
+
// contain its own host), so this is insurance against pathologically
|
|
147
|
+
// deep generated pages approaching the JS recursion limit.
|
|
148
|
+
if (depth > 50) return;
|
|
149
|
+
let nodes;
|
|
150
|
+
try { nodes = root.querySelectorAll('*'); } catch (e) { return; }
|
|
151
|
+
nodes.forEach(el => {
|
|
152
|
+
const tag = el.tagName ? el.tagName.toLowerCase() : '';
|
|
153
|
+
|
|
154
|
+
// 同源 iframe:进入其内容文档,并按 iframe 在顶层的位置做坐标偏移。
|
|
155
|
+
// 跨域 iframe 访问 contentDocument 会抛异常 —— 那是浏览器安全边界,
|
|
156
|
+
// 无法穿透,静默跳过(诚实:我们不假装能看到跨域内容)。
|
|
157
|
+
if (tag === 'iframe') {
|
|
158
|
+
// inert 跨 iframe 边界继承:frame 内文档看不到父文档的 inert 祖先,
|
|
159
|
+
// 故在此判定 iframe 自身或其祖先是否 inert,向内传递。
|
|
160
|
+
const frameInert = inheritedInert || isInertEl(el, false);
|
|
161
|
+
let doc = null, frameRect = null, insetX = 0, insetY = 0;
|
|
162
|
+
try {
|
|
163
|
+
frameRect = el.getBoundingClientRect();
|
|
164
|
+
// getBoundingClientRect gives the iframe's BORDER-box origin,
|
|
165
|
+
// but the content document starts inside the border+padding.
|
|
166
|
+
// Without this inset every child is reported too far up-left
|
|
167
|
+
// (Chromium's default 2px iframe border alone shifts a ref
|
|
168
|
+
// click off-target; thick-bordered embed/payment frames more).
|
|
169
|
+
const cs = (el.ownerDocument.defaultView || window).getComputedStyle(el);
|
|
170
|
+
insetX = (parseFloat(cs.borderLeftWidth) || 0) + (parseFloat(cs.paddingLeft) || 0);
|
|
171
|
+
insetY = (parseFloat(cs.borderTopWidth) || 0) + (parseFloat(cs.paddingTop) || 0);
|
|
172
|
+
doc = el.contentDocument;
|
|
173
|
+
} catch (e) { doc = null; }
|
|
174
|
+
if (doc && doc.documentElement) {
|
|
175
|
+
walk(doc.documentElement, offX + frameRect.x + insetX, offY + frameRect.y + insetY, depth + 1, frameInert);
|
|
176
|
+
}
|
|
177
|
+
return;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
processEl(el, offX, offY, inheritedInert);
|
|
181
|
+
|
|
182
|
+
// shadow root(open 模式):递归进入。坐标系与宿主一致,偏移不变。
|
|
183
|
+
// inert 同样跨 shadow 边界继承:closest 不穿透 shadow root,故把
|
|
184
|
+
// 宿主自身/继承来的 inert 状态算好后传入。
|
|
185
|
+
if (el.shadowRoot) {
|
|
186
|
+
walk(el.shadowRoot, offX, offY, depth + 1, isInertEl(el, inheritedInert));
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
walk(document.documentElement, 0, 0, 0, false);
|
|
192
|
+
|
|
193
|
+
// 6. 最终去重 (防止某些前端库生成多个不可见的克隆 DOM)
|
|
194
|
+
const uniqueElements = [];
|
|
195
|
+
const seen = new Set();
|
|
196
|
+
elements.forEach(el => {
|
|
197
|
+
const dedupKeys = Object.keys(el).filter(k => k !== 'ref').sort();
|
|
198
|
+
const key = JSON.stringify(el, dedupKeys);
|
|
199
|
+
if (!seen.has(key)) {
|
|
200
|
+
seen.add(key);
|
|
201
|
+
uniqueElements.push(el);
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
// 7. 重复同名控件消歧。N 行同名按钮(每行一个 "Delete")压缩后 text 全相同,
|
|
206
|
+
// codegen 只能 get_by_text('Delete').first —— 永远点第一行,持久化测试说谎。
|
|
207
|
+
// 这里给「有歧义」的控件补两样东西:scope(所在行的标识文本,如 "Bob Jones")
|
|
208
|
+
// 与 dup_index(DOM 序里的第几个),让 codegen 生成作用域定位器。仅对真正
|
|
209
|
+
// 碰撞(≥2)的控件补,非歧义页 0 额外 token。
|
|
210
|
+
// 碰撞键 = (role, accessible-name),与 codegen 的 _fallback_strategy 同口径。
|
|
211
|
+
// 角色推断必须与 codegen 的 _infer_web_role 同口径,否则分组会漏判:
|
|
212
|
+
// 比如 <input type=submit value=X> 与 <button>X</button> 都渲染成
|
|
213
|
+
// get_by_role('button', name='X'),分组键若只看 tag 会把它们分到两组而漏掉碰撞。
|
|
214
|
+
function roleOf(nd) {
|
|
215
|
+
const tag = nd.class;
|
|
216
|
+
const t = (nd.type || '').toLowerCase();
|
|
217
|
+
if (tag === 'a') return 'link';
|
|
218
|
+
if (tag === 'button' || t === 'submit' || t === 'button' || t === 'reset') return 'button';
|
|
219
|
+
if (t === 'checkbox') return 'checkbox';
|
|
220
|
+
if (t === 'radio') return 'radio';
|
|
221
|
+
if (tag === 'select') return 'combobox';
|
|
222
|
+
if (tag === 'textarea' || (tag === 'input' &&
|
|
223
|
+
['', 'text', 'email', 'search', 'url', 'tel', 'password'].includes(t))) return 'textbox';
|
|
224
|
+
return tag;
|
|
225
|
+
}
|
|
226
|
+
function nameKeyOf(nd) {
|
|
227
|
+
const accName = (nd.desc || nd.text || '').trim();
|
|
228
|
+
if (!accName) return null;
|
|
229
|
+
return roleOf(nd) + '\\u0000' + accName;
|
|
230
|
+
}
|
|
231
|
+
// 收集每个碰撞键的成员(仅可点击控件——不可点文本不会被 LLM 当点击目标)。
|
|
232
|
+
const groups = new Map();
|
|
233
|
+
uniqueElements.forEach(nd => {
|
|
234
|
+
if (!nd.clickable) return;
|
|
235
|
+
const k = nameKeyOf(nd);
|
|
236
|
+
if (!k) return;
|
|
237
|
+
if (!groups.has(k)) groups.set(k, []);
|
|
238
|
+
groups.get(k).push(nd);
|
|
239
|
+
});
|
|
240
|
+
groups.forEach(members => {
|
|
241
|
+
if (members.length < 2) return; // 唯一 → 不补,省 token
|
|
242
|
+
const groupEls = members.map(m => rawElOf.get(m)).filter(Boolean);
|
|
243
|
+
// 先各自算候选 scope(所在行的唯一标识 label)。
|
|
244
|
+
members.forEach((nd, i) => {
|
|
245
|
+
nd.dup_index = i; // DOM 序(uniqueElements 保持遍历顺序)
|
|
246
|
+
const el = rawElOf.get(nd);
|
|
247
|
+
nd._scopeCand = el ? computeScope(el, groupEls, nd) : '';
|
|
248
|
+
});
|
|
249
|
+
// 组内唯一性校验:若某 scope 被多行共用(行标识相同)或为空,则它无法消歧,
|
|
250
|
+
// 不写 scope(该成员只保留 dup_index → codegen 走诚实 skip,绝不持久化会必然
|
|
251
|
+
// 失败的定位器)。只有「组内唯一且非空」的 scope 才采纳。
|
|
252
|
+
const counts = {};
|
|
253
|
+
members.forEach(nd => { const s = nd._scopeCand; if (s) counts[s] = (counts[s] || 0) + 1; });
|
|
254
|
+
members.forEach(nd => {
|
|
255
|
+
if (nd._scopeCand && counts[nd._scopeCand] === 1) nd.scope = nd._scopeCand;
|
|
256
|
+
delete nd._scopeCand;
|
|
257
|
+
});
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
// 找「行根」(子树只含本组这一个成员的最高祖先),再在行内取一个干净的叶子 label
|
|
261
|
+
// 作为 scope —— 叶子的 textContent 可被 get_by_text(exact=True) 精确命中,避免
|
|
262
|
+
// 子串误匹配("Bob" 命中 "Bob Jones")。取最长的叶子文本(最具体的行标识)。
|
|
263
|
+
function computeScope(el, groupEls, nd) {
|
|
264
|
+
const ownName = (nd.desc || nd.text || '').trim();
|
|
265
|
+
let rowRoot = null;
|
|
266
|
+
let cur = el.parentElement;
|
|
267
|
+
while (cur && cur.tagName && cur.tagName.toLowerCase() !== 'body') {
|
|
268
|
+
let containsOther = false;
|
|
269
|
+
for (const other of groupEls) {
|
|
270
|
+
if (other !== el && cur.contains(other)) { containsOther = true; break; }
|
|
271
|
+
}
|
|
272
|
+
if (containsOther) break; // 再往上就跨行了
|
|
273
|
+
rowRoot = cur;
|
|
274
|
+
cur = cur.parentElement;
|
|
275
|
+
}
|
|
276
|
+
if (!rowRoot) return '';
|
|
277
|
+
let best = '';
|
|
278
|
+
let nodes;
|
|
279
|
+
try { nodes = rowRoot.querySelectorAll('*'); } catch (e) { return ''; }
|
|
280
|
+
nodes.forEach(d => {
|
|
281
|
+
if (el.contains(d)) return; // 跳过控件自身及其内部(它的 label)
|
|
282
|
+
if (d.childElementCount > 0) return; // 只取叶子 → 文本可被 exact 命中
|
|
283
|
+
let t = '';
|
|
284
|
+
try { t = (d.textContent || '').trim().replace(/\\s+/g, ' '); } catch (e) { return; }
|
|
285
|
+
if (!t || t === ownName) return;
|
|
286
|
+
// 上限 80:scope 要用 get_by_text(exact=True) 精确命中,绝不能截断
|
|
287
|
+
//(截断后 exact 永远匹配不上)。超长叶子直接不作为候选 —— 该行宁可
|
|
288
|
+
// 走诚实 skip,也不持久化一个脆弱/必失败的定位器。
|
|
289
|
+
if (t.length > 80) return;
|
|
290
|
+
if (t.length > best.length) best = t; // 最长叶子 = 最具体的行标识
|
|
291
|
+
});
|
|
292
|
+
return best;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
return JSON.stringify({"ui_elements": uniqueElements});
|
|
296
|
+
}
|
|
297
|
+
"""
|
|
298
|
+
try:
|
|
299
|
+
# 在 Playwright 浏览器上下文环境中执行 JS 注入并获取结果
|
|
300
|
+
ui_json_str = page.evaluate(js_script)
|
|
301
|
+
return ui_json_str
|
|
302
|
+
except Exception as e:
|
|
303
|
+
print(f"[Warning] Failed to extract Web DOM: {e}")
|
|
304
|
+
return '{"ui_elements": []}'
|
utils/utils_xml.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import defusedxml.ElementTree as ET
|
|
3
|
+
except ModuleNotFoundError:
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
_PATTERN_NOISE = re.compile(r'^[\$\¥\€\£\d\.\,\+\-\%]+$')
|
|
9
|
+
_PATTERN_HASH_SUFFIX = re.compile(r'_[a-f0-9]{8}$')
|
|
10
|
+
|
|
11
|
+
def _should_filter_by_text(text: str, clickable: bool) -> bool:
|
|
12
|
+
if clickable:
|
|
13
|
+
return False
|
|
14
|
+
if len(text) <= 5 and _PATTERN_NOISE.match(text):
|
|
15
|
+
return True
|
|
16
|
+
return False
|
|
17
|
+
|
|
18
|
+
def _should_filter_by_id(res_id: str) -> bool:
|
|
19
|
+
if not res_id:
|
|
20
|
+
return False
|
|
21
|
+
return "com.android.systemui" in res_id
|
|
22
|
+
|
|
23
|
+
def _should_filter_by_desc(desc: str) -> bool:
|
|
24
|
+
if not desc:
|
|
25
|
+
return False
|
|
26
|
+
if "OpenVPN" in desc or "VoLTE" in desc:
|
|
27
|
+
return True
|
|
28
|
+
if len(desc) > 30 and "0, 1, 2" in desc:
|
|
29
|
+
return True
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
def _short_resource_id(res_id: str) -> str:
|
|
33
|
+
"""The bare id name (no package prefix), for display/token economy only.
|
|
34
|
+
|
|
35
|
+
NOTE: do NOT use this as a locator value — uiautomator2's resourceId
|
|
36
|
+
selector matches the FULL `pkg:id/name`, so the compressor emits the full id
|
|
37
|
+
(see compress_android_xml). This helper exists only for the optional `id_short`
|
|
38
|
+
hint.
|
|
39
|
+
"""
|
|
40
|
+
short = res_id.split("/")[-1]
|
|
41
|
+
short = _PATTERN_HASH_SUFFIX.sub('', short)
|
|
42
|
+
return short
|
|
43
|
+
|
|
44
|
+
def _node_label(node) -> str:
|
|
45
|
+
"""A node's own label (text, falling back to content-desc), stripped."""
|
|
46
|
+
return node.attrib.get("text", "").strip() or node.attrib.get("content-desc", "").strip()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _is_filtered_node(node) -> bool:
|
|
50
|
+
"""True if the emit loop will drop this node entirely (id / desc filters).
|
|
51
|
+
|
|
52
|
+
A clickable/promoted node survives the numeric-noise *text* filter, so only
|
|
53
|
+
`_should_filter_by_id` / `_should_filter_by_desc` matter — they `continue`
|
|
54
|
+
past the node regardless of its label. Promotion must consult this so it
|
|
55
|
+
never suppresses a row container in favor of a label that then vanishes.
|
|
56
|
+
"""
|
|
57
|
+
res_id = node.attrib.get("resource-id", "").strip()
|
|
58
|
+
desc = node.attrib.get("content-desc", "").strip()
|
|
59
|
+
return _should_filter_by_id(res_id) or _should_filter_by_desc(desc)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _emittable_own_label(node) -> bool:
|
|
63
|
+
"""The node carries its own label AND will survive emission — i.e. it is
|
|
64
|
+
already a locatable control (a Button, a labeled clickable), not a headless
|
|
65
|
+
container needing promotion. A container whose only own label is itself
|
|
66
|
+
filtered (e.g. a clickable wrapper with content-desc='VoLTE') is treated as
|
|
67
|
+
label-less so its real child label can still be promoted."""
|
|
68
|
+
return bool(_node_label(node)) and not _is_filtered_node(node)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _scope_label_descendants(container) -> list:
|
|
72
|
+
"""Surviving labeled descendants in document order, without crossing a nested
|
|
73
|
+
clickable boundary (an inner card owns its own labels). Labels the emit loop
|
|
74
|
+
would drop (filtered id/desc) are skipped, so promotion never targets — nor
|
|
75
|
+
suppresses a container in favor of — a node that would vanish."""
|
|
76
|
+
out: list = []
|
|
77
|
+
for child in container:
|
|
78
|
+
if child.attrib.get("clickable") == "true":
|
|
79
|
+
continue # nested clickable owns its own subtree's labels
|
|
80
|
+
if _node_label(child) and not _is_filtered_node(child):
|
|
81
|
+
out.append(child)
|
|
82
|
+
out.extend(_scope_label_descendants(child))
|
|
83
|
+
return out
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _promotable_label(container):
|
|
87
|
+
"""The label node to promote for a headless clickable container, or None.
|
|
88
|
+
|
|
89
|
+
Prefers the standard `:id/title` node when present, else the first surviving
|
|
90
|
+
label in document order — so a summary/status line that happens to render
|
|
91
|
+
before the title (e.g. '已连接' above '蓝牙') doesn't become the row's tap
|
|
92
|
+
label. Returns None for an icon-only container or one whose only labels are
|
|
93
|
+
all filtered (→ left as an honest headless clickable, never fabricated)."""
|
|
94
|
+
labels = _scope_label_descendants(container)
|
|
95
|
+
if not labels:
|
|
96
|
+
return None
|
|
97
|
+
for node in labels:
|
|
98
|
+
if node.attrib.get("resource-id", "").strip().endswith("/title"):
|
|
99
|
+
return node
|
|
100
|
+
return labels[0]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _compute_row_promotions(root):
|
|
104
|
+
"""Find list-row label promotions (RecyclerView / Preference rows).
|
|
105
|
+
|
|
106
|
+
The dominant Android list shape is a CLICKABLE container with no own label
|
|
107
|
+
whose text lives in a non-clickable child TextView. A flat walk splits the
|
|
108
|
+
row into a headless (unlocatable) clickable + a text node marked not-clickable,
|
|
109
|
+
so NO element is both clickable and labeled. We promote the container's title
|
|
110
|
+
(or first surviving) label descendant to clickable (a real node with a real id
|
|
111
|
+
— tapping it bubbles to the clickable ancestor, verified on a real device) and
|
|
112
|
+
suppress the now-redundant empty container.
|
|
113
|
+
|
|
114
|
+
Returns (promote_ids, suppress_ids): sets of id(node) for Pass 2 to apply.
|
|
115
|
+
Identity keys are safe because `root` (and all its Element nodes) is held
|
|
116
|
+
alive across both passes within compress_android_xml; do not stream/re-parse
|
|
117
|
+
between the passes.
|
|
118
|
+
|
|
119
|
+
Honesty boundaries:
|
|
120
|
+
- Disabled container (enabled=false) → not effectively clickable, no promotion.
|
|
121
|
+
- No promotable, *survivable* label (icon-only, or only filtered labels) →
|
|
122
|
+
container left as an honest headless clickable; never fabricate a label and
|
|
123
|
+
never suppress a row in favor of a label that the emit loop would drop.
|
|
124
|
+
- Label search does NOT cross into a nested clickable — an inner card's label
|
|
125
|
+
belongs to the inner card, so an outer wrapper can't steal it (an outer
|
|
126
|
+
wrapper around already-promoted inner cards stays an honest, locator-less
|
|
127
|
+
clickable rather than being given a borrowed label).
|
|
128
|
+
"""
|
|
129
|
+
promote_ids: set[int] = set()
|
|
130
|
+
suppress_ids: set[int] = set()
|
|
131
|
+
|
|
132
|
+
for node in root.iter():
|
|
133
|
+
if node.attrib.get("clickable") != "true":
|
|
134
|
+
continue
|
|
135
|
+
if node.attrib.get("enabled") == "false":
|
|
136
|
+
continue # disabled row is not effectively clickable — don't promote
|
|
137
|
+
if _emittable_own_label(node):
|
|
138
|
+
continue # already a locatable control (e.g. a Button) — nothing to lift
|
|
139
|
+
|
|
140
|
+
label_node = _promotable_label(node)
|
|
141
|
+
if label_node is None:
|
|
142
|
+
# Icon-only container, or every candidate label would be filtered out:
|
|
143
|
+
# leave the container un-suppressed (today's headless-clickable, still
|
|
144
|
+
# present/assertable) rather than dropping the row entirely.
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
promote_ids.add(id(label_node))
|
|
148
|
+
suppress_ids.add(id(node))
|
|
149
|
+
|
|
150
|
+
# Never suppress a node we also promote (defensive; can't currently coincide).
|
|
151
|
+
suppress_ids -= promote_ids
|
|
152
|
+
return promote_ids, suppress_ids
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def compress_android_xml(raw_xml: str) -> str:
|
|
156
|
+
try:
|
|
157
|
+
root = ET.fromstring(raw_xml)
|
|
158
|
+
except ET.ParseError as e:
|
|
159
|
+
raw_preview = raw_xml[:200] if raw_xml else "(empty)"
|
|
160
|
+
print(f"[Warning] XML parse failed: {e}, first 200 chars: {raw_preview}")
|
|
161
|
+
return '{"ui_elements": []}'
|
|
162
|
+
|
|
163
|
+
elements = []
|
|
164
|
+
promote_ids, suppress_ids = _compute_row_promotions(root)
|
|
165
|
+
|
|
166
|
+
for node in root.iter():
|
|
167
|
+
if id(node) in suppress_ids:
|
|
168
|
+
# Redundant empty row container — its label child carries the row now.
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
attrib = node.attrib
|
|
172
|
+
text = attrib.get("text", "").strip()
|
|
173
|
+
desc = attrib.get("content-desc", "").strip()
|
|
174
|
+
res_id = attrib.get("resource-id", "").strip()
|
|
175
|
+
# `enabled` defaults to true in Android; only an explicit "false" disables.
|
|
176
|
+
# A disabled control must not be reported clickable (the LLM would tap it
|
|
177
|
+
# and hang on the timeout) but is still emitted so its existence/disabled
|
|
178
|
+
# state stays assertable — mirrors the web compressor's disabled contract.
|
|
179
|
+
disabled = attrib.get("enabled") == "false"
|
|
180
|
+
# A row label promoted from a headless clickable container is effectively
|
|
181
|
+
# clickable (tap bubbles to the clickable ancestor — real-device verified);
|
|
182
|
+
# a disabled node is never promoted (excluded in _compute_row_promotions).
|
|
183
|
+
promoted = id(node) in promote_ids
|
|
184
|
+
clickable = (attrib.get("clickable") == "true" or promoted) and not disabled
|
|
185
|
+
node_class = attrib.get("class", "").split(".")[-1]
|
|
186
|
+
|
|
187
|
+
if _should_filter_by_id(res_id):
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if _should_filter_by_desc(desc):
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Pass `clickable or disabled`: the numeric-noise filter must not drop a
|
|
194
|
+
# disabled control (clickable is False for it), or its disabled state
|
|
195
|
+
# could never be seen/asserted — the filter runs before emission.
|
|
196
|
+
if _should_filter_by_text(text, clickable or disabled):
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
if text or desc or clickable or disabled:
|
|
200
|
+
el_info = {"class": node_class}
|
|
201
|
+
if text: el_info["text"] = text
|
|
202
|
+
if desc: el_info["desc"] = desc
|
|
203
|
+
if clickable: el_info["clickable"] = True
|
|
204
|
+
if disabled: el_info["disabled"] = True
|
|
205
|
+
|
|
206
|
+
if res_id:
|
|
207
|
+
# Emit the FULL resource-id (pkg:id/name) — this is what
|
|
208
|
+
# uiautomator2's resourceId selector matches. Stripping the
|
|
209
|
+
# prefix produced ids that could never be located (the agent's
|
|
210
|
+
# #2-priority locator was silently broken on Android).
|
|
211
|
+
el_info["id"] = res_id
|
|
212
|
+
short = _short_resource_id(res_id)
|
|
213
|
+
if short and short != res_id:
|
|
214
|
+
el_info["id_short"] = short
|
|
215
|
+
|
|
216
|
+
elements.append(el_info)
|
|
217
|
+
|
|
218
|
+
return json.dumps({"ui_elements": elements}, ensure_ascii=False)
|