dp-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,671 @@
1
+ # -*- coding:utf-8 -*-
2
+ """
3
+ a11y tree — 浏览器原生无障碍树获取与渲染
4
+
5
+ 通过 Chrome DevTools Protocol Accessibility API 获取完整的 a11y tree,
6
+ 为 AI 提供全面的页面结构理解。
7
+
8
+ 核心流程:
9
+ 1. CDP getFullAXTree 获取扁平节点列表
10
+ 2. _normalize_node() 解包 AXValue 对象
11
+ 3. _build_tree() 构建嵌套树
12
+ 4. 为交互节点生成 DrissionPage 定位器
13
+ 5. render_a11y_text() 渲染为可读文本
14
+ 6. CDP 失败时降级到 JS fallback
15
+ """
16
+ from .utils import suggest_locator
17
+
18
+ # 交互角色列表:需要生成定位器的角色
19
+ _INTERACTIVE_ROLES = frozenset({
20
+ 'button', 'link', 'textbox', 'combobox', 'checkbox', 'radio',
21
+ 'slider', 'spinbutton', 'tab', 'menuitem', 'searchbox', 'switch',
22
+ 'option', 'menuitemcheckbox', 'menuitemradio', 'treeitem',
23
+ })
24
+
25
+ # 语义角色:渲染时保留的非交互角色
26
+ _SEMANTIC_ROLES = frozenset({
27
+ 'heading', 'list', 'listitem', 'article', 'navigation', 'main',
28
+ 'banner', 'contentinfo', 'complementary', 'form', 'search',
29
+ 'region', 'table', 'row', 'cell', 'columnheader', 'rowheader',
30
+ 'img', 'figure', 'alert', 'dialog', 'status', 'progressbar',
31
+ 'separator', 'toolbar', 'tablist', 'tabpanel', 'tree', 'treegrid',
32
+ 'grid', 'gridcell', 'group', 'document', 'application',
33
+ })
34
+
35
+ # 纯文本角色:父节点 name 已包含其内容,渲染时跳过
36
+ _TEXT_ROLES = frozenset({
37
+ 'StaticText', 'InlineTextBox',
38
+ })
39
+
40
+ # 叶级内容角色:直接包含文本,需要从 StaticText 子节点收集文本
41
+ _CONTENT_ROLES = frozenset({
42
+ 'paragraph', 'code', 'heading', 'listitem', 'cell', 'columnheader',
43
+ 'rowheader', 'definition', 'term', 'caption', 'blockquote', 'LabelText',
44
+ 'legend', 'LineBreak',
45
+ })
46
+
47
+ # 获得 ref 编号的内容角色(有意义的内容块,AI 可通过编号引用提取)
48
+ _REF_CONTENT_ROLES = frozenset({
49
+ 'heading', 'paragraph', 'code', 'blockquote', 'article', 'figure',
50
+ })
51
+
52
+
53
+ def take_a11y_snapshot(page, selector=None, max_depth=None) -> dict:
54
+ """
55
+ 获取页面 a11y tree。
56
+
57
+ :param page: DrissionPage 的 ChromiumPage 对象
58
+ :param selector: CSS 选择器,限定子树范围(可选)
59
+ :param max_depth: 最大深度限制(可选,传给 CDP)
60
+ :return: 标准化的 a11y tree 数据
61
+ """
62
+ page.wait.doc_loaded()
63
+ page_info = {'url': page.url, 'title': page.title}
64
+
65
+ # ── 尝试 CDP 获取 ──
66
+ try:
67
+ flat_nodes = _get_full_tree_cdp(page, max_depth)
68
+ normalized = [_normalize_node(n) for n in flat_nodes]
69
+ tree = _build_tree(normalized)
70
+
71
+ # 如果指定 selector,找到对应子树
72
+ selector_warning = None
73
+ if selector:
74
+ tree, selector_warning = _find_subtree_by_selector(page, tree, normalized, selector)
75
+
76
+ stats = _compute_stats(normalized)
77
+
78
+ # 为交互节点 + 可引用内容节点批量生成定位器
79
+ need_locator = [n for n in normalized
80
+ if n.get('backendNodeId') and (
81
+ n['role'] in _INTERACTIVE_ROLES or
82
+ n['role'] in _REF_CONTENT_ROLES)]
83
+ _generate_locators_batch(page, need_locator)
84
+
85
+ result = {
86
+ 'page': page_info,
87
+ 'mode': 'a11y',
88
+ 'method': 'cdp',
89
+ 'tree': tree,
90
+ 'stats': stats,
91
+ }
92
+ if selector_warning:
93
+ result['warning'] = selector_warning
94
+ return result
95
+ except Exception as cdp_err:
96
+ cdp_error_msg = str(cdp_err)
97
+
98
+ # ── CDP 失败,降级到 JS fallback ──
99
+ try:
100
+ from .js_scripts import _JS_A11Y_FALLBACK
101
+ raw = page.run_js(_JS_A11Y_FALLBACK)
102
+ if isinstance(raw, dict):
103
+ tree = raw.get('tree', {})
104
+ stats = raw.get('stats', {})
105
+ return {
106
+ 'page': page_info,
107
+ 'mode': 'a11y',
108
+ 'method': 'js_fallback',
109
+ 'tree': tree,
110
+ 'stats': stats,
111
+ 'warning': f'CDP 不可用,已降级到 JS fallback (CDP: {cdp_error_msg})',
112
+ }
113
+ except Exception:
114
+ pass
115
+
116
+ # ── 全部失败 ──
117
+ return {
118
+ 'page': page_info,
119
+ 'mode': 'a11y',
120
+ 'method': 'failed',
121
+ 'tree': {},
122
+ 'stats': {'total': 0, 'ignored': 0, 'interactive': 0},
123
+ 'error': f'a11y tree 获取失败 (CDP: {cdp_err})',
124
+ }
125
+
126
+
127
+ def render_a11y_text(snapshot: dict, verbose: bool = False,
128
+ brief: bool = False, refs: dict = None) -> str:
129
+ """
130
+ 将 a11y tree 数据渲染为人类/AI 可读文本。
131
+
132
+ :param snapshot: take_a11y_snapshot 返回的数据
133
+ :param verbose: True 时显示 ignored 节点和完整属性
134
+ :param brief: True 时截断内容文本,保留结构+交互,省 token
135
+ :param refs: 可选,传入空 dict 时会被填充为 {ref_id: {locator, role, name, backendNodeId}}
136
+ :return: 格式化的文本
137
+ """
138
+ lines = []
139
+ page_info = snapshot.get('page', {})
140
+ stats = snapshot.get('stats', {})
141
+
142
+ # 渲染上下文:编号计数器 + ref 映射收集
143
+ ctx = {'counter': 0, 'refs': {} if refs is None else refs}
144
+
145
+ mode_label = 'brief' if brief else 'full'
146
+ # 头部信息先占位,渲染完成后回填 ref 统计
147
+ header_idx = len(lines)
148
+ lines.append('') # placeholder
149
+ lines.append(f"- URL: {page_info.get('url', '')}")
150
+ lines.append(f"- Title: {page_info.get('title', '')}")
151
+ stats_idx = len(lines)
152
+ lines.append('') # placeholder for stats line
153
+ if brief:
154
+ lines.append('- Note: 内容已精简,如需完整文本请用 --mode full 或 --selector')
155
+ lines.append('')
156
+
157
+ if snapshot.get('warning'):
158
+ lines.append(f"⚠ {snapshot['warning']}")
159
+ if snapshot.get('error'):
160
+ lines.append(f"⚠ {snapshot['error']}")
161
+ lines[header_idx] = f'### Page Snapshot ({mode_label})'
162
+ lines[stats_idx] = (f"- Nodes: {stats.get('total', 0)} total, "
163
+ f"{stats.get('interactive', 0)} interactive")
164
+ return '\n'.join(lines)
165
+
166
+ tree = snapshot.get('tree', {})
167
+ if tree:
168
+ _render_node(tree, lines, depth=0, verbose=verbose, brief=brief,
169
+ ctx=ctx)
170
+ else:
171
+ lines.append('(a11y tree 为空)')
172
+
173
+ # 回填头部:包含 ref 统计
174
+ ref_count = ctx['counter']
175
+ lines[header_idx] = f'### Page Snapshot ({mode_label})'
176
+ lines[stats_idx] = (f"- Nodes: {stats.get('total', 0)} total, "
177
+ f"{stats.get('interactive', 0)} interactive, "
178
+ f"{ref_count} refs")
179
+ if ref_count > 0:
180
+ lines[stats_idx] += f" — 使用 ref:N 引用元素,如 dp click \"ref:1\""
181
+
182
+ # 如果调用方传了 refs dict,确保数据已填充
183
+ if refs is not None:
184
+ refs.update(ctx['refs'])
185
+
186
+ return '\n'.join(lines)
187
+
188
+
189
+ def render_a11y_plain_text(snapshot: dict, refs: dict = None) -> str:
190
+ """
191
+ 将 a11y tree 扁平化为纯文本(按阅读顺序)。
192
+
193
+ :param snapshot: take_a11y_snapshot 返回的数据
194
+ :param refs: 可选,传入空 dict 时会被填充为 ref 映射(避免需要额外调用 render_a11y_text)
195
+ :return: 纯文本字符串
196
+ """
197
+ tree = snapshot.get('tree', {})
198
+ if not tree:
199
+ return ''
200
+
201
+ # 如果需要收集 refs,在纯文本渲染过程中顺便收集
202
+ if refs is not None:
203
+ ctx = {'counter': 0, 'refs': refs}
204
+ _collect_refs_only(tree, ctx)
205
+ refs.update(ctx['refs'])
206
+
207
+ parts = []
208
+ _collect_plain_text(tree, parts)
209
+ return '\n'.join(parts)
210
+
211
+
212
+ # ── CDP 获取函数 ──────────────────────────────────────────────────────────────
213
+
214
+
215
+ def _get_full_tree_cdp(page, max_depth=None) -> list:
216
+ """通过 CDP getFullAXTree 获取完整 a11y tree"""
217
+ kwargs = {}
218
+ if max_depth is not None:
219
+ kwargs['depth'] = max_depth
220
+ result = page.run_cdp('Accessibility.getFullAXTree', **kwargs)
221
+ return result.get('nodes', [])
222
+
223
+
224
+ def _find_subtree_by_selector(page, tree: dict, all_nodes: list,
225
+ selector: str) -> tuple:
226
+ """在已构建的 a11y tree 中,找到 selector 对应的子树。
227
+
228
+ :return: (subtree, warning) — subtree 为匹配的子树或完整树,warning 为失败提示或 None
229
+ """
230
+ # 1. 获取 selector 对应的 backendNodeId
231
+ try:
232
+ doc = page.run_cdp('DOM.getDocument')
233
+ root_id = doc['root']['nodeId']
234
+ result = page.run_cdp('DOM.querySelector', nodeId=root_id, selector=selector)
235
+ node_id = result.get('nodeId')
236
+ if not node_id:
237
+ return tree, f'--selector "{selector}" 未匹配到元素,已返回完整页面快照'
238
+
239
+ desc = page.run_cdp('DOM.describeNode', nodeId=node_id)
240
+ target_bid = desc['node']['backendNodeId']
241
+ except Exception:
242
+ return tree, f'--selector "{selector}" 查询失败,已返回完整页面快照'
243
+
244
+ # 2. 在 a11y tree 中查找匹配的节点
245
+ def find_node(node, target_bid):
246
+ if node.get('backendNodeId') == target_bid:
247
+ return node
248
+ for child in node.get('children', []):
249
+ found = find_node(child, target_bid)
250
+ if found:
251
+ return found
252
+ return None
253
+
254
+ subtree = find_node(tree, target_bid)
255
+ if subtree:
256
+ return subtree, None
257
+ return tree, f'--selector "{selector}" 在 a11y tree 中未找到对应节点,已返回完整页面快照'
258
+
259
+
260
+ # ── 数据标准化 ────────────────────────────────────────────────────────────────
261
+
262
+
263
+ def _normalize_node(raw: dict) -> dict:
264
+ """解包 CDP AXValue 对象为简单值"""
265
+ return {
266
+ 'nodeId': raw.get('nodeId', ''),
267
+ 'role': _ax_value(raw.get('role')),
268
+ 'name': _ax_value(raw.get('name')),
269
+ 'description': _ax_value(raw.get('description')),
270
+ 'value': _ax_value(raw.get('value')),
271
+ 'ignored': raw.get('ignored', False),
272
+ 'ignoredReasons': [
273
+ _ax_value(r) for r in raw.get('ignoredReasons', [])
274
+ ] if raw.get('ignoredReasons') else [],
275
+ 'properties': {
276
+ p['name']: _ax_value(p.get('value'))
277
+ for p in raw.get('properties', [])
278
+ },
279
+ 'childIds': raw.get('childIds', []),
280
+ 'parentId': raw.get('parentId'),
281
+ 'backendNodeId': raw.get('backendDOMNodeId'),
282
+ 'frameId': raw.get('frameId'),
283
+ # 后续填充
284
+ 'locator': None,
285
+ 'children': [],
286
+ }
287
+
288
+
289
+ def _ax_value(v) -> any:
290
+ """从 AXValue 对象中提取值"""
291
+ if v is None:
292
+ return ''
293
+ if isinstance(v, dict):
294
+ return v.get('value', '')
295
+ return v
296
+
297
+
298
+ # ── 构建树 ────────────────────────────────────────────────────────────────────
299
+
300
+
301
+ def _build_tree(flat_nodes: list) -> dict:
302
+ """将扁平节点列表按 parentId/childIds 关系组装为嵌套树"""
303
+ if not flat_nodes:
304
+ return {}
305
+
306
+ node_map = {n['nodeId']: n for n in flat_nodes}
307
+
308
+ for node in flat_nodes:
309
+ children = []
310
+ for cid in node.get('childIds', []):
311
+ child = node_map.get(cid)
312
+ if child:
313
+ children.append(child)
314
+ node['children'] = children
315
+
316
+ # 根节点:没有 parentId 或 parentId 不在列表中的节点
317
+ roots = [n for n in flat_nodes
318
+ if not n.get('parentId') or n['parentId'] not in node_map]
319
+
320
+ return roots[0] if roots else flat_nodes[0] if flat_nodes else {}
321
+
322
+
323
+ def _compute_stats(nodes: list) -> dict:
324
+ """计算统计信息"""
325
+ total = len(nodes)
326
+ ignored = sum(1 for n in nodes if n.get('ignored'))
327
+ interactive = sum(1 for n in nodes if n['role'] in _INTERACTIVE_ROLES)
328
+ return {
329
+ 'total': total,
330
+ 'ignored': ignored,
331
+ 'interactive': interactive,
332
+ }
333
+
334
+
335
+ # ── 定位器生成 ────────────────────────────────────────────────────────────────
336
+
337
+
338
+ def _generate_locators_batch(page, interactive_nodes: list) -> None:
339
+ """批量为交互节点生成 DrissionPage 定位器。
340
+
341
+ 优化:一次 DOM.getDocument(depth=-1) 获取完整 DOM 树,
342
+ 再从内存中按 backendNodeId 查找,避免 N 次 CDP 往返。
343
+ """
344
+ if not interactive_nodes:
345
+ return
346
+
347
+ # 收集需要的 backendNodeId
348
+ bid_to_nodes = {}
349
+ for node in interactive_nodes:
350
+ bid = node.get('backendNodeId')
351
+ if bid:
352
+ bid_to_nodes.setdefault(bid, []).append(node)
353
+
354
+ if not bid_to_nodes:
355
+ return
356
+
357
+ # 方案 1:一次性获取完整 DOM 树并建索引
358
+ bid_map = _build_dom_bid_map(page)
359
+
360
+ if bid_map:
361
+ for bid, nodes in bid_to_nodes.items():
362
+ dom_info = bid_map.get(bid)
363
+ if dom_info:
364
+ text = (nodes[0].get('name') or '')[:50]
365
+ loc = suggest_locator(dom_info['tag'], dom_info['attrs'], text)
366
+ for n in nodes:
367
+ n['locator'] = loc
368
+ else:
369
+ # fallback:逐个查询(兼容 DOM.getDocument 不可用的情况)
370
+ for bid, nodes in bid_to_nodes.items():
371
+ dom_info = _get_dom_attrs(page, bid)
372
+ if dom_info:
373
+ text = (nodes[0].get('name') or '')[:50]
374
+ loc = suggest_locator(dom_info['tag'], dom_info['attrs'], text)
375
+ for n in nodes:
376
+ n['locator'] = loc
377
+
378
+
379
+ def _build_dom_bid_map(page) -> dict:
380
+ """一次性获取完整 DOM 树,返回 {backendNodeId: {tag, attrs}} 映射"""
381
+ try:
382
+ doc = page.run_cdp('DOM.getDocument', depth=-1)
383
+ bid_map = {}
384
+ _walk_dom_node(doc.get('root', {}), bid_map)
385
+ return bid_map
386
+ except Exception:
387
+ return {}
388
+
389
+
390
+ def _walk_dom_node(node: dict, bid_map: dict) -> None:
391
+ """递归遍历 DOM 节点,建立 backendNodeId → {tag, attrs} 索引"""
392
+ bid = node.get('backendNodeId')
393
+ if bid:
394
+ attrs_list = node.get('attributes', [])
395
+ attrs = dict(zip(attrs_list[::2], attrs_list[1::2]))
396
+ bid_map[bid] = {
397
+ 'tag': node.get('nodeName', '').lower(),
398
+ 'attrs': attrs,
399
+ }
400
+ for child in node.get('children', []):
401
+ _walk_dom_node(child, bid_map)
402
+ # shadow DOM / content document
403
+ for sub in node.get('shadowRoots', []):
404
+ _walk_dom_node(sub, bid_map)
405
+ cd = node.get('contentDocument')
406
+ if cd:
407
+ _walk_dom_node(cd, bid_map)
408
+
409
+
410
+ def _get_dom_attrs(page, backend_node_id: int) -> dict:
411
+ """通过 CDP DOM.describeNode 获取 DOM 节点的属性(fallback 逐个查询)"""
412
+ try:
413
+ result = page.run_cdp('DOM.describeNode', backendNodeId=backend_node_id)
414
+ node = result.get('node', {})
415
+ attrs_list = node.get('attributes', [])
416
+ attrs = dict(zip(attrs_list[::2], attrs_list[1::2]))
417
+ return {
418
+ 'tag': node.get('nodeName', '').lower(),
419
+ 'attrs': attrs,
420
+ }
421
+ except Exception:
422
+ return {}
423
+
424
+
425
+ # ── 文本渲染 ──────────────────────────────────────────────────────────────────
426
+
427
+
428
+ def _render_node(node: dict, lines: list, depth: int = 0,
429
+ verbose: bool = False, parent_text: str = '',
430
+ brief: bool = False, ctx: dict = None) -> None:
431
+ """递归渲染单个 a11y 节点为文本行
432
+
433
+ :param parent_text: 父节点已显示的文本,用于消除子节点冗余
434
+ :param brief: True 时截断内容文本(paragraph/code 等)
435
+ :param ctx: 渲染上下文 {'counter': int, 'refs': dict},用于分配 [N] 编号
436
+ """
437
+ role = node.get('role', '')
438
+ name = node.get('name', '')
439
+ ignored = node.get('ignored', False)
440
+ children = node.get('children', [])
441
+
442
+ # 跳过 ignored 节点(除非 verbose),但仍然渲染子节点
443
+ if ignored and not verbose:
444
+ for child in children:
445
+ _render_node(child, lines, depth, verbose=verbose,
446
+ parent_text=parent_text, brief=brief, ctx=ctx)
447
+ return
448
+
449
+ # 跳过 InlineTextBox(永远是 StaticText 的子节点,完全冗余)
450
+ if role == 'InlineTextBox':
451
+ return
452
+
453
+ # StaticText 特殊处理:如果文本已被父节点覆盖则跳过
454
+ if role == 'StaticText':
455
+ text = name.strip()
456
+ if not text or (parent_text and text in parent_text):
457
+ return
458
+ # brief 模式:跳过独立文本节点(正文细节,概览不需要)
459
+ if brief:
460
+ return
461
+ # 独立文本节点:直接输出文本内容(不显示 StaticText 角色名)
462
+ indent = ' ' * depth
463
+ lines.append(f'{indent}- "{text}"')
464
+ return
465
+
466
+ # 如果节点没有 name,且是叶级内容角色,从子节点收集文本
467
+ display_name = name
468
+ if not display_name and role in _CONTENT_ROLES:
469
+ display_name = _collect_text(node)
470
+
471
+ # 跳过无意义的容器节点(generic/none 且没有名字)
472
+ if role in ('generic', 'none', '') and not name:
473
+ for child in children:
474
+ _render_node(child, lines, depth, verbose=verbose,
475
+ parent_text=parent_text, brief=brief, ctx=ctx)
476
+ return
477
+
478
+ # 文本与父节点重复时,跳过纯文本包装节点(无结构子节点的小包装)
479
+ if (display_name and parent_text and display_name in parent_text
480
+ and role not in _INTERACTIVE_ROLES):
481
+ has_structural_child = any(
482
+ c.get('role', '') not in _TEXT_ROLES for c in children)
483
+ if not has_structural_child:
484
+ return
485
+
486
+ # ── 判断是否分配 ref 编号 ──
487
+ ref_label = ''
488
+ loc = node.get('locator')
489
+ if ctx is not None:
490
+ should_ref = False
491
+ if role in _INTERACTIVE_ROLES and loc:
492
+ should_ref = True
493
+ elif role in _REF_CONTENT_ROLES and display_name:
494
+ should_ref = True
495
+ if should_ref:
496
+ ctx['counter'] += 1
497
+ ref_id = ctx['counter']
498
+ ref_label = f'[{ref_id}] '
499
+ ctx['refs'][str(ref_id)] = {
500
+ 'locator': loc,
501
+ 'role': role,
502
+ 'name': (display_name or name or '')[:100],
503
+ 'backendNodeId': node.get('backendNodeId'),
504
+ }
505
+
506
+ # 构建行内容
507
+ indent = ' ' * depth
508
+ parts = []
509
+
510
+ # 角色
511
+ if role:
512
+ parts.append(role)
513
+
514
+ # 名字/文本(brief 模式下截断内容角色的文本)
515
+ shown_name = display_name
516
+ if brief and shown_name and role in _CONTENT_ROLES and len(shown_name) > 80:
517
+ shown_name = shown_name[:80] + '...'
518
+ if shown_name:
519
+ parts.append(f'"{shown_name}"')
520
+
521
+ # 关键属性
522
+ props = node.get('properties', {})
523
+ prop_strs = []
524
+ for key in ('checked', 'expanded', 'selected', 'disabled', 'required',
525
+ 'level', 'pressed', 'valuetext'):
526
+ val = props.get(key)
527
+ if val is not None and val != '' and val is not False:
528
+ if val is True:
529
+ prop_strs.append(key)
530
+ else:
531
+ prop_strs.append(f'{key}={val}')
532
+
533
+ if prop_strs:
534
+ parts.append(f"[{', '.join(prop_strs)}]")
535
+
536
+ # value(输入框等)
537
+ value = node.get('value', '')
538
+ if value and role in ('textbox', 'combobox', 'slider', 'spinbutton', 'searchbox'):
539
+ parts.append(f'value="{value}"')
540
+
541
+ # 定位器
542
+ if loc:
543
+ parts.append(f'→ {loc}')
544
+
545
+ # ignored 标记
546
+ if ignored:
547
+ parts.append('[ignored]')
548
+
549
+ # description
550
+ desc = node.get('description', '')
551
+ if desc and verbose:
552
+ parts.append(f'desc="{desc}"')
553
+
554
+ if parts:
555
+ lines.append(f"{indent}- {ref_label}{' '.join(parts)}")
556
+
557
+ # 递归渲染子节点,传递当前节点的文本上下文
558
+ text_ctx = display_name or parent_text
559
+ for child in children:
560
+ _render_node(child, lines, depth + 1, verbose=verbose,
561
+ parent_text=text_ctx, brief=brief, ctx=ctx)
562
+
563
+
564
+ def _collect_text(node: dict, _depth: int = 0) -> str:
565
+ """从节点子树中收集可见文本。
566
+
567
+ 递归穿透无名 generic/none 容器,收集 StaticText 和有 name 的子节点文本。
568
+ 深度限制宽松(10 层),确保文章内容完整收集;截断由渲染层(brief 模式)控制。
569
+ """
570
+ if _depth > 10:
571
+ return ''
572
+ parts = []
573
+ for child in node.get('children', []):
574
+ child_role = child.get('role', '')
575
+ child_name = child.get('name', '')
576
+ if child_role in _TEXT_ROLES:
577
+ if child_name:
578
+ parts.append(child_name)
579
+ elif child_name:
580
+ # 有名字的子节点(link/code 等)贡献文本
581
+ parts.append(child_name)
582
+ elif child_role in ('generic', 'none', ''):
583
+ # 无名容器:递归穿透,收集其内部文本
584
+ sub = _collect_text(child, _depth + 1)
585
+ if sub:
586
+ parts.append(sub)
587
+ if parts:
588
+ return ''.join(parts).strip()
589
+ return ''
590
+
591
+
592
+ # ── 纯文本渲染 ──────────────────────────────────────────────────────────────────────
593
+
594
+
595
+ # 块级角色:渲染纯文本时在前后插入换行
596
+ _BLOCK_ROLES = frozenset({
597
+ 'paragraph', 'heading', 'listitem', 'code', 'blockquote',
598
+ 'figure', 'separator', 'article', 'main', 'banner', 'contentinfo',
599
+ 'navigation', 'complementary', 'search', 'region', 'form',
600
+ })
601
+
602
+
603
+ def _collect_plain_text(node: dict, parts: list) -> None:
604
+ """递归收集节点的可见文本(按阅读顺序)"""
605
+ role = node.get('role', '')
606
+ name = node.get('name', '')
607
+ children = node.get('children', [])
608
+
609
+ if node.get('ignored', False):
610
+ for child in children:
611
+ _collect_plain_text(child, parts)
612
+ return
613
+
614
+ if role in _TEXT_ROLES:
615
+ text = name.strip()
616
+ if text:
617
+ parts.append(text)
618
+ return
619
+
620
+ if role == 'InlineTextBox':
621
+ return
622
+
623
+ # 块级元素:收集完子节点后加换行
624
+ is_block = role in _BLOCK_ROLES
625
+
626
+ for child in children:
627
+ _collect_plain_text(child, parts)
628
+
629
+ if is_block and parts and parts[-1] != '':
630
+ parts.append('') # 空行分隔块级元素
631
+
632
+
633
+ def _collect_refs_only(node: dict, ctx: dict) -> None:
634
+ """轻量遍历树,只分配 ref 编号(不渲染任何输出)"""
635
+ role = node.get('role', '')
636
+ name = node.get('name', '')
637
+ ignored = node.get('ignored', False)
638
+ children = node.get('children', [])
639
+ loc = node.get('locator')
640
+
641
+ if ignored:
642
+ for child in children:
643
+ _collect_refs_only(child, ctx)
644
+ return
645
+
646
+ if role in ('InlineTextBox', 'StaticText'):
647
+ return
648
+
649
+ # 与 _render_node 同逻辑判断是否分配编号
650
+ display_name = name
651
+ if not display_name and role in _CONTENT_ROLES:
652
+ display_name = _collect_text(node)
653
+
654
+ should_ref = False
655
+ if role in _INTERACTIVE_ROLES and loc:
656
+ should_ref = True
657
+ elif role in _REF_CONTENT_ROLES and display_name:
658
+ should_ref = True
659
+
660
+ if should_ref:
661
+ ctx['counter'] += 1
662
+ ref_id = ctx['counter']
663
+ ctx['refs'][str(ref_id)] = {
664
+ 'locator': loc,
665
+ 'role': role,
666
+ 'name': (display_name or name or '')[:100],
667
+ 'backendNodeId': node.get('backendNodeId'),
668
+ }
669
+
670
+ for child in children:
671
+ _collect_refs_only(child, ctx)