dp-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,158 @@
1
+ # -*- coding:utf-8 -*-
2
+ """
3
+ extract_structured / query_elements — 数据提取函数
4
+ """
5
+ from .utils import suggest_locator
6
+
7
+ _JS_CSS_PATH = """
8
+ var el = this;
9
+ var parts = [];
10
+ while (el && el !== document.body && el.nodeType === 1) {
11
+ var seg = el.tagName.toLowerCase();
12
+ if (el.id && /^[a-zA-Z][\\w-]*$/.test(el.id)) {
13
+ parts.unshift('#' + el.id);
14
+ break;
15
+ }
16
+ var classes = Array.from(el.classList)
17
+ .filter(function(c) { return c.length >= 3; });
18
+ if (classes.length > 0) {
19
+ seg = '.' + classes[0];
20
+ var siblings = el.parentElement
21
+ ? Array.from(el.parentElement.querySelectorAll(':scope > ' + seg))
22
+ : [];
23
+ if (siblings.length > 1) {
24
+ var idx = Array.from(el.parentElement.children).indexOf(el) + 1;
25
+ seg = seg + ':nth-child(' + idx + ')';
26
+ }
27
+ } else {
28
+ var allSiblings = el.parentElement
29
+ ? Array.from(el.parentElement.children).filter(function(c) { return c.tagName === el.tagName; })
30
+ : [];
31
+ if (allSiblings.length > 1) {
32
+ var idx2 = Array.from(el.parentElement.children).indexOf(el) + 1;
33
+ seg = seg + ':nth-child(' + idx2 + ')';
34
+ }
35
+ }
36
+ parts.unshift(seg);
37
+ el = el.parentElement;
38
+ }
39
+ return parts.join(' > ');
40
+ """
41
+
42
+ _JS_XPATH = """
43
+ var el = this;
44
+ var parts = [];
45
+ while (el && el.nodeType === 1) {
46
+ var seg = el.tagName.toLowerCase();
47
+ if (el.id && /^[a-zA-Z][\\w-]*$/.test(el.id)) {
48
+ parts.unshift('//' + seg + '[@id="' + el.id + '"]');
49
+ return parts.join('/');
50
+ }
51
+ var siblings = el.parentElement
52
+ ? Array.from(el.parentElement.children).filter(function(c) { return c.tagName === el.tagName; })
53
+ : [];
54
+ if (siblings.length > 1) {
55
+ var idx = siblings.indexOf(el) + 1;
56
+ seg = seg + '[' + idx + ']';
57
+ }
58
+ parts.unshift(seg);
59
+ el = el.parentElement;
60
+ }
61
+ return '/' + parts.join('/');
62
+ """
63
+
64
+
65
+ def extract_structured(page, container: str, fields: dict,
66
+ limit: int = 100) -> list:
67
+ """
68
+ 结构化批量提取。
69
+
70
+ :param container: 容器定位器,如 'css:.job-card'
71
+ :param fields: 字段映射字典
72
+ :param limit: 最多提取多少条
73
+ """
74
+ try:
75
+ containers = page.eles(container, timeout=5)
76
+ except Exception:
77
+ containers = page.s_eles(container)
78
+ if not containers:
79
+ return []
80
+
81
+ results = []
82
+ for item in list(containers)[:limit]:
83
+ record = {}
84
+ for field_name, spec in fields.items():
85
+ if isinstance(spec, str):
86
+ spec = {'selector': spec}
87
+ sel = spec.get('selector', '')
88
+ multi = spec.get('multi', False)
89
+ attr = spec.get('attr', None)
90
+ default = spec.get('default', '')
91
+ try:
92
+ if multi:
93
+ eles = item.eles(sel)
94
+ record[field_name] = [
95
+ (e.attr(attr) if attr else (e.raw_text or '').strip())
96
+ for e in eles
97
+ ]
98
+ else:
99
+ ele = item.ele(sel)
100
+ if ele and ele.__class__.__name__ != 'NoneElement':
101
+ if attr:
102
+ record[field_name] = ele.attr(attr) or default
103
+ else:
104
+ record[field_name] = (ele.raw_text or '').strip() or default
105
+ else:
106
+ record[field_name] = default
107
+ except Exception:
108
+ record[field_name] = default
109
+ results.append(record)
110
+ return results
111
+
112
+
113
+ def query_elements(page, selector: str, fields: list,
114
+ limit: int = 200) -> list:
115
+ """
116
+ query 模式:找到所有匹配 selector 的元素,批量提取指定属性/文本。
117
+ """
118
+ try:
119
+ eles = page.eles(selector, timeout=5)
120
+ except Exception:
121
+ eles = page.s_eles(selector)
122
+
123
+ results = []
124
+ for ele in list(eles)[:limit]:
125
+ record = {}
126
+ for f in fields:
127
+ try:
128
+ if f == 'text':
129
+ record['text'] = (ele.raw_text or '').strip()
130
+ elif f == 'tag':
131
+ record['tag'] = ele.tag
132
+ elif f == 'loc':
133
+ record['loc'] = suggest_locator(
134
+ ele.tag, ele.attrs, (ele.raw_text or '').strip()[:50]
135
+ )
136
+ elif f in ('css', 'css_path'):
137
+ try:
138
+ path = ele.run_js(_JS_CSS_PATH)
139
+ record['css'] = f'css:{path}' if path else ''
140
+ except Exception:
141
+ record['css'] = ''
142
+ elif f == 'xpath':
143
+ try:
144
+ path = ele.run_js(_JS_XPATH)
145
+ record['xpath'] = f'xpath:{path}' if path else ''
146
+ except Exception:
147
+ record['xpath'] = ''
148
+ elif f == 'html':
149
+ record['html'] = ele.inner_html or ''
150
+ elif f == 'outer_html':
151
+ record['outer_html'] = ele.html or ''
152
+ else:
153
+ val = ele.attrs.get(f, '') if hasattr(ele, 'attrs') else ''
154
+ record[f] = val or ''
155
+ except Exception:
156
+ record[f] = ''
157
+ results.append(record)
158
+ return results
@@ -0,0 +1,155 @@
1
+ # -*- coding:utf-8 -*-
2
+ """
3
+ 浏览器注入 JS 脚本。
4
+
5
+ _JS_A11Y_FALLBACK — 当 CDP Accessibility API 不可用时,通过 DOM 遍历模拟 a11y tree。
6
+ """
7
+
8
+ # ── A11y Tree JS 降级脚本 ─────────────────────────────────────────────────────
9
+ # 当 CDP Accessibility API 不可用时,通过 DOM 遍历模拟 a11y tree
10
+ _JS_A11Y_FALLBACK = """
11
+ (() => {
12
+ const ROLE_MAP = {
13
+ a: 'link', button: 'button', input: 'textbox', textarea: 'textbox',
14
+ select: 'combobox', option: 'option', img: 'img', nav: 'navigation',
15
+ main: 'main', header: 'banner', footer: 'contentinfo', aside: 'complementary',
16
+ form: 'form', table: 'table', tr: 'row', td: 'cell', th: 'columnheader',
17
+ ul: 'list', ol: 'list', li: 'listitem', h1: 'heading', h2: 'heading',
18
+ h3: 'heading', h4: 'heading', h5: 'heading', h6: 'heading',
19
+ article: 'article', section: 'region', dialog: 'dialog',
20
+ details: 'group', summary: 'button', progress: 'progressbar',
21
+ meter: 'meter', output: 'status',
22
+ };
23
+ const INPUT_ROLE = { checkbox: 'checkbox', radio: 'radio', range: 'slider',
24
+ search: 'searchbox', number: 'spinbutton', submit: 'button', reset: 'button',
25
+ button: 'button', file: 'button' };
26
+
27
+ let nodeCounter = 0;
28
+ let stats = { total: 0, ignored: 0, interactive: 0 };
29
+ const INTERACTIVE = new Set([
30
+ 'button','link','textbox','combobox','checkbox','radio',
31
+ 'slider','spinbutton','tab','menuitem','searchbox','switch',
32
+ 'option','menuitemcheckbox','menuitemradio','treeitem',
33
+ ]);
34
+
35
+ function getRole(el) {
36
+ const explicit = el.getAttribute('role');
37
+ if (explicit) return explicit;
38
+ const tag = el.tagName.toLowerCase();
39
+ if (tag === 'input') return INPUT_ROLE[el.type] || 'textbox';
40
+ return ROLE_MAP[tag] || '';
41
+ }
42
+
43
+ function getName(el) {
44
+ const label = el.getAttribute('aria-label');
45
+ if (label) return label;
46
+ const labelledBy = el.getAttribute('aria-labelledby');
47
+ if (labelledBy) {
48
+ const ref = document.getElementById(labelledBy);
49
+ if (ref) return (ref.textContent || '').trim().slice(0, 100);
50
+ }
51
+ const alt = el.getAttribute('alt');
52
+ if (alt) return alt;
53
+ const title = el.getAttribute('title');
54
+ if (title) return title;
55
+ const placeholder = el.getAttribute('placeholder');
56
+ if (placeholder) return placeholder;
57
+ // 直接子文本
58
+ let text = '';
59
+ for (const child of el.childNodes) {
60
+ if (child.nodeType === 3) text += child.textContent;
61
+ }
62
+ text = text.trim().slice(0, 100);
63
+ return text;
64
+ }
65
+
66
+ function isHidden(el) {
67
+ if (el.hidden || el.getAttribute('aria-hidden') === 'true') return true;
68
+ const st = getComputedStyle(el);
69
+ return st.display === 'none' || st.visibility === 'hidden';
70
+ }
71
+
72
+ function getProps(el) {
73
+ const props = {};
74
+ const tag = el.tagName.toLowerCase();
75
+ if (el.hasAttribute('aria-expanded'))
76
+ props.expanded = el.getAttribute('aria-expanded') === 'true';
77
+ if (el.hasAttribute('aria-checked'))
78
+ props.checked = el.getAttribute('aria-checked') === 'true';
79
+ if (el.hasAttribute('aria-selected'))
80
+ props.selected = el.getAttribute('aria-selected') === 'true';
81
+ if (el.hasAttribute('aria-disabled') || el.disabled)
82
+ props.disabled = true;
83
+ if (el.hasAttribute('aria-required') || el.required)
84
+ props.required = true;
85
+ if (el.hasAttribute('aria-pressed'))
86
+ props.pressed = el.getAttribute('aria-pressed') === 'true';
87
+ if (/^h[1-6]$/.test(tag))
88
+ props.level = parseInt(tag[1]);
89
+ return props;
90
+ }
91
+
92
+ function buildNode(el, depth) {
93
+ if (depth > 20) return null;
94
+ if (el.nodeType !== 1) return null;
95
+ if (isHidden(el)) { stats.ignored++; return null; }
96
+
97
+ const role = getRole(el);
98
+ const name = getName(el);
99
+ const id = String(++nodeCounter);
100
+ stats.total++;
101
+ if (INTERACTIVE.has(role)) stats.interactive++;
102
+
103
+ const children = [];
104
+ for (const child of el.children) {
105
+ const cn = buildNode(child, depth + 1);
106
+ if (cn) children.push(cn);
107
+ }
108
+
109
+ // 跳过无意义容器(无 role、无 name、只有一个子节点)
110
+ if (!role && !name && children.length === 1) {
111
+ return children[0];
112
+ }
113
+ // 跳过完全空的无 role 节点
114
+ if (!role && !name && children.length === 0) {
115
+ return null;
116
+ }
117
+
118
+ const node = { nodeId: id, role: role || 'generic', name: name };
119
+ const props = getProps(el);
120
+ if (Object.keys(props).length) node.properties = props;
121
+
122
+ const value = el.value;
123
+ if (value !== undefined && value !== '' && role &&
124
+ ['textbox','combobox','slider','spinbutton','searchbox'].includes(role)) {
125
+ node.value = String(value).slice(0, 200);
126
+ }
127
+
128
+ // 定位器
129
+ const loc = suggestLoc(el);
130
+ if (loc) node.locator = loc;
131
+
132
+ if (children.length) node.children = children;
133
+ return node;
134
+ }
135
+
136
+ function suggestLoc(el) {
137
+ const id = el.id;
138
+ if (id) return '#' + id;
139
+ for (const attr of ['data-testid','data-qa','aria-label','name','placeholder']) {
140
+ const v = el.getAttribute(attr);
141
+ if (v) return '@' + attr + '=' + v;
142
+ }
143
+ const cls = el.className;
144
+ if (typeof cls === 'string' && cls.trim()) {
145
+ return '.' + cls.trim().split(/\\s+/)[0];
146
+ }
147
+ const text = (el.textContent || '').trim();
148
+ if (text && text.length <= 30) return 'text:' + text;
149
+ return 't:' + el.tagName.toLowerCase();
150
+ }
151
+
152
+ const tree = buildNode(document.body, 0) || { nodeId: '0', role: 'WebArea', name: document.title };
153
+ return { tree: tree, stats: stats };
154
+ })()
155
+ """
@@ -0,0 +1,43 @@
1
+ # -*- coding:utf-8 -*-
2
+ """共享工具函数"""
3
+ import re
4
+
5
+
6
+ def _is_meaningful_class(cls: str) -> bool:
7
+ """判断 CSS 类名是否有语义(过滤混淆/哈希类名)"""
8
+ if not cls or len(cls) < 2:
9
+ return False
10
+ # CSS module 风格:prefix-hash,后缀含数字(如 btn-abc1234、css-1d2e3f)
11
+ if re.match(r'^[a-z]+-(?=\w*\d)\w{4,}$', cls):
12
+ return False
13
+ # 纯随机字符串:6+ 字符且无分隔符(-_),大小写混杂或全小写无元音
14
+ if len(cls) >= 6 and not re.search(r'[-_]', cls):
15
+ # 大小写混杂无分隔符(如 hkJMPzDNh、BAyykwGBSi)
16
+ if re.search(r'[a-z]', cls) and re.search(r'[A-Z]', cls):
17
+ return False
18
+ # 全小写但无元音(如 bcdfgh)→ 大概率是哈希
19
+ if cls.islower() and not re.search(r'[aeiou]', cls):
20
+ return False
21
+ return True
22
+
23
+
24
+ def suggest_locator(tag: str, attrs: dict, text: str) -> str:
25
+ """为静态元素生成最优 DrissionPage 定位字符串"""
26
+ if attrs.get('id'):
27
+ return f'#{attrs["id"]}'
28
+
29
+ for semantic in ('data-testid', 'data-qa', 'data-cy', 'aria-label', 'name', 'placeholder'):
30
+ if attrs.get(semantic):
31
+ val = attrs[semantic]
32
+ return f'@{semantic}={val}'
33
+
34
+ cls = attrs.get('class', '')
35
+ if cls:
36
+ classes = [c for c in cls.strip().split() if _is_meaningful_class(c)]
37
+ if classes:
38
+ return f'.{classes[0]}'
39
+
40
+ if text and len(text) <= 30:
41
+ return f'text:{text}'
42
+
43
+ return f't:{tag}'
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.4
2
+ Name: dp-cli
3
+ Version: 0.1.0
4
+ Summary: A powerful CLI for DrissionPage — browser automation, structured data extraction, network listening and more.
5
+ License: BSD-3-Clause
6
+ Project-URL: Homepage, https://github.com/mofanx/dp-cli
7
+ Project-URL: Repository, https://github.com/mofanx/dp-cli
8
+ Keywords: drissionpage,browser,automation,cli,web-scraping
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Environment :: Console
12
+ Classifier: Topic :: Utilities
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: DrissionPage>=4.0
17
+ Requires-Dist: click>=8.0
18
+
19
+ # dp-cli
20
+
21
+ A powerful CLI for [DrissionPage](https://github.com/g1879/DrissionPage) — browser automation, structured data extraction, network listening and more.
22
+
23
+ ## Features
24
+
25
+ - **Anti-detection by default** — not based on webdriver, `navigator.webdriver` is `false`
26
+ - **Reuse your own browser** — connect to a running Chrome via `--port`, keeping login state and cookies
27
+ - **Powerful locator syntax** — descriptive strings stable across navigation (no ephemeral refs)
28
+ - **Structured data extraction** — `extract` + `query` + `snapshot --mode content` for scraping list pages
29
+ - **Network listening** — capture XHR/Fetch requests and response bodies
30
+ - **Dual mode** — browser control + pure HTTP requests
31
+ - **Shadow-root / iframe** — traverse directly without switching context
32
+ - **JSON output** — all commands output JSON, AI-friendly
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install dp-cli
38
+ dp --help
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```bash
44
+ # Auto-managed browser
45
+ dp open https://example.com
46
+ dp snapshot
47
+ dp click "text:Login"
48
+ dp fill "@name=username" admin
49
+ dp press Enter
50
+ dp close
51
+
52
+ # Connect to your own logged-in browser
53
+ google-chrome --remote-debugging-port=9222
54
+ dp open https://example.com --port 9222
55
+ dp snapshot
56
+ ```
57
+
58
+ ## Data Extraction (3-step workflow)
59
+
60
+ ```bash
61
+ # 1. Discover CSS class names via noise-filtered content tree
62
+ dp snapshot --mode content --max-text 40
63
+
64
+ # 2. Verify field selectors
65
+ dp query "css:.item-title" --fields "text,loc"
66
+
67
+ # 3. Batch extract to CSV
68
+ dp extract "css:.item-card" \
69
+ '{"title":"css:.item-title",
70
+ "price":"css:.item-price",
71
+ "tags":{"selector":"css:.tag","multi":true},
72
+ "url":{"selector":"css:a","attr":"href"}}' \
73
+ --limit 100 --output csv --filename result.csv
74
+ ```
75
+
76
+ ## Project Structure
77
+
78
+ ```
79
+ dp_cli/
80
+ ├── main.py # CLI entry point (~47 lines)
81
+ ├── session.py # Browser session management
82
+ ├── snapshot.py # Page snapshot & data extraction engine
83
+ ├── output.py # JSON output helpers
84
+ └── commands/
85
+ ├── _utils.py # Shared decorators & helpers
86
+ ├── browser.py # open / goto / reload / close / list
87
+ ├── snapshot_cmd.py # snapshot / extract / query / find / inspect
88
+ ├── element.py # click / fill / select / hover / drag / check / upload
89
+ ├── keyboard.py # press / type / scroll / scroll-to
90
+ ├── page.py # screenshot / pdf / eval / wait / dialog
91
+ ├── tab.py # tab-list / tab-new / tab-select / tab-close
92
+ ├── storage.py # cookie-* / localstorage-* / sessionstorage-*
93
+ ├── network.py # listen / listen-stop / http-get / http-post
94
+ └── misc.py # resize / maximize / state-save / state-load / config-set
95
+ ```
96
+
97
+ ## Documentation
98
+
99
+ See [`skills/SKILL.md`](skills/SKILL.md) for full workflow guide and [`skills/references/commands.md`](skills/references/commands.md) for complete command reference.
100
+
101
+ ## License
102
+
103
+ BSD-3-Clause
@@ -0,0 +1,25 @@
1
+ dp_cli/__init__.py,sha256=-yYBDkgMDSHYf_F_uX23wZsDSjrudhppJvm34su8_I4,23
2
+ dp_cli/main.py,sha256=kzQlpsBt42vi43KlDJbnrmCEaX_K0PFoifrK95YoqVc,1110
3
+ dp_cli/output.py,sha256=MDs3U64iiDfP7ONWEY3HDK02LHmWhHE6ojuyfuTv4R0,2356
4
+ dp_cli/session.py,sha256=2qHuaSiYn3-6BAn4FifyUDrjdBwMT8CzUAm9aF5VEkI,6023
5
+ dp_cli/commands/__init__.py,sha256=fU5LGWUmhYYrZXYBELfUnmUgzhSEuzsIWqQl7Jn1Afk,306
6
+ dp_cli/commands/_utils.py,sha256=KAkUOj_teoJmfH6LKmXWZNGLE9LeJnrguDHy6xqou7I,3466
7
+ dp_cli/commands/browser.py,sha256=FOY0nT0pugztNkRNNj4_PfcX0IVqkaixCrb3ZEtmV4c,6193
8
+ dp_cli/commands/element.py,sha256=AkkQIlmMfmakXAwQ1i57Td0T6kntGMhVgcJvxYBb7a8,11159
9
+ dp_cli/commands/keyboard.py,sha256=Ojr8-twK3tYAumO__H_Y0nBeBUqxVJ3zV5UhpsJoT1w,4328
10
+ dp_cli/commands/misc.py,sha256=QdnO9mWtve6K59nBGXOQEPkB3SryQ9j2Lh_vysfBtP8,4994
11
+ dp_cli/commands/network.py,sha256=aWJqj0iZXPhvVHeKeDXHaC75bvBgBIH3637I_iFPc3w,6735
12
+ dp_cli/commands/page.py,sha256=lswoaO-r3tBx25YRQ22iDgvAeImbj0Fp6-8xEEoRmn8,8268
13
+ dp_cli/commands/snapshot_cmd.py,sha256=YJT8DzuUJJCFXdSCRlX1IzXIu8D0Ooc7LuqrZpkPi10,16888
14
+ dp_cli/commands/storage.py,sha256=J0HN1YBDGm6JwwoPTs5-RN0Sr6oidN1vLA08VBh0DJw,8556
15
+ dp_cli/commands/tab.py,sha256=WOuF2YP_1eQSjcnAmA_aVOza3Bd5NgbkbI2ZTRPVjvs,7277
16
+ dp_cli/snapshot/__init__.py,sha256=08i54F9n7p_1jZVhOPQLzXn3IFsvFXMiqgYJgLtamqg,786
17
+ dp_cli/snapshot/a11y.py,sha256=tvvOkQz3AHfTZ6cj-vf5UqOLsPFF2UDPnFuEAegBreY,23991
18
+ dp_cli/snapshot/extract.py,sha256=rHzdGCEp292El3DTe1EhT6xx4a91NxMv1seJIB5Ho2E,5413
19
+ dp_cli/snapshot/js_scripts.py,sha256=Zybph9IbVlUCARfLQxGeD6BpjLjSKG6zqjIMEoyt_9k,6104
20
+ dp_cli/snapshot/utils.py,sha256=_nYaqhvZjvbWzZQW1ZtM3tzmMoEHqmh9YWZax3WRzvU,1537
21
+ dp_cli-0.1.0.dist-info/METADATA,sha256=9JWR6XS2p2l6-iZG3WecSADGkOQ_uKgTml6NFCINX6w,3685
22
+ dp_cli-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
23
+ dp_cli-0.1.0.dist-info/entry_points.txt,sha256=EgN4pBb_UDeB8bfysCeZuhdCH6EiRz91cJMH9Q1PNBo,40
24
+ dp_cli-0.1.0.dist-info/top_level.txt,sha256=6o7k3gaYATX5gnc7N5WbROP6kfynSgVUX2gjx87_E_o,7
25
+ dp_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dp = dp_cli.main:main
@@ -0,0 +1 @@
1
+ dp_cli