dp-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dp_cli/__init__.py +1 -0
- dp_cli/commands/__init__.py +12 -0
- dp_cli/commands/_utils.py +107 -0
- dp_cli/commands/browser.py +159 -0
- dp_cli/commands/element.py +259 -0
- dp_cli/commands/keyboard.py +126 -0
- dp_cli/commands/misc.py +136 -0
- dp_cli/commands/network.py +169 -0
- dp_cli/commands/page.py +204 -0
- dp_cli/commands/snapshot_cmd.py +391 -0
- dp_cli/commands/storage.py +222 -0
- dp_cli/commands/tab.py +203 -0
- dp_cli/main.py +47 -0
- dp_cli/output.py +97 -0
- dp_cli/session.py +201 -0
- dp_cli/snapshot/__init__.py +23 -0
- dp_cli/snapshot/a11y.py +671 -0
- dp_cli/snapshot/extract.py +158 -0
- dp_cli/snapshot/js_scripts.py +155 -0
- dp_cli/snapshot/utils.py +43 -0
- dp_cli-0.1.0.dist-info/METADATA +103 -0
- dp_cli-0.1.0.dist-info/RECORD +25 -0
- dp_cli-0.1.0.dist-info/WHEEL +5 -0
- dp_cli-0.1.0.dist-info/entry_points.txt +2 -0
- dp_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
extract_structured / query_elements — 数据提取函数
|
|
4
|
+
"""
|
|
5
|
+
from .utils import suggest_locator
|
|
6
|
+
|
|
7
|
+
_JS_CSS_PATH = """
|
|
8
|
+
var el = this;
|
|
9
|
+
var parts = [];
|
|
10
|
+
while (el && el !== document.body && el.nodeType === 1) {
|
|
11
|
+
var seg = el.tagName.toLowerCase();
|
|
12
|
+
if (el.id && /^[a-zA-Z][\\w-]*$/.test(el.id)) {
|
|
13
|
+
parts.unshift('#' + el.id);
|
|
14
|
+
break;
|
|
15
|
+
}
|
|
16
|
+
var classes = Array.from(el.classList)
|
|
17
|
+
.filter(function(c) { return c.length >= 3; });
|
|
18
|
+
if (classes.length > 0) {
|
|
19
|
+
seg = '.' + classes[0];
|
|
20
|
+
var siblings = el.parentElement
|
|
21
|
+
? Array.from(el.parentElement.querySelectorAll(':scope > ' + seg))
|
|
22
|
+
: [];
|
|
23
|
+
if (siblings.length > 1) {
|
|
24
|
+
var idx = Array.from(el.parentElement.children).indexOf(el) + 1;
|
|
25
|
+
seg = seg + ':nth-child(' + idx + ')';
|
|
26
|
+
}
|
|
27
|
+
} else {
|
|
28
|
+
var allSiblings = el.parentElement
|
|
29
|
+
? Array.from(el.parentElement.children).filter(function(c) { return c.tagName === el.tagName; })
|
|
30
|
+
: [];
|
|
31
|
+
if (allSiblings.length > 1) {
|
|
32
|
+
var idx2 = Array.from(el.parentElement.children).indexOf(el) + 1;
|
|
33
|
+
seg = seg + ':nth-child(' + idx2 + ')';
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
parts.unshift(seg);
|
|
37
|
+
el = el.parentElement;
|
|
38
|
+
}
|
|
39
|
+
return parts.join(' > ');
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
_JS_XPATH = """
|
|
43
|
+
var el = this;
|
|
44
|
+
var parts = [];
|
|
45
|
+
while (el && el.nodeType === 1) {
|
|
46
|
+
var seg = el.tagName.toLowerCase();
|
|
47
|
+
if (el.id && /^[a-zA-Z][\\w-]*$/.test(el.id)) {
|
|
48
|
+
parts.unshift('//' + seg + '[@id="' + el.id + '"]');
|
|
49
|
+
return parts.join('/');
|
|
50
|
+
}
|
|
51
|
+
var siblings = el.parentElement
|
|
52
|
+
? Array.from(el.parentElement.children).filter(function(c) { return c.tagName === el.tagName; })
|
|
53
|
+
: [];
|
|
54
|
+
if (siblings.length > 1) {
|
|
55
|
+
var idx = siblings.indexOf(el) + 1;
|
|
56
|
+
seg = seg + '[' + idx + ']';
|
|
57
|
+
}
|
|
58
|
+
parts.unshift(seg);
|
|
59
|
+
el = el.parentElement;
|
|
60
|
+
}
|
|
61
|
+
return '/' + parts.join('/');
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_structured(page, container: str, fields: dict,
|
|
66
|
+
limit: int = 100) -> list:
|
|
67
|
+
"""
|
|
68
|
+
结构化批量提取。
|
|
69
|
+
|
|
70
|
+
:param container: 容器定位器,如 'css:.job-card'
|
|
71
|
+
:param fields: 字段映射字典
|
|
72
|
+
:param limit: 最多提取多少条
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
containers = page.eles(container, timeout=5)
|
|
76
|
+
except Exception:
|
|
77
|
+
containers = page.s_eles(container)
|
|
78
|
+
if not containers:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
results = []
|
|
82
|
+
for item in list(containers)[:limit]:
|
|
83
|
+
record = {}
|
|
84
|
+
for field_name, spec in fields.items():
|
|
85
|
+
if isinstance(spec, str):
|
|
86
|
+
spec = {'selector': spec}
|
|
87
|
+
sel = spec.get('selector', '')
|
|
88
|
+
multi = spec.get('multi', False)
|
|
89
|
+
attr = spec.get('attr', None)
|
|
90
|
+
default = spec.get('default', '')
|
|
91
|
+
try:
|
|
92
|
+
if multi:
|
|
93
|
+
eles = item.eles(sel)
|
|
94
|
+
record[field_name] = [
|
|
95
|
+
(e.attr(attr) if attr else (e.raw_text or '').strip())
|
|
96
|
+
for e in eles
|
|
97
|
+
]
|
|
98
|
+
else:
|
|
99
|
+
ele = item.ele(sel)
|
|
100
|
+
if ele and ele.__class__.__name__ != 'NoneElement':
|
|
101
|
+
if attr:
|
|
102
|
+
record[field_name] = ele.attr(attr) or default
|
|
103
|
+
else:
|
|
104
|
+
record[field_name] = (ele.raw_text or '').strip() or default
|
|
105
|
+
else:
|
|
106
|
+
record[field_name] = default
|
|
107
|
+
except Exception:
|
|
108
|
+
record[field_name] = default
|
|
109
|
+
results.append(record)
|
|
110
|
+
return results
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def query_elements(page, selector: str, fields: list,
|
|
114
|
+
limit: int = 200) -> list:
|
|
115
|
+
"""
|
|
116
|
+
query 模式:找到所有匹配 selector 的元素,批量提取指定属性/文本。
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
eles = page.eles(selector, timeout=5)
|
|
120
|
+
except Exception:
|
|
121
|
+
eles = page.s_eles(selector)
|
|
122
|
+
|
|
123
|
+
results = []
|
|
124
|
+
for ele in list(eles)[:limit]:
|
|
125
|
+
record = {}
|
|
126
|
+
for f in fields:
|
|
127
|
+
try:
|
|
128
|
+
if f == 'text':
|
|
129
|
+
record['text'] = (ele.raw_text or '').strip()
|
|
130
|
+
elif f == 'tag':
|
|
131
|
+
record['tag'] = ele.tag
|
|
132
|
+
elif f == 'loc':
|
|
133
|
+
record['loc'] = suggest_locator(
|
|
134
|
+
ele.tag, ele.attrs, (ele.raw_text or '').strip()[:50]
|
|
135
|
+
)
|
|
136
|
+
elif f in ('css', 'css_path'):
|
|
137
|
+
try:
|
|
138
|
+
path = ele.run_js(_JS_CSS_PATH)
|
|
139
|
+
record['css'] = f'css:{path}' if path else ''
|
|
140
|
+
except Exception:
|
|
141
|
+
record['css'] = ''
|
|
142
|
+
elif f == 'xpath':
|
|
143
|
+
try:
|
|
144
|
+
path = ele.run_js(_JS_XPATH)
|
|
145
|
+
record['xpath'] = f'xpath:{path}' if path else ''
|
|
146
|
+
except Exception:
|
|
147
|
+
record['xpath'] = ''
|
|
148
|
+
elif f == 'html':
|
|
149
|
+
record['html'] = ele.inner_html or ''
|
|
150
|
+
elif f == 'outer_html':
|
|
151
|
+
record['outer_html'] = ele.html or ''
|
|
152
|
+
else:
|
|
153
|
+
val = ele.attrs.get(f, '') if hasattr(ele, 'attrs') else ''
|
|
154
|
+
record[f] = val or ''
|
|
155
|
+
except Exception:
|
|
156
|
+
record[f] = ''
|
|
157
|
+
results.append(record)
|
|
158
|
+
return results
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
浏览器注入 JS 脚本。
|
|
4
|
+
|
|
5
|
+
_JS_A11Y_FALLBACK — 当 CDP Accessibility API 不可用时,通过 DOM 遍历模拟 a11y tree。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# ── A11y Tree JS 降级脚本 ─────────────────────────────────────────────────────
|
|
9
|
+
# 当 CDP Accessibility API 不可用时,通过 DOM 遍历模拟 a11y tree
|
|
10
|
+
_JS_A11Y_FALLBACK = """
|
|
11
|
+
(() => {
|
|
12
|
+
const ROLE_MAP = {
|
|
13
|
+
a: 'link', button: 'button', input: 'textbox', textarea: 'textbox',
|
|
14
|
+
select: 'combobox', option: 'option', img: 'img', nav: 'navigation',
|
|
15
|
+
main: 'main', header: 'banner', footer: 'contentinfo', aside: 'complementary',
|
|
16
|
+
form: 'form', table: 'table', tr: 'row', td: 'cell', th: 'columnheader',
|
|
17
|
+
ul: 'list', ol: 'list', li: 'listitem', h1: 'heading', h2: 'heading',
|
|
18
|
+
h3: 'heading', h4: 'heading', h5: 'heading', h6: 'heading',
|
|
19
|
+
article: 'article', section: 'region', dialog: 'dialog',
|
|
20
|
+
details: 'group', summary: 'button', progress: 'progressbar',
|
|
21
|
+
meter: 'meter', output: 'status',
|
|
22
|
+
};
|
|
23
|
+
const INPUT_ROLE = { checkbox: 'checkbox', radio: 'radio', range: 'slider',
|
|
24
|
+
search: 'searchbox', number: 'spinbutton', submit: 'button', reset: 'button',
|
|
25
|
+
button: 'button', file: 'button' };
|
|
26
|
+
|
|
27
|
+
let nodeCounter = 0;
|
|
28
|
+
let stats = { total: 0, ignored: 0, interactive: 0 };
|
|
29
|
+
const INTERACTIVE = new Set([
|
|
30
|
+
'button','link','textbox','combobox','checkbox','radio',
|
|
31
|
+
'slider','spinbutton','tab','menuitem','searchbox','switch',
|
|
32
|
+
'option','menuitemcheckbox','menuitemradio','treeitem',
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
function getRole(el) {
|
|
36
|
+
const explicit = el.getAttribute('role');
|
|
37
|
+
if (explicit) return explicit;
|
|
38
|
+
const tag = el.tagName.toLowerCase();
|
|
39
|
+
if (tag === 'input') return INPUT_ROLE[el.type] || 'textbox';
|
|
40
|
+
return ROLE_MAP[tag] || '';
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function getName(el) {
|
|
44
|
+
const label = el.getAttribute('aria-label');
|
|
45
|
+
if (label) return label;
|
|
46
|
+
const labelledBy = el.getAttribute('aria-labelledby');
|
|
47
|
+
if (labelledBy) {
|
|
48
|
+
const ref = document.getElementById(labelledBy);
|
|
49
|
+
if (ref) return (ref.textContent || '').trim().slice(0, 100);
|
|
50
|
+
}
|
|
51
|
+
const alt = el.getAttribute('alt');
|
|
52
|
+
if (alt) return alt;
|
|
53
|
+
const title = el.getAttribute('title');
|
|
54
|
+
if (title) return title;
|
|
55
|
+
const placeholder = el.getAttribute('placeholder');
|
|
56
|
+
if (placeholder) return placeholder;
|
|
57
|
+
// 直接子文本
|
|
58
|
+
let text = '';
|
|
59
|
+
for (const child of el.childNodes) {
|
|
60
|
+
if (child.nodeType === 3) text += child.textContent;
|
|
61
|
+
}
|
|
62
|
+
text = text.trim().slice(0, 100);
|
|
63
|
+
return text;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function isHidden(el) {
|
|
67
|
+
if (el.hidden || el.getAttribute('aria-hidden') === 'true') return true;
|
|
68
|
+
const st = getComputedStyle(el);
|
|
69
|
+
return st.display === 'none' || st.visibility === 'hidden';
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function getProps(el) {
|
|
73
|
+
const props = {};
|
|
74
|
+
const tag = el.tagName.toLowerCase();
|
|
75
|
+
if (el.hasAttribute('aria-expanded'))
|
|
76
|
+
props.expanded = el.getAttribute('aria-expanded') === 'true';
|
|
77
|
+
if (el.hasAttribute('aria-checked'))
|
|
78
|
+
props.checked = el.getAttribute('aria-checked') === 'true';
|
|
79
|
+
if (el.hasAttribute('aria-selected'))
|
|
80
|
+
props.selected = el.getAttribute('aria-selected') === 'true';
|
|
81
|
+
if (el.hasAttribute('aria-disabled') || el.disabled)
|
|
82
|
+
props.disabled = true;
|
|
83
|
+
if (el.hasAttribute('aria-required') || el.required)
|
|
84
|
+
props.required = true;
|
|
85
|
+
if (el.hasAttribute('aria-pressed'))
|
|
86
|
+
props.pressed = el.getAttribute('aria-pressed') === 'true';
|
|
87
|
+
if (/^h[1-6]$/.test(tag))
|
|
88
|
+
props.level = parseInt(tag[1]);
|
|
89
|
+
return props;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function buildNode(el, depth) {
|
|
93
|
+
if (depth > 20) return null;
|
|
94
|
+
if (el.nodeType !== 1) return null;
|
|
95
|
+
if (isHidden(el)) { stats.ignored++; return null; }
|
|
96
|
+
|
|
97
|
+
const role = getRole(el);
|
|
98
|
+
const name = getName(el);
|
|
99
|
+
const id = String(++nodeCounter);
|
|
100
|
+
stats.total++;
|
|
101
|
+
if (INTERACTIVE.has(role)) stats.interactive++;
|
|
102
|
+
|
|
103
|
+
const children = [];
|
|
104
|
+
for (const child of el.children) {
|
|
105
|
+
const cn = buildNode(child, depth + 1);
|
|
106
|
+
if (cn) children.push(cn);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// 跳过无意义容器(无 role、无 name、只有一个子节点)
|
|
110
|
+
if (!role && !name && children.length === 1) {
|
|
111
|
+
return children[0];
|
|
112
|
+
}
|
|
113
|
+
// 跳过完全空的无 role 节点
|
|
114
|
+
if (!role && !name && children.length === 0) {
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const node = { nodeId: id, role: role || 'generic', name: name };
|
|
119
|
+
const props = getProps(el);
|
|
120
|
+
if (Object.keys(props).length) node.properties = props;
|
|
121
|
+
|
|
122
|
+
const value = el.value;
|
|
123
|
+
if (value !== undefined && value !== '' && role &&
|
|
124
|
+
['textbox','combobox','slider','spinbutton','searchbox'].includes(role)) {
|
|
125
|
+
node.value = String(value).slice(0, 200);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// 定位器
|
|
129
|
+
const loc = suggestLoc(el);
|
|
130
|
+
if (loc) node.locator = loc;
|
|
131
|
+
|
|
132
|
+
if (children.length) node.children = children;
|
|
133
|
+
return node;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function suggestLoc(el) {
|
|
137
|
+
const id = el.id;
|
|
138
|
+
if (id) return '#' + id;
|
|
139
|
+
for (const attr of ['data-testid','data-qa','aria-label','name','placeholder']) {
|
|
140
|
+
const v = el.getAttribute(attr);
|
|
141
|
+
if (v) return '@' + attr + '=' + v;
|
|
142
|
+
}
|
|
143
|
+
const cls = el.className;
|
|
144
|
+
if (typeof cls === 'string' && cls.trim()) {
|
|
145
|
+
return '.' + cls.trim().split(/\\s+/)[0];
|
|
146
|
+
}
|
|
147
|
+
const text = (el.textContent || '').trim();
|
|
148
|
+
if (text && text.length <= 30) return 'text:' + text;
|
|
149
|
+
return 't:' + el.tagName.toLowerCase();
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const tree = buildNode(document.body, 0) || { nodeId: '0', role: 'WebArea', name: document.title };
|
|
153
|
+
return { tree: tree, stats: stats };
|
|
154
|
+
})()
|
|
155
|
+
"""
|
dp_cli/snapshot/utils.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
"""共享工具函数"""
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _is_meaningful_class(cls: str) -> bool:
|
|
7
|
+
"""判断 CSS 类名是否有语义(过滤混淆/哈希类名)"""
|
|
8
|
+
if not cls or len(cls) < 2:
|
|
9
|
+
return False
|
|
10
|
+
# CSS module 风格:prefix-hash,后缀含数字(如 btn-abc1234、css-1d2e3f)
|
|
11
|
+
if re.match(r'^[a-z]+-(?=\w*\d)\w{4,}$', cls):
|
|
12
|
+
return False
|
|
13
|
+
# 纯随机字符串:6+ 字符且无分隔符(-_),大小写混杂或全小写无元音
|
|
14
|
+
if len(cls) >= 6 and not re.search(r'[-_]', cls):
|
|
15
|
+
# 大小写混杂无分隔符(如 hkJMPzDNh、BAyykwGBSi)
|
|
16
|
+
if re.search(r'[a-z]', cls) and re.search(r'[A-Z]', cls):
|
|
17
|
+
return False
|
|
18
|
+
# 全小写但无元音(如 bcdfgh)→ 大概率是哈希
|
|
19
|
+
if cls.islower() and not re.search(r'[aeiou]', cls):
|
|
20
|
+
return False
|
|
21
|
+
return True
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def suggest_locator(tag: str, attrs: dict, text: str) -> str:
|
|
25
|
+
"""为静态元素生成最优 DrissionPage 定位字符串"""
|
|
26
|
+
if attrs.get('id'):
|
|
27
|
+
return f'#{attrs["id"]}'
|
|
28
|
+
|
|
29
|
+
for semantic in ('data-testid', 'data-qa', 'data-cy', 'aria-label', 'name', 'placeholder'):
|
|
30
|
+
if attrs.get(semantic):
|
|
31
|
+
val = attrs[semantic]
|
|
32
|
+
return f'@{semantic}={val}'
|
|
33
|
+
|
|
34
|
+
cls = attrs.get('class', '')
|
|
35
|
+
if cls:
|
|
36
|
+
classes = [c for c in cls.strip().split() if _is_meaningful_class(c)]
|
|
37
|
+
if classes:
|
|
38
|
+
return f'.{classes[0]}'
|
|
39
|
+
|
|
40
|
+
if text and len(text) <= 30:
|
|
41
|
+
return f'text:{text}'
|
|
42
|
+
|
|
43
|
+
return f't:{tag}'
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dp-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A powerful CLI for DrissionPage — browser automation, structured data extraction, network listening and more.
|
|
5
|
+
License: BSD-3-Clause
|
|
6
|
+
Project-URL: Homepage, https://github.com/mofanx/dp-cli
|
|
7
|
+
Project-URL: Repository, https://github.com/mofanx/dp-cli
|
|
8
|
+
Keywords: drissionpage,browser,automation,cli,web-scraping
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: DrissionPage>=4.0
|
|
17
|
+
Requires-Dist: click>=8.0
|
|
18
|
+
|
|
19
|
+
# dp-cli
|
|
20
|
+
|
|
21
|
+
A powerful CLI for [DrissionPage](https://github.com/g1879/DrissionPage) — browser automation, structured data extraction, network listening and more.
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- **Anti-detection by default** — not based on webdriver, `navigator.webdriver` is `false`
|
|
26
|
+
- **Reuse your own browser** — connect to a running Chrome via `--port`, keeping login state and cookies
|
|
27
|
+
- **Powerful locator syntax** — descriptive strings stable across navigation (no ephemeral refs)
|
|
28
|
+
- **Structured data extraction** — `extract` + `query` + `snapshot --mode content` for scraping list pages
|
|
29
|
+
- **Network listening** — capture XHR/Fetch requests and response bodies
|
|
30
|
+
- **Dual mode** — browser control + pure HTTP requests
|
|
31
|
+
- **Shadow-root / iframe** — traverse directly without switching context
|
|
32
|
+
- **JSON output** — all commands output JSON, AI-friendly
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install dp-cli
|
|
38
|
+
dp --help
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Auto-managed browser
|
|
45
|
+
dp open https://example.com
|
|
46
|
+
dp snapshot
|
|
47
|
+
dp click "text:Login"
|
|
48
|
+
dp fill "@name=username" admin
|
|
49
|
+
dp press Enter
|
|
50
|
+
dp close
|
|
51
|
+
|
|
52
|
+
# Connect to your own logged-in browser
|
|
53
|
+
google-chrome --remote-debugging-port=9222
|
|
54
|
+
dp open https://example.com --port 9222
|
|
55
|
+
dp snapshot
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Data Extraction (3-step workflow)
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# 1. Discover CSS class names via noise-filtered content tree
|
|
62
|
+
dp snapshot --mode content --max-text 40
|
|
63
|
+
|
|
64
|
+
# 2. Verify field selectors
|
|
65
|
+
dp query "css:.item-title" --fields "text,loc"
|
|
66
|
+
|
|
67
|
+
# 3. Batch extract to CSV
|
|
68
|
+
dp extract "css:.item-card" \
|
|
69
|
+
'{"title":"css:.item-title",
|
|
70
|
+
"price":"css:.item-price",
|
|
71
|
+
"tags":{"selector":"css:.tag","multi":true},
|
|
72
|
+
"url":{"selector":"css:a","attr":"href"}}' \
|
|
73
|
+
--limit 100 --output csv --filename result.csv
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Project Structure
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
dp_cli/
|
|
80
|
+
├── main.py # CLI entry point (~47 lines)
|
|
81
|
+
├── session.py # Browser session management
|
|
82
|
+
├── snapshot.py # Page snapshot & data extraction engine
|
|
83
|
+
├── output.py # JSON output helpers
|
|
84
|
+
└── commands/
|
|
85
|
+
├── _utils.py # Shared decorators & helpers
|
|
86
|
+
├── browser.py # open / goto / reload / close / list
|
|
87
|
+
├── snapshot_cmd.py # snapshot / extract / query / find / inspect
|
|
88
|
+
├── element.py # click / fill / select / hover / drag / check / upload
|
|
89
|
+
├── keyboard.py # press / type / scroll / scroll-to
|
|
90
|
+
├── page.py # screenshot / pdf / eval / wait / dialog
|
|
91
|
+
├── tab.py # tab-list / tab-new / tab-select / tab-close
|
|
92
|
+
├── storage.py # cookie-* / localstorage-* / sessionstorage-*
|
|
93
|
+
├── network.py # listen / listen-stop / http-get / http-post
|
|
94
|
+
└── misc.py # resize / maximize / state-save / state-load / config-set
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Documentation
|
|
98
|
+
|
|
99
|
+
See [`skills/SKILL.md`](skills/SKILL.md) for full workflow guide and [`skills/references/commands.md`](skills/references/commands.md) for complete command reference.
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
BSD-3-Clause
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
dp_cli/__init__.py,sha256=-yYBDkgMDSHYf_F_uX23wZsDSjrudhppJvm34su8_I4,23
|
|
2
|
+
dp_cli/main.py,sha256=kzQlpsBt42vi43KlDJbnrmCEaX_K0PFoifrK95YoqVc,1110
|
|
3
|
+
dp_cli/output.py,sha256=MDs3U64iiDfP7ONWEY3HDK02LHmWhHE6ojuyfuTv4R0,2356
|
|
4
|
+
dp_cli/session.py,sha256=2qHuaSiYn3-6BAn4FifyUDrjdBwMT8CzUAm9aF5VEkI,6023
|
|
5
|
+
dp_cli/commands/__init__.py,sha256=fU5LGWUmhYYrZXYBELfUnmUgzhSEuzsIWqQl7Jn1Afk,306
|
|
6
|
+
dp_cli/commands/_utils.py,sha256=KAkUOj_teoJmfH6LKmXWZNGLE9LeJnrguDHy6xqou7I,3466
|
|
7
|
+
dp_cli/commands/browser.py,sha256=FOY0nT0pugztNkRNNj4_PfcX0IVqkaixCrb3ZEtmV4c,6193
|
|
8
|
+
dp_cli/commands/element.py,sha256=AkkQIlmMfmakXAwQ1i57Td0T6kntGMhVgcJvxYBb7a8,11159
|
|
9
|
+
dp_cli/commands/keyboard.py,sha256=Ojr8-twK3tYAumO__H_Y0nBeBUqxVJ3zV5UhpsJoT1w,4328
|
|
10
|
+
dp_cli/commands/misc.py,sha256=QdnO9mWtve6K59nBGXOQEPkB3SryQ9j2Lh_vysfBtP8,4994
|
|
11
|
+
dp_cli/commands/network.py,sha256=aWJqj0iZXPhvVHeKeDXHaC75bvBgBIH3637I_iFPc3w,6735
|
|
12
|
+
dp_cli/commands/page.py,sha256=lswoaO-r3tBx25YRQ22iDgvAeImbj0Fp6-8xEEoRmn8,8268
|
|
13
|
+
dp_cli/commands/snapshot_cmd.py,sha256=YJT8DzuUJJCFXdSCRlX1IzXIu8D0Ooc7LuqrZpkPi10,16888
|
|
14
|
+
dp_cli/commands/storage.py,sha256=J0HN1YBDGm6JwwoPTs5-RN0Sr6oidN1vLA08VBh0DJw,8556
|
|
15
|
+
dp_cli/commands/tab.py,sha256=WOuF2YP_1eQSjcnAmA_aVOza3Bd5NgbkbI2ZTRPVjvs,7277
|
|
16
|
+
dp_cli/snapshot/__init__.py,sha256=08i54F9n7p_1jZVhOPQLzXn3IFsvFXMiqgYJgLtamqg,786
|
|
17
|
+
dp_cli/snapshot/a11y.py,sha256=tvvOkQz3AHfTZ6cj-vf5UqOLsPFF2UDPnFuEAegBreY,23991
|
|
18
|
+
dp_cli/snapshot/extract.py,sha256=rHzdGCEp292El3DTe1EhT6xx4a91NxMv1seJIB5Ho2E,5413
|
|
19
|
+
dp_cli/snapshot/js_scripts.py,sha256=Zybph9IbVlUCARfLQxGeD6BpjLjSKG6zqjIMEoyt_9k,6104
|
|
20
|
+
dp_cli/snapshot/utils.py,sha256=_nYaqhvZjvbWzZQW1ZtM3tzmMoEHqmh9YWZax3WRzvU,1537
|
|
21
|
+
dp_cli-0.1.0.dist-info/METADATA,sha256=9JWR6XS2p2l6-iZG3WecSADGkOQ_uKgTml6NFCINX6w,3685
|
|
22
|
+
dp_cli-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
23
|
+
dp_cli-0.1.0.dist-info/entry_points.txt,sha256=EgN4pBb_UDeB8bfysCeZuhdCH6EiRz91cJMH9Q1PNBo,40
|
|
24
|
+
dp_cli-0.1.0.dist-info/top_level.txt,sha256=6o7k3gaYATX5gnc7N5WbROP6kfynSgVUX2gjx87_E_o,7
|
|
25
|
+
dp_cli-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dp_cli
|