qd-browser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qd_browser-0.1.0/.gitignore +154 -0
- qd_browser-0.1.0/LICENSE +21 -0
- qd_browser-0.1.0/PKG-INFO +416 -0
- qd_browser-0.1.0/README.md +380 -0
- qd_browser-0.1.0/pyproject.toml +64 -0
- qd_browser-0.1.0/src/qd_browser/__init__.py +3 -0
- qd_browser-0.1.0/src/qd_browser/browser.py +49 -0
- qd_browser-0.1.0/src/qd_browser/cli.py +1018 -0
- qd_browser-0.1.0/src/qd_browser/config.py +67 -0
- qd_browser-0.1.0/src/qd_browser/crawler.py +215 -0
- qd_browser-0.1.0/src/qd_browser/downloader.py +219 -0
- qd_browser-0.1.0/src/qd_browser/history.py +285 -0
- qd_browser-0.1.0/src/qd_browser/llm.py +458 -0
- qd_browser-0.1.0/src/qd_browser/parser.py +111 -0
- qd_browser-0.1.0/src/qd_browser/search.py +299 -0
- qd_browser-0.1.0/src/qd_browser/utils.py +101 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
98
|
+
__pypackages__/
|
|
99
|
+
|
|
100
|
+
# Celery stuff
|
|
101
|
+
celerybeat-schedule
|
|
102
|
+
celerybeat.pid
|
|
103
|
+
|
|
104
|
+
# SageMath parsed files
|
|
105
|
+
*.sage.py
|
|
106
|
+
|
|
107
|
+
# Environments
|
|
108
|
+
.env
|
|
109
|
+
.venv
|
|
110
|
+
env/
|
|
111
|
+
venv/
|
|
112
|
+
env.bak/
|
|
113
|
+
venv.bak/
|
|
114
|
+
|
|
115
|
+
# Spyder project settings
|
|
116
|
+
.spyderproject
|
|
117
|
+
.spyproject
|
|
118
|
+
|
|
119
|
+
# Rope project settings
|
|
120
|
+
.ropeproject
|
|
121
|
+
|
|
122
|
+
# mkdocs documentation
|
|
123
|
+
/site
|
|
124
|
+
|
|
125
|
+
# mypy
|
|
126
|
+
.mypy_cache/
|
|
127
|
+
.dmypy.json
|
|
128
|
+
dmypy.json
|
|
129
|
+
|
|
130
|
+
# Pyre type checker
|
|
131
|
+
.pyre/
|
|
132
|
+
|
|
133
|
+
# pytype static type analyzer
|
|
134
|
+
.pytype/
|
|
135
|
+
|
|
136
|
+
# Cython debug symbols
|
|
137
|
+
cython_debug/
|
|
138
|
+
|
|
139
|
+
# Pytorch
|
|
140
|
+
/.idea/
|
|
141
|
+
|
|
142
|
+
# SAgents temp files
|
|
143
|
+
.sagents_temp.py
|
|
144
|
+
*.tmp
|
|
145
|
+
.last_session
|
|
146
|
+
|
|
147
|
+
# JAgents project registry
|
|
148
|
+
.jagents_projects.json
|
|
149
|
+
|
|
150
|
+
# Claude Code settings
|
|
151
|
+
.claude/
|
|
152
|
+
|
|
153
|
+
# Output directory
|
|
154
|
+
output/
|
qd_browser-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 juzcn
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: qd-browser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 基于 Playwright 的综合爬虫 CLI 工具
|
|
5
|
+
Project-URL: Homepage, https://github.com/juzcn/qd-browser
|
|
6
|
+
Project-URL: Repository, https://github.com/juzcn/qd-browser.git
|
|
7
|
+
Project-URL: Issues, https://github.com/juzcn/qd-browser/issues
|
|
8
|
+
Author: juzcn
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: browser,cli,crawler,playwright,scraper
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.13
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
23
|
+
Requires-Dist: html2text>=2024.2.26
|
|
24
|
+
Requires-Dist: httpx>=0.28.0
|
|
25
|
+
Requires-Dist: lxml>=5.3.0
|
|
26
|
+
Requires-Dist: openai>=1.50.0
|
|
27
|
+
Requires-Dist: playwright>=1.50.0
|
|
28
|
+
Requires-Dist: pydantic-settings>=2.8.0
|
|
29
|
+
Requires-Dist: python-dotenv>=1.2.2
|
|
30
|
+
Requires-Dist: requests>=2.33.1
|
|
31
|
+
Requires-Dist: rich>=13.9.0
|
|
32
|
+
Requires-Dist: tldextract>=5.3.1
|
|
33
|
+
Requires-Dist: trafilatura>=1.13.0
|
|
34
|
+
Requires-Dist: typer>=0.15.0
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# qd-browser
|
|
38
|
+
|
|
39
|
+
基于 Playwright 的综合爬虫 CLI 工具。
|
|
40
|
+
|
|
41
|
+
## 功能特性
|
|
42
|
+
|
|
43
|
+
- **浏览器自动化**: 基于 Playwright,支持动态网页爬取
|
|
44
|
+
- **内容解析**: 自动提取网页正文、元数据、附件链接
|
|
45
|
+
- **Markdown 导出**: 将网页内容转换为 Markdown 格式保存
|
|
46
|
+
- **附件下载**: 批量下载页面中的 PDF、Word、Excel 等附件
|
|
47
|
+
- **搜索引擎集成**: Serper + 百度搜索,API 失败自动 fallback 到浏览器搜索
|
|
48
|
+
- **URL 去重**: 全局访问历史记录,支持跳过已访问 URL
|
|
49
|
+
- **自动域名子目录**: 自动从 URL 解析域名作为输出子目录
|
|
50
|
+
- **LLM 内容生成**: 使用 NVIDIA 免费模型(Llama 3.1 等)生成内容,支持自动 fallback
|
|
51
|
+
|
|
52
|
+
## 安装
|
|
53
|
+
|
|
54
|
+
### 方式一:从 PyPI 安装(推荐)
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# 使用 pip 安装
|
|
58
|
+
pip install qd-browser
|
|
59
|
+
|
|
60
|
+
# 使用 uv 安装
|
|
61
|
+
uv pip install qd-browser
|
|
62
|
+
|
|
63
|
+
# 安装后需要安装 Playwright 浏览器
|
|
64
|
+
playwright install chromium
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 方式二:从 Git 安装
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# 使用 pip 从 Git 安装
|
|
71
|
+
pip install git+https://github.com/juzcn/qd-browser.git
|
|
72
|
+
|
|
73
|
+
# 安装特定分支
|
|
74
|
+
pip install git+https://github.com/juzcn/qd-browser.git@main
|
|
75
|
+
|
|
76
|
+
# 安装特定 tag
|
|
77
|
+
pip install git+https://github.com/juzcn/qd-browser.git@v0.1.0
|
|
78
|
+
|
|
79
|
+
# 使用 uv 从 Git 安装
|
|
80
|
+
uv pip install git+https://github.com/juzcn/qd-browser.git
|
|
81
|
+
|
|
82
|
+
# 安装后需要安装 Playwright 浏览器
|
|
83
|
+
playwright install chromium
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 方式三:从源码安装(开发者)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# 克隆仓库
|
|
90
|
+
git clone https://github.com/juzcn/qd-browser.git
|
|
91
|
+
cd qd-browser
|
|
92
|
+
|
|
93
|
+
# 使用 uv 安装依赖
|
|
94
|
+
uv sync
|
|
95
|
+
|
|
96
|
+
# 安装 Playwright Chromium 浏览器
|
|
97
|
+
uv run playwright install chromium
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 方式四:从本地分发包安装
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# 安装 wheel 包
|
|
104
|
+
pip install qd_browser-0.1.0-py3-none-any.whl
|
|
105
|
+
|
|
106
|
+
# 或安装源码包
|
|
107
|
+
pip install qd-browser-0.1.0.tar.gz
|
|
108
|
+
|
|
109
|
+
# 使用 uv 安装
|
|
110
|
+
uv pip install ./dist/qd_browser-0.1.0-py3-none-any.whl
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## 开发者:构建分发包
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# 安装构建工具
|
|
117
|
+
uv sync --dev
|
|
118
|
+
|
|
119
|
+
# 构建 sdist 和 wheel
|
|
120
|
+
uv run python -m build
|
|
121
|
+
|
|
122
|
+
# 生成的包在 dist/ 目录下
|
|
123
|
+
ls dist/
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## 使用方法
|
|
127
|
+
|
|
128
|
+
### 基本用法
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# 查看帮助
|
|
132
|
+
uv run qd-browser --help
|
|
133
|
+
|
|
134
|
+
# 爬取单个网页(自动下载附件,默认中文文件夹)
|
|
135
|
+
# 自动从 URL 解析域名作为子目录
|
|
136
|
+
uv run qd-browser url-download https://www.example.com
|
|
137
|
+
|
|
138
|
+
# 指定输出目录
|
|
139
|
+
uv run qd-browser url-download --output-dir ./my-output https://example.com
|
|
140
|
+
|
|
141
|
+
# 使用英文文件夹命名
|
|
142
|
+
uv run qd-browser url-download --language en https://example.com
|
|
143
|
+
|
|
144
|
+
# 使用稳定的 URL 哈希作为文件名后缀(跨进程一致)
|
|
145
|
+
uv run qd-browser url-download --hash-url https://example.com
|
|
146
|
+
|
|
147
|
+
# 调试模式
|
|
148
|
+
uv run qd-browser url-download --debug https://example.com
|
|
149
|
+
|
|
150
|
+
# 不跳过已访问的 URL(强制重新处理)
|
|
151
|
+
uv run qd-browser url-download --not-skip https://example.com
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### 命令说明
|
|
155
|
+
|
|
156
|
+
#### `url-download` - 爬取单个网页并下载内容(包含附件)
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
uv run qd-browser url-download [OPTIONS] URL
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
自动从 URL 解析域名作为输出子目录(例如 `output/example.com/`)。
|
|
163
|
+
|
|
164
|
+
选项:
|
|
165
|
+
- `--output-dir TEXT`: 输出目录(默认: ./output)
|
|
166
|
+
- `--language TEXT`: 语言:zh(中文)或 en(英文),影响文件夹命名(默认: zh)
|
|
167
|
+
- `--hash-url`: 使用稳定的 URL 哈希(MD5)作为文件名后缀
|
|
168
|
+
- `--debug`: 调试模式,保存原始 HTML
|
|
169
|
+
- `--url-title TEXT`: URL 的标题/描述,用于文件命名
|
|
170
|
+
- `--not-skip`: 不跳过已访问的 URL(强制重新处理)
|
|
171
|
+
|
|
172
|
+
#### `domain-download` - 通过搜索引擎批量爬取(包含附件)
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
uv run qd-browser domain-download <域名或URL> <搜索关键词>
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
通过 Serper 和百度搜索发现目标域名下的链接,然后批量爬取。
|
|
179
|
+
所有结果保存到以域名为名的子目录下。
|
|
180
|
+
|
|
181
|
+
选项:
|
|
182
|
+
- `--output-dir TEXT`: 输出目录(默认: ./output)
|
|
183
|
+
- `--language TEXT`: 语言:zh(中文)或 en(英文),影响文件夹命名(默认: zh)
|
|
184
|
+
- `--hash-url`: 使用稳定的 URL 哈希(MD5)作为文件名后缀
|
|
185
|
+
- `--debug`: 调试模式,保存原始 HTML
|
|
186
|
+
- `--not-skip`: 不跳过已访问的 URL(强制重新处理)
|
|
187
|
+
|
|
188
|
+
#### `web-download` - 全网搜索并批量爬取(包含附件)
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
uv run qd-browser web-download <搜索关键词>
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
全网搜索(不限制域名),每个 URL 自动保存到其对应域名的子目录下。
|
|
195
|
+
|
|
196
|
+
选项:
|
|
197
|
+
- `--output-dir TEXT`: 输出目录(默认: ./output)
|
|
198
|
+
- `--language TEXT`: 语言:zh(中文)或 en(英文),影响文件夹命名(默认: zh)
|
|
199
|
+
- `--hash-url`: 使用稳定的 URL 哈希(MD5)作为文件名后缀
|
|
200
|
+
- `--debug`: 调试模式,保存原始 HTML
|
|
201
|
+
- `--not-skip`: 不跳过已访问的 URL(强制重新处理)
|
|
202
|
+
|
|
203
|
+
#### `llm-download` - 使用 NVIDIA 免费模型生成内容
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
# 基本使用
|
|
207
|
+
uv run qd-browser llm-download "写一篇关于人工智能的文章"
|
|
208
|
+
|
|
209
|
+
# 带系统提示词
|
|
210
|
+
uv run qd-browser llm-download "解释量子计算" -s "你是一个科普作家"
|
|
211
|
+
|
|
212
|
+
# 自定义文件名前缀
|
|
213
|
+
uv run qd-browser llm-download "写代码" -f "python_tutorial"
|
|
214
|
+
|
|
215
|
+
# 调整生成参数
|
|
216
|
+
uv run qd-browser llm-download "创作诗歌" -m 8192 -t 0.9 -p 0.95
|
|
217
|
+
|
|
218
|
+
# 调试模式
|
|
219
|
+
uv run qd-browser llm-download "测试" --debug
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
使用 NVIDIA API Catalog 提供的免费模型,支持自动 fallback。
|
|
223
|
+
默认模型池(按优先级):
|
|
224
|
+
1. meta/llama-3.1-405b-instruct
|
|
225
|
+
2. meta/llama-3.1-70b-instruct
|
|
226
|
+
3. meta/llama-3.1-8b-instruct
|
|
227
|
+
4. meta/llama-3-70b-instruct
|
|
228
|
+
5. meta/llama-3-8b-instruct
|
|
229
|
+
|
|
230
|
+
选项:
|
|
231
|
+
- `--output-dir TEXT`: 输出目录(默认: ./output)
|
|
232
|
+
- `--system-prompt, -s TEXT`: 系统提示词(可选)
|
|
233
|
+
- `--filename-prefix, -f TEXT`: 文件名前缀(可选)
|
|
234
|
+
- `--max-tokens, -m INTEGER`: 最大生成 token 数(默认: 4096)
|
|
235
|
+
- `--temperature, -t FLOAT`: 温度参数 0.0-2.0(默认: 0.7)
|
|
236
|
+
- `--top-p, -p FLOAT`: top_p 参数 0.0-1.0(默认: 0.9)
|
|
237
|
+
- `--language TEXT`: 语言:zh(中文)或 en(英文)(默认: zh)
|
|
238
|
+
- `--debug`: 调试模式,显示详细错误信息
|
|
239
|
+
|
|
240
|
+
#### `config` - 配置和历史记录管理
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
# 显示当前配置
|
|
244
|
+
uv run qd-browser config --settings
|
|
245
|
+
uv run qd-browser config -s
|
|
246
|
+
|
|
247
|
+
# 历史记录统计
|
|
248
|
+
uv run qd-browser config --stats
|
|
249
|
+
uv run qd-browser config -t
|
|
250
|
+
|
|
251
|
+
# 列出历史记录
|
|
252
|
+
uv run qd-browser config --history
|
|
253
|
+
uv run qd-browser config -l
|
|
254
|
+
|
|
255
|
+
# 按状态过滤
|
|
256
|
+
uv run qd-browser config --history --by-status success
|
|
257
|
+
|
|
258
|
+
# 按域名过滤
|
|
259
|
+
uv run qd-browser config --history --by-site example.com
|
|
260
|
+
|
|
261
|
+
# 按输出目录过滤
|
|
262
|
+
uv run qd-browser config --history --by-output-dir ./output
|
|
263
|
+
|
|
264
|
+
# 按 URL 过滤
|
|
265
|
+
uv run qd-browser config --history --by-url https://example.com
|
|
266
|
+
uv run qd-browser config --history -c example.com
|
|
267
|
+
|
|
268
|
+
# 从历史记录中移除 URL
|
|
269
|
+
uv run qd-browser config --remove-url https://example.com
|
|
270
|
+
uv run qd-browser config -r https://example.com
|
|
271
|
+
|
|
272
|
+
# 移除指定域名的所有记录
|
|
273
|
+
uv run qd-browser config --remove-site example.com
|
|
274
|
+
|
|
275
|
+
# 移除指定输出目录的所有记录
|
|
276
|
+
uv run qd-browser config --remove-output-dir ./output
|
|
277
|
+
|
|
278
|
+
# 移除指定状态的所有记录
|
|
279
|
+
uv run qd-browser config --remove-status failed
|
|
280
|
+
|
|
281
|
+
# 清空所有历史记录
|
|
282
|
+
uv run qd-browser config --init
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
## 配置
|
|
286
|
+
|
|
287
|
+
可以通过环境变量配置,前缀为 `QD_`:
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
QD_OUTPUT_DIR=./my-output
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
也可以在项目根目录创建 `.env` 文件进行配置。
|
|
294
|
+
|
|
295
|
+
### 搜索 API 配置
|
|
296
|
+
|
|
297
|
+
支持多个 API key 用于 fallback:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
SERPER_API_KEY=xxx
|
|
301
|
+
SERPER_API_KEY_1=xxx
|
|
302
|
+
SERPER_API_KEY_2=xxx
|
|
303
|
+
|
|
304
|
+
BAIDU_API_KEY=xxx
|
|
305
|
+
BAIDU_API_KEY_1=xxx
|
|
306
|
+
BAIDU_API_KEY_2=xxx
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
当所有 API key 都失败时,会自动使用浏览器进行搜索作为备用。
|
|
310
|
+
|
|
311
|
+
### NVIDIA LLM API 配置
|
|
312
|
+
|
|
313
|
+
使用 `llm-download` 命令需要配置:
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
NVAPI_KEY=nvapi-xxx
|
|
317
|
+
NVIDIA_BASE_URL=https://integrate.api.nvidia.com/v1
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
获取 API Key: https://build.nvidia.com/
|
|
321
|
+
|
|
322
|
+
### URL 历史记录
|
|
323
|
+
|
|
324
|
+
全局 URL 访问历史保存在:
|
|
325
|
+
- Windows: `C:\Users\用户名\.qd_browser\visited.json`
|
|
326
|
+
- Linux/Mac: `~/.qd_browser/visited.json`
|
|
327
|
+
|
|
328
|
+
## 项目结构
|
|
329
|
+
|
|
330
|
+
```
|
|
331
|
+
qd-browser/
|
|
332
|
+
├── src/qd_browser/
|
|
333
|
+
│ ├── __init__.py # 包初始化
|
|
334
|
+
│ ├── cli.py # CLI 入口
|
|
335
|
+
│ ├── config.py # 配置管理
|
|
336
|
+
│ ├── browser.py # 浏览器管理
|
|
337
|
+
│ ├── crawler.py # 爬虫核心
|
|
338
|
+
│ ├── parser.py # 内容解析
|
|
339
|
+
│ ├── downloader.py # 文件下载
|
|
340
|
+
│ ├── utils.py # 工具函数
|
|
341
|
+
│ ├── search.py # 搜索功能
|
|
342
|
+
│ ├── history.py # URL 历史记录
|
|
343
|
+
│ └── llm.py # LLM 生成(NVIDIA API)
|
|
344
|
+
├── output/ # 输出目录
|
|
345
|
+
│ └── attachments/ # 附件下载目录
|
|
346
|
+
├── pyproject.toml # 项目配置
|
|
347
|
+
└── README.md
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## 技术栈
|
|
351
|
+
|
|
352
|
+
- **playwright**: 浏览器自动化
|
|
353
|
+
- **typer**: CLI 框架
|
|
354
|
+
- **beautifulsoup4**: HTML 解析
|
|
355
|
+
- **trafilatura**: 网页正文提取 + HTML 转 Markdown
|
|
356
|
+
- **rich**: 终端美化
|
|
357
|
+
- **pydantic-settings**: 配置管理
|
|
358
|
+
- **httpx**: HTTP 客户端
|
|
359
|
+
- **tldextract**: 域名解析
|
|
360
|
+
- **openai**: OpenAI SDK(用于 NVIDIA API)
|
|
361
|
+
- **hatchling**: 构建后端
|
|
362
|
+
|
|
363
|
+
## 开发者:发布到 PyPI
|
|
364
|
+
|
|
365
|
+
### 1. 构建分发包
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
# 清理旧的构建
|
|
369
|
+
rm -rf dist/ build/ *.egg-info/
|
|
370
|
+
|
|
371
|
+
# 构建 sdist 和 wheel
|
|
372
|
+
uv run python -m build
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### 2. 上传到 TestPyPI(测试)
|
|
376
|
+
|
|
377
|
+
```bash
|
|
378
|
+
# 安装 twine
|
|
379
|
+
uv pip install twine
|
|
380
|
+
|
|
381
|
+
# 上传到 TestPyPI
|
|
382
|
+
uv run twine upload --repository testpypi dist/*
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### 3. 从 TestPyPI 测试安装
|
|
386
|
+
|
|
387
|
+
```bash
|
|
388
|
+
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ qd-browser
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
### 4. 上传到 PyPI(正式发布)
|
|
392
|
+
|
|
393
|
+
```bash
|
|
394
|
+
# 上传到 PyPI
|
|
395
|
+
uv run twine upload dist/*
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
### PyPI 账号配置
|
|
399
|
+
|
|
400
|
+
在 `~/.pypirc` 中配置:
|
|
401
|
+
|
|
402
|
+
```ini
|
|
403
|
+
[distutils]
|
|
404
|
+
index-servers =
|
|
405
|
+
pypi
|
|
406
|
+
testpypi
|
|
407
|
+
|
|
408
|
+
[pypi]
|
|
409
|
+
username = __token__
|
|
410
|
+
password = pypi-你的token
|
|
411
|
+
|
|
412
|
+
[testpypi]
|
|
413
|
+
repository = https://test.pypi.org/legacy/
|
|
414
|
+
username = __token__
|
|
415
|
+
password = pypi-你的testtoken
|
|
416
|
+
```
|