parsehub 2.0.17__tar.gz → 2.0.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsehub-2.0.19/PKG-INFO +381 -0
- parsehub-2.0.19/README.md +338 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/pyproject.toml +12 -1
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/__init__.py +23 -8
- parsehub-2.0.19/src/parsehub/cli.py +661 -0
- parsehub-2.0.19/src/parsehub/cli_config.py +292 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/weibo.py +2 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/xhs.py +4 -1
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/result.py +2 -2
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/utils/utils.py +8 -5
- parsehub-2.0.19/src/parsehub.egg-info/PKG-INFO +381 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub.egg-info/SOURCES.txt +7 -1
- parsehub-2.0.19/src/parsehub.egg-info/entry_points.txt +3 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub.egg-info/requires.txt +5 -0
- parsehub-2.0.19/test/test_cli.py +459 -0
- parsehub-2.0.19/test/test_cli_config.py +79 -0
- parsehub-2.0.19/test/test_core_offline.py +357 -0
- parsehub-2.0.17/PKG-INFO +0 -206
- parsehub-2.0.17/README.md +0 -167
- parsehub-2.0.17/src/parsehub.egg-info/PKG-INFO +0 -206
- {parsehub-2.0.17 → parsehub-2.0.19}/LICENSE +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/setup.cfg +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/base/base.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/base/ytdlp.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/bilibili.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/coolapk.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/douyin.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/instagram.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/tieba.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/tiktok.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/twitter.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/weibo.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/xhs.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/douyin.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/tieba.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/tiktok.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/twitter.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/xiaoheihe.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.17 → parsehub-2.0.19}/src/parsehub.egg-info/top_level.txt +0 -0
parsehub-2.0.19/PKG-INFO
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: parsehub
|
|
3
|
+
Version: 2.0.19
|
|
4
|
+
Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
|
|
5
|
+
Author-email: 梓澪 <zilingmio@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/z-mio/parsehub
|
|
8
|
+
Project-URL: Issues, https://github.com/z-mio/parsehub/issues
|
|
9
|
+
Keywords: parser,video-downloader,social-media,crawler,parsehub
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Multimedia :: Video
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Requires-Python: >=3.12.0
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: aiofiles>=23.2
|
|
19
|
+
Requires-Dist: beautifulsoup4>=4.12.3
|
|
20
|
+
Requires-Dist: loguru>=0.6.0
|
|
21
|
+
Requires-Dist: pydub>=0.25.1
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
23
|
+
Requires-Dist: tenacity>=8.5.0
|
|
24
|
+
Requires-Dist: urlextract>=1.9.0
|
|
25
|
+
Requires-Dist: yt-dlp[default]
|
|
26
|
+
Requires-Dist: lxml>=5.3.0
|
|
27
|
+
Requires-Dist: instaloader>=4.14
|
|
28
|
+
Requires-Dist: pydantic>=1.10.19
|
|
29
|
+
Requires-Dist: markdownify>=1.1.0
|
|
30
|
+
Requires-Dist: markdown>=3.7
|
|
31
|
+
Requires-Dist: requests
|
|
32
|
+
Requires-Dist: httpx>=0.24.1
|
|
33
|
+
Requires-Dist: pillow>=12.1.0
|
|
34
|
+
Requires-Dist: python-slugify[unidecode]>=8.0.4
|
|
35
|
+
Requires-Dist: opencv-python-headless>=4.13.0.92
|
|
36
|
+
Requires-Dist: cryptography>=46.0.6
|
|
37
|
+
Requires-Dist: gmssl>=3.2.2
|
|
38
|
+
Provides-Extra: cli
|
|
39
|
+
Requires-Dist: argcomplete>=3.6.3; extra == "cli"
|
|
40
|
+
Requires-Dist: keyring>=25.6.0; extra == "cli"
|
|
41
|
+
Requires-Dist: platformdirs>=4.5.1; extra == "cli"
|
|
42
|
+
Dynamic: license-file
|
|
43
|
+
|
|
44
|
+
<div align="center">
|
|
45
|
+
|
|
46
|
+
# 🔗 ParseHub
|
|
47
|
+
|
|
48
|
+
**社交媒体聚合解析器**
|
|
49
|
+
|
|
50
|
+
[](https://pypi.org/project/parsehub/)
|
|
51
|
+
[](https://www.python.org/)
|
|
52
|
+
[](LICENSE)
|
|
53
|
+
[](https://github.com/z-mio/parsehub)
|
|
54
|
+
|
|
55
|
+
轻量、异步、开箱即用的社交媒体解析与媒体下载库,支持 17+ 平台。
|
|
56
|
+
|
|
57
|
+
[安装](#-安装) · [CLI 快速使用](#-cli-快速使用) · [Python API](#-python-api-快速使用) · [支持平台](#-支持平台) · [高级用法](#-高级用法) · [TG Bot](https://github.com/z-mio/parse_hub_bot)
|
|
58
|
+
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## ✨ 特性
|
|
64
|
+
|
|
65
|
+
- 🌍 **广泛的平台支持** — 覆盖国内外 17+ 主流社交媒体平台
|
|
66
|
+
- 🧹 **链接清理** — 自动提取分享文案中的链接,并清除可移除的跟踪参数
|
|
67
|
+
- 🎬 **多媒体解析** — 支持视频、图文、动图、实况照片和富文本文章
|
|
68
|
+
- 📦 **同步 / 异步 API** — 同时提供 `async/await` 与 `*_sync` 调用方式
|
|
69
|
+
- 🤖 **Telegram Bot** — 基于本项目的 Bot 已上线 → [@ParseHuBot](https://t.me/ParsehuBot)
|
|
70
|
+
|
|
71
|
+
## 📦 安装
|
|
72
|
+
|
|
73
|
+
> Python ≥ 3.12
|
|
74
|
+
|
|
75
|
+
### 安装为命令行工具
|
|
76
|
+
|
|
77
|
+
如果主要把 ParseHub 当作 CLI 使用,推荐用 `pipx` 安装隔离的命令行环境:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pipx install "parsehub[cli]"
|
|
81
|
+
ph --help
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### 安装为 Python 库
|
|
85
|
+
|
|
86
|
+
如果要在项目代码中调用 Python API:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# uv
|
|
90
|
+
uv add parsehub
|
|
91
|
+
|
|
92
|
+
# pip
|
|
93
|
+
pip install parsehub
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
项目内也需要完整 CLI 配置能力时,可安装 `cli` 扩展:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# uv
|
|
100
|
+
uv add "parsehub[cli]"
|
|
101
|
+
|
|
102
|
+
# pip
|
|
103
|
+
pip install "parsehub[cli]"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 🚀 CLI 快速使用
|
|
107
|
+
|
|
108
|
+
解析链接或分享文案:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
parsehub "https://example.com/post/1"
|
|
112
|
+
|
|
113
|
+
# 短命令等价写法
|
|
114
|
+
ph "https://example.com/post/1"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
下载媒体:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
parsehub download "https://example.com/post/1" -o ./downloads
|
|
121
|
+
|
|
122
|
+
# 短命令等价写法
|
|
123
|
+
ph d "https://example.com/post/1" -o ./downloads
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
查看支持的平台:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
ph platforms
|
|
130
|
+
# 或
|
|
131
|
+
ph ls
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
配置某个平台的代理和 Cookie:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# 同时设置解析代理和下载代理
|
|
138
|
+
ph set proxy xhs http://127.0.0.1:7890
|
|
139
|
+
|
|
140
|
+
# 只设置下载代理
|
|
141
|
+
ph set proxy xhs http://127.0.0.1:7891 --for download
|
|
142
|
+
|
|
143
|
+
# 保存 Cookie,输入时不会显示在终端里
|
|
144
|
+
ph set cookie xhs
|
|
145
|
+
|
|
146
|
+
# 查看配置状态
|
|
147
|
+
ph set list
|
|
148
|
+
ph set show xhs
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
配置会自动按平台应用到后续解析和下载;临时覆盖时仍可直接传参数:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
ph "https://example.com/post/1" --proxy http://127.0.0.1:7890
|
|
155
|
+
ph d "https://example.com/post/1" --parse-proxy http://127.0.0.1:7890 --cookie "key=value"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## 🐍 Python API 快速使用
|
|
159
|
+
|
|
160
|
+
### 同步解析
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from parsehub import ParseHub
|
|
164
|
+
|
|
165
|
+
ph = ParseHub()
|
|
166
|
+
result = ph.parse_sync("https://www.xiaoheihe.cn/app/bbs/link/174972336")
|
|
167
|
+
|
|
168
|
+
print(result.title)
|
|
169
|
+
print(result.raw_url)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### 异步解析
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
import asyncio
|
|
176
|
+
from parsehub import ParseHub
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
async def main():
|
|
180
|
+
ph = ParseHub()
|
|
181
|
+
result = await ph.parse("https://tieba.baidu.com/p/9939510114")
|
|
182
|
+
print(result)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
asyncio.run(main())
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### 下载媒体
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
from parsehub import ParseHub
|
|
192
|
+
|
|
193
|
+
ph = ParseHub()
|
|
194
|
+
result = ph.download_sync(
|
|
195
|
+
"https://www.xiaoheihe.cn/app/bbs/link/174972336",
|
|
196
|
+
path="./downloads",
|
|
197
|
+
save_metadata=True,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
print(result.output_dir)
|
|
201
|
+
print(result.media)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
需要 Cookie 登录或解析代理时,可以直接在下载时传入解析参数:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from parsehub import ParseHub
|
|
208
|
+
|
|
209
|
+
ph = ParseHub()
|
|
210
|
+
downloaded = ph.download_sync(
|
|
211
|
+
"https://example.com",
|
|
212
|
+
path="./downloads",
|
|
213
|
+
parse_cookie="key1=value1; key2=value2",
|
|
214
|
+
parse_proxy="http://127.0.0.1:7890",
|
|
215
|
+
save_metadata=True,
|
|
216
|
+
)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## 🌐 支持平台
|
|
220
|
+
|
|
221
|
+
| 平台 | 视频 | 图文 | 其他 |
|
|
222
|
+
|:----------------|:--:|:--:|:------|
|
|
223
|
+
| **Twitter / X** | ✅ | ✅ | |
|
|
224
|
+
| **Instagram** | ✅ | ✅ | |
|
|
225
|
+
| **YouTube** | ✅ | | 🎵 音乐 |
|
|
226
|
+
| **Facebook** | ✅ | | |
|
|
227
|
+
| **Threads** | ✅ | ✅ | |
|
|
228
|
+
| **Bilibili** | ✅ | | 📝 动态 |
|
|
229
|
+
| **抖音** | ✅ | ✅ | |
|
|
230
|
+
| **TikTok** | ✅ | ✅ | |
|
|
231
|
+
| **微博** | ✅ | ✅ | |
|
|
232
|
+
| **小红书** | ✅ | ✅ | |
|
|
233
|
+
| **贴吧** | ✅ | ✅ | |
|
|
234
|
+
| **微信公众号** | | ✅ | |
|
|
235
|
+
| **快手** | ✅ | | |
|
|
236
|
+
| **酷安** | | ✅ | |
|
|
237
|
+
| **皮皮虾** | ✅ | ✅ | |
|
|
238
|
+
| **最右** | ✅ | ✅ | |
|
|
239
|
+
| **小黑盒** | ✅ | ✅ | |
|
|
240
|
+
|
|
241
|
+
> 可通过 `ph ls` 或 `ParseHub().get_platforms()` 获取当前版本实际注册的平台列表。
|
|
242
|
+
|
|
243
|
+
## 🔑 高级用法
|
|
244
|
+
|
|
245
|
+
### 分享文案与平台识别
|
|
246
|
+
|
|
247
|
+
`url` 参数可以直接传分享文案,ParseHub 会自动提取其中的第一个链接:
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from parsehub import ParseHub
|
|
251
|
+
|
|
252
|
+
ph = ParseHub()
|
|
253
|
+
text = "复制这条分享 https://tieba.baidu.com/p/9939510114 后打开"
|
|
254
|
+
|
|
255
|
+
print(ph.get_platform(text))
|
|
256
|
+
print(ph.parse_sync(text).raw_url)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Cookie 登录与代理
|
|
260
|
+
|
|
261
|
+
需要登录态的平台可传 Cookie;解析入口使用 `cookie` / `proxy`,下载入口使用 `parse_cookie` / `parse_proxy` 作为解析阶段参数。
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from parsehub import ParseHub
|
|
265
|
+
|
|
266
|
+
ph = ParseHub()
|
|
267
|
+
result = ph.parse_sync(
|
|
268
|
+
"https://example.com",
|
|
269
|
+
cookie="key1=value1; key2=value2",
|
|
270
|
+
proxy="http://127.0.0.1:7890",
|
|
271
|
+
)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
Cookie 支持多种格式:
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from parsehub import ParseHub
|
|
278
|
+
|
|
279
|
+
ph = ParseHub()
|
|
280
|
+
|
|
281
|
+
# Cookie header 字符串
|
|
282
|
+
ph.parse_sync("https://example.com", cookie="key1=value1; key2=value2")
|
|
283
|
+
|
|
284
|
+
# JSON 字符串
|
|
285
|
+
ph.parse_sync("https://example.com", cookie='{"key1": "value1", "key2": "value2"}')
|
|
286
|
+
|
|
287
|
+
# 字典
|
|
288
|
+
ph.parse_sync("https://example.com", cookie={"key1": "value1", "key2": "value2"})
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
当前支持 Cookie 的平台包括:
|
|
292
|
+
|
|
293
|
+
- `Twitter / X`
|
|
294
|
+
- `Instagram`
|
|
295
|
+
- `YouTube`
|
|
296
|
+
- `Bilibili`
|
|
297
|
+
- `抖音`
|
|
298
|
+
- `TikTok`
|
|
299
|
+
- `快手`
|
|
300
|
+
|
|
301
|
+
### 下载进度回调
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from parsehub import ParseHub
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class ProgressTracker:
|
|
308
|
+
async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
|
|
309
|
+
print(f"[{task_name}] {current}/{total} ({unit})")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
result = ParseHub().download_sync(
|
|
313
|
+
"https://example.com",
|
|
314
|
+
path="./downloads",
|
|
315
|
+
callback=ProgressTracker(),
|
|
316
|
+
callback_args=("extra_arg",),
|
|
317
|
+
callback_kwargs={"task_name": "demo"},
|
|
318
|
+
)
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
`unit` 可能为:
|
|
322
|
+
|
|
323
|
+
- `bytes`:单文件下载时的字节进度
|
|
324
|
+
- `count`:多文件下载时的文件数量进度
|
|
325
|
+
|
|
326
|
+
### 保存 metadata.json
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
from parsehub import ParseHub
|
|
330
|
+
|
|
331
|
+
result = ParseHub().download_sync(
|
|
332
|
+
"https://example.com",
|
|
333
|
+
path="./downloads",
|
|
334
|
+
save_metadata=True,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
print(result.output_dir / "metadata.json")
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
### 全局配置
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
from pathlib import Path
|
|
344
|
+
from parsehub.config import GlobalConfig
|
|
345
|
+
|
|
346
|
+
GlobalConfig.default_save_dir = Path("./downloads")
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### 错误处理
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from parsehub import ParseHub
|
|
353
|
+
from parsehub.errors import ParseError, UnknownPlatform
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
result = ParseHub().parse_sync("https://example.com")
|
|
357
|
+
except UnknownPlatform:
|
|
358
|
+
print("暂不支持该平台")
|
|
359
|
+
except ParseError as exc:
|
|
360
|
+
print(f"解析失败: {exc}")
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
## 🤝 参考项目
|
|
364
|
+
|
|
365
|
+
- [Evil0ctal/Douyin_TikTok_Download_API](https://github.com/Evil0ctal/Douyin_TikTok_Download_API)
|
|
366
|
+
- [yt-dlp/yt-dlp](https://github.com/yt-dlp/yt-dlp)
|
|
367
|
+
- [instaloader/instaloader](https://github.com/instaloader/instaloader)
|
|
368
|
+
- [SocialSisterYi/bilibili-API-collect](https://github.com/SocialSisterYi/bilibili-API-collect)
|
|
369
|
+
- [Nemo2011/bilibili-api](https://github.com/Nemo2011/bilibili-api)
|
|
370
|
+
|
|
371
|
+
## 📜 开源协议
|
|
372
|
+
|
|
373
|
+
本项目基于 [MIT License](LICENSE) 开源。
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
<div align="center">
|
|
378
|
+
|
|
379
|
+
**如果这个项目对你有帮助,欢迎点个 ⭐ Star!**
|
|
380
|
+
|
|
381
|
+
</div>
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# 🔗 ParseHub
|
|
4
|
+
|
|
5
|
+
**社交媒体聚合解析器**
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/parsehub/)
|
|
8
|
+
[](https://www.python.org/)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
[](https://github.com/z-mio/parsehub)
|
|
11
|
+
|
|
12
|
+
轻量、异步、开箱即用的社交媒体解析与媒体下载库,支持 17+ 平台。
|
|
13
|
+
|
|
14
|
+
[安装](#-安装) · [CLI 快速使用](#-cli-快速使用) · [Python API](#-python-api-快速使用) · [支持平台](#-支持平台) · [高级用法](#-高级用法) · [TG Bot](https://github.com/z-mio/parse_hub_bot)
|
|
15
|
+
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## ✨ 特性
|
|
21
|
+
|
|
22
|
+
- 🌍 **广泛的平台支持** — 覆盖国内外 17+ 主流社交媒体平台
|
|
23
|
+
- 🧹 **链接清理** — 自动提取分享文案中的链接,并清除可移除的跟踪参数
|
|
24
|
+
- 🎬 **多媒体解析** — 支持视频、图文、动图、实况照片和富文本文章
|
|
25
|
+
- 📦 **同步 / 异步 API** — 同时提供 `async/await` 与 `*_sync` 调用方式
|
|
26
|
+
- 🤖 **Telegram Bot** — 基于本项目的 Bot 已上线 → [@ParseHuBot](https://t.me/ParsehuBot)
|
|
27
|
+
|
|
28
|
+
## 📦 安装
|
|
29
|
+
|
|
30
|
+
> Python ≥ 3.12
|
|
31
|
+
|
|
32
|
+
### 安装为命令行工具
|
|
33
|
+
|
|
34
|
+
如果主要把 ParseHub 当作 CLI 使用,推荐用 `pipx` 安装隔离的命令行环境:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pipx install "parsehub[cli]"
|
|
38
|
+
ph --help
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### 安装为 Python 库
|
|
42
|
+
|
|
43
|
+
如果要在项目代码中调用 Python API:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
# uv
|
|
47
|
+
uv add parsehub
|
|
48
|
+
|
|
49
|
+
# pip
|
|
50
|
+
pip install parsehub
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
项目内也需要完整 CLI 配置能力时,可安装 `cli` 扩展:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# uv
|
|
57
|
+
uv add "parsehub[cli]"
|
|
58
|
+
|
|
59
|
+
# pip
|
|
60
|
+
pip install "parsehub[cli]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## 🚀 CLI 快速使用
|
|
64
|
+
|
|
65
|
+
解析链接或分享文案:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
parsehub "https://example.com/post/1"
|
|
69
|
+
|
|
70
|
+
# 短命令等价写法
|
|
71
|
+
ph "https://example.com/post/1"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
下载媒体:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
parsehub download "https://example.com/post/1" -o ./downloads
|
|
78
|
+
|
|
79
|
+
# 短命令等价写法
|
|
80
|
+
ph d "https://example.com/post/1" -o ./downloads
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
查看支持的平台:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
ph platforms
|
|
87
|
+
# 或
|
|
88
|
+
ph ls
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
配置某个平台的代理和 Cookie:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# 同时设置解析代理和下载代理
|
|
95
|
+
ph set proxy xhs http://127.0.0.1:7890
|
|
96
|
+
|
|
97
|
+
# 只设置下载代理
|
|
98
|
+
ph set proxy xhs http://127.0.0.1:7891 --for download
|
|
99
|
+
|
|
100
|
+
# 保存 Cookie,输入时不会显示在终端里
|
|
101
|
+
ph set cookie xhs
|
|
102
|
+
|
|
103
|
+
# 查看配置状态
|
|
104
|
+
ph set list
|
|
105
|
+
ph set show xhs
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
配置会自动按平台应用到后续解析和下载;临时覆盖时仍可直接传参数:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
ph "https://example.com/post/1" --proxy http://127.0.0.1:7890
|
|
112
|
+
ph d "https://example.com/post/1" --parse-proxy http://127.0.0.1:7890 --cookie "key=value"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## 🐍 Python API 快速使用
|
|
116
|
+
|
|
117
|
+
### 同步解析
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from parsehub import ParseHub
|
|
121
|
+
|
|
122
|
+
ph = ParseHub()
|
|
123
|
+
result = ph.parse_sync("https://www.xiaoheihe.cn/app/bbs/link/174972336")
|
|
124
|
+
|
|
125
|
+
print(result.title)
|
|
126
|
+
print(result.raw_url)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### 异步解析
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
import asyncio
|
|
133
|
+
from parsehub import ParseHub
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
async def main():
|
|
137
|
+
ph = ParseHub()
|
|
138
|
+
result = await ph.parse("https://tieba.baidu.com/p/9939510114")
|
|
139
|
+
print(result)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
asyncio.run(main())
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 下载媒体
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from parsehub import ParseHub
|
|
149
|
+
|
|
150
|
+
ph = ParseHub()
|
|
151
|
+
result = ph.download_sync(
|
|
152
|
+
"https://www.xiaoheihe.cn/app/bbs/link/174972336",
|
|
153
|
+
path="./downloads",
|
|
154
|
+
save_metadata=True,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
print(result.output_dir)
|
|
158
|
+
print(result.media)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
需要 Cookie 登录或解析代理时,可以直接在下载时传入解析参数:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from parsehub import ParseHub
|
|
165
|
+
|
|
166
|
+
ph = ParseHub()
|
|
167
|
+
downloaded = ph.download_sync(
|
|
168
|
+
"https://example.com",
|
|
169
|
+
path="./downloads",
|
|
170
|
+
parse_cookie="key1=value1; key2=value2",
|
|
171
|
+
parse_proxy="http://127.0.0.1:7890",
|
|
172
|
+
save_metadata=True,
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## 🌐 支持平台
|
|
177
|
+
|
|
178
|
+
| 平台 | 视频 | 图文 | 其他 |
|
|
179
|
+
|:----------------|:--:|:--:|:------|
|
|
180
|
+
| **Twitter / X** | ✅ | ✅ | |
|
|
181
|
+
| **Instagram** | ✅ | ✅ | |
|
|
182
|
+
| **YouTube** | ✅ | | 🎵 音乐 |
|
|
183
|
+
| **Facebook** | ✅ | | |
|
|
184
|
+
| **Threads** | ✅ | ✅ | |
|
|
185
|
+
| **Bilibili** | ✅ | | 📝 动态 |
|
|
186
|
+
| **抖音** | ✅ | ✅ | |
|
|
187
|
+
| **TikTok** | ✅ | ✅ | |
|
|
188
|
+
| **微博** | ✅ | ✅ | |
|
|
189
|
+
| **小红书** | ✅ | ✅ | |
|
|
190
|
+
| **贴吧** | ✅ | ✅ | |
|
|
191
|
+
| **微信公众号** | | ✅ | |
|
|
192
|
+
| **快手** | ✅ | | |
|
|
193
|
+
| **酷安** | | ✅ | |
|
|
194
|
+
| **皮皮虾** | ✅ | ✅ | |
|
|
195
|
+
| **最右** | ✅ | ✅ | |
|
|
196
|
+
| **小黑盒** | ✅ | ✅ | |
|
|
197
|
+
|
|
198
|
+
> 可通过 `ph ls` 或 `ParseHub().get_platforms()` 获取当前版本实际注册的平台列表。
|
|
199
|
+
|
|
200
|
+
## 🔑 高级用法
|
|
201
|
+
|
|
202
|
+
### 分享文案与平台识别
|
|
203
|
+
|
|
204
|
+
`url` 参数可以直接传分享文案,ParseHub 会自动提取其中的第一个链接:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from parsehub import ParseHub
|
|
208
|
+
|
|
209
|
+
ph = ParseHub()
|
|
210
|
+
text = "复制这条分享 https://tieba.baidu.com/p/9939510114 后打开"
|
|
211
|
+
|
|
212
|
+
print(ph.get_platform(text))
|
|
213
|
+
print(ph.parse_sync(text).raw_url)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Cookie 登录与代理
|
|
217
|
+
|
|
218
|
+
需要登录态的平台可传 Cookie;解析入口使用 `cookie` / `proxy`,下载入口使用 `parse_cookie` / `parse_proxy` 作为解析阶段参数。
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from parsehub import ParseHub
|
|
222
|
+
|
|
223
|
+
ph = ParseHub()
|
|
224
|
+
result = ph.parse_sync(
|
|
225
|
+
"https://example.com",
|
|
226
|
+
cookie="key1=value1; key2=value2",
|
|
227
|
+
proxy="http://127.0.0.1:7890",
|
|
228
|
+
)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Cookie 支持多种格式:
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from parsehub import ParseHub
|
|
235
|
+
|
|
236
|
+
ph = ParseHub()
|
|
237
|
+
|
|
238
|
+
# Cookie header 字符串
|
|
239
|
+
ph.parse_sync("https://example.com", cookie="key1=value1; key2=value2")
|
|
240
|
+
|
|
241
|
+
# JSON 字符串
|
|
242
|
+
ph.parse_sync("https://example.com", cookie='{"key1": "value1", "key2": "value2"}')
|
|
243
|
+
|
|
244
|
+
# 字典
|
|
245
|
+
ph.parse_sync("https://example.com", cookie={"key1": "value1", "key2": "value2"})
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
当前支持 Cookie 的平台包括:
|
|
249
|
+
|
|
250
|
+
- `Twitter / X`
|
|
251
|
+
- `Instagram`
|
|
252
|
+
- `YouTube`
|
|
253
|
+
- `Bilibili`
|
|
254
|
+
- `抖音`
|
|
255
|
+
- `TikTok`
|
|
256
|
+
- `快手`
|
|
257
|
+
|
|
258
|
+
### 下载进度回调
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
from parsehub import ParseHub
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ProgressTracker:
|
|
265
|
+
async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
|
|
266
|
+
print(f"[{task_name}] {current}/{total} ({unit})")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
result = ParseHub().download_sync(
|
|
270
|
+
"https://example.com",
|
|
271
|
+
path="./downloads",
|
|
272
|
+
callback=ProgressTracker(),
|
|
273
|
+
callback_args=("extra_arg",),
|
|
274
|
+
callback_kwargs={"task_name": "demo"},
|
|
275
|
+
)
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
`unit` 可能为:
|
|
279
|
+
|
|
280
|
+
- `bytes`:单文件下载时的字节进度
|
|
281
|
+
- `count`:多文件下载时的文件数量进度
|
|
282
|
+
|
|
283
|
+
### 保存 metadata.json
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from parsehub import ParseHub
|
|
287
|
+
|
|
288
|
+
result = ParseHub().download_sync(
|
|
289
|
+
"https://example.com",
|
|
290
|
+
path="./downloads",
|
|
291
|
+
save_metadata=True,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
print(result.output_dir / "metadata.json")
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### 全局配置
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
from pathlib import Path
|
|
301
|
+
from parsehub.config import GlobalConfig
|
|
302
|
+
|
|
303
|
+
GlobalConfig.default_save_dir = Path("./downloads")
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### 错误处理
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
from parsehub import ParseHub
|
|
310
|
+
from parsehub.errors import ParseError, UnknownPlatform
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
result = ParseHub().parse_sync("https://example.com")
|
|
314
|
+
except UnknownPlatform:
|
|
315
|
+
print("暂不支持该平台")
|
|
316
|
+
except ParseError as exc:
|
|
317
|
+
print(f"解析失败: {exc}")
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
## 🤝 参考项目
|
|
321
|
+
|
|
322
|
+
- [Evil0ctal/Douyin_TikTok_Download_API](https://github.com/Evil0ctal/Douyin_TikTok_Download_API)
|
|
323
|
+
- [yt-dlp/yt-dlp](https://github.com/yt-dlp/yt-dlp)
|
|
324
|
+
- [instaloader/instaloader](https://github.com/instaloader/instaloader)
|
|
325
|
+
- [SocialSisterYi/bilibili-API-collect](https://github.com/SocialSisterYi/bilibili-API-collect)
|
|
326
|
+
- [Nemo2011/bilibili-api](https://github.com/Nemo2011/bilibili-api)
|
|
327
|
+
|
|
328
|
+
## 📜 开源协议
|
|
329
|
+
|
|
330
|
+
本项目基于 [MIT License](LICENSE) 开源。
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
<div align="center">
|
|
335
|
+
|
|
336
|
+
**如果这个项目对你有帮助,欢迎点个 ⭐ Star!**
|
|
337
|
+
|
|
338
|
+
</div>
|