parsehub 2.0.3__tar.gz → 2.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.3/src/parsehub.egg-info → parsehub-2.0.5}/PKG-INFO +21 -2
- {parsehub-2.0.3 → parsehub-2.0.5}/README.md +20 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/pyproject.toml +1 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/__init__.py +26 -3
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/base.py +26 -4
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/ytdlp.py +7 -5
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/bilibili.py +13 -14
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/coolapk.py +8 -7
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/douyin.py +4 -6
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/kuaishou.py +0 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/pipix.py +1 -2
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/threads.py +1 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/tieba.py +2 -2
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/twitter.py +3 -3
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/weibo.py +3 -4
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/weixin.py +0 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/xhs.py +2 -4
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/xiaoheihe.py +1 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/zuiyou.py +0 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/weibo.py +1 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/callback.py +2 -1
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/result.py +27 -18
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/downloader.py +8 -3
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/utils.py +2 -20
- {parsehub-2.0.3 → parsehub-2.0.5/src/parsehub.egg-info}/PKG-INFO +21 -2
- {parsehub-2.0.3 → parsehub-2.0.5}/LICENSE +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/setup.cfg +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/instagram.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/tieba.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/twitter.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/xhs.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/xiaoheihe.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: parsehub
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.5
|
|
4
4
|
Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
|
|
5
5
|
Author-email: 梓澪 <zilingmio@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -119,6 +119,25 @@ print(result)
|
|
|
119
119
|
|
|
120
120
|
## 🔑 高级用法
|
|
121
121
|
|
|
122
|
+
### 下载进度回调
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from parsehub import ParseHub
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ProgressTracker:
|
|
129
|
+
async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
|
|
130
|
+
print(f"[{task_name}] {current}/{total} ({unit})")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
result = ParseHub().download_sync(
|
|
134
|
+
"https://example.com",
|
|
135
|
+
callback=ProgressTracker(),
|
|
136
|
+
callback_args=("extra_arg",),
|
|
137
|
+
callback_kwargs={"task_name": "demo"},
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
122
141
|
### Cookie 登录 & 代理
|
|
123
142
|
|
|
124
143
|
部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
|
|
@@ -126,7 +145,7 @@ print(result)
|
|
|
126
145
|
```python
|
|
127
146
|
from parsehub import ParseHub
|
|
128
147
|
|
|
129
|
-
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
|
|
148
|
+
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
|
|
130
149
|
```
|
|
131
150
|
|
|
132
151
|
Cookie 支持多种格式传入:
|
|
@@ -82,6 +82,25 @@ print(result)
|
|
|
82
82
|
|
|
83
83
|
## 🔑 高级用法
|
|
84
84
|
|
|
85
|
+
### 下载进度回调
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from parsehub import ParseHub
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ProgressTracker:
|
|
92
|
+
async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
|
|
93
|
+
print(f"[{task_name}] {current}/{total} ({unit})")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
result = ParseHub().download_sync(
|
|
97
|
+
"https://example.com",
|
|
98
|
+
callback=ProgressTracker(),
|
|
99
|
+
callback_args=("extra_arg",),
|
|
100
|
+
callback_kwargs={"task_name": "demo"},
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
85
104
|
### Cookie 登录 & 代理
|
|
86
105
|
|
|
87
106
|
部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
|
|
@@ -89,7 +108,7 @@ print(result)
|
|
|
89
108
|
```python
|
|
90
109
|
from parsehub import ParseHub
|
|
91
110
|
|
|
92
|
-
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
|
|
111
|
+
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
|
|
93
112
|
```
|
|
94
113
|
|
|
95
114
|
Cookie 支持多种格式传入:
|
|
@@ -46,14 +46,18 @@ class ParseHub:
|
|
|
46
46
|
*,
|
|
47
47
|
callback: ProgressCallback = None,
|
|
48
48
|
callback_args: tuple = (),
|
|
49
|
+
callback_kwargs: dict | None = None,
|
|
49
50
|
proxy: str | None = None,
|
|
51
|
+
save_metadata: bool = False,
|
|
50
52
|
) -> DownloadResult:
|
|
51
53
|
"""下载
|
|
52
54
|
:param url: 分享文案 / 分享链接
|
|
53
55
|
:param path: 保存路径
|
|
54
56
|
:param callback: 下载进度回调函数
|
|
55
57
|
:param callback_args: 下载进度回调函数参数
|
|
58
|
+
:param callback_kwargs: 回调函数的关键字参数
|
|
56
59
|
:param proxy: 代理
|
|
60
|
+
:param save_metadata: 保存解析结果为 metadata.json, 默认为 False
|
|
57
61
|
:return: DownloadResult
|
|
58
62
|
|
|
59
63
|
Note:
|
|
@@ -68,7 +72,14 @@ class ParseHub:
|
|
|
68
72
|
- ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
|
|
69
73
|
"""
|
|
70
74
|
result = await self.parse(url)
|
|
71
|
-
return await result.download(
|
|
75
|
+
return await result.download(
|
|
76
|
+
path,
|
|
77
|
+
callback=callback,
|
|
78
|
+
callback_args=callback_args,
|
|
79
|
+
callback_kwargs=callback_kwargs,
|
|
80
|
+
proxy=proxy,
|
|
81
|
+
save_metadata=save_metadata,
|
|
82
|
+
)
|
|
72
83
|
|
|
73
84
|
def download_sync(
|
|
74
85
|
self,
|
|
@@ -76,7 +87,9 @@ class ParseHub:
|
|
|
76
87
|
path: str | Path | None = None,
|
|
77
88
|
callback: ProgressCallback | None = None,
|
|
78
89
|
callback_args: tuple = (),
|
|
90
|
+
callback_kwargs: dict | None = None,
|
|
79
91
|
proxy: str | None = None,
|
|
92
|
+
save_metadata: bool = False,
|
|
80
93
|
) -> DownloadResult:
|
|
81
94
|
"""
|
|
82
95
|
同步下载
|
|
@@ -84,7 +97,9 @@ class ParseHub:
|
|
|
84
97
|
:param path: 下载路径
|
|
85
98
|
:param callback: 进度回调函数
|
|
86
99
|
:param callback_args: 进度回调函数参数
|
|
100
|
+
:param callback_kwargs: 回调函数的关键字参数
|
|
87
101
|
:param proxy: 代理
|
|
102
|
+
:param save_metadata: 保存解析结果为 metadata.json, 默认为 False
|
|
88
103
|
:return: DownloadResult
|
|
89
104
|
|
|
90
105
|
Note:
|
|
@@ -99,7 +114,15 @@ class ParseHub:
|
|
|
99
114
|
- ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
|
|
100
115
|
"""
|
|
101
116
|
return get_event_loop().run_until_complete(
|
|
102
|
-
self.download(
|
|
117
|
+
self.download(
|
|
118
|
+
url,
|
|
119
|
+
path,
|
|
120
|
+
callback=callback,
|
|
121
|
+
callback_args=callback_args,
|
|
122
|
+
callback_kwargs=callback_kwargs,
|
|
123
|
+
proxy=proxy,
|
|
124
|
+
save_metadata=save_metadata,
|
|
125
|
+
)
|
|
103
126
|
)
|
|
104
127
|
|
|
105
128
|
async def get_raw_url(self, url: str, proxy: str | None = None) -> str:
|
|
@@ -110,7 +133,7 @@ class ParseHub:
|
|
|
110
133
|
"""
|
|
111
134
|
parser = self.get_parser(url)
|
|
112
135
|
try:
|
|
113
|
-
return await parser(proxy=proxy).get_raw_url(url)
|
|
136
|
+
return await parser(proxy=proxy).get_raw_url(url, after_clean_parameters=True)
|
|
114
137
|
except Exception as e:
|
|
115
138
|
raise ParseError from e
|
|
116
139
|
|
|
@@ -25,6 +25,8 @@ class BaseParser(ABC):
|
|
|
25
25
|
"""匹配规则"""
|
|
26
26
|
__reserved_parameters__: list[str] = []
|
|
27
27
|
"""要保留的参数, 例如翻页. 默认清除全部参数"""
|
|
28
|
+
__after_clean_parameters__: list[str] = []
|
|
29
|
+
"""解析完成后需要清理的参数, 在解析完成前会保留这些参数, 优先级高于 __reserved_parameters__"""
|
|
28
30
|
__redirect_keywords__: list[str] = []
|
|
29
31
|
"""如果链接包含其中之一, 则遵循重定向规则"""
|
|
30
32
|
|
|
@@ -61,9 +63,10 @@ class BaseParser(ABC):
|
|
|
61
63
|
:param url: 分享文案 / 分享链接
|
|
62
64
|
:return: 解析结果
|
|
63
65
|
"""
|
|
64
|
-
raw_url = await self.get_raw_url(url)
|
|
66
|
+
raw_url = await self.get_raw_url(url, after_clean_parameters=False)
|
|
65
67
|
result = await self._do_parse(raw_url)
|
|
66
68
|
result.platform = self.__platform__
|
|
69
|
+
result.raw_url = self._clean_params(raw_url, self.__after_clean_parameters__)
|
|
67
70
|
return result
|
|
68
71
|
|
|
69
72
|
@abstractmethod
|
|
@@ -73,10 +76,11 @@ class BaseParser(ABC):
|
|
|
73
76
|
"""
|
|
74
77
|
raise NotImplementedError
|
|
75
78
|
|
|
76
|
-
async def get_raw_url(self, url: str) -> str:
|
|
79
|
+
async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
|
|
77
80
|
"""
|
|
78
81
|
清除链接中的参数
|
|
79
82
|
:param url: 链接
|
|
83
|
+
:param after_clean_parameters: 是否执行后清理参数
|
|
80
84
|
:return:
|
|
81
85
|
"""
|
|
82
86
|
url = match_url(url)
|
|
@@ -101,7 +105,25 @@ class BaseParser(ABC):
|
|
|
101
105
|
query_params = parse_qs(parsed_url.query)
|
|
102
106
|
|
|
103
107
|
for i in query_params.copy().keys():
|
|
104
|
-
|
|
105
|
-
|
|
108
|
+
is_reserved = i in self.__reserved_parameters__
|
|
109
|
+
is_after_clean = i in self.__after_clean_parameters__
|
|
110
|
+
keep = (is_reserved and not (after_clean_parameters and is_after_clean)) or (
|
|
111
|
+
is_after_clean and not after_clean_parameters
|
|
112
|
+
)
|
|
113
|
+
if not keep:
|
|
114
|
+
query_params.pop(i, None)
|
|
115
|
+
|
|
116
|
+
new_query = urlencode(query_params, doseq=True)
|
|
117
|
+
return parsed_url._replace(query=new_query).geturl()
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _clean_params(url: str, params: list[str]) -> str:
|
|
121
|
+
"""清除链接中的指定参数"""
|
|
122
|
+
if not params:
|
|
123
|
+
return url
|
|
124
|
+
parsed_url = urlparse(url)
|
|
125
|
+
query_params = parse_qs(parsed_url.query)
|
|
126
|
+
for p in params:
|
|
127
|
+
query_params.pop(p, None)
|
|
106
128
|
new_query = urlencode(query_params, doseq=True)
|
|
107
129
|
return parsed_url._replace(query=new_query).geturl()
|
|
@@ -39,7 +39,6 @@ class YtParser(BaseParser, register=False):
|
|
|
39
39
|
dl=video_info,
|
|
40
40
|
title=video_info.title,
|
|
41
41
|
content=video_info.description,
|
|
42
|
-
raw_url=raw_url,
|
|
43
42
|
video=VideoRef(
|
|
44
43
|
url=raw_url,
|
|
45
44
|
thumb_url=video_info.thumbnail,
|
|
@@ -115,11 +114,10 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
115
114
|
title,
|
|
116
115
|
video=None,
|
|
117
116
|
content=None,
|
|
118
|
-
raw_url=None,
|
|
119
117
|
):
|
|
120
118
|
"""dl: yt-dlp解析结果"""
|
|
121
119
|
self.dl = dl
|
|
122
|
-
super().__init__(title=title, video=video, content=content
|
|
120
|
+
super().__init__(title=title, video=video, content=content)
|
|
123
121
|
|
|
124
122
|
async def _do_download(
|
|
125
123
|
self,
|
|
@@ -127,9 +125,13 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
127
125
|
output_dir: str | Path,
|
|
128
126
|
callback: ProgressCallback | None = None,
|
|
129
127
|
callback_args: tuple = (),
|
|
128
|
+
callback_kwargs: dict | None = None,
|
|
130
129
|
proxy: str | None = None,
|
|
131
130
|
headers: dict | None = None,
|
|
132
131
|
) -> "DownloadResult":
|
|
132
|
+
if callback_kwargs is None:
|
|
133
|
+
callback_kwargs = {}
|
|
134
|
+
|
|
133
135
|
paramss = self.dl.paramss.copy()
|
|
134
136
|
if proxy:
|
|
135
137
|
paramss["proxy"] = proxy
|
|
@@ -141,7 +143,7 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
141
143
|
# paramss["format"] = "worstvideo* + worstaudio / worst"
|
|
142
144
|
|
|
143
145
|
if callback:
|
|
144
|
-
await callback(0, 1, "count", *callback_args)
|
|
146
|
+
await callback(0, 1, "count", *callback_args, **callback_kwargs)
|
|
145
147
|
|
|
146
148
|
await self.__download(paramss)
|
|
147
149
|
|
|
@@ -150,7 +152,7 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
150
152
|
raise DownloadError("下载失败 -1")
|
|
151
153
|
|
|
152
154
|
if callback:
|
|
153
|
-
await callback(1, 1, "count", *callback_args)
|
|
155
|
+
await callback(1, 1, "count", *callback_args, **callback_kwargs)
|
|
154
156
|
|
|
155
157
|
video_path = v[0]
|
|
156
158
|
return DownloadResult(
|
|
@@ -3,8 +3,6 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Union
|
|
4
4
|
from urllib.parse import parse_qs, urlparse
|
|
5
5
|
|
|
6
|
-
import httpx
|
|
7
|
-
|
|
8
6
|
from ...config.config import GlobalConfig
|
|
9
7
|
from ...provider_api.bilibili import BiliAPI, BiliDynamic
|
|
10
8
|
from ...types import (
|
|
@@ -30,8 +28,8 @@ class BiliParse(YtParser):
|
|
|
30
28
|
__redirect_keywords__ = ["b23.tv", "bili2233.cn"]
|
|
31
29
|
|
|
32
30
|
async def _do_parse(self, raw_url: str) -> Union["YtVideoParseResult", "BiliVideoParseResult", ImageParseResult]:
|
|
33
|
-
if
|
|
34
|
-
dynamic = await self.get_dynamic_info(
|
|
31
|
+
if await self.is_dynamic(raw_url):
|
|
32
|
+
dynamic = await self.get_dynamic_info(raw_url)
|
|
35
33
|
content = self.hashtag_handler(dynamic.content)
|
|
36
34
|
photos = []
|
|
37
35
|
if dynamic.images:
|
|
@@ -44,7 +42,6 @@ class BiliParse(YtParser):
|
|
|
44
42
|
title=dynamic.title,
|
|
45
43
|
content=content,
|
|
46
44
|
photo=photos,
|
|
47
|
-
raw_url=ourl,
|
|
48
45
|
)
|
|
49
46
|
else:
|
|
50
47
|
try:
|
|
@@ -69,18 +66,16 @@ class BiliParse(YtParser):
|
|
|
69
66
|
else:
|
|
70
67
|
return super().match(url)
|
|
71
68
|
|
|
72
|
-
async def get_raw_url(self, url: str) -> str:
|
|
69
|
+
async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
|
|
73
70
|
"""获取原始链接"""
|
|
74
71
|
if self._is_bvid(url):
|
|
75
72
|
return f"https://www.bilibili.com/video/{url}"
|
|
76
73
|
else:
|
|
77
|
-
return await super().get_raw_url(url)
|
|
74
|
+
return await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
|
|
78
75
|
|
|
79
|
-
|
|
76
|
+
@staticmethod
|
|
77
|
+
async def is_dynamic(url) -> str | None:
|
|
80
78
|
"""是动态"""
|
|
81
|
-
async with httpx.AsyncClient(proxy=self.proxy) as cli:
|
|
82
|
-
url = str((await cli.get(url, follow_redirects=True, timeout=30)).url)
|
|
83
|
-
|
|
84
79
|
if re.search(r"\b\d{18,19}\b", url):
|
|
85
80
|
return url
|
|
86
81
|
return None
|
|
@@ -128,7 +123,6 @@ class BiliParse(YtParser):
|
|
|
128
123
|
video_url = self.change_source(durl["backup_url"][0]) if durl.get("backup_url") else durl["url"]
|
|
129
124
|
return BiliVideoParseResult(
|
|
130
125
|
title=data["View"]["title"],
|
|
131
|
-
raw_url=url,
|
|
132
126
|
content=f"P{p}: {part}" if part else "",
|
|
133
127
|
video=VideoRef(
|
|
134
128
|
url=video_url,
|
|
@@ -143,7 +137,6 @@ class BiliParse(YtParser):
|
|
|
143
137
|
result = await super()._do_parse(url)
|
|
144
138
|
return YtVideoParseResult(
|
|
145
139
|
title=result.title,
|
|
146
|
-
raw_url=result.raw_url,
|
|
147
140
|
dl=result.dl,
|
|
148
141
|
video=result.media,
|
|
149
142
|
)
|
|
@@ -173,12 +166,18 @@ class BiliVideoParseResult(VideoParseResult):
|
|
|
173
166
|
output_dir: str | Path,
|
|
174
167
|
callback: ProgressCallback | None = None,
|
|
175
168
|
callback_args: tuple = (),
|
|
169
|
+
callback_kwargs: dict | None = None,
|
|
176
170
|
proxy: str | None = None,
|
|
177
171
|
headers: dict | None = None,
|
|
178
172
|
) -> "DownloadResult":
|
|
179
173
|
headers = {"referer": "https://www.bilibili.com", "User-Agent": GlobalConfig.ua}
|
|
180
174
|
return await super()._do_download(
|
|
181
|
-
output_dir=output_dir,
|
|
175
|
+
output_dir=output_dir,
|
|
176
|
+
callback=callback,
|
|
177
|
+
callback_args=callback_args,
|
|
178
|
+
callback_kwargs=callback_kwargs,
|
|
179
|
+
proxy=proxy,
|
|
180
|
+
headers=headers,
|
|
182
181
|
)
|
|
183
182
|
|
|
184
183
|
|
|
@@ -14,7 +14,6 @@ from ...types import (
|
|
|
14
14
|
ProgressCallback,
|
|
15
15
|
RichTextParseResult,
|
|
16
16
|
)
|
|
17
|
-
from ...utils.utils import clear_params
|
|
18
17
|
from ..base.base import BaseParser
|
|
19
18
|
|
|
20
19
|
|
|
@@ -22,12 +21,11 @@ class CoolapkParser(BaseParser):
|
|
|
22
21
|
__platform__ = Platform.COOLAPK
|
|
23
22
|
__supported_type__ = ["图文"]
|
|
24
23
|
__match__ = r"^(http(s)?://)www.coolapk.com/(feed|picture)/.*"
|
|
25
|
-
|
|
24
|
+
__after_clean_parameters__ = ["shareKey", "s"]
|
|
26
25
|
|
|
27
26
|
async def _do_parse(
|
|
28
27
|
self, raw_url: str
|
|
29
28
|
) -> Union["CoolapkImageParseResult", "CoolapkRichTextParseResult", "CoolapkMultimediaParseResult"]:
|
|
30
|
-
raw_url_ = clear_params(raw_url, ["s", "shareKey"])
|
|
31
29
|
try:
|
|
32
30
|
coolapk = await Coolapk.parse(raw_url, proxy=self.proxy)
|
|
33
31
|
except Exception as e:
|
|
@@ -38,20 +36,17 @@ class CoolapkParser(BaseParser):
|
|
|
38
36
|
title=coolapk.title,
|
|
39
37
|
media=media,
|
|
40
38
|
markdown_content=coolapk.markdown_content,
|
|
41
|
-
raw_url=raw_url_,
|
|
42
39
|
)
|
|
43
40
|
if any(isinstance(m, AniRef) for m in media):
|
|
44
41
|
return CoolapkMultimediaParseResult(
|
|
45
42
|
title=coolapk.title,
|
|
46
43
|
media=media,
|
|
47
44
|
content=coolapk.text_content,
|
|
48
|
-
raw_url=raw_url_,
|
|
49
45
|
)
|
|
50
46
|
return CoolapkImageParseResult(
|
|
51
47
|
title=coolapk.title,
|
|
52
48
|
photo=media,
|
|
53
49
|
content=coolapk.text_content,
|
|
54
|
-
raw_url=raw_url_,
|
|
55
50
|
)
|
|
56
51
|
|
|
57
52
|
|
|
@@ -62,6 +57,7 @@ class CoolapkParseResult(ParseResult):
|
|
|
62
57
|
output_dir: str | Path,
|
|
63
58
|
callback: ProgressCallback = None,
|
|
64
59
|
callback_args: tuple = (),
|
|
60
|
+
callback_kwargs: dict | None = None,
|
|
65
61
|
proxy: str | None = None,
|
|
66
62
|
headers: dict = None,
|
|
67
63
|
) -> "DownloadResult":
|
|
@@ -72,7 +68,12 @@ class CoolapkParseResult(ParseResult):
|
|
|
72
68
|
)
|
|
73
69
|
}
|
|
74
70
|
return await super()._do_download(
|
|
75
|
-
output_dir=output_dir,
|
|
71
|
+
output_dir=output_dir,
|
|
72
|
+
callback=callback,
|
|
73
|
+
callback_args=callback_args,
|
|
74
|
+
callback_kwargs=callback_kwargs,
|
|
75
|
+
proxy=proxy,
|
|
76
|
+
headers=headers,
|
|
76
77
|
)
|
|
77
78
|
|
|
78
79
|
|
|
@@ -30,9 +30,9 @@ class DouyinParser(BaseParser):
|
|
|
30
30
|
|
|
31
31
|
match data.type:
|
|
32
32
|
case DYType.VIDEO:
|
|
33
|
-
return await self.video_parse(
|
|
33
|
+
return await self.video_parse(data)
|
|
34
34
|
case DYType.IMAGE:
|
|
35
|
-
return await self.image_parse(
|
|
35
|
+
return await self.image_parse(data)
|
|
36
36
|
|
|
37
37
|
@staticmethod
|
|
38
38
|
async def parse_api(url) -> "DYResult":
|
|
@@ -52,17 +52,15 @@ class DouyinParser(BaseParser):
|
|
|
52
52
|
return DYResult.parse(url, response.json())
|
|
53
53
|
|
|
54
54
|
@staticmethod
|
|
55
|
-
async def video_parse(
|
|
55
|
+
async def video_parse(result: "DYResult"):
|
|
56
56
|
return VideoParseResult(
|
|
57
|
-
raw_url=url,
|
|
58
57
|
title=result.desc,
|
|
59
58
|
video=result.video,
|
|
60
59
|
)
|
|
61
60
|
|
|
62
61
|
@staticmethod
|
|
63
|
-
async def image_parse(
|
|
62
|
+
async def image_parse(result: "DYResult"):
|
|
64
63
|
return ImageParseResult(
|
|
65
|
-
raw_url=url,
|
|
66
64
|
title=result.desc,
|
|
67
65
|
photo=result.image_list,
|
|
68
66
|
)
|
|
@@ -27,10 +27,9 @@ class PipixParser(BaseParser):
|
|
|
27
27
|
height=ppx.video_height,
|
|
28
28
|
width=ppx.video_width,
|
|
29
29
|
),
|
|
30
|
-
raw_url=raw_url,
|
|
31
30
|
)
|
|
32
31
|
else:
|
|
33
|
-
return ImageParseResult(title=ppx.content, photo=ppx.img_url
|
|
32
|
+
return ImageParseResult(title=ppx.content, photo=ppx.img_url)
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
__all__ = ["PipixParser"]
|
|
@@ -19,7 +19,7 @@ class ThreadsParser(BaseParser):
|
|
|
19
19
|
media.append(VideoRef(url=m.url, thumb_url=m.thumb_url, width=m.width, height=m.height))
|
|
20
20
|
case ThreadsMediaType.IMAGE:
|
|
21
21
|
media.append(ImageRef(url=m.url, thumb_url=m.url, width=m.width, height=m.height))
|
|
22
|
-
return MultimediaParseResult(content=post.content, media=media
|
|
22
|
+
return MultimediaParseResult(content=post.content, media=media)
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
__all__ = ["ThreadsParser"]
|
|
@@ -17,9 +17,9 @@ class TieBaParser(BaseParser):
|
|
|
17
17
|
raise ParseError("贴吧解析失败") from e
|
|
18
18
|
|
|
19
19
|
if tb.video_url:
|
|
20
|
-
return VideoParseResult(title=tb.title, video=tb.video_url,
|
|
20
|
+
return VideoParseResult(title=tb.title, video=tb.video_url, content=tb.content)
|
|
21
21
|
else:
|
|
22
|
-
return ImageParseResult(title=tb.title, photo=tb.img_url,
|
|
22
|
+
return ImageParseResult(title=tb.title, photo=tb.img_url, content=tb.content)
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
__all__ = ["TieBaParser"]
|
|
@@ -21,8 +21,8 @@ class TwitterParser(BaseParser):
|
|
|
21
21
|
tweet = await self._parse(raw_url)
|
|
22
22
|
return await self.media_parse(raw_url, tweet)
|
|
23
23
|
|
|
24
|
-
async def get_raw_url(self, url: str) -> str:
|
|
25
|
-
url = await super().get_raw_url(url)
|
|
24
|
+
async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
|
|
25
|
+
url = await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
|
|
26
26
|
return str(urlunparse(urlparse(url)._replace(netloc="x.com")))
|
|
27
27
|
|
|
28
28
|
async def _parse(self, url: str):
|
|
@@ -63,7 +63,7 @@ class TwitterParser(BaseParser):
|
|
|
63
63
|
case TwitterAni():
|
|
64
64
|
path = AniRef(url=m.url, ext="mp4", height=m.height, width=m.width, thumb_url=m.thumb_url)
|
|
65
65
|
media.append(path)
|
|
66
|
-
return MultimediaParseResult(content=tweet.full_text, media=media
|
|
66
|
+
return MultimediaParseResult(content=tweet.full_text, media=media)
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
__all__ = ["TwitterParser"]
|
|
@@ -29,7 +29,6 @@ class WeiboParser(BaseParser):
|
|
|
29
29
|
if data.page_info.object_type == MediaType.VIDEO:
|
|
30
30
|
return VideoParseResult(
|
|
31
31
|
content=text,
|
|
32
|
-
raw_url=raw_url,
|
|
33
32
|
video=VideoRef(
|
|
34
33
|
url=data.page_info.media_info.playback.url,
|
|
35
34
|
thumb_url=data.page_info.page_pic,
|
|
@@ -45,7 +44,7 @@ class WeiboParser(BaseParser):
|
|
|
45
44
|
or (data.mix_media_info and data.mix_media_info.items)
|
|
46
45
|
)
|
|
47
46
|
if not media_info:
|
|
48
|
-
return MultimediaParseResult(content=text,
|
|
47
|
+
return MultimediaParseResult(content=text, media=[])
|
|
49
48
|
|
|
50
49
|
for i in media_info:
|
|
51
50
|
match i.type:
|
|
@@ -70,8 +69,8 @@ class WeiboParser(BaseParser):
|
|
|
70
69
|
case _:
|
|
71
70
|
media.append(ImageRef(url=i.media_url, thumb_url=i.thumb_url, width=i.width, height=i.height))
|
|
72
71
|
if all((isinstance(m, ImageRef) or isinstance(m, LivePhotoRef)) for m in media):
|
|
73
|
-
return ImageParseResult(content=text,
|
|
74
|
-
return MultimediaParseResult(content=text,
|
|
72
|
+
return ImageParseResult(content=text, photo=media)
|
|
73
|
+
return MultimediaParseResult(content=text, media=media)
|
|
75
74
|
|
|
76
75
|
def f_text(self, text: str) -> str:
|
|
77
76
|
# text = re.sub(r'<a href="https://video.weibo.com.*?>.*的微博视频.*</a>', "", text)
|
|
@@ -14,7 +14,6 @@ from ...types import (
|
|
|
14
14
|
VideoParseResult,
|
|
15
15
|
VideoRef,
|
|
16
16
|
)
|
|
17
|
-
from ...utils.utils import clear_params
|
|
18
17
|
from ..base import BaseParser
|
|
19
18
|
|
|
20
19
|
|
|
@@ -23,15 +22,14 @@ class XHSParser(BaseParser):
|
|
|
23
22
|
__supported_type__ = ["视频", "图文"]
|
|
24
23
|
__match__ = r"^(http(s)?://)?.+(xiaohongshu|xhslink).com/.+"
|
|
25
24
|
__redirect_keywords__ = ["xhslink", "item"]
|
|
26
|
-
|
|
25
|
+
__after_clean_parameters__ = ["xsec_token"]
|
|
27
26
|
|
|
28
27
|
async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult", "MultimediaParseResult"]:
|
|
29
|
-
raw_url_ = clear_params(raw_url, "xsec_token")
|
|
30
28
|
xhs = XHSAPI(proxy=self.proxy)
|
|
31
29
|
result = await xhs.extract(raw_url)
|
|
32
30
|
|
|
33
31
|
desc = self.hashtag_handler(result.desc)
|
|
34
|
-
k = {"title": result.title, "content": desc, "raw_url":
|
|
32
|
+
k = {"title": result.title, "content": desc, "raw_url": raw_url}
|
|
35
33
|
match result.type:
|
|
36
34
|
case XHSPostType.VIDEO:
|
|
37
35
|
v: XHSMedia = result.media[0]
|
|
@@ -31,7 +31,7 @@ class XiaoHeiHeParser(BaseParser):
|
|
|
31
31
|
return ImageParseResult(photo=media, **v)
|
|
32
32
|
return MultimediaParseResult(media=media, **v)
|
|
33
33
|
case XiaoHeiHePostType.ARTICLE:
|
|
34
|
-
return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content
|
|
34
|
+
return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content)
|
|
35
35
|
|
|
36
36
|
@staticmethod
|
|
37
37
|
def __parse_media(xhh: XiaoHeiHePost):
|
|
@@ -6,7 +6,7 @@ ProgressUnit = Literal["bytes", "count"]
|
|
|
6
6
|
class ProgressCallback(Protocol):
|
|
7
7
|
"""下载进度回调: (current, total, unit, *args) -> None"""
|
|
8
8
|
|
|
9
|
-
async def __call__(self, current: int, total: int, unit: ProgressUnit, *args) -> None:
|
|
9
|
+
async def __call__(self, current: int, total: int, unit: ProgressUnit, *args, **kwargs) -> None:
|
|
10
10
|
"""
|
|
11
11
|
下载进度回调
|
|
12
12
|
Args:
|
|
@@ -16,6 +16,7 @@ class ProgressCallback(Protocol):
|
|
|
16
16
|
- ``bytes``: 字节进度,用于单文件下载时报告已下载/总字节数
|
|
17
17
|
- ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
|
|
18
18
|
*args: 自定义参数
|
|
19
|
+
**kwargs: 自定义关键字参数
|
|
19
20
|
|
|
20
21
|
Returns:
|
|
21
22
|
None
|
|
@@ -29,7 +29,6 @@ class ParseResult(ABC): # noqa: B024
|
|
|
29
29
|
|
|
30
30
|
def __init__(
|
|
31
31
|
self,
|
|
32
|
-
raw_url: str,
|
|
33
32
|
title: str = "",
|
|
34
33
|
content: str = "",
|
|
35
34
|
media: list[AnyMediaRef] | AnyMediaRef | None = None,
|
|
@@ -39,13 +38,12 @@ class ParseResult(ABC): # noqa: B024
|
|
|
39
38
|
:param title: 标题
|
|
40
39
|
:param media: 媒体下载链接
|
|
41
40
|
:param content: 正文 (纯文本)
|
|
42
|
-
:param raw_url: 原始帖子链接
|
|
43
41
|
:param platform: 平台
|
|
44
42
|
"""
|
|
43
|
+
self.raw_url = None
|
|
45
44
|
self.title = (title or "").strip()
|
|
46
45
|
self.content = (content or "").strip()
|
|
47
46
|
self.media = media
|
|
48
|
-
self.raw_url = raw_url
|
|
49
47
|
self.platform = platform
|
|
50
48
|
|
|
51
49
|
def __repr__(self):
|
|
@@ -79,6 +77,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
79
77
|
output_dir: str | Path,
|
|
80
78
|
callback: ProgressCallback | None = None,
|
|
81
79
|
callback_args: tuple = (),
|
|
80
|
+
callback_kwargs: dict | None = None,
|
|
82
81
|
proxy: str | None = None,
|
|
83
82
|
headers: dict | None = None,
|
|
84
83
|
) -> "DownloadResult":
|
|
@@ -87,6 +86,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
87
86
|
:param output_dir: 输出的子目录
|
|
88
87
|
:param callback: 下载进度回调函数
|
|
89
88
|
:param callback_args: 回调函数的参数
|
|
89
|
+
:param callback_kwargs: 回调函数的关键字参数
|
|
90
90
|
:param proxy: 代理
|
|
91
91
|
:param headers: 请求头
|
|
92
92
|
:return: DownloadResult
|
|
@@ -99,13 +99,15 @@ class ParseResult(ABC): # noqa: B024
|
|
|
99
99
|
for i, media in enumerate(media_list):
|
|
100
100
|
dl_progress = None
|
|
101
101
|
dl_progress_args = ()
|
|
102
|
+
dl_progress_kwargs = {}
|
|
102
103
|
if callback and is_single:
|
|
103
104
|
|
|
104
|
-
async def _byte_callback(current, total, *args):
|
|
105
|
-
await callback(current, total, "bytes", *args)
|
|
105
|
+
async def _byte_callback(current, total, *args, **kwargs):
|
|
106
|
+
await callback(current, total, "bytes", *args, **kwargs)
|
|
106
107
|
|
|
107
108
|
dl_progress = _byte_callback
|
|
108
109
|
dl_progress_args = callback_args
|
|
110
|
+
dl_progress_kwargs = callback_kwargs
|
|
109
111
|
|
|
110
112
|
try:
|
|
111
113
|
f = await download(
|
|
@@ -115,6 +117,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
115
117
|
proxies=proxy,
|
|
116
118
|
progress=dl_progress,
|
|
117
119
|
progress_args=dl_progress_args,
|
|
120
|
+
progress_kwargs=dl_progress_kwargs,
|
|
118
121
|
)
|
|
119
122
|
except Exception as e:
|
|
120
123
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
@@ -162,6 +165,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
162
165
|
*,
|
|
163
166
|
callback: ProgressCallback | None = None,
|
|
164
167
|
callback_args: tuple = (),
|
|
168
|
+
callback_kwargs: dict | None = None,
|
|
165
169
|
proxy: str | None = None,
|
|
166
170
|
save_metadata: bool = False,
|
|
167
171
|
) -> "DownloadResult":
|
|
@@ -169,6 +173,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
169
173
|
:param path: 保存路径
|
|
170
174
|
:param callback: 下载进度回调函数
|
|
171
175
|
:param callback_args: 下载进度回调函数参数
|
|
176
|
+
:param callback_kwargs: 回调函数的关键字参数
|
|
172
177
|
:param proxy: 代理
|
|
173
178
|
:param save_metadata: 保存解析结果为 metadata.json, 默认为 False
|
|
174
179
|
:return: DownloadResult
|
|
@@ -176,7 +181,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
176
181
|
Note:
|
|
177
182
|
下载进度回调函数签名::
|
|
178
183
|
|
|
179
|
-
async def callback(current: int, total: int, unit: Literal['bytes', 'count'], *args) -> None
|
|
184
|
+
async def callback(current: int, total: int, unit: Literal['bytes', 'count'], *args, **kwargs) -> None
|
|
180
185
|
|
|
181
186
|
- current: 当前进度值
|
|
182
187
|
- total: 总进度值
|
|
@@ -201,7 +206,11 @@ class ParseResult(ABC): # noqa: B024
|
|
|
201
206
|
|
|
202
207
|
try:
|
|
203
208
|
return await self._do_download(
|
|
204
|
-
output_dir=output_dir,
|
|
209
|
+
output_dir=output_dir,
|
|
210
|
+
callback=callback,
|
|
211
|
+
callback_args=callback_args,
|
|
212
|
+
callback_kwargs=callback_kwargs,
|
|
213
|
+
proxy=proxy,
|
|
205
214
|
)
|
|
206
215
|
except Exception as e:
|
|
207
216
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
@@ -213,6 +222,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
213
222
|
*,
|
|
214
223
|
callback: ProgressCallback | None = None,
|
|
215
224
|
callback_args: tuple = (),
|
|
225
|
+
callback_kwargs: dict | None = None,
|
|
216
226
|
proxy: str | None = None,
|
|
217
227
|
save_metadata: bool = False,
|
|
218
228
|
) -> "DownloadResult":
|
|
@@ -220,6 +230,7 @@ class ParseResult(ABC): # noqa: B024
|
|
|
220
230
|
:param path: 保存路径
|
|
221
231
|
:param callback: 下载进度回调函数
|
|
222
232
|
:param callback_args: 下载进度回调函数参数
|
|
233
|
+
:param callback_kwargs: 回调函数的关键字参数
|
|
223
234
|
:param proxy: 代理
|
|
224
235
|
:param save_metadata: 保存解析结果为 metadata.json, 默认为 False
|
|
225
236
|
:return: DownloadResult
|
|
@@ -237,7 +248,12 @@ class ParseResult(ABC): # noqa: B024
|
|
|
237
248
|
"""
|
|
238
249
|
return get_event_loop().run_until_complete(
|
|
239
250
|
self.download(
|
|
240
|
-
path,
|
|
251
|
+
path,
|
|
252
|
+
callback=callback,
|
|
253
|
+
callback_args=callback_args,
|
|
254
|
+
callback_kwargs=callback_kwargs,
|
|
255
|
+
proxy=proxy,
|
|
256
|
+
save_metadata=save_metadata,
|
|
241
257
|
)
|
|
242
258
|
)
|
|
243
259
|
|
|
@@ -249,7 +265,6 @@ class VideoParseResult(ParseResult):
|
|
|
249
265
|
|
|
250
266
|
def __init__(
|
|
251
267
|
self,
|
|
252
|
-
raw_url: str,
|
|
253
268
|
title: str = "",
|
|
254
269
|
video: str | VideoRef | None = None,
|
|
255
270
|
content: str = "",
|
|
@@ -259,7 +274,6 @@ class VideoParseResult(ParseResult):
|
|
|
259
274
|
title=title,
|
|
260
275
|
media=video,
|
|
261
276
|
content=content,
|
|
262
|
-
raw_url=raw_url,
|
|
263
277
|
)
|
|
264
278
|
|
|
265
279
|
|
|
@@ -270,14 +284,13 @@ class ImageParseResult(ParseResult):
|
|
|
270
284
|
|
|
271
285
|
def __init__(
|
|
272
286
|
self,
|
|
273
|
-
raw_url: str,
|
|
274
287
|
title: str = "",
|
|
275
288
|
photo: list[str | ImageRef | LivePhotoRef] | None = None,
|
|
276
289
|
content: str = "",
|
|
277
290
|
):
|
|
278
291
|
if photo:
|
|
279
292
|
photo = [ImageRef(url=p) if isinstance(p, str) else p for p in photo]
|
|
280
|
-
super().__init__(title=title, media=photo, content=content
|
|
293
|
+
super().__init__(title=title, media=photo, content=content)
|
|
281
294
|
|
|
282
295
|
|
|
283
296
|
class MultimediaParseResult(ParseResult):
|
|
@@ -287,12 +300,11 @@ class MultimediaParseResult(ParseResult):
|
|
|
287
300
|
|
|
288
301
|
def __init__(
|
|
289
302
|
self,
|
|
290
|
-
raw_url: str,
|
|
291
303
|
title: str = "",
|
|
292
304
|
media: list[AnyMediaRef] | None = None,
|
|
293
305
|
content: str = "",
|
|
294
306
|
):
|
|
295
|
-
super().__init__(title=title, media=media, content=content
|
|
307
|
+
super().__init__(title=title, media=media, content=content)
|
|
296
308
|
|
|
297
309
|
|
|
298
310
|
class RichTextParseResult(ParseResult):
|
|
@@ -302,20 +314,17 @@ class RichTextParseResult(ParseResult):
|
|
|
302
314
|
|
|
303
315
|
def __init__(
|
|
304
316
|
self,
|
|
305
|
-
raw_url: str,
|
|
306
317
|
title: str = "",
|
|
307
318
|
media: list[AnyMediaRef] | None = None,
|
|
308
319
|
markdown_content: str = "",
|
|
309
320
|
):
|
|
310
321
|
"""
|
|
311
|
-
|
|
312
322
|
:param title: 标题
|
|
313
323
|
:param media: 文章中的媒体
|
|
314
324
|
:param markdown_content: markdown 格式正文
|
|
315
|
-
:param raw_url: 原始 URL
|
|
316
325
|
"""
|
|
317
326
|
self.markdown_content = markdown_content
|
|
318
|
-
super().__init__(title=title, media=media, content=self.plaintext_content
|
|
327
|
+
super().__init__(title=title, media=media, content=self.plaintext_content)
|
|
319
328
|
|
|
320
329
|
def __repr__(self):
|
|
321
330
|
media_count = f"[{len(self.media if isinstance(self.media, list) else [self.media])}]" if self.media else None
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import re
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
6
7
|
from urllib.parse import unquote, urlparse
|
|
7
8
|
|
|
8
9
|
import aiofiles
|
|
@@ -17,6 +18,7 @@ async def download(
|
|
|
17
18
|
proxies: httpx.Proxy | None = None,
|
|
18
19
|
progress: Callable | None = None,
|
|
19
20
|
progress_args: tuple = (),
|
|
21
|
+
progress_kwargs: dict | None = None,
|
|
20
22
|
max_retries: int = 3,
|
|
21
23
|
chunk_size: int = 8192,
|
|
22
24
|
) -> str:
|
|
@@ -26,7 +28,8 @@ async def download(
|
|
|
26
28
|
:param headers: 请求头
|
|
27
29
|
:param proxies: 代理
|
|
28
30
|
:param progress: 下载进度回调函数
|
|
29
|
-
:param progress_args:
|
|
31
|
+
:param progress_args: 下载进度回调函数的参数
|
|
32
|
+
:param progress_kwargs: 下载进度回调函数的关键字参数
|
|
30
33
|
:param max_retries: 最大重试次数
|
|
31
34
|
:param chunk_size: 分块大小
|
|
32
35
|
:return: 文件路径
|
|
@@ -78,7 +81,7 @@ async def download(
|
|
|
78
81
|
|
|
79
82
|
current = resume_pos if is_resumed else 0
|
|
80
83
|
|
|
81
|
-
file_mode = "ab" if is_resumed else "wb"
|
|
84
|
+
file_mode: Literal["ab", "wb"] = "ab" if is_resumed else "wb"
|
|
82
85
|
|
|
83
86
|
async with aiofiles.open(file=resolved_path, mode=file_mode) as f:
|
|
84
87
|
async for chunk in r.aiter_bytes(chunk_size=chunk_size):
|
|
@@ -86,7 +89,9 @@ async def download(
|
|
|
86
89
|
await f.write(chunk)
|
|
87
90
|
current += len(chunk)
|
|
88
91
|
if progress:
|
|
89
|
-
|
|
92
|
+
if progress_kwargs is None:
|
|
93
|
+
progress_kwargs = {}
|
|
94
|
+
await progress(current, total_size, *progress_args, **progress_kwargs)
|
|
90
95
|
|
|
91
96
|
# 完整性校验
|
|
92
97
|
if 0 < total_size != current:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import re
|
|
4
|
-
from urllib.parse import parse_qs, urlparse
|
|
5
4
|
|
|
6
5
|
from urlextract import URLExtract
|
|
7
6
|
|
|
@@ -15,7 +14,7 @@ def get_event_loop():
|
|
|
15
14
|
return event_loop
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
|
|
17
|
+
_url_extractor = URLExtract()
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
def match_url(text: str) -> str:
|
|
@@ -23,7 +22,7 @@ def match_url(text: str) -> str:
|
|
|
23
22
|
if not text:
|
|
24
23
|
return ""
|
|
25
24
|
text = re.sub(r"(https?://)", r" \1", text) # 协议前面增加空格, 方便提取
|
|
26
|
-
url =
|
|
25
|
+
url = _url_extractor.find_urls(text, only_unique=True)
|
|
27
26
|
return url[0] if url else ""
|
|
28
27
|
|
|
29
28
|
|
|
@@ -35,23 +34,6 @@ def cookie_ellipsis(cookie: dict) -> str:
|
|
|
35
34
|
return f"{text[:c]}......{text[-c:]}"
|
|
36
35
|
|
|
37
36
|
|
|
38
|
-
def clear_params(url: str, param: str | list[str]) -> str:
|
|
39
|
-
"""
|
|
40
|
-
删除链接指定参数
|
|
41
|
-
:param url: 链接
|
|
42
|
-
:param param: 参数
|
|
43
|
-
:return:
|
|
44
|
-
"""
|
|
45
|
-
params = param if isinstance(param, list) else [param]
|
|
46
|
-
parsed_url = urlparse(url)
|
|
47
|
-
query_params = parse_qs(parsed_url.query)
|
|
48
|
-
for i in params.copy():
|
|
49
|
-
if i in query_params:
|
|
50
|
-
del query_params[i]
|
|
51
|
-
new_query = "&".join([f"{k}={v[0]}" for k, v in query_params.items()])
|
|
52
|
-
return parsed_url._replace(query=new_query).geturl()
|
|
53
|
-
|
|
54
|
-
|
|
55
37
|
def normalize_cookie(v):
|
|
56
38
|
if v is None or isinstance(v, dict):
|
|
57
39
|
return v
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: parsehub
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.5
|
|
4
4
|
Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
|
|
5
5
|
Author-email: 梓澪 <zilingmio@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -119,6 +119,25 @@ print(result)
|
|
|
119
119
|
|
|
120
120
|
## 🔑 高级用法
|
|
121
121
|
|
|
122
|
+
### 下载进度回调
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from parsehub import ParseHub
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ProgressTracker:
|
|
129
|
+
async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
|
|
130
|
+
print(f"[{task_name}] {current}/{total} ({unit})")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
result = ParseHub().download_sync(
|
|
134
|
+
"https://example.com",
|
|
135
|
+
callback=ProgressTracker(),
|
|
136
|
+
callback_args=("extra_arg",),
|
|
137
|
+
callback_kwargs={"task_name": "demo"},
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
122
141
|
### Cookie 登录 & 代理
|
|
123
142
|
|
|
124
143
|
部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
|
|
@@ -126,7 +145,7 @@ print(result)
|
|
|
126
145
|
```python
|
|
127
146
|
from parsehub import ParseHub
|
|
128
147
|
|
|
129
|
-
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
|
|
148
|
+
ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
|
|
130
149
|
```
|
|
131
150
|
|
|
132
151
|
Cookie 支持多种格式传入:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|