parsehub 2.0.5__tar.gz → 2.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.5/src/parsehub.egg-info → parsehub-2.0.7}/PKG-INFO +1 -1
- {parsehub-2.0.5 → parsehub-2.0.7}/pyproject.toml +1 -1
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/__init__.py +24 -2
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/base.py +25 -6
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/bilibili.py +2 -2
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/instagram.py +6 -4
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/twitter.py +4 -4
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/xhs.py +4 -3
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/xiaoheihe.py +3 -4
- {parsehub-2.0.5 → parsehub-2.0.7/src/parsehub.egg-info}/PKG-INFO +1 -1
- {parsehub-2.0.5 → parsehub-2.0.7}/LICENSE +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/README.md +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/setup.cfg +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/ytdlp.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/coolapk.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/douyin.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/tieba.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/weibo.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/tieba.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/twitter.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/weibo.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/xhs.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/xiaoheihe.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/result.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/utils.py +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -125,15 +125,37 @@ class ParseHub:
|
|
|
125
125
|
)
|
|
126
126
|
)
|
|
127
127
|
|
|
128
|
-
async def get_raw_url(self, url: str, proxy: str | None = None) -> str:
|
|
128
|
+
async def get_raw_url(self, url: str, proxy: str | None = None, clean_all: bool = True) -> str:
|
|
129
129
|
"""获取原始链接
|
|
130
130
|
:param url: 分享文案 / 分享链接
|
|
131
131
|
:param proxy: 代理
|
|
132
|
+
:param clean_all: 是否清除全部可清除的参数 (包括解析后才需清除的参数)
|
|
133
|
+
|
|
134
|
+
Example:
|
|
135
|
+
以小红书为例,其解析器配置如下::
|
|
136
|
+
|
|
137
|
+
__reserved_parameters__ = []
|
|
138
|
+
__after_clean_parameters__ = ["xsec_token"]
|
|
139
|
+
|
|
140
|
+
原始链接::
|
|
141
|
+
|
|
142
|
+
https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx&tracking=yyy
|
|
143
|
+
|
|
144
|
+
``clean_all=False`` (解析阶段,保留解析所需的参数)::
|
|
145
|
+
|
|
146
|
+
https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx
|
|
147
|
+
# tracking 被清除,xsec_token 保留(解析时需要它)
|
|
148
|
+
|
|
149
|
+
``clean_all=True`` (最终输出,清除所有非必要参数)::
|
|
150
|
+
|
|
151
|
+
https://www.xiaohongshu.com/explore/abc123
|
|
152
|
+
# xsec_token 也被清除,返回干净的链接
|
|
153
|
+
|
|
132
154
|
:return: 原始链接
|
|
133
155
|
"""
|
|
134
156
|
parser = self.get_parser(url)
|
|
135
157
|
try:
|
|
136
|
-
return await parser(proxy=proxy).get_raw_url(url,
|
|
158
|
+
return await parser(proxy=proxy).get_raw_url(url, clean_all=clean_all)
|
|
137
159
|
except Exception as e:
|
|
138
160
|
raise ParseError from e
|
|
139
161
|
|
|
@@ -63,7 +63,7 @@ class BaseParser(ABC):
|
|
|
63
63
|
:param url: 分享文案 / 分享链接
|
|
64
64
|
:return: 解析结果
|
|
65
65
|
"""
|
|
66
|
-
raw_url = await self.get_raw_url(url,
|
|
66
|
+
raw_url = await self.get_raw_url(url, clean_all=False)
|
|
67
67
|
result = await self._do_parse(raw_url)
|
|
68
68
|
result.platform = self.__platform__
|
|
69
69
|
result.raw_url = self._clean_params(raw_url, self.__after_clean_parameters__)
|
|
@@ -76,11 +76,32 @@ class BaseParser(ABC):
|
|
|
76
76
|
"""
|
|
77
77
|
raise NotImplementedError
|
|
78
78
|
|
|
79
|
-
async def get_raw_url(self, url: str,
|
|
79
|
+
async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
|
|
80
80
|
"""
|
|
81
81
|
清除链接中的参数
|
|
82
82
|
:param url: 链接
|
|
83
|
-
:param
|
|
83
|
+
:param clean_all: 是否清除全部可清除的参数 (包括解析后才需清除的参数)
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
以小红书为例,其解析器配置如下::
|
|
87
|
+
|
|
88
|
+
__reserved_parameters__ = []
|
|
89
|
+
__after_clean_parameters__ = ["xsec_token"]
|
|
90
|
+
|
|
91
|
+
原始链接::
|
|
92
|
+
|
|
93
|
+
https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx&tracking=yyy
|
|
94
|
+
|
|
95
|
+
``clean_all=False`` (解析阶段,保留解析所需的参数)::
|
|
96
|
+
|
|
97
|
+
https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx
|
|
98
|
+
# tracking 被清除,xsec_token 保留(解析时需要它)
|
|
99
|
+
|
|
100
|
+
``clean_all=True`` (最终输出,清除所有非必要参数)::
|
|
101
|
+
|
|
102
|
+
https://www.xiaohongshu.com/explore/abc123
|
|
103
|
+
# xsec_token 也被清除,返回干净的链接
|
|
104
|
+
|
|
84
105
|
:return:
|
|
85
106
|
"""
|
|
86
107
|
url = match_url(url)
|
|
@@ -107,9 +128,7 @@ class BaseParser(ABC):
|
|
|
107
128
|
for i in query_params.copy().keys():
|
|
108
129
|
is_reserved = i in self.__reserved_parameters__
|
|
109
130
|
is_after_clean = i in self.__after_clean_parameters__
|
|
110
|
-
keep = (is_reserved and not (
|
|
111
|
-
is_after_clean and not after_clean_parameters
|
|
112
|
-
)
|
|
131
|
+
keep = (is_reserved and not (clean_all and is_after_clean)) or (is_after_clean and not clean_all)
|
|
113
132
|
if not keep:
|
|
114
133
|
query_params.pop(i, None)
|
|
115
134
|
|
|
@@ -66,12 +66,12 @@ class BiliParse(YtParser):
|
|
|
66
66
|
else:
|
|
67
67
|
return super().match(url)
|
|
68
68
|
|
|
69
|
-
async def get_raw_url(self, url: str,
|
|
69
|
+
async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
|
|
70
70
|
"""获取原始链接"""
|
|
71
71
|
if self._is_bvid(url):
|
|
72
72
|
return f"https://www.bilibili.com/video/{url}"
|
|
73
73
|
else:
|
|
74
|
-
return await super().get_raw_url(url,
|
|
74
|
+
return await super().get_raw_url(url, clean_all=clean_all)
|
|
75
75
|
|
|
76
76
|
@staticmethod
|
|
77
77
|
async def is_dynamic(url) -> str | None:
|
|
@@ -28,7 +28,6 @@ class InstagramParser(BaseParser):
|
|
|
28
28
|
dimensions = {}
|
|
29
29
|
width, height = dimensions.get("width", 0) or 0, dimensions.get("height", 0) or 0
|
|
30
30
|
|
|
31
|
-
k = {"title": post.title, "content": post.caption, "raw_url": raw_url}
|
|
32
31
|
match post.typename:
|
|
33
32
|
case "GraphSidecar":
|
|
34
33
|
media = [
|
|
@@ -37,9 +36,11 @@ class InstagramParser(BaseParser):
|
|
|
37
36
|
else ImageRef(url=i.display_url, width=i.width, height=i.height)
|
|
38
37
|
for i in post.get_sidecar_nodes()
|
|
39
38
|
]
|
|
40
|
-
return MultimediaParseResult(media=media,
|
|
39
|
+
return MultimediaParseResult(media=media, title=post.title, content=post.caption)
|
|
41
40
|
case "GraphImage":
|
|
42
|
-
return ImageParseResult(
|
|
41
|
+
return ImageParseResult(
|
|
42
|
+
photo=[ImageRef(url=post.url, width=width, height=height)], title=post.title, content=post.caption
|
|
43
|
+
)
|
|
43
44
|
case "GraphVideo":
|
|
44
45
|
return VideoParseResult(
|
|
45
46
|
video=VideoRef(
|
|
@@ -49,7 +50,8 @@ class InstagramParser(BaseParser):
|
|
|
49
50
|
width=width,
|
|
50
51
|
height=height,
|
|
51
52
|
),
|
|
52
|
-
|
|
53
|
+
title=post.title,
|
|
54
|
+
content=post.caption,
|
|
53
55
|
)
|
|
54
56
|
case _:
|
|
55
57
|
raise ParseError("不支持的类型")
|
|
@@ -19,10 +19,10 @@ class TwitterParser(BaseParser):
|
|
|
19
19
|
|
|
20
20
|
async def _do_parse(self, raw_url: str) -> "MultimediaParseResult":
|
|
21
21
|
tweet = await self._parse(raw_url)
|
|
22
|
-
return await self.media_parse(
|
|
22
|
+
return await self.media_parse(tweet)
|
|
23
23
|
|
|
24
|
-
async def get_raw_url(self, url: str,
|
|
25
|
-
url = await super().get_raw_url(url,
|
|
24
|
+
async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
|
|
25
|
+
url = await super().get_raw_url(url, clean_all=clean_all)
|
|
26
26
|
return str(urlunparse(urlparse(url)._replace(netloc="x.com")))
|
|
27
27
|
|
|
28
28
|
async def _parse(self, url: str):
|
|
@@ -46,7 +46,7 @@ class TwitterParser(BaseParser):
|
|
|
46
46
|
return tweet
|
|
47
47
|
|
|
48
48
|
@staticmethod
|
|
49
|
-
async def media_parse(
|
|
49
|
+
async def media_parse(tweet: TwitterTweet):
|
|
50
50
|
media = []
|
|
51
51
|
for m in tweet.media:
|
|
52
52
|
match m:
|
|
@@ -29,7 +29,6 @@ class XHSParser(BaseParser):
|
|
|
29
29
|
result = await xhs.extract(raw_url)
|
|
30
30
|
|
|
31
31
|
desc = self.hashtag_handler(result.desc)
|
|
32
|
-
k = {"title": result.title, "content": desc, "raw_url": raw_url}
|
|
33
32
|
match result.type:
|
|
34
33
|
case XHSPostType.VIDEO:
|
|
35
34
|
v: XHSMedia = result.media[0]
|
|
@@ -37,7 +36,8 @@ class XHSParser(BaseParser):
|
|
|
37
36
|
video=VideoRef(
|
|
38
37
|
url=v.url, thumb_url=v.thumb_url, duration=v.duration, height=v.height, width=v.width
|
|
39
38
|
),
|
|
40
|
-
|
|
39
|
+
title=result.title,
|
|
40
|
+
content=desc,
|
|
41
41
|
)
|
|
42
42
|
case XHSPostType.IMAGE:
|
|
43
43
|
photos: list[ImageRef | LivePhotoRef] = []
|
|
@@ -55,7 +55,8 @@ class XHSParser(BaseParser):
|
|
|
55
55
|
|
|
56
56
|
return ImageParseResult(
|
|
57
57
|
photo=photos,
|
|
58
|
-
|
|
58
|
+
title=result.title,
|
|
59
|
+
content=desc,
|
|
59
60
|
)
|
|
60
61
|
case _:
|
|
61
62
|
raise ParseError("不支持的类型")
|
|
@@ -22,14 +22,13 @@ class XiaoHeiHeParser(BaseParser):
|
|
|
22
22
|
async def _do_parse(self, raw_url: str) -> AnyParseResult:
|
|
23
23
|
xhh: XiaoHeiHePost = await XiaoHeiHeAPI(proxy=self.proxy).parse(raw_url)
|
|
24
24
|
media = self.__parse_media(xhh)
|
|
25
|
-
v = {"title": xhh.title, "content": xhh.content, "raw_url": raw_url}
|
|
26
25
|
match xhh.type:
|
|
27
26
|
case XiaoHeiHePostType.VIDEO:
|
|
28
|
-
return VideoParseResult(video=media,
|
|
27
|
+
return VideoParseResult(video=media, title=xhh.title, content=xhh.content)
|
|
29
28
|
case XiaoHeiHePostType.IMAGE:
|
|
30
29
|
if not media or all(isinstance(m, ImageRef) for m in media):
|
|
31
|
-
return ImageParseResult(photo=media,
|
|
32
|
-
return MultimediaParseResult(media=media,
|
|
30
|
+
return ImageParseResult(photo=media, title=xhh.title, content=xhh.content)
|
|
31
|
+
return MultimediaParseResult(media=media, title=xhh.title, content=xhh.content)
|
|
33
32
|
case XiaoHeiHePostType.ARTICLE:
|
|
34
33
|
return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content)
|
|
35
34
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|