parsehub 2.0.5__tar.gz → 2.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {parsehub-2.0.5/src/parsehub.egg-info → parsehub-2.0.7}/PKG-INFO +1 -1
  2. {parsehub-2.0.5 → parsehub-2.0.7}/pyproject.toml +1 -1
  3. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/__init__.py +24 -2
  4. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/base.py +25 -6
  5. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/bilibili.py +2 -2
  6. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/instagram.py +6 -4
  7. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/twitter.py +4 -4
  8. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/xhs.py +4 -3
  9. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/xiaoheihe.py +3 -4
  10. {parsehub-2.0.5 → parsehub-2.0.7/src/parsehub.egg-info}/PKG-INFO +1 -1
  11. {parsehub-2.0.5 → parsehub-2.0.7}/LICENSE +0 -0
  12. {parsehub-2.0.5 → parsehub-2.0.7}/README.md +0 -0
  13. {parsehub-2.0.5 → parsehub-2.0.7}/setup.cfg +0 -0
  14. {parsehub-2.0.5 → parsehub-2.0.7}/src/__init__.py +0 -0
  15. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/config/__init__.py +0 -0
  16. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/config/config.py +0 -0
  17. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/errors.py +0 -0
  18. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/__init__.py +0 -0
  19. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/__init__.py +0 -0
  20. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/base/ytdlp.py +0 -0
  21. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/__init__.py +0 -0
  22. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/coolapk.py +0 -0
  23. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/douyin.py +0 -0
  24. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/facebook.py +0 -0
  25. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/kuaishou.py +0 -0
  26. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/pipix.py +0 -0
  27. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/threads.py +0 -0
  28. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/tieba.py +0 -0
  29. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/weibo.py +0 -0
  30. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/weixin.py +0 -0
  31. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/youtube.py +0 -0
  32. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/parsers/parser/zuiyou.py +0 -0
  33. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/__init__.py +0 -0
  34. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/bilibili.py +0 -0
  35. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/coolapk.py +0 -0
  36. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/instagram.py +0 -0
  37. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/kuaishou.py +0 -0
  38. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/pipix.py +0 -0
  39. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/threads.py +0 -0
  40. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/tieba.py +0 -0
  41. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/twitter.py +0 -0
  42. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/weibo.py +0 -0
  43. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/weixin.py +0 -0
  44. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/xhs.py +0 -0
  45. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/xiaoheihe.py +0 -0
  46. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/provider_api/zuiyou.py +0 -0
  47. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/__init__.py +0 -0
  48. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/callback.py +0 -0
  49. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/media_file.py +0 -0
  50. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/media_ref.py +0 -0
  51. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/platform.py +0 -0
  52. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/post.py +0 -0
  53. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/types/result.py +0 -0
  54. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/downloader.py +0 -0
  55. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/media_info.py +0 -0
  56. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub/utils/utils.py +0 -0
  57. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/SOURCES.txt +0 -0
  58. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/dependency_links.txt +0 -0
  59. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/requires.txt +0 -0
  60. {parsehub-2.0.5 → parsehub-2.0.7}/src/parsehub.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.5
3
+ Version: 2.0.7
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "parsehub"
3
- version = "2.0.5"
3
+ version = "2.0.7"
4
4
  description = "轻量、异步、开箱即用的社交媒体聚合解析库"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12.0"
@@ -125,15 +125,37 @@ class ParseHub:
125
125
  )
126
126
  )
127
127
 
128
- async def get_raw_url(self, url: str, proxy: str | None = None) -> str:
128
+ async def get_raw_url(self, url: str, proxy: str | None = None, clean_all: bool = True) -> str:
129
129
  """获取原始链接
130
130
  :param url: 分享文案 / 分享链接
131
131
  :param proxy: 代理
132
+ :param clean_all: 是否清除全部可清除的参数 (包括解析后才需清除的参数)
133
+
134
+ Example:
135
+ 以小红书为例,其解析器配置如下::
136
+
137
+ __reserved_parameters__ = []
138
+ __after_clean_parameters__ = ["xsec_token"]
139
+
140
+ 原始链接::
141
+
142
+ https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx&tracking=yyy
143
+
144
+ ``clean_all=False`` (解析阶段,保留解析所需的参数)::
145
+
146
+ https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx
147
+ # tracking 被清除,xsec_token 保留(解析时需要它)
148
+
149
+ ``clean_all=True`` (最终输出,清除所有非必要参数)::
150
+
151
+ https://www.xiaohongshu.com/explore/abc123
152
+ # xsec_token 也被清除,返回干净的链接
153
+
132
154
  :return: 原始链接
133
155
  """
134
156
  parser = self.get_parser(url)
135
157
  try:
136
- return await parser(proxy=proxy).get_raw_url(url, after_clean_parameters=True)
158
+ return await parser(proxy=proxy).get_raw_url(url, clean_all=clean_all)
137
159
  except Exception as e:
138
160
  raise ParseError from e
139
161
 
@@ -63,7 +63,7 @@ class BaseParser(ABC):
63
63
  :param url: 分享文案 / 分享链接
64
64
  :return: 解析结果
65
65
  """
66
- raw_url = await self.get_raw_url(url, after_clean_parameters=False)
66
+ raw_url = await self.get_raw_url(url, clean_all=False)
67
67
  result = await self._do_parse(raw_url)
68
68
  result.platform = self.__platform__
69
69
  result.raw_url = self._clean_params(raw_url, self.__after_clean_parameters__)
@@ -76,11 +76,32 @@ class BaseParser(ABC):
76
76
  """
77
77
  raise NotImplementedError
78
78
 
79
- async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
79
+ async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
80
80
  """
81
81
  清除链接中的参数
82
82
  :param url: 链接
83
- :param after_clean_parameters: 是否执行后清理参数
83
+ :param clean_all: 是否清除全部可清除的参数 (包括解析后才需清除的参数)
84
+
85
+ Example:
86
+ 以小红书为例,其解析器配置如下::
87
+
88
+ __reserved_parameters__ = []
89
+ __after_clean_parameters__ = ["xsec_token"]
90
+
91
+ 原始链接::
92
+
93
+ https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx&tracking=yyy
94
+
95
+ ``clean_all=False`` (解析阶段,保留解析所需的参数)::
96
+
97
+ https://www.xiaohongshu.com/explore/abc123?xsec_token=xxx
98
+ # tracking 被清除,xsec_token 保留(解析时需要它)
99
+
100
+ ``clean_all=True`` (最终输出,清除所有非必要参数)::
101
+
102
+ https://www.xiaohongshu.com/explore/abc123
103
+ # xsec_token 也被清除,返回干净的链接
104
+
84
105
  :return:
85
106
  """
86
107
  url = match_url(url)
@@ -107,9 +128,7 @@ class BaseParser(ABC):
107
128
  for i in query_params.copy().keys():
108
129
  is_reserved = i in self.__reserved_parameters__
109
130
  is_after_clean = i in self.__after_clean_parameters__
110
- keep = (is_reserved and not (after_clean_parameters and is_after_clean)) or (
111
- is_after_clean and not after_clean_parameters
112
- )
131
+ keep = (is_reserved and not (clean_all and is_after_clean)) or (is_after_clean and not clean_all)
113
132
  if not keep:
114
133
  query_params.pop(i, None)
115
134
 
@@ -66,12 +66,12 @@ class BiliParse(YtParser):
66
66
  else:
67
67
  return super().match(url)
68
68
 
69
- async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
69
+ async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
70
70
  """获取原始链接"""
71
71
  if self._is_bvid(url):
72
72
  return f"https://www.bilibili.com/video/{url}"
73
73
  else:
74
- return await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
74
+ return await super().get_raw_url(url, clean_all=clean_all)
75
75
 
76
76
  @staticmethod
77
77
  async def is_dynamic(url) -> str | None:
@@ -28,7 +28,6 @@ class InstagramParser(BaseParser):
28
28
  dimensions = {}
29
29
  width, height = dimensions.get("width", 0) or 0, dimensions.get("height", 0) or 0
30
30
 
31
- k = {"title": post.title, "content": post.caption, "raw_url": raw_url}
32
31
  match post.typename:
33
32
  case "GraphSidecar":
34
33
  media = [
@@ -37,9 +36,11 @@ class InstagramParser(BaseParser):
37
36
  else ImageRef(url=i.display_url, width=i.width, height=i.height)
38
37
  for i in post.get_sidecar_nodes()
39
38
  ]
40
- return MultimediaParseResult(media=media, **k)
39
+ return MultimediaParseResult(media=media, title=post.title, content=post.caption)
41
40
  case "GraphImage":
42
- return ImageParseResult(photo=[ImageRef(url=post.url, width=width, height=height)], **k)
41
+ return ImageParseResult(
42
+ photo=[ImageRef(url=post.url, width=width, height=height)], title=post.title, content=post.caption
43
+ )
43
44
  case "GraphVideo":
44
45
  return VideoParseResult(
45
46
  video=VideoRef(
@@ -49,7 +50,8 @@ class InstagramParser(BaseParser):
49
50
  width=width,
50
51
  height=height,
51
52
  ),
52
- **k,
53
+ title=post.title,
54
+ content=post.caption,
53
55
  )
54
56
  case _:
55
57
  raise ParseError("不支持的类型")
@@ -19,10 +19,10 @@ class TwitterParser(BaseParser):
19
19
 
20
20
  async def _do_parse(self, raw_url: str) -> "MultimediaParseResult":
21
21
  tweet = await self._parse(raw_url)
22
- return await self.media_parse(raw_url, tweet)
22
+ return await self.media_parse(tweet)
23
23
 
24
- async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
25
- url = await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
24
+ async def get_raw_url(self, url: str, clean_all: bool = False) -> str:
25
+ url = await super().get_raw_url(url, clean_all=clean_all)
26
26
  return str(urlunparse(urlparse(url)._replace(netloc="x.com")))
27
27
 
28
28
  async def _parse(self, url: str):
@@ -46,7 +46,7 @@ class TwitterParser(BaseParser):
46
46
  return tweet
47
47
 
48
48
  @staticmethod
49
- async def media_parse(url, tweet: TwitterTweet):
49
+ async def media_parse(tweet: TwitterTweet):
50
50
  media = []
51
51
  for m in tweet.media:
52
52
  match m:
@@ -29,7 +29,6 @@ class XHSParser(BaseParser):
29
29
  result = await xhs.extract(raw_url)
30
30
 
31
31
  desc = self.hashtag_handler(result.desc)
32
- k = {"title": result.title, "content": desc, "raw_url": raw_url}
33
32
  match result.type:
34
33
  case XHSPostType.VIDEO:
35
34
  v: XHSMedia = result.media[0]
@@ -37,7 +36,8 @@ class XHSParser(BaseParser):
37
36
  video=VideoRef(
38
37
  url=v.url, thumb_url=v.thumb_url, duration=v.duration, height=v.height, width=v.width
39
38
  ),
40
- **k,
39
+ title=result.title,
40
+ content=desc,
41
41
  )
42
42
  case XHSPostType.IMAGE:
43
43
  photos: list[ImageRef | LivePhotoRef] = []
@@ -55,7 +55,8 @@ class XHSParser(BaseParser):
55
55
 
56
56
  return ImageParseResult(
57
57
  photo=photos,
58
- **k,
58
+ title=result.title,
59
+ content=desc,
59
60
  )
60
61
  case _:
61
62
  raise ParseError("不支持的类型")
@@ -22,14 +22,13 @@ class XiaoHeiHeParser(BaseParser):
22
22
  async def _do_parse(self, raw_url: str) -> AnyParseResult:
23
23
  xhh: XiaoHeiHePost = await XiaoHeiHeAPI(proxy=self.proxy).parse(raw_url)
24
24
  media = self.__parse_media(xhh)
25
- v = {"title": xhh.title, "content": xhh.content, "raw_url": raw_url}
26
25
  match xhh.type:
27
26
  case XiaoHeiHePostType.VIDEO:
28
- return VideoParseResult(video=media, **v)
27
+ return VideoParseResult(video=media, title=xhh.title, content=xhh.content)
29
28
  case XiaoHeiHePostType.IMAGE:
30
29
  if not media or all(isinstance(m, ImageRef) for m in media):
31
- return ImageParseResult(photo=media, **v)
32
- return MultimediaParseResult(media=media, **v)
30
+ return ImageParseResult(photo=media, title=xhh.title, content=xhh.content)
31
+ return MultimediaParseResult(media=media, title=xhh.title, content=xhh.content)
33
32
  case XiaoHeiHePostType.ARTICLE:
34
33
  return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content)
35
34
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.5
3
+ Version: 2.0.7
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
File without changes
File without changes
File without changes
File without changes