parsehub 2.0.3__tar.gz → 2.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {parsehub-2.0.3/src/parsehub.egg-info → parsehub-2.0.5}/PKG-INFO +21 -2
  2. {parsehub-2.0.3 → parsehub-2.0.5}/README.md +20 -1
  3. {parsehub-2.0.3 → parsehub-2.0.5}/pyproject.toml +1 -1
  4. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/__init__.py +26 -3
  5. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/base.py +26 -4
  6. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/ytdlp.py +7 -5
  7. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/bilibili.py +13 -14
  8. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/coolapk.py +8 -7
  9. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/douyin.py +4 -6
  10. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/kuaishou.py +0 -1
  11. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/pipix.py +1 -2
  12. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/threads.py +1 -1
  13. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/tieba.py +2 -2
  14. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/twitter.py +3 -3
  15. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/weibo.py +3 -4
  16. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/weixin.py +0 -1
  17. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/xhs.py +2 -4
  18. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/xiaoheihe.py +1 -1
  19. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/zuiyou.py +0 -1
  20. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/weibo.py +1 -1
  21. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/callback.py +2 -1
  22. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/result.py +27 -18
  23. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/downloader.py +8 -3
  24. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/utils.py +2 -20
  25. {parsehub-2.0.3 → parsehub-2.0.5/src/parsehub.egg-info}/PKG-INFO +21 -2
  26. {parsehub-2.0.3 → parsehub-2.0.5}/LICENSE +0 -0
  27. {parsehub-2.0.3 → parsehub-2.0.5}/setup.cfg +0 -0
  28. {parsehub-2.0.3 → parsehub-2.0.5}/src/__init__.py +0 -0
  29. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/config/__init__.py +0 -0
  30. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/config/config.py +0 -0
  31. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/errors.py +0 -0
  32. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/__init__.py +0 -0
  33. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/base/__init__.py +0 -0
  34. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/__init__.py +0 -0
  35. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/facebook.py +0 -0
  36. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/instagram.py +0 -0
  37. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/parsers/parser/youtube.py +0 -0
  38. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/__init__.py +0 -0
  39. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/bilibili.py +0 -0
  40. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/coolapk.py +0 -0
  41. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/instagram.py +0 -0
  42. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/kuaishou.py +0 -0
  43. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/pipix.py +0 -0
  44. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/threads.py +0 -0
  45. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/tieba.py +0 -0
  46. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/twitter.py +0 -0
  47. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/weixin.py +0 -0
  48. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/xhs.py +0 -0
  49. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/xiaoheihe.py +0 -0
  50. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/provider_api/zuiyou.py +0 -0
  51. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/__init__.py +0 -0
  52. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/media_file.py +0 -0
  53. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/media_ref.py +0 -0
  54. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/platform.py +0 -0
  55. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/types/post.py +0 -0
  56. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub/utils/media_info.py +0 -0
  57. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/SOURCES.txt +0 -0
  58. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/dependency_links.txt +0 -0
  59. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/requires.txt +0 -0
  60. {parsehub-2.0.3 → parsehub-2.0.5}/src/parsehub.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.3
3
+ Version: 2.0.5
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -119,6 +119,25 @@ print(result)
119
119
 
120
120
  ## 🔑 高级用法
121
121
 
122
+ ### 下载进度回调
123
+
124
+ ```python
125
+ from parsehub import ParseHub
126
+
127
+
128
+ class ProgressTracker:
129
+ async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
130
+ print(f"[{task_name}] {current}/{total} ({unit})")
131
+
132
+
133
+ result = ParseHub().download_sync(
134
+ "https://example.com",
135
+ callback=ProgressTracker(),
136
+ callback_args=("extra_arg",),
137
+ callback_kwargs={"task_name": "demo"},
138
+ )
139
+ ```
140
+
122
141
  ### Cookie 登录 & 代理
123
142
 
124
143
  部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
@@ -126,7 +145,7 @@ print(result)
126
145
  ```python
127
146
  from parsehub import ParseHub
128
147
 
129
- ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
148
+ ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
130
149
  ```
131
150
 
132
151
  Cookie 支持多种格式传入:
@@ -82,6 +82,25 @@ print(result)
82
82
 
83
83
  ## 🔑 高级用法
84
84
 
85
+ ### 下载进度回调
86
+
87
+ ```python
88
+ from parsehub import ParseHub
89
+
90
+
91
+ class ProgressTracker:
92
+ async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
93
+ print(f"[{task_name}] {current}/{total} ({unit})")
94
+
95
+
96
+ result = ParseHub().download_sync(
97
+ "https://example.com",
98
+ callback=ProgressTracker(),
99
+ callback_args=("extra_arg",),
100
+ callback_kwargs={"task_name": "demo"},
101
+ )
102
+ ```
103
+
85
104
  ### Cookie 登录 & 代理
86
105
 
87
106
  部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
@@ -89,7 +108,7 @@ print(result)
89
108
  ```python
90
109
  from parsehub import ParseHub
91
110
 
92
- ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
111
+ ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
93
112
  ```
94
113
 
95
114
  Cookie 支持多种格式传入:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "parsehub"
3
- version = "2.0.3"
3
+ version = "2.0.5"
4
4
  description = "轻量、异步、开箱即用的社交媒体聚合解析库"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12.0"
@@ -46,14 +46,18 @@ class ParseHub:
46
46
  *,
47
47
  callback: ProgressCallback = None,
48
48
  callback_args: tuple = (),
49
+ callback_kwargs: dict | None = None,
49
50
  proxy: str | None = None,
51
+ save_metadata: bool = False,
50
52
  ) -> DownloadResult:
51
53
  """下载
52
54
  :param url: 分享文案 / 分享链接
53
55
  :param path: 保存路径
54
56
  :param callback: 下载进度回调函数
55
57
  :param callback_args: 下载进度回调函数参数
58
+ :param callback_kwargs: 回调函数的关键字参数
56
59
  :param proxy: 代理
60
+ :param save_metadata: 保存解析结果为 metadata.json, 默认为 False
57
61
  :return: DownloadResult
58
62
 
59
63
  Note:
@@ -68,7 +72,14 @@ class ParseHub:
68
72
  - ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
69
73
  """
70
74
  result = await self.parse(url)
71
- return await result.download(path, callback=callback, callback_args=callback_args, proxy=proxy)
75
+ return await result.download(
76
+ path,
77
+ callback=callback,
78
+ callback_args=callback_args,
79
+ callback_kwargs=callback_kwargs,
80
+ proxy=proxy,
81
+ save_metadata=save_metadata,
82
+ )
72
83
 
73
84
  def download_sync(
74
85
  self,
@@ -76,7 +87,9 @@ class ParseHub:
76
87
  path: str | Path | None = None,
77
88
  callback: ProgressCallback | None = None,
78
89
  callback_args: tuple = (),
90
+ callback_kwargs: dict | None = None,
79
91
  proxy: str | None = None,
92
+ save_metadata: bool = False,
80
93
  ) -> DownloadResult:
81
94
  """
82
95
  同步下载
@@ -84,7 +97,9 @@ class ParseHub:
84
97
  :param path: 下载路径
85
98
  :param callback: 进度回调函数
86
99
  :param callback_args: 进度回调函数参数
100
+ :param callback_kwargs: 回调函数的关键字参数
87
101
  :param proxy: 代理
102
+ :param save_metadata: 保存解析结果为 metadata.json, 默认为 False
88
103
  :return: DownloadResult
89
104
 
90
105
  Note:
@@ -99,7 +114,15 @@ class ParseHub:
99
114
  - ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
100
115
  """
101
116
  return get_event_loop().run_until_complete(
102
- self.download(url, path, callback=callback, callback_args=callback_args, proxy=proxy)
117
+ self.download(
118
+ url,
119
+ path,
120
+ callback=callback,
121
+ callback_args=callback_args,
122
+ callback_kwargs=callback_kwargs,
123
+ proxy=proxy,
124
+ save_metadata=save_metadata,
125
+ )
103
126
  )
104
127
 
105
128
  async def get_raw_url(self, url: str, proxy: str | None = None) -> str:
@@ -110,7 +133,7 @@ class ParseHub:
110
133
  """
111
134
  parser = self.get_parser(url)
112
135
  try:
113
- return await parser(proxy=proxy).get_raw_url(url)
136
+ return await parser(proxy=proxy).get_raw_url(url, after_clean_parameters=True)
114
137
  except Exception as e:
115
138
  raise ParseError from e
116
139
 
@@ -25,6 +25,8 @@ class BaseParser(ABC):
25
25
  """匹配规则"""
26
26
  __reserved_parameters__: list[str] = []
27
27
  """要保留的参数, 例如翻页. 默认清除全部参数"""
28
+ __after_clean_parameters__: list[str] = []
29
+ """解析完成后需要清理的参数, 在解析完成前会保留这些参数, 优先级高于 __reserved_parameters__"""
28
30
  __redirect_keywords__: list[str] = []
29
31
  """如果链接包含其中之一, 则遵循重定向规则"""
30
32
 
@@ -61,9 +63,10 @@ class BaseParser(ABC):
61
63
  :param url: 分享文案 / 分享链接
62
64
  :return: 解析结果
63
65
  """
64
- raw_url = await self.get_raw_url(url)
66
+ raw_url = await self.get_raw_url(url, after_clean_parameters=False)
65
67
  result = await self._do_parse(raw_url)
66
68
  result.platform = self.__platform__
69
+ result.raw_url = self._clean_params(raw_url, self.__after_clean_parameters__)
67
70
  return result
68
71
 
69
72
  @abstractmethod
@@ -73,10 +76,11 @@ class BaseParser(ABC):
73
76
  """
74
77
  raise NotImplementedError
75
78
 
76
- async def get_raw_url(self, url: str) -> str:
79
+ async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
77
80
  """
78
81
  清除链接中的参数
79
82
  :param url: 链接
83
+ :param after_clean_parameters: 是否执行后清理参数
80
84
  :return:
81
85
  """
82
86
  url = match_url(url)
@@ -101,7 +105,25 @@ class BaseParser(ABC):
101
105
  query_params = parse_qs(parsed_url.query)
102
106
 
103
107
  for i in query_params.copy().keys():
104
- if i not in self.__reserved_parameters__:
105
- del query_params[i]
108
+ is_reserved = i in self.__reserved_parameters__
109
+ is_after_clean = i in self.__after_clean_parameters__
110
+ keep = (is_reserved and not (after_clean_parameters and is_after_clean)) or (
111
+ is_after_clean and not after_clean_parameters
112
+ )
113
+ if not keep:
114
+ query_params.pop(i, None)
115
+
116
+ new_query = urlencode(query_params, doseq=True)
117
+ return parsed_url._replace(query=new_query).geturl()
118
+
119
+ @staticmethod
120
+ def _clean_params(url: str, params: list[str]) -> str:
121
+ """清除链接中的指定参数"""
122
+ if not params:
123
+ return url
124
+ parsed_url = urlparse(url)
125
+ query_params = parse_qs(parsed_url.query)
126
+ for p in params:
127
+ query_params.pop(p, None)
106
128
  new_query = urlencode(query_params, doseq=True)
107
129
  return parsed_url._replace(query=new_query).geturl()
@@ -39,7 +39,6 @@ class YtParser(BaseParser, register=False):
39
39
  dl=video_info,
40
40
  title=video_info.title,
41
41
  content=video_info.description,
42
- raw_url=raw_url,
43
42
  video=VideoRef(
44
43
  url=raw_url,
45
44
  thumb_url=video_info.thumbnail,
@@ -115,11 +114,10 @@ class YtVideoParseResult(VideoParseResult):
115
114
  title,
116
115
  video=None,
117
116
  content=None,
118
- raw_url=None,
119
117
  ):
120
118
  """dl: yt-dlp解析结果"""
121
119
  self.dl = dl
122
- super().__init__(title=title, video=video, content=content, raw_url=raw_url)
120
+ super().__init__(title=title, video=video, content=content)
123
121
 
124
122
  async def _do_download(
125
123
  self,
@@ -127,9 +125,13 @@ class YtVideoParseResult(VideoParseResult):
127
125
  output_dir: str | Path,
128
126
  callback: ProgressCallback | None = None,
129
127
  callback_args: tuple = (),
128
+ callback_kwargs: dict | None = None,
130
129
  proxy: str | None = None,
131
130
  headers: dict | None = None,
132
131
  ) -> "DownloadResult":
132
+ if callback_kwargs is None:
133
+ callback_kwargs = {}
134
+
133
135
  paramss = self.dl.paramss.copy()
134
136
  if proxy:
135
137
  paramss["proxy"] = proxy
@@ -141,7 +143,7 @@ class YtVideoParseResult(VideoParseResult):
141
143
  # paramss["format"] = "worstvideo* + worstaudio / worst"
142
144
 
143
145
  if callback:
144
- await callback(0, 1, "count", *callback_args)
146
+ await callback(0, 1, "count", *callback_args, **callback_kwargs)
145
147
 
146
148
  await self.__download(paramss)
147
149
 
@@ -150,7 +152,7 @@ class YtVideoParseResult(VideoParseResult):
150
152
  raise DownloadError("下载失败 -1")
151
153
 
152
154
  if callback:
153
- await callback(1, 1, "count", *callback_args)
155
+ await callback(1, 1, "count", *callback_args, **callback_kwargs)
154
156
 
155
157
  video_path = v[0]
156
158
  return DownloadResult(
@@ -3,8 +3,6 @@ from pathlib import Path
3
3
  from typing import Union
4
4
  from urllib.parse import parse_qs, urlparse
5
5
 
6
- import httpx
7
-
8
6
  from ...config.config import GlobalConfig
9
7
  from ...provider_api.bilibili import BiliAPI, BiliDynamic
10
8
  from ...types import (
@@ -30,8 +28,8 @@ class BiliParse(YtParser):
30
28
  __redirect_keywords__ = ["b23.tv", "bili2233.cn"]
31
29
 
32
30
  async def _do_parse(self, raw_url: str) -> Union["YtVideoParseResult", "BiliVideoParseResult", ImageParseResult]:
33
- if ourl := await self.is_dynamic(raw_url):
34
- dynamic = await self.get_dynamic_info(ourl)
31
+ if await self.is_dynamic(raw_url):
32
+ dynamic = await self.get_dynamic_info(raw_url)
35
33
  content = self.hashtag_handler(dynamic.content)
36
34
  photos = []
37
35
  if dynamic.images:
@@ -44,7 +42,6 @@ class BiliParse(YtParser):
44
42
  title=dynamic.title,
45
43
  content=content,
46
44
  photo=photos,
47
- raw_url=ourl,
48
45
  )
49
46
  else:
50
47
  try:
@@ -69,18 +66,16 @@ class BiliParse(YtParser):
69
66
  else:
70
67
  return super().match(url)
71
68
 
72
- async def get_raw_url(self, url: str) -> str:
69
+ async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
73
70
  """获取原始链接"""
74
71
  if self._is_bvid(url):
75
72
  return f"https://www.bilibili.com/video/{url}"
76
73
  else:
77
- return await super().get_raw_url(url)
74
+ return await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
78
75
 
79
- async def is_dynamic(self, url) -> str | None:
76
+ @staticmethod
77
+ async def is_dynamic(url) -> str | None:
80
78
  """是动态"""
81
- async with httpx.AsyncClient(proxy=self.proxy) as cli:
82
- url = str((await cli.get(url, follow_redirects=True, timeout=30)).url)
83
-
84
79
  if re.search(r"\b\d{18,19}\b", url):
85
80
  return url
86
81
  return None
@@ -128,7 +123,6 @@ class BiliParse(YtParser):
128
123
  video_url = self.change_source(durl["backup_url"][0]) if durl.get("backup_url") else durl["url"]
129
124
  return BiliVideoParseResult(
130
125
  title=data["View"]["title"],
131
- raw_url=url,
132
126
  content=f"P{p}: {part}" if part else "",
133
127
  video=VideoRef(
134
128
  url=video_url,
@@ -143,7 +137,6 @@ class BiliParse(YtParser):
143
137
  result = await super()._do_parse(url)
144
138
  return YtVideoParseResult(
145
139
  title=result.title,
146
- raw_url=result.raw_url,
147
140
  dl=result.dl,
148
141
  video=result.media,
149
142
  )
@@ -173,12 +166,18 @@ class BiliVideoParseResult(VideoParseResult):
173
166
  output_dir: str | Path,
174
167
  callback: ProgressCallback | None = None,
175
168
  callback_args: tuple = (),
169
+ callback_kwargs: dict | None = None,
176
170
  proxy: str | None = None,
177
171
  headers: dict | None = None,
178
172
  ) -> "DownloadResult":
179
173
  headers = {"referer": "https://www.bilibili.com", "User-Agent": GlobalConfig.ua}
180
174
  return await super()._do_download(
181
- output_dir=output_dir, callback=callback, callback_args=callback_args, proxy=proxy, headers=headers
175
+ output_dir=output_dir,
176
+ callback=callback,
177
+ callback_args=callback_args,
178
+ callback_kwargs=callback_kwargs,
179
+ proxy=proxy,
180
+ headers=headers,
182
181
  )
183
182
 
184
183
 
@@ -14,7 +14,6 @@ from ...types import (
14
14
  ProgressCallback,
15
15
  RichTextParseResult,
16
16
  )
17
- from ...utils.utils import clear_params
18
17
  from ..base.base import BaseParser
19
18
 
20
19
 
@@ -22,12 +21,11 @@ class CoolapkParser(BaseParser):
22
21
  __platform__ = Platform.COOLAPK
23
22
  __supported_type__ = ["图文"]
24
23
  __match__ = r"^(http(s)?://)www.coolapk.com/(feed|picture)/.*"
25
- __reserved_parameters__ = ["shareKey", "s"]
24
+ __after_clean_parameters__ = ["shareKey", "s"]
26
25
 
27
26
  async def _do_parse(
28
27
  self, raw_url: str
29
28
  ) -> Union["CoolapkImageParseResult", "CoolapkRichTextParseResult", "CoolapkMultimediaParseResult"]:
30
- raw_url_ = clear_params(raw_url, ["s", "shareKey"])
31
29
  try:
32
30
  coolapk = await Coolapk.parse(raw_url, proxy=self.proxy)
33
31
  except Exception as e:
@@ -38,20 +36,17 @@ class CoolapkParser(BaseParser):
38
36
  title=coolapk.title,
39
37
  media=media,
40
38
  markdown_content=coolapk.markdown_content,
41
- raw_url=raw_url_,
42
39
  )
43
40
  if any(isinstance(m, AniRef) for m in media):
44
41
  return CoolapkMultimediaParseResult(
45
42
  title=coolapk.title,
46
43
  media=media,
47
44
  content=coolapk.text_content,
48
- raw_url=raw_url_,
49
45
  )
50
46
  return CoolapkImageParseResult(
51
47
  title=coolapk.title,
52
48
  photo=media,
53
49
  content=coolapk.text_content,
54
- raw_url=raw_url_,
55
50
  )
56
51
 
57
52
 
@@ -62,6 +57,7 @@ class CoolapkParseResult(ParseResult):
62
57
  output_dir: str | Path,
63
58
  callback: ProgressCallback = None,
64
59
  callback_args: tuple = (),
60
+ callback_kwargs: dict | None = None,
65
61
  proxy: str | None = None,
66
62
  headers: dict = None,
67
63
  ) -> "DownloadResult":
@@ -72,7 +68,12 @@ class CoolapkParseResult(ParseResult):
72
68
  )
73
69
  }
74
70
  return await super()._do_download(
75
- output_dir=output_dir, callback=callback, callback_args=callback_args, proxy=proxy, headers=headers
71
+ output_dir=output_dir,
72
+ callback=callback,
73
+ callback_args=callback_args,
74
+ callback_kwargs=callback_kwargs,
75
+ proxy=proxy,
76
+ headers=headers,
76
77
  )
77
78
 
78
79
 
@@ -30,9 +30,9 @@ class DouyinParser(BaseParser):
30
30
 
31
31
  match data.type:
32
32
  case DYType.VIDEO:
33
- return await self.video_parse(raw_url, data)
33
+ return await self.video_parse(data)
34
34
  case DYType.IMAGE:
35
- return await self.image_parse(raw_url, data)
35
+ return await self.image_parse(data)
36
36
 
37
37
  @staticmethod
38
38
  async def parse_api(url) -> "DYResult":
@@ -52,17 +52,15 @@ class DouyinParser(BaseParser):
52
52
  return DYResult.parse(url, response.json())
53
53
 
54
54
  @staticmethod
55
- async def video_parse(url, result: "DYResult"):
55
+ async def video_parse(result: "DYResult"):
56
56
  return VideoParseResult(
57
- raw_url=url,
58
57
  title=result.desc,
59
58
  video=result.video,
60
59
  )
61
60
 
62
61
  @staticmethod
63
- async def image_parse(url, result: "DYResult"):
62
+ async def image_parse(result: "DYResult"):
64
63
  return ImageParseResult(
65
- raw_url=url,
66
64
  title=result.desc,
67
65
  photo=result.image_list,
68
66
  )
@@ -25,7 +25,6 @@ class KuaiShouParser(BaseParser):
25
25
  height=result.height,
26
26
  width=result.width,
27
27
  ),
28
- raw_url=raw_url,
29
28
  )
30
29
 
31
30
 
@@ -27,10 +27,9 @@ class PipixParser(BaseParser):
27
27
  height=ppx.video_height,
28
28
  width=ppx.video_width,
29
29
  ),
30
- raw_url=raw_url,
31
30
  )
32
31
  else:
33
- return ImageParseResult(title=ppx.content, photo=ppx.img_url, raw_url=raw_url)
32
+ return ImageParseResult(title=ppx.content, photo=ppx.img_url)
34
33
 
35
34
 
36
35
  __all__ = ["PipixParser"]
@@ -19,7 +19,7 @@ class ThreadsParser(BaseParser):
19
19
  media.append(VideoRef(url=m.url, thumb_url=m.thumb_url, width=m.width, height=m.height))
20
20
  case ThreadsMediaType.IMAGE:
21
21
  media.append(ImageRef(url=m.url, thumb_url=m.url, width=m.width, height=m.height))
22
- return MultimediaParseResult(content=post.content, media=media, raw_url=raw_url)
22
+ return MultimediaParseResult(content=post.content, media=media)
23
23
 
24
24
 
25
25
  __all__ = ["ThreadsParser"]
@@ -17,9 +17,9 @@ class TieBaParser(BaseParser):
17
17
  raise ParseError("贴吧解析失败") from e
18
18
 
19
19
  if tb.video_url:
20
- return VideoParseResult(title=tb.title, video=tb.video_url, raw_url=raw_url, content=tb.content)
20
+ return VideoParseResult(title=tb.title, video=tb.video_url, content=tb.content)
21
21
  else:
22
- return ImageParseResult(title=tb.title, photo=tb.img_url, raw_url=raw_url, content=tb.content)
22
+ return ImageParseResult(title=tb.title, photo=tb.img_url, content=tb.content)
23
23
 
24
24
 
25
25
  __all__ = ["TieBaParser"]
@@ -21,8 +21,8 @@ class TwitterParser(BaseParser):
21
21
  tweet = await self._parse(raw_url)
22
22
  return await self.media_parse(raw_url, tweet)
23
23
 
24
- async def get_raw_url(self, url: str) -> str:
25
- url = await super().get_raw_url(url)
24
+ async def get_raw_url(self, url: str, after_clean_parameters: bool = False) -> str:
25
+ url = await super().get_raw_url(url, after_clean_parameters=after_clean_parameters)
26
26
  return str(urlunparse(urlparse(url)._replace(netloc="x.com")))
27
27
 
28
28
  async def _parse(self, url: str):
@@ -63,7 +63,7 @@ class TwitterParser(BaseParser):
63
63
  case TwitterAni():
64
64
  path = AniRef(url=m.url, ext="mp4", height=m.height, width=m.width, thumb_url=m.thumb_url)
65
65
  media.append(path)
66
- return MultimediaParseResult(content=tweet.full_text, media=media, raw_url=url)
66
+ return MultimediaParseResult(content=tweet.full_text, media=media)
67
67
 
68
68
 
69
69
  __all__ = ["TwitterParser"]
@@ -29,7 +29,6 @@ class WeiboParser(BaseParser):
29
29
  if data.page_info.object_type == MediaType.VIDEO:
30
30
  return VideoParseResult(
31
31
  content=text,
32
- raw_url=raw_url,
33
32
  video=VideoRef(
34
33
  url=data.page_info.media_info.playback.url,
35
34
  thumb_url=data.page_info.page_pic,
@@ -45,7 +44,7 @@ class WeiboParser(BaseParser):
45
44
  or (data.mix_media_info and data.mix_media_info.items)
46
45
  )
47
46
  if not media_info:
48
- return MultimediaParseResult(content=text, raw_url=raw_url, media=[])
47
+ return MultimediaParseResult(content=text, media=[])
49
48
 
50
49
  for i in media_info:
51
50
  match i.type:
@@ -70,8 +69,8 @@ class WeiboParser(BaseParser):
70
69
  case _:
71
70
  media.append(ImageRef(url=i.media_url, thumb_url=i.thumb_url, width=i.width, height=i.height))
72
71
  if all((isinstance(m, ImageRef) or isinstance(m, LivePhotoRef)) for m in media):
73
- return ImageParseResult(content=text, raw_url=raw_url, photo=media)
74
- return MultimediaParseResult(content=text, raw_url=raw_url, media=media)
72
+ return ImageParseResult(content=text, photo=media)
73
+ return MultimediaParseResult(content=text, media=media)
75
74
 
76
75
  def f_text(self, text: str) -> str:
77
76
  # text = re.sub(r'<a href="https://video.weibo.com.*?>.*的微博视频.*</a>', "", text)
@@ -14,7 +14,6 @@ class WXParser(BaseParser):
14
14
  title=wx.title,
15
15
  media=[ImageRef(url=i) for i in wx.imgs],
16
16
  markdown_content=wx.markdown_content,
17
- raw_url=raw_url,
18
17
  )
19
18
 
20
19
 
@@ -14,7 +14,6 @@ from ...types import (
14
14
  VideoParseResult,
15
15
  VideoRef,
16
16
  )
17
- from ...utils.utils import clear_params
18
17
  from ..base import BaseParser
19
18
 
20
19
 
@@ -23,15 +22,14 @@ class XHSParser(BaseParser):
23
22
  __supported_type__ = ["视频", "图文"]
24
23
  __match__ = r"^(http(s)?://)?.+(xiaohongshu|xhslink).com/.+"
25
24
  __redirect_keywords__ = ["xhslink", "item"]
26
- __reserved_parameters__ = ["xsec_token"]
25
+ __after_clean_parameters__ = ["xsec_token"]
27
26
 
28
27
  async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult", "MultimediaParseResult"]:
29
- raw_url_ = clear_params(raw_url, "xsec_token")
30
28
  xhs = XHSAPI(proxy=self.proxy)
31
29
  result = await xhs.extract(raw_url)
32
30
 
33
31
  desc = self.hashtag_handler(result.desc)
34
- k = {"title": result.title, "content": desc, "raw_url": raw_url_}
32
+ k = {"title": result.title, "content": desc, "raw_url": raw_url}
35
33
  match result.type:
36
34
  case XHSPostType.VIDEO:
37
35
  v: XHSMedia = result.media[0]
@@ -31,7 +31,7 @@ class XiaoHeiHeParser(BaseParser):
31
31
  return ImageParseResult(photo=media, **v)
32
32
  return MultimediaParseResult(media=media, **v)
33
33
  case XiaoHeiHePostType.ARTICLE:
34
- return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content, raw_url=raw_url)
34
+ return RichTextParseResult(title=xhh.title, media=media, markdown_content=xhh.content)
35
35
 
36
36
  @staticmethod
37
37
  def __parse_media(xhh: XiaoHeiHePost):
@@ -19,7 +19,6 @@ class ZuiYouParser(BaseParser):
19
19
  else ImageRef(url=i.url, thumb_url=i.thumb_url)
20
20
  for i in zy.media
21
21
  ],
22
- raw_url=raw_url,
23
22
  )
24
23
 
25
24
 
@@ -294,4 +294,4 @@ class WeiboContent:
294
294
 
295
295
 
296
296
  if __name__ == "__main__":
297
- print(asyncio.run(WeiboAPI().parse("https://weibo.com/3208333150/Ow0iEbEX0")))
297
+ print(asyncio.run(WeiboAPI().parse("https://weibo.com/6576374129/Qv0n8sXum")))
@@ -6,7 +6,7 @@ ProgressUnit = Literal["bytes", "count"]
6
6
  class ProgressCallback(Protocol):
7
7
  """下载进度回调: (current, total, unit, *args) -> None"""
8
8
 
9
- async def __call__(self, current: int, total: int, unit: ProgressUnit, *args) -> None:
9
+ async def __call__(self, current: int, total: int, unit: ProgressUnit, *args, **kwargs) -> None:
10
10
  """
11
11
  下载进度回调
12
12
  Args:
@@ -16,6 +16,7 @@ class ProgressCallback(Protocol):
16
16
  - ``bytes``: 字节进度,用于单文件下载时报告已下载/总字节数
17
17
  - ``count``: 计数进度,用于多文件下载时报告已完成/总文件数
18
18
  *args: 自定义参数
19
+ **kwargs: 自定义关键字参数
19
20
 
20
21
  Returns:
21
22
  None
@@ -29,7 +29,6 @@ class ParseResult(ABC): # noqa: B024
29
29
 
30
30
  def __init__(
31
31
  self,
32
- raw_url: str,
33
32
  title: str = "",
34
33
  content: str = "",
35
34
  media: list[AnyMediaRef] | AnyMediaRef | None = None,
@@ -39,13 +38,12 @@ class ParseResult(ABC): # noqa: B024
39
38
  :param title: 标题
40
39
  :param media: 媒体下载链接
41
40
  :param content: 正文 (纯文本)
42
- :param raw_url: 原始帖子链接
43
41
  :param platform: 平台
44
42
  """
43
+ self.raw_url = None
45
44
  self.title = (title or "").strip()
46
45
  self.content = (content or "").strip()
47
46
  self.media = media
48
- self.raw_url = raw_url
49
47
  self.platform = platform
50
48
 
51
49
  def __repr__(self):
@@ -79,6 +77,7 @@ class ParseResult(ABC): # noqa: B024
79
77
  output_dir: str | Path,
80
78
  callback: ProgressCallback | None = None,
81
79
  callback_args: tuple = (),
80
+ callback_kwargs: dict | None = None,
82
81
  proxy: str | None = None,
83
82
  headers: dict | None = None,
84
83
  ) -> "DownloadResult":
@@ -87,6 +86,7 @@ class ParseResult(ABC): # noqa: B024
87
86
  :param output_dir: 输出的子目录
88
87
  :param callback: 下载进度回调函数
89
88
  :param callback_args: 回调函数的参数
89
+ :param callback_kwargs: 回调函数的关键字参数
90
90
  :param proxy: 代理
91
91
  :param headers: 请求头
92
92
  :return: DownloadResult
@@ -99,13 +99,15 @@ class ParseResult(ABC): # noqa: B024
99
99
  for i, media in enumerate(media_list):
100
100
  dl_progress = None
101
101
  dl_progress_args = ()
102
+ dl_progress_kwargs = {}
102
103
  if callback and is_single:
103
104
 
104
- async def _byte_callback(current, total, *args):
105
- await callback(current, total, "bytes", *args)
105
+ async def _byte_callback(current, total, *args, **kwargs):
106
+ await callback(current, total, "bytes", *args, **kwargs)
106
107
 
107
108
  dl_progress = _byte_callback
108
109
  dl_progress_args = callback_args
110
+ dl_progress_kwargs = callback_kwargs
109
111
 
110
112
  try:
111
113
  f = await download(
@@ -115,6 +117,7 @@ class ParseResult(ABC): # noqa: B024
115
117
  proxies=proxy,
116
118
  progress=dl_progress,
117
119
  progress_args=dl_progress_args,
120
+ progress_kwargs=dl_progress_kwargs,
118
121
  )
119
122
  except Exception as e:
120
123
  shutil.rmtree(output_dir, ignore_errors=True)
@@ -162,6 +165,7 @@ class ParseResult(ABC): # noqa: B024
162
165
  *,
163
166
  callback: ProgressCallback | None = None,
164
167
  callback_args: tuple = (),
168
+ callback_kwargs: dict | None = None,
165
169
  proxy: str | None = None,
166
170
  save_metadata: bool = False,
167
171
  ) -> "DownloadResult":
@@ -169,6 +173,7 @@ class ParseResult(ABC): # noqa: B024
169
173
  :param path: 保存路径
170
174
  :param callback: 下载进度回调函数
171
175
  :param callback_args: 下载进度回调函数参数
176
+ :param callback_kwargs: 回调函数的关键字参数
172
177
  :param proxy: 代理
173
178
  :param save_metadata: 保存解析结果为 metadata.json, 默认为 False
174
179
  :return: DownloadResult
@@ -176,7 +181,7 @@ class ParseResult(ABC): # noqa: B024
176
181
  Note:
177
182
  下载进度回调函数签名::
178
183
 
179
- async def callback(current: int, total: int, unit: Literal['bytes', 'count'], *args) -> None
184
+ async def callback(current: int, total: int, unit: Literal['bytes', 'count'], *args, **kwargs) -> None
180
185
 
181
186
  - current: 当前进度值
182
187
  - total: 总进度值
@@ -201,7 +206,11 @@ class ParseResult(ABC): # noqa: B024
201
206
 
202
207
  try:
203
208
  return await self._do_download(
204
- output_dir=output_dir, callback=callback, callback_args=callback_args, proxy=proxy
209
+ output_dir=output_dir,
210
+ callback=callback,
211
+ callback_args=callback_args,
212
+ callback_kwargs=callback_kwargs,
213
+ proxy=proxy,
205
214
  )
206
215
  except Exception as e:
207
216
  shutil.rmtree(output_dir, ignore_errors=True)
@@ -213,6 +222,7 @@ class ParseResult(ABC): # noqa: B024
213
222
  *,
214
223
  callback: ProgressCallback | None = None,
215
224
  callback_args: tuple = (),
225
+ callback_kwargs: dict | None = None,
216
226
  proxy: str | None = None,
217
227
  save_metadata: bool = False,
218
228
  ) -> "DownloadResult":
@@ -220,6 +230,7 @@ class ParseResult(ABC): # noqa: B024
220
230
  :param path: 保存路径
221
231
  :param callback: 下载进度回调函数
222
232
  :param callback_args: 下载进度回调函数参数
233
+ :param callback_kwargs: 回调函数的关键字参数
223
234
  :param proxy: 代理
224
235
  :param save_metadata: 保存解析结果为 metadata.json, 默认为 False
225
236
  :return: DownloadResult
@@ -237,7 +248,12 @@ class ParseResult(ABC): # noqa: B024
237
248
  """
238
249
  return get_event_loop().run_until_complete(
239
250
  self.download(
240
- path, callback=callback, callback_args=callback_args, proxy=proxy, save_metadata=save_metadata
251
+ path,
252
+ callback=callback,
253
+ callback_args=callback_args,
254
+ callback_kwargs=callback_kwargs,
255
+ proxy=proxy,
256
+ save_metadata=save_metadata,
241
257
  )
242
258
  )
243
259
 
@@ -249,7 +265,6 @@ class VideoParseResult(ParseResult):
249
265
 
250
266
  def __init__(
251
267
  self,
252
- raw_url: str,
253
268
  title: str = "",
254
269
  video: str | VideoRef | None = None,
255
270
  content: str = "",
@@ -259,7 +274,6 @@ class VideoParseResult(ParseResult):
259
274
  title=title,
260
275
  media=video,
261
276
  content=content,
262
- raw_url=raw_url,
263
277
  )
264
278
 
265
279
 
@@ -270,14 +284,13 @@ class ImageParseResult(ParseResult):
270
284
 
271
285
  def __init__(
272
286
  self,
273
- raw_url: str,
274
287
  title: str = "",
275
288
  photo: list[str | ImageRef | LivePhotoRef] | None = None,
276
289
  content: str = "",
277
290
  ):
278
291
  if photo:
279
292
  photo = [ImageRef(url=p) if isinstance(p, str) else p for p in photo]
280
- super().__init__(title=title, media=photo, content=content, raw_url=raw_url)
293
+ super().__init__(title=title, media=photo, content=content)
281
294
 
282
295
 
283
296
  class MultimediaParseResult(ParseResult):
@@ -287,12 +300,11 @@ class MultimediaParseResult(ParseResult):
287
300
 
288
301
  def __init__(
289
302
  self,
290
- raw_url: str,
291
303
  title: str = "",
292
304
  media: list[AnyMediaRef] | None = None,
293
305
  content: str = "",
294
306
  ):
295
- super().__init__(title=title, media=media, content=content, raw_url=raw_url)
307
+ super().__init__(title=title, media=media, content=content)
296
308
 
297
309
 
298
310
  class RichTextParseResult(ParseResult):
@@ -302,20 +314,17 @@ class RichTextParseResult(ParseResult):
302
314
 
303
315
  def __init__(
304
316
  self,
305
- raw_url: str,
306
317
  title: str = "",
307
318
  media: list[AnyMediaRef] | None = None,
308
319
  markdown_content: str = "",
309
320
  ):
310
321
  """
311
-
312
322
  :param title: 标题
313
323
  :param media: 文章中的媒体
314
324
  :param markdown_content: markdown 格式正文
315
- :param raw_url: 原始 URL
316
325
  """
317
326
  self.markdown_content = markdown_content
318
- super().__init__(title=title, media=media, content=self.plaintext_content, raw_url=raw_url)
327
+ super().__init__(title=title, media=media, content=self.plaintext_content)
319
328
 
320
329
  def __repr__(self):
321
330
  media_count = f"[{len(self.media if isinstance(self.media, list) else [self.media])}]" if self.media else None
@@ -3,6 +3,7 @@ import os
3
3
  import re
4
4
  from collections.abc import Callable
5
5
  from pathlib import Path
6
+ from typing import Literal
6
7
  from urllib.parse import unquote, urlparse
7
8
 
8
9
  import aiofiles
@@ -17,6 +18,7 @@ async def download(
17
18
  proxies: httpx.Proxy | None = None,
18
19
  progress: Callable | None = None,
19
20
  progress_args: tuple = (),
21
+ progress_kwargs: dict | None = None,
20
22
  max_retries: int = 3,
21
23
  chunk_size: int = 8192,
22
24
  ) -> str:
@@ -26,7 +28,8 @@ async def download(
26
28
  :param headers: 请求头
27
29
  :param proxies: 代理
28
30
  :param progress: 下载进度回调函数
29
- :param progress_args: 下载进度回调函数参数
31
+ :param progress_args: 下载进度回调函数的参数
32
+ :param progress_kwargs: 下载进度回调函数的关键字参数
30
33
  :param max_retries: 最大重试次数
31
34
  :param chunk_size: 分块大小
32
35
  :return: 文件路径
@@ -78,7 +81,7 @@ async def download(
78
81
 
79
82
  current = resume_pos if is_resumed else 0
80
83
 
81
- file_mode = "ab" if is_resumed else "wb"
84
+ file_mode: Literal["ab", "wb"] = "ab" if is_resumed else "wb"
82
85
 
83
86
  async with aiofiles.open(file=resolved_path, mode=file_mode) as f:
84
87
  async for chunk in r.aiter_bytes(chunk_size=chunk_size):
@@ -86,7 +89,9 @@ async def download(
86
89
  await f.write(chunk)
87
90
  current += len(chunk)
88
91
  if progress:
89
- await progress(current, total_size, *progress_args)
92
+ if progress_kwargs is None:
93
+ progress_kwargs = {}
94
+ await progress(current, total_size, *progress_args, **progress_kwargs)
90
95
 
91
96
  # 完整性校验
92
97
  if 0 < total_size != current:
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import json
3
3
  import re
4
- from urllib.parse import parse_qs, urlparse
5
4
 
6
5
  from urlextract import URLExtract
7
6
 
@@ -15,7 +14,7 @@ def get_event_loop():
15
14
  return event_loop
16
15
 
17
16
 
18
- url_extractor = URLExtract()
17
+ _url_extractor = URLExtract()
19
18
 
20
19
 
21
20
  def match_url(text: str) -> str:
@@ -23,7 +22,7 @@ def match_url(text: str) -> str:
23
22
  if not text:
24
23
  return ""
25
24
  text = re.sub(r"(https?://)", r" \1", text) # 协议前面增加空格, 方便提取
26
- url = url_extractor.find_urls(text, only_unique=True)
25
+ url = _url_extractor.find_urls(text, only_unique=True)
27
26
  return url[0] if url else ""
28
27
 
29
28
 
@@ -35,23 +34,6 @@ def cookie_ellipsis(cookie: dict) -> str:
35
34
  return f"{text[:c]}......{text[-c:]}"
36
35
 
37
36
 
38
- def clear_params(url: str, param: str | list[str]) -> str:
39
- """
40
- 删除链接指定参数
41
- :param url: 链接
42
- :param param: 参数
43
- :return:
44
- """
45
- params = param if isinstance(param, list) else [param]
46
- parsed_url = urlparse(url)
47
- query_params = parse_qs(parsed_url.query)
48
- for i in params.copy():
49
- if i in query_params:
50
- del query_params[i]
51
- new_query = "&".join([f"{k}={v[0]}" for k, v in query_params.items()])
52
- return parsed_url._replace(query=new_query).geturl()
53
-
54
-
55
37
  def normalize_cookie(v):
56
38
  if v is None or isinstance(v, dict):
57
39
  return v
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.3
3
+ Version: 2.0.5
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -119,6 +119,25 @@ print(result)
119
119
 
120
120
  ## 🔑 高级用法
121
121
 
122
+ ### 下载进度回调
123
+
124
+ ```python
125
+ from parsehub import ParseHub
126
+
127
+
128
+ class ProgressTracker:
129
+ async def __call__(self, current: int, total: int, unit: str, *args, task_name: str = "", **kwargs):
130
+ print(f"[{task_name}] {current}/{total} ({unit})")
131
+
132
+
133
+ result = ParseHub().download_sync(
134
+ "https://example.com",
135
+ callback=ProgressTracker(),
136
+ callback_args=("extra_arg",),
137
+ callback_kwargs={"task_name": "demo"},
138
+ )
139
+ ```
140
+
122
141
  ### Cookie 登录 & 代理
123
142
 
124
143
  部分平台的内容需要登录才能访问,通过 Cookie 即可解锁:
@@ -126,7 +145,7 @@ print(result)
126
145
  ```python
127
146
  from parsehub import ParseHub
128
147
 
129
- ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890",)
148
+ ph = ParseHub(cookie="key1=value1; key2=value2", proxy="http://127.0.0.1:7890", )
130
149
  ```
131
150
 
132
151
  Cookie 支持多种格式传入:
File without changes
File without changes
File without changes