parsehub 2.0.23__tar.gz → 2.0.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.23/src/parsehub.egg-info → parsehub-2.0.24}/PKG-INFO +1 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/pyproject.toml +7 -6
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/__init__.py +3 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/cli.py +5 -4
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/base/base.py +2 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/base/ytdlp.py +18 -14
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/bilibili.py +7 -7
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/coolapk.py +3 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/instagram.py +4 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/tiktok.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/twitter.py +3 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/weibo.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/xhs.py +10 -5
- parsehub-2.0.24/src/parsehub/parsers/parser/xiaoheihe.py +60 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/youtube.py +1 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/bilibili.py +11 -10
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/coolapk.py +1 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/douyin.py +13 -7
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/instagram.py +11 -10
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/kuaishou.py +6 -5
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/pipix.py +1 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/threads.py +11 -9
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/tieba.py +4 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/tiktok.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/twitter.py +15 -10
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/weibo.py +26 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/weixin.py +5 -3
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/xhs.py +16 -13
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/xiaoheihe.py +12 -8
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/zuiyou.py +6 -6
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/callback.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/media_file.py +5 -5
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/platform.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/result.py +6 -6
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/utils/utils.py +2 -2
- {parsehub-2.0.23 → parsehub-2.0.24/src/parsehub.egg-info}/PKG-INFO +1 -1
- {parsehub-2.0.23 → parsehub-2.0.24}/test/test_core_offline.py +8 -3
- parsehub-2.0.23/src/parsehub/parsers/parser/xiaoheihe.py +0 -50
- {parsehub-2.0.23 → parsehub-2.0.24}/LICENSE +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/README.md +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/setup.cfg +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/cli_config.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/douyin.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/tieba.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub.egg-info/entry_points.txt +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/src/parsehub.egg-info/top_level.txt +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/test/test_cli.py +0 -0
- {parsehub-2.0.23 → parsehub-2.0.24}/test/test_cli_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "parsehub"
|
|
3
|
-
version = "2.0.
|
|
3
|
+
version = "2.0.24"
|
|
4
4
|
description = "轻量、异步、开箱即用的社交媒体聚合解析库"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12.0"
|
|
@@ -85,14 +85,15 @@ ignore = [
|
|
|
85
85
|
[tool.mypy]
|
|
86
86
|
python_version = "3.12"
|
|
87
87
|
files = ["./"]
|
|
88
|
+
exclude = ["test"]
|
|
88
89
|
ignore_missing_imports = true
|
|
89
|
-
warn_return_any =
|
|
90
|
+
warn_return_any = true
|
|
90
91
|
warn_unused_ignores = true
|
|
91
|
-
check_untyped_defs =
|
|
92
|
-
disallow_untyped_defs =
|
|
93
|
-
no_implicit_optional =
|
|
92
|
+
check_untyped_defs = true
|
|
93
|
+
disallow_untyped_defs = true
|
|
94
|
+
no_implicit_optional = true
|
|
94
95
|
|
|
95
96
|
[tool.pytest.ini_options]
|
|
96
97
|
testpaths = ["test"]
|
|
97
98
|
pythonpath = ["src"]
|
|
98
|
-
python_files = ["test_*.py"]
|
|
99
|
+
python_files = ["test_*.py"]
|
|
@@ -13,7 +13,7 @@ logger.disable(__name__)
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class ParseHub:
|
|
16
|
-
def __init__(self):
|
|
16
|
+
def __init__(self) -> None:
|
|
17
17
|
self.parsers: list[type[BaseParser]] = BaseParser.get_registry()
|
|
18
18
|
|
|
19
19
|
async def parse(self, url: str, *, proxy: str | None = None, cookie: str | dict | None = None) -> AnyParseResult:
|
|
@@ -185,7 +185,7 @@ class ParseHub:
|
|
|
185
185
|
return parser
|
|
186
186
|
return None
|
|
187
187
|
|
|
188
|
-
def get_parser(self, url) -> type[BaseParser] | None:
|
|
188
|
+
def get_parser(self, url: str) -> type[BaseParser] | None:
|
|
189
189
|
"""获取解析器
|
|
190
190
|
:param url: 分享文案 / 分享链接
|
|
191
191
|
"""
|
|
@@ -193,7 +193,7 @@ class ParseHub:
|
|
|
193
193
|
return parser
|
|
194
194
|
return None
|
|
195
195
|
|
|
196
|
-
def get_platform(self, url) -> Platform | None:
|
|
196
|
+
def get_platform(self, url: str) -> Platform | None:
|
|
197
197
|
"""获取平台
|
|
198
198
|
:param url: 分享文案 / 分享链接
|
|
199
199
|
"""
|
|
@@ -46,7 +46,7 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
46
46
|
try:
|
|
47
47
|
args = parser.parse_args(_normalize_argv(raw_argv))
|
|
48
48
|
_finalize_output_args(args)
|
|
49
|
-
return args.func(args)
|
|
49
|
+
return int(args.func(args))
|
|
50
50
|
except SystemExit as e:
|
|
51
51
|
return _normalize_exit_code(e.code)
|
|
52
52
|
except ValueError as e:
|
|
@@ -365,14 +365,15 @@ def _cookie_prompt() -> Any:
|
|
|
365
365
|
|
|
366
366
|
def _load_platform_config(platform_id: str | None) -> PlatformConfig:
|
|
367
367
|
if not platform_id:
|
|
368
|
-
return _platform_config_type()()
|
|
369
|
-
return _config_store().get_platform(platform_id)
|
|
368
|
+
return cast("PlatformConfig", _platform_config_type()())
|
|
369
|
+
return cast("PlatformConfig", _config_store().get_platform(platform_id))
|
|
370
370
|
|
|
371
371
|
|
|
372
372
|
def _load_cookie(platform_id: str | None) -> str | None:
|
|
373
373
|
if not platform_id:
|
|
374
374
|
return None
|
|
375
|
-
|
|
375
|
+
value = _cookie_store().get(platform_id)
|
|
376
|
+
return value if isinstance(value, str) or value is None else str(value)
|
|
376
377
|
|
|
377
378
|
|
|
378
379
|
def _detect_platform_id(hub: Any, url_or_text: str) -> str | None:
|
|
@@ -2,6 +2,7 @@ import importlib
|
|
|
2
2
|
import pkgutil
|
|
3
3
|
import re
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Any
|
|
5
6
|
from urllib.parse import parse_qs, urlencode, urlparse
|
|
6
7
|
|
|
7
8
|
import httpx
|
|
@@ -34,7 +35,7 @@ class BaseParser(ABC):
|
|
|
34
35
|
self.proxy = proxy
|
|
35
36
|
self.cookie = normalize_cookie(cookie)
|
|
36
37
|
|
|
37
|
-
def __init_subclass__(cls, /, register: bool = True, **kwargs):
|
|
38
|
+
def __init_subclass__(cls, /, register: bool = True, **kwargs: Any) -> None:
|
|
38
39
|
super().__init_subclass__(**kwargs)
|
|
39
40
|
if register:
|
|
40
41
|
if not cls.__platform__:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from collections.abc import Callable
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
from typing import Any, cast
|
|
4
6
|
|
|
5
7
|
from yt_dlp import YoutubeDL
|
|
6
8
|
|
|
@@ -30,7 +32,7 @@ def switch_ytdlp_proxy(ydl: YoutubeDL, proxy: str | None) -> None:
|
|
|
30
32
|
director.close()
|
|
31
33
|
|
|
32
34
|
|
|
33
|
-
def download_video(yto_params: dict, url: str, proxy: str | None = None) -> None:
|
|
35
|
+
def download_video(yto_params: dict[str, Any], url: str, proxy: str | None = None) -> None:
|
|
34
36
|
"""在独立线程中下载视频"""
|
|
35
37
|
try:
|
|
36
38
|
with YoutubeDL(yto_params) as ydl:
|
|
@@ -43,14 +45,16 @@ def download_video(yto_params: dict, url: str, proxy: str | None = None) -> None
|
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
class MonotonicDownloadProgress:
|
|
46
|
-
def __init__(
|
|
48
|
+
def __init__(
|
|
49
|
+
self, emit: Callable[[float], None], *, start: float = 0.0, end: float = 100.0, min_step: float = 0.1
|
|
50
|
+
) -> None:
|
|
47
51
|
self.emit = emit
|
|
48
52
|
self.start = start
|
|
49
53
|
self.end = end
|
|
50
54
|
self.min_step = min_step
|
|
51
55
|
self.current = start
|
|
52
56
|
|
|
53
|
-
def __call__(self, d: dict):
|
|
57
|
+
def __call__(self, d: dict[str, Any]) -> None:
|
|
54
58
|
status = d.get("status")
|
|
55
59
|
|
|
56
60
|
if status == "downloading":
|
|
@@ -82,8 +86,8 @@ class MonotonicDownloadProgress:
|
|
|
82
86
|
# 分片下载有时没有稳定总大小,但有 frag 进度;作为兜底
|
|
83
87
|
frag_index = d.get("fragment_index")
|
|
84
88
|
frag_count = d.get("fragment_count")
|
|
85
|
-
if frag_index
|
|
86
|
-
return min(frag_index / frag_count * 100, 100)
|
|
89
|
+
if isinstance(frag_index, int | float) and isinstance(frag_count, int | float) and frag_count:
|
|
90
|
+
return min(float(frag_index) / float(frag_count) * 100, 100.0)
|
|
87
91
|
|
|
88
92
|
return None
|
|
89
93
|
|
|
@@ -106,7 +110,7 @@ class YtParser(BaseParser, register=False):
|
|
|
106
110
|
),
|
|
107
111
|
)
|
|
108
112
|
|
|
109
|
-
async def _parse(self, url) -> "YtVideoInfo":
|
|
113
|
+
async def _parse(self, url: str) -> "YtVideoInfo":
|
|
110
114
|
try:
|
|
111
115
|
dl = await asyncio.wait_for(asyncio.to_thread(self._extract_info, url), timeout=30)
|
|
112
116
|
except TimeoutError as e:
|
|
@@ -136,20 +140,20 @@ class YtParser(BaseParser, register=False):
|
|
|
136
140
|
proxy=self.proxy,
|
|
137
141
|
)
|
|
138
142
|
|
|
139
|
-
def _extract_info(self, url):
|
|
143
|
+
def _extract_info(self, url: str) -> dict[str, Any]:
|
|
140
144
|
params = self.params.copy()
|
|
141
145
|
if self.proxy:
|
|
142
146
|
params["proxy"] = self.proxy
|
|
143
147
|
|
|
144
148
|
try:
|
|
145
149
|
with YoutubeDL(params) as ydl:
|
|
146
|
-
return ydl.extract_info(url, download=False)
|
|
150
|
+
return cast(dict[str, Any], ydl.extract_info(url, download=False))
|
|
147
151
|
except Exception as e:
|
|
148
152
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
149
153
|
raise RuntimeError(error_msg) from None
|
|
150
154
|
|
|
151
155
|
@property
|
|
152
|
-
def params(self) -> dict:
|
|
156
|
+
def params(self) -> dict[str, Any]:
|
|
153
157
|
params = {
|
|
154
158
|
"format": "mp4+bestvideo[height<=1080]+bestaudio",
|
|
155
159
|
"quiet": True, # 不输出日志
|
|
@@ -170,9 +174,9 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
170
174
|
def __init__(
|
|
171
175
|
self,
|
|
172
176
|
dl: "YtVideoInfo",
|
|
173
|
-
title,
|
|
174
|
-
video=None,
|
|
175
|
-
content=None,
|
|
177
|
+
title: str | None,
|
|
178
|
+
video: VideoRef | None = None,
|
|
179
|
+
content: str | None = None,
|
|
176
180
|
):
|
|
177
181
|
"""dl: yt-dlp解析结果"""
|
|
178
182
|
self.dl = dl
|
|
@@ -201,7 +205,7 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
201
205
|
if callback:
|
|
202
206
|
loop = asyncio.get_running_loop()
|
|
203
207
|
|
|
204
|
-
def _callback(count: float):
|
|
208
|
+
def _callback(count: float) -> None:
|
|
205
209
|
asyncio.run_coroutine_threadsafe(
|
|
206
210
|
callback(int(count), 100, "bytes", *callback_args, **callback_kwargs), loop
|
|
207
211
|
)
|
|
@@ -237,7 +241,7 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
237
241
|
output_dir,
|
|
238
242
|
)
|
|
239
243
|
|
|
240
|
-
async def _run_download(self, paramss: dict, count: int = 0, *, proxy: str | None = None) -> None:
|
|
244
|
+
async def _run_download(self, paramss: dict[str, Any], count: int = 0, *, proxy: str | None = None) -> None:
|
|
241
245
|
if count > 2:
|
|
242
246
|
raise DownloadError("下载失败 -2")
|
|
243
247
|
|
|
@@ -58,7 +58,7 @@ class BiliParse(YtParser):
|
|
|
58
58
|
raise ParseError("Bilibili 解析失败") from e
|
|
59
59
|
|
|
60
60
|
@staticmethod
|
|
61
|
-
def _is_bvid(url: str):
|
|
61
|
+
def _is_bvid(url: str) -> bool:
|
|
62
62
|
if url.lower().startswith("bv"):
|
|
63
63
|
return True
|
|
64
64
|
else:
|
|
@@ -79,7 +79,7 @@ class BiliParse(YtParser):
|
|
|
79
79
|
return await super().get_raw_url(url, clean_all=clean_all)
|
|
80
80
|
|
|
81
81
|
@staticmethod
|
|
82
|
-
async def is_dynamic(url) -> str | None:
|
|
82
|
+
async def is_dynamic(url: str) -> str | None:
|
|
83
83
|
"""是动态"""
|
|
84
84
|
if re.search(r"\b\d{18,19}\b", url):
|
|
85
85
|
return url
|
|
@@ -93,9 +93,9 @@ class BiliParse(YtParser):
|
|
|
93
93
|
if "风控" in str(e):
|
|
94
94
|
raise ParseError(f"账号风控\n使用的cookie: {cookie_ellipsis(self.cookie)}") from e
|
|
95
95
|
raise ParseError(str(e)) from e
|
|
96
|
-
return dynamic_info
|
|
96
|
+
return cast(BiliDynamic, dynamic_info)
|
|
97
97
|
|
|
98
|
-
async def bili_api_parse(self, url) -> BiliVideoParseResult | ImageParseResult:
|
|
98
|
+
async def bili_api_parse(self, url: str) -> BiliVideoParseResult | ImageParseResult:
|
|
99
99
|
async with BiliAPI(proxy=self.proxy) as bili:
|
|
100
100
|
video_info = await bili.get_video_info(url)
|
|
101
101
|
|
|
@@ -138,16 +138,16 @@ class BiliParse(YtParser):
|
|
|
138
138
|
),
|
|
139
139
|
)
|
|
140
140
|
|
|
141
|
-
async def ytp_parse(self, url) -> YtVideoParseResult:
|
|
141
|
+
async def ytp_parse(self, url: str) -> YtVideoParseResult:
|
|
142
142
|
result = cast(YtVideoParseResult, await super()._do_parse(url))
|
|
143
143
|
return YtVideoParseResult(
|
|
144
144
|
title=result.title,
|
|
145
145
|
dl=result.dl,
|
|
146
|
-
video=result.media,
|
|
146
|
+
video=cast(VideoRef | None, result.media),
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
@staticmethod
|
|
150
|
-
def change_source(url: str):
|
|
150
|
+
def change_source(url: str) -> str:
|
|
151
151
|
return re.sub(
|
|
152
152
|
r"upos-.*.(bilivideo.com|mirrorakam.akamaized.net)",
|
|
153
153
|
"upos-sz-upcdnbda2.bilivideo.com",
|
|
@@ -52,7 +52,7 @@ class CoolapkParser(BaseParser):
|
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
@staticmethod
|
|
55
|
-
def hashtag_handler(desc: str):
|
|
55
|
+
def hashtag_handler(desc: str) -> str:
|
|
56
56
|
hashtags = re.findall(r" ?#[^#]+# ?", desc)
|
|
57
57
|
for hashtag in hashtags:
|
|
58
58
|
desc = desc.replace(hashtag, f" {hashtag.strip().removesuffix('#')} ")
|
|
@@ -64,11 +64,11 @@ class CoolapkParseResult(ParseResult):
|
|
|
64
64
|
self,
|
|
65
65
|
*,
|
|
66
66
|
output_dir: str | Path,
|
|
67
|
-
callback: ProgressCallback = None,
|
|
67
|
+
callback: ProgressCallback | None = None,
|
|
68
68
|
callback_args: tuple = (),
|
|
69
69
|
callback_kwargs: dict | None = None,
|
|
70
70
|
proxy: str | None = None,
|
|
71
|
-
headers: dict = None,
|
|
71
|
+
headers: dict | None = None,
|
|
72
72
|
) -> "DownloadResult":
|
|
73
73
|
headers = {
|
|
74
74
|
"Accept": (
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import re
|
|
3
|
+
from typing import Any, cast
|
|
3
4
|
|
|
4
5
|
from instaloader import BadResponseException
|
|
5
6
|
|
|
@@ -56,7 +57,7 @@ class InstagramParser(BaseParser):
|
|
|
56
57
|
case _:
|
|
57
58
|
raise ParseError("不支持的类型")
|
|
58
59
|
|
|
59
|
-
async def _parse(self, url, shortcode, cookie=None) -> MyPost:
|
|
60
|
+
async def _parse(self, url: str, shortcode: str, cookie: dict[str, Any] | None = None) -> MyPost:
|
|
60
61
|
try:
|
|
61
62
|
post = await asyncio.wait_for(
|
|
62
63
|
asyncio.to_thread(
|
|
@@ -84,10 +85,10 @@ class InstagramParser(BaseParser):
|
|
|
84
85
|
text = str(e)
|
|
85
86
|
raise ParseError(f"无法获取帖子内容: {text}") from e
|
|
86
87
|
else:
|
|
87
|
-
return post
|
|
88
|
+
return cast(MyPost, post)
|
|
88
89
|
|
|
89
90
|
@staticmethod
|
|
90
|
-
def get_short_code(url: str):
|
|
91
|
+
def get_short_code(url: str) -> str | None:
|
|
91
92
|
url = url.removesuffix("/")
|
|
92
93
|
shortcode = re.search(r"/(share|p|reel|.*/p|.*/reel)/(.*)", url)
|
|
93
94
|
return shortcode.group(2).split("/")[0] if shortcode else None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Self, Union
|
|
4
|
+
from typing import Any, Self, Union
|
|
5
5
|
|
|
6
6
|
from ... import ProgressCallback
|
|
7
7
|
from ...provider_api.tiktok import TikTokWebCrawler
|
|
@@ -110,7 +110,7 @@ def preferred_video_url(data: dict | str | list | None) -> str | None:
|
|
|
110
110
|
return next((url for url in urls if "aweme" in url), None) or (urls[0] if urls else None)
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
def as_int(value) -> int:
|
|
113
|
+
def as_int(value: Any) -> int:
|
|
114
114
|
try:
|
|
115
115
|
return int(value or 0)
|
|
116
116
|
except (TypeError, ValueError):
|
|
@@ -26,7 +26,7 @@ class TwitterParser(BaseParser):
|
|
|
26
26
|
__supported_type__ = ["视频", "图文"]
|
|
27
27
|
__match__ = r"^(http(s)?://)?.+(twitter|fixupx|x).com/.*/status/\d+"
|
|
28
28
|
|
|
29
|
-
async def _do_parse(self, raw_url: str) ->
|
|
29
|
+
async def _do_parse(self, raw_url: str) -> MultimediaParseResult | RichTextParseResult:
|
|
30
30
|
tweet = await self._parse(raw_url)
|
|
31
31
|
return await self.media_parse(tweet)
|
|
32
32
|
|
|
@@ -34,7 +34,7 @@ class TwitterParser(BaseParser):
|
|
|
34
34
|
url = await super().get_raw_url(url, clean_all=clean_all)
|
|
35
35
|
return str(urlunparse(urlparse(url)._replace(netloc="x.com")))
|
|
36
36
|
|
|
37
|
-
async def _parse(self, url: str):
|
|
37
|
+
async def _parse(self, url: str) -> TwitterTweet:
|
|
38
38
|
x = Twitter(self.proxy, cookie=None)
|
|
39
39
|
try:
|
|
40
40
|
tweet = await x.fetch_tweet(url)
|
|
@@ -55,7 +55,7 @@ class TwitterParser(BaseParser):
|
|
|
55
55
|
return tweet
|
|
56
56
|
|
|
57
57
|
@staticmethod
|
|
58
|
-
async def media_parse(tweet: TwitterTweet):
|
|
58
|
+
async def media_parse(tweet: TwitterTweet) -> MultimediaParseResult | RichTextParseResult:
|
|
59
59
|
media: list[AnyMediaRef] = []
|
|
60
60
|
if tweet.media:
|
|
61
61
|
for m in tweet.media:
|
|
@@ -17,7 +17,7 @@ from ..base.base import BaseParser
|
|
|
17
17
|
class WeiboParser(BaseParser):
|
|
18
18
|
__platform__ = Platform.WEIBO
|
|
19
19
|
__supported_type__ = ["视频", "图文"]
|
|
20
|
-
__match__ = r"^(http(s)?://)(m\.|)weibo
|
|
20
|
+
__match__ = r"^(http(s)?://)((m\.|)weibo\.(com|cn)/(?!(u/)).+|mapp\.api\.weibo\.cn/fx/.+)"
|
|
21
21
|
|
|
22
22
|
async def _do_parse(self, raw_url: str) -> MultimediaParseResult | VideoParseResult | ImageParseResult:
|
|
23
23
|
weibo = await WeiboAPI(self.proxy).parse(raw_url)
|
|
@@ -91,7 +91,7 @@ class WeiboParser(BaseParser):
|
|
|
91
91
|
return text.strip()
|
|
92
92
|
|
|
93
93
|
@staticmethod
|
|
94
|
-
def hashtag_handler(desc: str):
|
|
94
|
+
def hashtag_handler(desc: str) -> str:
|
|
95
95
|
hashtags = re.findall(r" ?#[^#]+# ?", desc)
|
|
96
96
|
for hashtag in hashtags:
|
|
97
97
|
desc = desc.replace(hashtag, f" {hashtag.strip().removesuffix('#')} ")
|
|
@@ -31,6 +31,8 @@ class XHSParser(BaseParser):
|
|
|
31
31
|
desc = self.hashtag_handler(result.desc)
|
|
32
32
|
match result.type:
|
|
33
33
|
case XHSPostType.VIDEO:
|
|
34
|
+
if not result.media:
|
|
35
|
+
raise ParseError("未获取到视频")
|
|
34
36
|
v: XHSMedia = result.media[0]
|
|
35
37
|
return VideoParseResult(
|
|
36
38
|
video=VideoRef(
|
|
@@ -40,10 +42,13 @@ class XHSParser(BaseParser):
|
|
|
40
42
|
content=desc,
|
|
41
43
|
)
|
|
42
44
|
case XHSPostType.IMAGE:
|
|
45
|
+
media_list = result.media or []
|
|
43
46
|
photos: list[ImageRef | LivePhotoRef] = []
|
|
44
|
-
for i in
|
|
47
|
+
for i in media_list:
|
|
45
48
|
if i.type == XHSMediaType.LIVE_PHOTO:
|
|
46
|
-
photos.append(
|
|
49
|
+
photos.append(
|
|
50
|
+
LivePhotoRef(url=i.thumb_url or "", video_url=i.url, width=i.width, height=i.height)
|
|
51
|
+
)
|
|
47
52
|
else:
|
|
48
53
|
# 小红书图片格式: "png" | "webp" | "jpeg" | "heic" | "avif"
|
|
49
54
|
ext = await self.get_ext_by_url(i.url)
|
|
@@ -61,7 +66,7 @@ class XHSParser(BaseParser):
|
|
|
61
66
|
case _:
|
|
62
67
|
raise ParseError("不支持的类型")
|
|
63
68
|
|
|
64
|
-
async def get_ext_by_url(self, url: str):
|
|
69
|
+
async def get_ext_by_url(self, url: str) -> str:
|
|
65
70
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
66
71
|
try:
|
|
67
72
|
response = await client.head(url, follow_redirects=True)
|
|
@@ -72,12 +77,12 @@ class XHSParser(BaseParser):
|
|
|
72
77
|
media_type = content_type.split(";")[0].strip()
|
|
73
78
|
if "/" in media_type:
|
|
74
79
|
extension = media_type.split("/")[-1]
|
|
75
|
-
return extension
|
|
80
|
+
return str(extension)
|
|
76
81
|
|
|
77
82
|
return ""
|
|
78
83
|
|
|
79
84
|
@staticmethod
|
|
80
|
-
def hashtag_handler(desc: str | None):
|
|
85
|
+
def hashtag_handler(desc: str | None) -> str:
|
|
81
86
|
if not desc:
|
|
82
87
|
return ""
|
|
83
88
|
hashtags = re.findall(r" ?#[^#\[\]]+\[话题]# ?", desc)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from ...parsers.base import BaseParser
|
|
2
|
+
from ...provider_api.xiaoheihe import XiaoHeiHeAPI, XiaoHeiHeMediaType, XiaoHeiHePost, XiaoHeiHePostType
|
|
3
|
+
from ...types import (
|
|
4
|
+
AniRef,
|
|
5
|
+
AnyParseResult,
|
|
6
|
+
ImageParseResult,
|
|
7
|
+
ImageRef,
|
|
8
|
+
MultimediaParseResult,
|
|
9
|
+
ParseError,
|
|
10
|
+
Platform,
|
|
11
|
+
RichTextParseResult,
|
|
12
|
+
VideoParseResult,
|
|
13
|
+
VideoRef,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class XiaoHeiHeParser(BaseParser):
|
|
18
|
+
__platform__ = Platform.XIAOHEIHE
|
|
19
|
+
__supported_type__ = ["视频", "图文"]
|
|
20
|
+
__match__ = r"^(http(s)?://)?.+xiaoheihe.cn/(v3|app)/bbs/(app|link).+"
|
|
21
|
+
__redirect_keywords__ = ["api.xiaoheihe"]
|
|
22
|
+
|
|
23
|
+
async def _do_parse(self, raw_url: str) -> AnyParseResult:
|
|
24
|
+
xhh = await XiaoHeiHeAPI(proxy=self.proxy).parse(raw_url)
|
|
25
|
+
match xhh.type:
|
|
26
|
+
case XiaoHeiHePostType.VIDEO:
|
|
27
|
+
return VideoParseResult(
|
|
28
|
+
video=self.__parse_video(xhh),
|
|
29
|
+
title=xhh.title,
|
|
30
|
+
content=xhh.content,
|
|
31
|
+
)
|
|
32
|
+
case XiaoHeiHePostType.IMAGE:
|
|
33
|
+
media = self.__parse_images(xhh)
|
|
34
|
+
if not media or all(isinstance(m, ImageRef) for m in media):
|
|
35
|
+
return ImageParseResult(photo=media, title=xhh.title, content=xhh.content)
|
|
36
|
+
return MultimediaParseResult(media=media, title=xhh.title, content=xhh.content)
|
|
37
|
+
case XiaoHeiHePostType.ARTICLE:
|
|
38
|
+
return RichTextParseResult(
|
|
39
|
+
title=xhh.title,
|
|
40
|
+
media=self.__parse_images(xhh),
|
|
41
|
+
markdown_content=xhh.content,
|
|
42
|
+
)
|
|
43
|
+
raise ParseError("不支持的类型")
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def __parse_video(xhh: XiaoHeiHePost) -> VideoRef:
|
|
47
|
+
if not xhh.media:
|
|
48
|
+
raise ParseError("未获取到视频")
|
|
49
|
+
media = xhh.media[0]
|
|
50
|
+
return VideoRef(url=media.url, thumb_url=media.thumb_url)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def __parse_images(xhh: XiaoHeiHePost) -> list[ImageRef | AniRef]:
|
|
54
|
+
images: list[ImageRef | AniRef] = []
|
|
55
|
+
for media in xhh.media or []:
|
|
56
|
+
if media.type == XiaoHeiHeMediaType.IMAGE:
|
|
57
|
+
images.append(ImageRef(url=media.url, width=media.width or 0, height=media.height or 0))
|
|
58
|
+
else:
|
|
59
|
+
images.append(AniRef(url=media.url, width=media.width or 0, height=media.height or 0))
|
|
60
|
+
return images
|
|
@@ -13,7 +13,7 @@ class YtbParse(YtParser):
|
|
|
13
13
|
__reserved_parameters__ = ["v", "list", "index"]
|
|
14
14
|
|
|
15
15
|
@property
|
|
16
|
-
def params(self):
|
|
16
|
+
def params(self) -> dict[str, Any]:
|
|
17
17
|
sub: dict[str, Any] = {
|
|
18
18
|
# "writesubtitles": True, # 下载字幕
|
|
19
19
|
# "writeautomaticsub": True, # 下载自动生成的字幕
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# mypy: disable-error-code=no-untyped-def
|
|
1
2
|
import asyncio
|
|
2
3
|
import re
|
|
3
4
|
import time
|
|
@@ -7,7 +8,7 @@ from dataclasses import dataclass
|
|
|
7
8
|
from enum import Enum
|
|
8
9
|
from functools import reduce
|
|
9
10
|
from hashlib import md5
|
|
10
|
-
from typing import Any
|
|
11
|
+
from typing import Any, cast
|
|
11
12
|
|
|
12
13
|
import httpx
|
|
13
14
|
|
|
@@ -70,7 +71,7 @@ class BiliAPI:
|
|
|
70
71
|
raise Exception("动态不可见")
|
|
71
72
|
case _:
|
|
72
73
|
raise Exception(f"获取动态信息失败: {mj}")
|
|
73
|
-
return BiliDynamic.parse(data)
|
|
74
|
+
return BiliDynamic.parse(cast(dict[str, Any], data))
|
|
74
75
|
|
|
75
76
|
async def get_video_info(self, url: str):
|
|
76
77
|
"""获取视频详细信息"""
|
|
@@ -117,7 +118,7 @@ class BiliAPI:
|
|
|
117
118
|
params=params,
|
|
118
119
|
cookies=cookies,
|
|
119
120
|
)
|
|
120
|
-
return response.json()
|
|
121
|
+
return cast(dict[str, Any], response.json())
|
|
121
122
|
|
|
122
123
|
async def get_buvid(self):
|
|
123
124
|
"""获取 buvid"""
|
|
@@ -135,7 +136,7 @@ class BiliAPI:
|
|
|
135
136
|
wbi = await BiliWbiSigner().wbi(bvid=bvid, cid=cid, up_mid=up_mid)
|
|
136
137
|
return await self.get_ai_summary(bvid, cid, up_mid, wbi["w_rid"], wbi["wts"])
|
|
137
138
|
|
|
138
|
-
async def get_ai_summary(self, bvid: str, cid: int, up_mid: int, w_rid: str, wts: int):
|
|
139
|
+
async def get_ai_summary(self, bvid: str, cid: int, up_mid: int, w_rid: str, wts: int) -> "AISummaryResult":
|
|
139
140
|
url = "https://api.bilibili.com/x/web-interface/view/conclusion/get"
|
|
140
141
|
result = await self._get_client().get(
|
|
141
142
|
url,
|
|
@@ -249,7 +250,7 @@ class BiliDynamic:
|
|
|
249
250
|
images: list[BiliImage] | None = None
|
|
250
251
|
|
|
251
252
|
@classmethod
|
|
252
|
-
def parse(cls, data: dict):
|
|
253
|
+
def parse(cls, data: dict) -> "BiliDynamic":
|
|
253
254
|
module_dynamic: dict = data["item"]["modules"]["module_dynamic"]
|
|
254
255
|
major: dict | None = module_dynamic.get("major", None)
|
|
255
256
|
if not major:
|
|
@@ -258,7 +259,7 @@ class BiliDynamic:
|
|
|
258
259
|
return cls._parse_major(module_dynamic, major)
|
|
259
260
|
|
|
260
261
|
@classmethod
|
|
261
|
-
def _parse_major(cls, module_dynamic: dict, major: dict):
|
|
262
|
+
def _parse_major(cls, module_dynamic: dict, major: dict) -> "BiliDynamic":
|
|
262
263
|
major_type = major["type"]
|
|
263
264
|
major_parsers: dict[MajorType, Callable[[dict, dict], BiliDynamic]] = {
|
|
264
265
|
MajorType.MAJOR_TYPE_MEDIALIST: cls._parse_medialist,
|
|
@@ -278,12 +279,12 @@ class BiliDynamic:
|
|
|
278
279
|
return major_parser(module_dynamic, major)
|
|
279
280
|
|
|
280
281
|
@classmethod
|
|
281
|
-
def _parse_pgc_union(cls, _, major: dict):
|
|
282
|
+
def _parse_pgc_union(cls, _: dict, major: dict) -> "BiliDynamic":
|
|
282
283
|
pgc = major["pgc"]
|
|
283
284
|
return cls(title=pgc["title"], images=[BiliImage(url=pgc["cover"])])
|
|
284
285
|
|
|
285
286
|
@classmethod
|
|
286
|
-
def _parse_forward(cls, module_dynamic: dict):
|
|
287
|
+
def _parse_forward(cls, module_dynamic: dict) -> "BiliDynamic":
|
|
287
288
|
return cls(content=cls._get_desc_text(module_dynamic))
|
|
288
289
|
|
|
289
290
|
@classmethod
|
|
@@ -301,7 +302,7 @@ class BiliDynamic:
|
|
|
301
302
|
return cls(title=music["title"], images=cls._get_major_cover(music))
|
|
302
303
|
|
|
303
304
|
@classmethod
|
|
304
|
-
def _parse_opus(cls, _, major: dict):
|
|
305
|
+
def _parse_opus(cls, _: dict, major: dict) -> "BiliDynamic":
|
|
305
306
|
opus = major["opus"]
|
|
306
307
|
images = None
|
|
307
308
|
if pics := opus["pics"]:
|
|
@@ -362,7 +363,7 @@ class BiliDynamic:
|
|
|
362
363
|
@staticmethod
|
|
363
364
|
def _get_desc_text(module_dynamic: dict) -> str | None:
|
|
364
365
|
if desc := module_dynamic["desc"]:
|
|
365
|
-
return desc["text"].strip()
|
|
366
|
+
return str(desc["text"]).strip()
|
|
366
367
|
return None
|
|
367
368
|
|
|
368
369
|
@staticmethod
|
|
@@ -16,7 +16,7 @@ class Coolapk:
|
|
|
16
16
|
imgs: list[str] | None = None
|
|
17
17
|
|
|
18
18
|
@classmethod
|
|
19
|
-
async def parse(cls, url: str, proxy: str = None) -> "Coolapk":
|
|
19
|
+
async def parse(cls, url: str, proxy: str | None = None) -> "Coolapk":
|
|
20
20
|
async with httpx.AsyncClient(headers={"User-Agent": GlobalConfig.ua}, proxy=proxy) as client:
|
|
21
21
|
result = await client.get(url)
|
|
22
22
|
soup = BeautifulSoup(result.text, "lxml")
|