parsehub 2.0.22__tar.gz → 2.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.22/src/parsehub.egg-info → parsehub-2.0.23}/PKG-INFO +1 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/pyproject.toml +18 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/__init__.py +7 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/cli.py +9 -8
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/base/base.py +6 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/base/ytdlp.py +11 -6
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/bilibili.py +9 -7
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/coolapk.py +2 -2
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/douyin.py +7 -5
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/instagram.py +5 -5
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/pipix.py +3 -3
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/threads.py +2 -2
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/tieba.py +5 -3
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/tiktok.py +6 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/twitter.py +12 -3
- parsehub-2.0.23/src/parsehub/parsers/parser/weibo.py +101 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/xiaoheihe.py +5 -3
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/bilibili.py +9 -7
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/coolapk.py +4 -2
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/douyin.py +2 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/instagram.py +4 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/pipix.py +6 -3
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/threads.py +2 -2
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/tieba.py +7 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/twitter.py +10 -6
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/weibo.py +68 -64
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/weixin.py +11 -7
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/xhs.py +10 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/xiaoheihe.py +4 -3
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/zuiyou.py +4 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/result.py +34 -29
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/utils/downloader.py +8 -8
- {parsehub-2.0.22 → parsehub-2.0.23/src/parsehub.egg-info}/PKG-INFO +1 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/test/test_cli.py +4 -4
- {parsehub-2.0.22 → parsehub-2.0.23}/test/test_cli_config.py +1 -1
- {parsehub-2.0.22 → parsehub-2.0.23}/test/test_core_offline.py +5 -5
- parsehub-2.0.22/src/parsehub/parsers/parser/weibo.py +0 -89
- {parsehub-2.0.22 → parsehub-2.0.23}/LICENSE +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/README.md +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/setup.cfg +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/cli_config.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/xhs.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/provider_api/tiktok.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub/utils/utils.py +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub.egg-info/entry_points.txt +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.22 → parsehub-2.0.23}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "parsehub"
|
|
3
|
-
version = "2.0.
|
|
3
|
+
version = "2.0.23"
|
|
4
4
|
description = "轻量、异步、开箱即用的社交媒体聚合解析库"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12.0"
|
|
@@ -52,6 +52,8 @@ cli = [
|
|
|
52
52
|
|
|
53
53
|
[dependency-groups]
|
|
54
54
|
dev = [
|
|
55
|
+
"mypy>=2.1.0",
|
|
56
|
+
"pytest>=9.0.3",
|
|
55
57
|
"ruff>=0.14.14",
|
|
56
58
|
]
|
|
57
59
|
|
|
@@ -79,3 +81,18 @@ ignore = [
|
|
|
79
81
|
"B008", # 不在参数默认值中执行函数调用
|
|
80
82
|
"C901", # 函数复杂度过高
|
|
81
83
|
]
|
|
84
|
+
|
|
85
|
+
[tool.mypy]
|
|
86
|
+
python_version = "3.12"
|
|
87
|
+
files = ["./"]
|
|
88
|
+
ignore_missing_imports = true
|
|
89
|
+
warn_return_any = false
|
|
90
|
+
warn_unused_ignores = true
|
|
91
|
+
check_untyped_defs = false
|
|
92
|
+
disallow_untyped_defs = false
|
|
93
|
+
no_implicit_optional = false
|
|
94
|
+
|
|
95
|
+
[tool.pytest.ini_options]
|
|
96
|
+
testpaths = ["test"]
|
|
97
|
+
pythonpath = ["src"]
|
|
98
|
+
python_files = ["test_*.py"]
|
|
@@ -47,9 +47,9 @@ class ParseHub:
|
|
|
47
47
|
async def download(
|
|
48
48
|
self,
|
|
49
49
|
url: str,
|
|
50
|
-
path: str | Path = None,
|
|
50
|
+
path: str | Path | None = None,
|
|
51
51
|
*,
|
|
52
|
-
callback: ProgressCallback = None,
|
|
52
|
+
callback: ProgressCallback | None = None,
|
|
53
53
|
callback_args: tuple = (),
|
|
54
54
|
callback_kwargs: dict | None = None,
|
|
55
55
|
proxy: str | None = None,
|
|
@@ -169,6 +169,8 @@ class ParseHub:
|
|
|
169
169
|
:return: 原始链接
|
|
170
170
|
"""
|
|
171
171
|
parser = self.get_parser(url)
|
|
172
|
+
if not parser:
|
|
173
|
+
raise UnknownPlatform(url)
|
|
172
174
|
try:
|
|
173
175
|
return await parser(proxy=proxy).get_raw_url(url, clean_all=clean_all)
|
|
174
176
|
except Exception as e:
|
|
@@ -210,9 +212,10 @@ class ParseHub:
|
|
|
210
212
|
"""
|
|
211
213
|
return [
|
|
212
214
|
{
|
|
213
|
-
"id":
|
|
214
|
-
"name":
|
|
215
|
+
"id": platform.id,
|
|
216
|
+
"name": platform.display_name,
|
|
215
217
|
"supported_types": parser.__supported_type__,
|
|
216
218
|
}
|
|
217
219
|
for parser in self.parsers
|
|
220
|
+
if (platform := parser.__platform__) is not None
|
|
218
221
|
]
|
|
@@ -8,7 +8,7 @@ import unicodedata
|
|
|
8
8
|
from dataclasses import asdict, is_dataclass
|
|
9
9
|
from importlib.metadata import PackageNotFoundError, version
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any
|
|
11
|
+
from typing import TYPE_CHECKING, Any, NoReturn, cast
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
14
14
|
from .cli_config import AutoCookieStore, PlatformConfig
|
|
@@ -20,12 +20,13 @@ _CLI_EXTRA_MODULES = ("argcomplete", "platformdirs")
|
|
|
20
20
|
class _ChineseArgumentParser(argparse.ArgumentParser):
|
|
21
21
|
def __init__(self, *args: Any, **kwargs: Any):
|
|
22
22
|
kwargs.setdefault("formatter_class", argparse.RawDescriptionHelpFormatter)
|
|
23
|
-
add_help = kwargs.pop("add_help", True)
|
|
24
|
-
|
|
23
|
+
add_help = bool(kwargs.pop("add_help", True))
|
|
24
|
+
kwargs["add_help"] = False
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
25
26
|
if add_help:
|
|
26
27
|
self.add_argument("-h", "--help", action="help", default=argparse.SUPPRESS, help="显示帮助信息")
|
|
27
28
|
|
|
28
|
-
def error(self, message: str) ->
|
|
29
|
+
def error(self, message: str) -> NoReturn:
|
|
29
30
|
self.print_usage(sys.stderr)
|
|
30
31
|
translated = _translate_argparse_error(message)
|
|
31
32
|
hint = _usage_hint(self.prog)
|
|
@@ -212,7 +213,7 @@ def _add_set_commands(subparsers: argparse._SubParsersAction) -> None:
|
|
|
212
213
|
|
|
213
214
|
def _add_platform_argument(parser: argparse.ArgumentParser) -> None:
|
|
214
215
|
action = parser.add_argument("platform", help="平台 ID,如 xhs")
|
|
215
|
-
action.completer = _complete_platforms
|
|
216
|
+
action.completer = _complete_platforms # type: ignore[attr-defined]
|
|
216
217
|
|
|
217
218
|
|
|
218
219
|
def _add_json_options(parser: argparse.ArgumentParser) -> None:
|
|
@@ -436,7 +437,7 @@ def _platform_config_row(
|
|
|
436
437
|
|
|
437
438
|
|
|
438
439
|
def _print_json(data: Any, *, pretty: bool) -> None:
|
|
439
|
-
kwargs = {"ensure_ascii": False}
|
|
440
|
+
kwargs: dict[str, Any] = {"ensure_ascii": False}
|
|
440
441
|
if pretty:
|
|
441
442
|
kwargs["indent"] = 2
|
|
442
443
|
else:
|
|
@@ -547,8 +548,8 @@ def _download_result_to_dict(result: Any) -> dict[str, Any]:
|
|
|
547
548
|
def _jsonable(value: Any) -> Any:
|
|
548
549
|
if isinstance(value, Path):
|
|
549
550
|
return str(value)
|
|
550
|
-
if is_dataclass(value):
|
|
551
|
-
return _jsonable(asdict(value))
|
|
551
|
+
if is_dataclass(value) and not isinstance(value, type):
|
|
552
|
+
return _jsonable(asdict(cast(Any, value)))
|
|
552
553
|
if isinstance(value, dict):
|
|
553
554
|
return {str(k): _jsonable(v) for k, v in value.items()}
|
|
554
555
|
if isinstance(value, (list, tuple)):
|
|
@@ -34,7 +34,7 @@ class BaseParser(ABC):
|
|
|
34
34
|
self.proxy = proxy
|
|
35
35
|
self.cookie = normalize_cookie(cookie)
|
|
36
36
|
|
|
37
|
-
def __init_subclass__(cls, /, register=True, **kwargs):
|
|
37
|
+
def __init_subclass__(cls, /, register: bool = True, **kwargs):
|
|
38
38
|
super().__init_subclass__(**kwargs)
|
|
39
39
|
if register:
|
|
40
40
|
if not cls.__platform__:
|
|
@@ -56,7 +56,7 @@ class BaseParser(ABC):
|
|
|
56
56
|
def match(cls, text: str) -> bool:
|
|
57
57
|
"""判断是否匹配该解析器"""
|
|
58
58
|
url = match_url(text)
|
|
59
|
-
return bool(re.match(cls.__match__, url))
|
|
59
|
+
return bool(cls.__match__ and re.match(cls.__match__, url))
|
|
60
60
|
|
|
61
61
|
async def parse(self, url: str) -> AnyParseResult:
|
|
62
62
|
"""解析
|
|
@@ -66,7 +66,8 @@ class BaseParser(ABC):
|
|
|
66
66
|
raw_url = await self.get_raw_url(url, clean_all=False)
|
|
67
67
|
result = await self._do_parse(raw_url)
|
|
68
68
|
result.platform = self.__platform__
|
|
69
|
-
|
|
69
|
+
raw_url_clean = self._clean_params(raw_url, self.__after_clean_parameters__)
|
|
70
|
+
result.raw_url = raw_url_clean
|
|
70
71
|
return result
|
|
71
72
|
|
|
72
73
|
@abstractmethod
|
|
@@ -104,7 +105,8 @@ class BaseParser(ABC):
|
|
|
104
105
|
|
|
105
106
|
:return:
|
|
106
107
|
"""
|
|
107
|
-
|
|
108
|
+
matched_url = match_url(url)
|
|
109
|
+
url = matched_url or url
|
|
108
110
|
if not url.startswith("http"):
|
|
109
111
|
url = f"https://{url}"
|
|
110
112
|
if any(x in url for x in self.__redirect_keywords__):
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Union
|
|
5
4
|
|
|
6
5
|
from yt_dlp import YoutubeDL
|
|
7
6
|
|
|
8
7
|
from ...types import (
|
|
8
|
+
AnyParseResult,
|
|
9
9
|
DownloadError,
|
|
10
10
|
DownloadResult,
|
|
11
11
|
ParseError,
|
|
@@ -91,7 +91,7 @@ class MonotonicDownloadProgress:
|
|
|
91
91
|
class YtParser(BaseParser, register=False):
|
|
92
92
|
"""yt-dlp解析器"""
|
|
93
93
|
|
|
94
|
-
async def _do_parse(self, raw_url: str) ->
|
|
94
|
+
async def _do_parse(self, raw_url: str) -> AnyParseResult:
|
|
95
95
|
video_info = await self._parse(raw_url)
|
|
96
96
|
return YtVideoParseResult(
|
|
97
97
|
dl=video_info,
|
|
@@ -114,8 +114,8 @@ class YtParser(BaseParser, register=False):
|
|
|
114
114
|
except Exception as e:
|
|
115
115
|
raise ParseError(f"解析视频信息失败: {str(e)}") from e
|
|
116
116
|
|
|
117
|
-
if dl.get("_type") and dl["_type"] == "playlist":
|
|
118
|
-
dl = dl["entries"][0]
|
|
117
|
+
if dl.get("_type") and dl["_type"] == "playlist":
|
|
118
|
+
dl = dl["entries"][0]
|
|
119
119
|
url = dl["webpage_url"]
|
|
120
120
|
title = dl["title"]
|
|
121
121
|
duration = dl.get("duration", 0)
|
|
@@ -190,12 +190,13 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
190
190
|
) -> "DownloadResult":
|
|
191
191
|
if callback_kwargs is None:
|
|
192
192
|
callback_kwargs = {}
|
|
193
|
+
output_dir_path = Path(output_dir)
|
|
193
194
|
|
|
194
195
|
paramss = self.dl.paramss.copy()
|
|
195
196
|
if self.dl.proxy:
|
|
196
197
|
paramss["proxy"] = self.dl.proxy
|
|
197
198
|
|
|
198
|
-
paramss["outtmpl"] = f"{
|
|
199
|
+
paramss["outtmpl"] = f"{output_dir_path.joinpath('ytdlp_%(id)s')}.%(ext)s"
|
|
199
200
|
|
|
200
201
|
if callback:
|
|
201
202
|
loop = asyncio.get_running_loop()
|
|
@@ -214,7 +215,11 @@ class YtVideoParseResult(VideoParseResult):
|
|
|
214
215
|
|
|
215
216
|
await self._run_download(paramss, proxy=proxy)
|
|
216
217
|
|
|
217
|
-
v =
|
|
218
|
+
v = (
|
|
219
|
+
list(output_dir_path.glob("*.mp4"))
|
|
220
|
+
or list(output_dir_path.glob("*.mkv"))
|
|
221
|
+
or list(output_dir_path.glob("*.webm"))
|
|
222
|
+
)
|
|
218
223
|
if not v:
|
|
219
224
|
raise DownloadError("下载失败 -1")
|
|
220
225
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
5
|
+
from typing import cast
|
|
4
6
|
from urllib.parse import parse_qs, urlparse
|
|
5
7
|
|
|
6
8
|
from loguru import logger
|
|
@@ -29,11 +31,11 @@ class BiliParse(YtParser):
|
|
|
29
31
|
__reserved_parameters__ = ["p"]
|
|
30
32
|
__redirect_keywords__ = ["b23.tv", "bili2233.cn"]
|
|
31
33
|
|
|
32
|
-
async def _do_parse(self, raw_url: str) ->
|
|
34
|
+
async def _do_parse(self, raw_url: str) -> YtVideoParseResult | BiliVideoParseResult | ImageParseResult:
|
|
33
35
|
if await self.is_dynamic(raw_url):
|
|
34
36
|
dynamic = await self.get_dynamic_info(raw_url)
|
|
35
37
|
content = self.hashtag_handler(dynamic.content)
|
|
36
|
-
photos = []
|
|
38
|
+
photos: list[LivePhotoRef | ImageRef] = []
|
|
37
39
|
if dynamic.images:
|
|
38
40
|
for i in dynamic.images:
|
|
39
41
|
if i.live_url:
|
|
@@ -93,7 +95,7 @@ class BiliParse(YtParser):
|
|
|
93
95
|
raise ParseError(str(e)) from e
|
|
94
96
|
return dynamic_info
|
|
95
97
|
|
|
96
|
-
async def bili_api_parse(self, url) ->
|
|
98
|
+
async def bili_api_parse(self, url) -> BiliVideoParseResult | ImageParseResult:
|
|
97
99
|
async with BiliAPI(proxy=self.proxy) as bili:
|
|
98
100
|
video_info = await bili.get_video_info(url)
|
|
99
101
|
|
|
@@ -136,8 +138,8 @@ class BiliParse(YtParser):
|
|
|
136
138
|
),
|
|
137
139
|
)
|
|
138
140
|
|
|
139
|
-
async def ytp_parse(self, url) ->
|
|
140
|
-
result = await super()._do_parse(url)
|
|
141
|
+
async def ytp_parse(self, url) -> YtVideoParseResult:
|
|
142
|
+
result = cast(YtVideoParseResult, await super()._do_parse(url))
|
|
141
143
|
return YtVideoParseResult(
|
|
142
144
|
title=result.title,
|
|
143
145
|
dl=result.dl,
|
|
@@ -172,7 +174,7 @@ class BiliVideoParseResult(VideoParseResult):
|
|
|
172
174
|
callback_kwargs: dict | None = None,
|
|
173
175
|
proxy: str | None = None,
|
|
174
176
|
headers: dict | None = None,
|
|
175
|
-
) ->
|
|
177
|
+
) -> DownloadResult:
|
|
176
178
|
headers = {"referer": "https://www.bilibili.com", "User-Agent": GlobalConfig.ua}
|
|
177
179
|
return await super()._do_download(
|
|
178
180
|
output_dir=output_dir,
|
|
@@ -31,14 +31,14 @@ class CoolapkParser(BaseParser):
|
|
|
31
31
|
coolapk = await Coolapk.parse(raw_url, proxy=self.proxy)
|
|
32
32
|
except Exception as e:
|
|
33
33
|
raise ParseError(str(e)) from e
|
|
34
|
-
media = [AniRef(url=i) if ".gif" in i else ImageRef(url=i) for i in coolapk.imgs]
|
|
34
|
+
media = [AniRef(url=i) if ".gif" in i else ImageRef(url=i) for i in coolapk.imgs or []]
|
|
35
35
|
if coolapk.markdown_content:
|
|
36
36
|
return CoolapkRichTextParseResult(
|
|
37
37
|
title=coolapk.title,
|
|
38
38
|
media=media,
|
|
39
39
|
markdown_content=coolapk.markdown_content,
|
|
40
40
|
)
|
|
41
|
-
content = self.hashtag_handler(coolapk.text_content)
|
|
41
|
+
content = self.hashtag_handler(coolapk.text_content or "")
|
|
42
42
|
if any(isinstance(m, AniRef) for m in media):
|
|
43
43
|
return CoolapkMultimediaParseResult(
|
|
44
44
|
title=coolapk.title,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Self, Union
|
|
@@ -47,6 +47,8 @@ class DouyinParser(BaseParser):
|
|
|
47
47
|
@staticmethod
|
|
48
48
|
def _build_video_result(result: "DouyinApiResult") -> VideoParseResult:
|
|
49
49
|
"""构建视频解析结果"""
|
|
50
|
+
if result.video is None:
|
|
51
|
+
raise ParseError("抖音解析失败: 未获取到视频")
|
|
50
52
|
return DouyinVideoParseResult(
|
|
51
53
|
title=result.desc,
|
|
52
54
|
video=result.video,
|
|
@@ -134,9 +136,9 @@ class DouyinApiResult:
|
|
|
134
136
|
"""抖音 API 解析结果"""
|
|
135
137
|
|
|
136
138
|
type: DouyinMediaType
|
|
137
|
-
video: VideoRef = None
|
|
139
|
+
video: VideoRef | None = None
|
|
138
140
|
desc: str = ""
|
|
139
|
-
image_list: list[ImageRef | LivePhotoRef] =
|
|
141
|
+
image_list: list[ImageRef | LivePhotoRef] = field(default_factory=list)
|
|
140
142
|
|
|
141
143
|
@classmethod
|
|
142
144
|
def parse(cls, json_dict: dict) -> Self:
|
|
@@ -162,7 +164,7 @@ class DouyinApiResult:
|
|
|
162
164
|
has_live_photos = any(img.get("video") for img in images)
|
|
163
165
|
|
|
164
166
|
if has_live_photos:
|
|
165
|
-
image_list = []
|
|
167
|
+
image_list: list[ImageRef | LivePhotoRef] = []
|
|
166
168
|
for image in images:
|
|
167
169
|
if video := image.get("video"):
|
|
168
170
|
video_info = parse_video_info(video)
|
|
@@ -206,7 +208,7 @@ class DouyinApiResult:
|
|
|
206
208
|
def _parse_image_post_info(cls, image_post_info: dict, desc: str) -> Self:
|
|
207
209
|
"""解析新版图片格式 (image_post_info 字段)"""
|
|
208
210
|
images = image_post_info.get("images", [])
|
|
209
|
-
image_list = []
|
|
211
|
+
image_list: list[ImageRef | LivePhotoRef] = []
|
|
210
212
|
|
|
211
213
|
for image in images:
|
|
212
214
|
display_image = image.get("display_image", {})
|
|
@@ -15,7 +15,7 @@ class InstagramParser(BaseParser):
|
|
|
15
15
|
__match__ = r"^(http(s)?://)(www\.|)instagram\.com/(p|reel|share|.*/p|.*/reel)/.*"
|
|
16
16
|
__redirect_keywords__ = ["share"]
|
|
17
17
|
|
|
18
|
-
async def _do_parse(self, raw_url: str) -> VideoParseResult | ImageParseResult | MultimediaParseResult
|
|
18
|
+
async def _do_parse(self, raw_url: str) -> VideoParseResult | ImageParseResult | MultimediaParseResult:
|
|
19
19
|
shortcode = self.get_short_code(raw_url)
|
|
20
20
|
if not shortcode:
|
|
21
21
|
raise ValueError("Instagram帖子链接无效")
|
|
@@ -32,7 +32,7 @@ class InstagramParser(BaseParser):
|
|
|
32
32
|
case "GraphSidecar":
|
|
33
33
|
media = [
|
|
34
34
|
VideoRef(url=i.video_url, thumb_url=i.display_url, width=i.width, height=i.height)
|
|
35
|
-
if i.is_video
|
|
35
|
+
if i.is_video and i.video_url
|
|
36
36
|
else ImageRef(url=i.display_url, width=i.width, height=i.height)
|
|
37
37
|
for i in post.get_sidecar_nodes()
|
|
38
38
|
]
|
|
@@ -44,9 +44,9 @@ class InstagramParser(BaseParser):
|
|
|
44
44
|
case "GraphVideo":
|
|
45
45
|
return VideoParseResult(
|
|
46
46
|
video=VideoRef(
|
|
47
|
-
url=post.video_url,
|
|
47
|
+
url=post.video_url or post.url,
|
|
48
48
|
thumb_url=post.url,
|
|
49
|
-
duration=int(post.video_duration),
|
|
49
|
+
duration=int(post.video_duration or 0),
|
|
50
50
|
width=width,
|
|
51
51
|
height=height,
|
|
52
52
|
),
|
|
@@ -81,7 +81,7 @@ class InstagramParser(BaseParser):
|
|
|
81
81
|
if cookie:
|
|
82
82
|
text = f"Instagram 账号可能已被封禁\n\n使用的Cookie: {cookie_ellipsis(cookie)}"
|
|
83
83
|
else:
|
|
84
|
-
text = e
|
|
84
|
+
text = str(e)
|
|
85
85
|
raise ParseError(f"无法获取帖子内容: {text}") from e
|
|
86
86
|
else:
|
|
87
87
|
return post
|
|
@@ -23,9 +23,9 @@ class PipixParser(BaseParser):
|
|
|
23
23
|
video=VideoRef(
|
|
24
24
|
url=ppx.video_url,
|
|
25
25
|
thumb_url=ppx.video_thumb,
|
|
26
|
-
duration=ppx.video_duration,
|
|
27
|
-
height=ppx.video_height,
|
|
28
|
-
width=ppx.video_width,
|
|
26
|
+
duration=ppx.video_duration or 0,
|
|
27
|
+
height=ppx.video_height or 0,
|
|
28
|
+
width=ppx.video_width or 0,
|
|
29
29
|
),
|
|
30
30
|
)
|
|
31
31
|
else:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from ...provider_api.threads import ThreadsAPI, ThreadsMedia, ThreadsMediaType
|
|
2
|
-
from ...types import ImageRef, MultimediaParseResult, Platform, VideoRef
|
|
2
|
+
from ...types import AnyMediaRef, ImageRef, MultimediaParseResult, Platform, VideoRef
|
|
3
3
|
from ..base.base import BaseParser
|
|
4
4
|
|
|
5
5
|
|
|
@@ -10,7 +10,7 @@ class ThreadsParser(BaseParser):
|
|
|
10
10
|
|
|
11
11
|
async def _do_parse(self, raw_url: str) -> "MultimediaParseResult":
|
|
12
12
|
post = await ThreadsAPI(proxy=self.proxy).parse(raw_url)
|
|
13
|
-
media = []
|
|
13
|
+
media: list[AnyMediaRef] = []
|
|
14
14
|
if post.media:
|
|
15
15
|
pm: list[ThreadsMedia] = post.media if isinstance(post.media, list) else [post.media]
|
|
16
16
|
for m in pm:
|
|
@@ -2,7 +2,7 @@ from typing import Union
|
|
|
2
2
|
|
|
3
3
|
import httpx
|
|
4
4
|
|
|
5
|
-
from ...provider_api.tieba import TieBa, TieBaError, TieBaPostType
|
|
5
|
+
from ...provider_api.tieba import TieBa, TieBaError, TieBaPostType, TieBaVideo
|
|
6
6
|
from ...types import AniRef, ImageParseResult, ImageRef, ParseError, Platform, VideoParseResult, VideoRef
|
|
7
7
|
from ..base.base import BaseParser
|
|
8
8
|
|
|
@@ -22,6 +22,8 @@ class TieBaParser(BaseParser):
|
|
|
22
22
|
|
|
23
23
|
match tb.type:
|
|
24
24
|
case TieBaPostType.VIDEO:
|
|
25
|
+
if not isinstance(tb.media, TieBaVideo):
|
|
26
|
+
raise ParseError("贴吧解析失败: 未获取到视频")
|
|
25
27
|
return VideoParseResult(
|
|
26
28
|
title=tb.title,
|
|
27
29
|
video=VideoRef(
|
|
@@ -35,8 +37,8 @@ class TieBaParser(BaseParser):
|
|
|
35
37
|
)
|
|
36
38
|
|
|
37
39
|
case TieBaPostType.PHOTO:
|
|
38
|
-
images = []
|
|
39
|
-
if tb.media:
|
|
40
|
+
images: list[ImageRef | AniRef] = []
|
|
41
|
+
if isinstance(tb.media, list):
|
|
40
42
|
for i in tb.media:
|
|
41
43
|
async with httpx.AsyncClient(proxy=self.proxy) as cli:
|
|
42
44
|
try:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Self, Union
|
|
@@ -44,6 +44,8 @@ class TikTokParser(BaseParser):
|
|
|
44
44
|
|
|
45
45
|
@staticmethod
|
|
46
46
|
def _build_video_result(result: "TikTokApiResult") -> VideoParseResult:
|
|
47
|
+
if result.video is None:
|
|
48
|
+
raise ParseError("TikTok 解析失败: 未获取到视频")
|
|
47
49
|
return TikTokVideoParseResult(
|
|
48
50
|
title=result.desc,
|
|
49
51
|
video=result.video,
|
|
@@ -199,9 +201,9 @@ class TikTokMediaType(Enum):
|
|
|
199
201
|
@dataclass
|
|
200
202
|
class TikTokApiResult:
|
|
201
203
|
type: TikTokMediaType
|
|
202
|
-
video: VideoRef = None
|
|
204
|
+
video: VideoRef | None = None
|
|
203
205
|
desc: str = ""
|
|
204
|
-
image_list: list[ImageRef] =
|
|
206
|
+
image_list: list[ImageRef] = field(default_factory=list)
|
|
205
207
|
|
|
206
208
|
@classmethod
|
|
207
209
|
def parse(cls, json_dict: dict) -> Self:
|
|
@@ -216,7 +218,7 @@ class TikTokApiResult:
|
|
|
216
218
|
|
|
217
219
|
@classmethod
|
|
218
220
|
def _parse_image_post(cls, image_post_info: dict, desc: str) -> Self:
|
|
219
|
-
image_list = []
|
|
221
|
+
image_list: list[ImageRef] = []
|
|
220
222
|
|
|
221
223
|
for image in image_post_info.get("images", []):
|
|
222
224
|
display_image = (
|
|
@@ -7,7 +7,16 @@ from ...provider_api.twitter import (
|
|
|
7
7
|
TwitterTweet,
|
|
8
8
|
TwitterVideo,
|
|
9
9
|
)
|
|
10
|
-
from ...types import
|
|
10
|
+
from ...types import (
|
|
11
|
+
AniRef,
|
|
12
|
+
AnyMediaRef,
|
|
13
|
+
ImageRef,
|
|
14
|
+
MultimediaParseResult,
|
|
15
|
+
ParseError,
|
|
16
|
+
Platform,
|
|
17
|
+
RichTextParseResult,
|
|
18
|
+
VideoRef,
|
|
19
|
+
)
|
|
11
20
|
from ...utils.utils import cookie_ellipsis
|
|
12
21
|
from ..base.base import BaseParser
|
|
13
22
|
|
|
@@ -47,12 +56,12 @@ class TwitterParser(BaseParser):
|
|
|
47
56
|
|
|
48
57
|
@staticmethod
|
|
49
58
|
async def media_parse(tweet: TwitterTweet):
|
|
50
|
-
media = []
|
|
59
|
+
media: list[AnyMediaRef] = []
|
|
51
60
|
if tweet.media:
|
|
52
61
|
for m in tweet.media:
|
|
53
62
|
match m:
|
|
54
63
|
case TwitterPhoto():
|
|
55
|
-
path = ImageRef(url=m.url, height=m.height, width=m.width, thumb_url=m.thumb_url)
|
|
64
|
+
path: AnyMediaRef = ImageRef(url=m.url, height=m.height, width=m.width, thumb_url=m.thumb_url)
|
|
56
65
|
case TwitterVideo():
|
|
57
66
|
path = VideoRef(
|
|
58
67
|
url=m.url,
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from ...provider_api.weibo import MediaType, MixMediaInfoItem, PicInfo, WeiboAPI
|
|
4
|
+
from ...types import (
|
|
5
|
+
AniRef,
|
|
6
|
+
ImageParseResult,
|
|
7
|
+
ImageRef,
|
|
8
|
+
LivePhotoRef,
|
|
9
|
+
MultimediaParseResult,
|
|
10
|
+
Platform,
|
|
11
|
+
VideoParseResult,
|
|
12
|
+
VideoRef,
|
|
13
|
+
)
|
|
14
|
+
from ..base.base import BaseParser
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class WeiboParser(BaseParser):
|
|
18
|
+
__platform__ = Platform.WEIBO
|
|
19
|
+
__supported_type__ = ["视频", "图文"]
|
|
20
|
+
__match__ = r"^(http(s)?://)(m\.|)weibo.(com|cn)/(?!(u/)).+"
|
|
21
|
+
|
|
22
|
+
async def _do_parse(self, raw_url: str) -> MultimediaParseResult | VideoParseResult | ImageParseResult:
|
|
23
|
+
weibo = await WeiboAPI(self.proxy).parse(raw_url)
|
|
24
|
+
data = weibo.data
|
|
25
|
+
text = self.f_text(data.content)
|
|
26
|
+
media: list[VideoRef | ImageRef | LivePhotoRef | AniRef] = []
|
|
27
|
+
|
|
28
|
+
if not data.pic_infos and data.page_info and data.page_info.object_type == MediaType.VIDEO:
|
|
29
|
+
playback = data.page_info.media_info and data.page_info.media_info.playback
|
|
30
|
+
if playback:
|
|
31
|
+
return VideoParseResult(
|
|
32
|
+
content=text,
|
|
33
|
+
video=VideoRef(
|
|
34
|
+
url=playback.url,
|
|
35
|
+
thumb_url=data.page_info.page_pic,
|
|
36
|
+
width=playback.width,
|
|
37
|
+
height=playback.height,
|
|
38
|
+
duration=int(playback.duration),
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
media_info: list[PicInfo | MixMediaInfoItem] | None = None
|
|
43
|
+
if data.retweeted_status and data.retweeted_status.pic_infos:
|
|
44
|
+
media_info = list(data.retweeted_status.pic_infos)
|
|
45
|
+
elif data.pic_infos:
|
|
46
|
+
media_info = list(data.pic_infos)
|
|
47
|
+
elif data.mix_media_info and data.mix_media_info.items:
|
|
48
|
+
media_info = list(data.mix_media_info.items)
|
|
49
|
+
if not media_info:
|
|
50
|
+
return MultimediaParseResult(content=text, media=[])
|
|
51
|
+
|
|
52
|
+
for i in media_info:
|
|
53
|
+
match i.type:
|
|
54
|
+
case MediaType.VIDEO:
|
|
55
|
+
if i.media_url:
|
|
56
|
+
media.append(
|
|
57
|
+
VideoRef(
|
|
58
|
+
url=i.media_url,
|
|
59
|
+
thumb_url=i.thumb_url,
|
|
60
|
+
width=i.width,
|
|
61
|
+
height=i.height,
|
|
62
|
+
duration=i.duration,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
case MediaType.LIVE_PHOTO:
|
|
66
|
+
if i.thumb_url:
|
|
67
|
+
media.append(
|
|
68
|
+
LivePhotoRef(
|
|
69
|
+
url=i.thumb_url,
|
|
70
|
+
ext="mov",
|
|
71
|
+
video_url=i.media_url,
|
|
72
|
+
width=i.width,
|
|
73
|
+
height=i.height,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
case MediaType.GIF:
|
|
77
|
+
if i.media_url:
|
|
78
|
+
media.append(AniRef(url=i.media_url, thumb_url=i.thumb_url))
|
|
79
|
+
case _:
|
|
80
|
+
if i.media_url:
|
|
81
|
+
media.append(ImageRef(url=i.media_url, thumb_url=i.thumb_url, width=i.width, height=i.height))
|
|
82
|
+
if all((isinstance(m, ImageRef) or isinstance(m, LivePhotoRef)) for m in media):
|
|
83
|
+
photos = [m for m in media if isinstance(m, ImageRef | LivePhotoRef)]
|
|
84
|
+
return ImageParseResult(content=text, photo=photos)
|
|
85
|
+
return MultimediaParseResult(content=text, media=media)
|
|
86
|
+
|
|
87
|
+
def f_text(self, text: str | None) -> str:
|
|
88
|
+
# text = re.sub(r'<a href="https://video.weibo.com.*?>.*的微博视频.*</a>', "", text)
|
|
89
|
+
# text = re.sub(r"<[^>]+>", " ", text)
|
|
90
|
+
text = self.hashtag_handler(text or "")
|
|
91
|
+
return text.strip()
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def hashtag_handler(desc: str):
|
|
95
|
+
hashtags = re.findall(r" ?#[^#]+# ?", desc)
|
|
96
|
+
for hashtag in hashtags:
|
|
97
|
+
desc = desc.replace(hashtag, f" {hashtag.strip().removesuffix('#')} ")
|
|
98
|
+
return desc
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
__all__ = ["WeiboParser"]
|
|
@@ -36,13 +36,15 @@ class XiaoHeiHeParser(BaseParser):
|
|
|
36
36
|
def __parse_media(xhh: XiaoHeiHePost):
|
|
37
37
|
match xhh.type:
|
|
38
38
|
case XiaoHeiHePostType.VIDEO:
|
|
39
|
+
if not xhh.media:
|
|
40
|
+
return None
|
|
39
41
|
return VideoRef(url=xhh.media[0].url, thumb_url=xhh.media[0].thumb_url)
|
|
40
42
|
case XiaoHeiHePostType.IMAGE | XiaoHeiHePostType.ARTICLE:
|
|
41
43
|
images: list[ImageRef | AniRef] = []
|
|
42
|
-
for i in xhh.media:
|
|
44
|
+
for i in xhh.media or []:
|
|
43
45
|
if i.type == XiaoHeiHeMediaType.IMAGE:
|
|
44
|
-
images.append(ImageRef(url=i.url, width=i.width, height=i.height))
|
|
46
|
+
images.append(ImageRef(url=i.url, width=i.width or 0, height=i.height or 0))
|
|
45
47
|
else:
|
|
46
|
-
images.append(AniRef(url=i.url, width=i.width, height=i.height))
|
|
48
|
+
images.append(AniRef(url=i.url, width=i.width or 0, height=i.height or 0))
|
|
47
49
|
|
|
48
50
|
return images
|