PyPI - parsehub - Versions diffs - 2.0.15__tar.gz → 2.0.16__tar.gz - Mend

parsehub 2.0.15tar.gz → 2.0.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{parsehub-2.0.15/src/parsehub.egg-info → parsehub-2.0.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: parsehub
-Version: 2.0.15
+Version: 2.0.16
 Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
 Author-email: 梓澪 <zilingmio@gmail.com>
 License: MIT
@@ -34,6 +34,7 @@ Requires-Dist: pillow>=12.1.0
 Requires-Dist: python-slugify[unidecode]>=8.0.4
 Requires-Dist: opencv-python-headless>=4.13.0.92
 Requires-Dist: cryptography>=46.0.6
+Requires-Dist: gmssl>=3.2.2
 Dynamic: license-file
 <div align="center">
@@ -105,7 +106,8 @@ print(result)
 | **Facebook**    | ✅  |    |       |
 | **Threads**     | ✅  | ✅  |       |
 | **Bilibili**    | ✅  |    | 📝 动态 |
-| **抖音 / TikTok** | ✅  | ✅  |       |
+| **抖音**          | ✅  | ✅  |       |
+| **TikTok**      | ✅  | ✅  |       |
 | **微博**          | ✅  | ✅  |       |
 | **小红书**         | ✅  | ✅  |       |
 | **贴吧**          | ✅  | ✅  |       |
@@ -164,7 +166,13 @@ ParseHub(cookie={"key1": "value1", "key2": "value2"})
 目前支持 Cookie 登录的平台:
-`Twitter` · `Instagram` · `Kuaishou` · `Bilibili` · `YouTube`
+- `Twitter`
+- `Instagram`
+- `Kuaishou`
+- `Bilibili`
+- `YouTube`
+- `抖音`
+- `TikTok`
 ### 全局配置

{parsehub-2.0.15 → parsehub-2.0.16}/README.md RENAMED Viewed

@@ -67,7 +67,8 @@ print(result)
 | **Facebook**    | ✅  |    |       |
 | **Threads**     | ✅  | ✅  |       |
 | **Bilibili**    | ✅  |    | 📝 动态 |
-| **抖音 / TikTok** | ✅  | ✅  |       |
+| **抖音**          | ✅  | ✅  |       |
+| **TikTok**      | ✅  | ✅  |       |
 | **微博**          | ✅  | ✅  |       |
 | **小红书**         | ✅  | ✅  |       |
 | **贴吧**          | ✅  | ✅  |       |
@@ -126,7 +127,13 @@ ParseHub(cookie={"key1": "value1", "key2": "value2"})
 目前支持 Cookie 登录的平台:
-`Twitter` · `Instagram` · `Kuaishou` · `Bilibili` · `YouTube`
+- `Twitter`
+- `Instagram`
+- `Kuaishou`
+- `Bilibili`
+- `YouTube`
+- `抖音`
+- `TikTok`
 ### 全局配置

{parsehub-2.0.15 → parsehub-2.0.16}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "parsehub"
-version = "2.0.15"
+version = "2.0.16"
 description = "轻量、异步、开箱即用的社交媒体聚合解析库"
 readme = "README.md"
 requires-python = ">=3.12.0"
@@ -37,6 +37,7 @@ dependencies = [
     "python-slugify[unidecode]>=8.0.4",
     "opencv-python-headless>=4.13.0.92",
     "cryptography>=46.0.6",
+    "gmssl>=3.2.2",
 ]
 [dependency-groups]

{parsehub-2.0.15 → parsehub-2.0.16}/src/parsehub/config/config.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import sys
 from pathlib import Path
-from pydantic import BaseModel, ConfigDict, HttpUrl
+from pydantic import BaseModel, ConfigDict
 class _GlobalConfig(BaseModel):
@@ -11,8 +11,6 @@ class _GlobalConfig(BaseModel):
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"
     )
-    douyin_api: HttpUrl = "https://douyin.wtf/"
-    """抖音解析API, 建议自行部署: https://github.com/Evil0ctal/Douyin_TikTok_Download_API"""
     default_save_dir: Path = Path(sys.argv[0]).parent / "downloads"
     """默认下载目录"""

parsehub-2.0.16/src/parsehub/parsers/parser/douyin.py ADDED Viewed

@@ -0,0 +1,251 @@
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Self, Union
+from ... import ProgressCallback
+from ...provider_api.douyin import DouyinWebCrawler
+from ...types import (
+    DownloadResult,
+    ImageParseResult,
+    ImageRef,
+    LivePhotoRef,
+    MultimediaParseResult,
+    ParseError,
+    Platform,
+    VideoParseResult,
+    VideoRef,
+)
+from ..base.base import BaseParser
+class DouyinParser(BaseParser):
+    __platform__ = Platform.DOUYIN
+    __supported_type__ = ["视频", "图文"]
+    __match__ = r"^(http(s)?://)?.+douyin.com/(?!share/user|qishui).+"
+    __redirect_keywords__ = ["v.douyin", "iesdouyin"]
+    __reserved_parameters__ = ["modal_id"]
+    async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult", "MultimediaParseResult"]:
+        result = await self._fetch_api_result(raw_url)
+        match result.type:
+            case DouyinMediaType.VIDEO:
+                return self._build_video_result(result)
+            case DouyinMediaType.IMAGE:
+                return self._build_image_result(result)
+    async def _fetch_api_result(self, url: str) -> "DouyinApiResult":
+        """获取并解析抖音 API 结果"""
+        if not self.cookie:
+            raise ParseError("抖音 Cookie 未配置")
+        crawler = DouyinWebCrawler(proxy=self.proxy, cookie=self.cookie)
+        response = await crawler.parse(url)
+        return DouyinApiResult.parse(response)
+    @staticmethod
+    def _build_video_result(result: "DouyinApiResult") -> VideoParseResult:
+        """构建视频解析结果"""
+        return DouyinVideoParseResult(
+            title=result.desc,
+            video=result.video,
+        )
+    @staticmethod
+    def _build_image_result(result: "DouyinApiResult") -> ImageParseResult:
+        """构建图片解析结果"""
+        return ImageParseResult(
+            title=result.desc,
+            photo=result.image_list,
+        )
+class DouyinVideoParseResult(VideoParseResult):
+    async def _do_download(
+        self,
+        *,
+        output_dir: str | Path,
+        callback: ProgressCallback | None = None,
+        callback_args: tuple = (),
+        callback_kwargs: dict | None = None,
+        proxy: str | None = None,
+        headers: dict | None = None,
+    ) -> "DownloadResult":
+        headers = {
+            "Referer": "https://www.douyin.com/",
+        }
+        return await super()._do_download(
+            output_dir=output_dir,
+            callback=callback,
+            callback_args=callback_args,
+            callback_kwargs=callback_kwargs,
+            proxy=proxy,
+            headers=headers,
+        )
+def remove_video_watermark(url: str) -> str:
+    """移除抖音视频水印标识 (playwm -> play)"""
+    return url.replace("playwm", "play")
+def parse_video_info(video_data: dict) -> dict:
+    bit_rates = video_data.get("bit_rate")
+    if not bit_rates:
+        raise ParseError("抖音解析失败: 未获取到视频下载地址")
+    # 按分辨率降序排列，选择最高质量
+    bit_rates.sort(
+        key=lambda x: x.get("play_addr", {}).get("width", 0) * x.get("play_addr", {}).get("height", 0),
+        reverse=True,
+    )
+    best_quality = bit_rates[0]
+    play_addr = best_quality.get("play_addr", {})
+    video_url_list = play_addr.get("url_list", [])
+    if not video_url_list:
+        raise ParseError("抖音解析失败: 视频下载地址为空")
+    video_url = remove_video_watermark(video_url_list[0])
+    cover = video_data.get("cover", {})
+    thumb_url_list = cover.get("url_list", [])
+    thumb_url = thumb_url_list[-1] if thumb_url_list else None
+    return {
+        "video_url": video_url,
+        "thumb_url": thumb_url,
+        "duration": best_quality.get("duration", 0),
+        "width": play_addr.get("width", 0),
+        "height": play_addr.get("height", 0),
+    }
+class DouyinMediaType(Enum):
+    """抖音媒体类型"""
+    VIDEO = "video"
+    IMAGE = "image"  # 实况图片 + 图片
+@dataclass
+class DouyinApiResult:
+    """抖音 API 解析结果"""
+    type: DouyinMediaType
+    video: VideoRef = None
+    desc: str = ""
+    image_list: list[ImageRef | LivePhotoRef] = None
+    @classmethod
+    def parse(cls, json_dict: dict) -> Self:
+        data = json_dict.get("aweme_detail")
+        if not data:
+            raise ParseError("抖音解析失败: 未获取到作品详情")
+        desc = data.get("desc", "")
+        if images := data.get("images"):
+            return cls._parse_images(images, desc)
+        elif image_post_info := data.get("image_post_info"):
+            return cls._parse_image_post_info(image_post_info, desc)
+        else:
+            return cls._parse_video(data, desc)
+    @classmethod
+    def _parse_images(cls, images: list[dict], desc: str) -> Self:
+        """解析旧版图片格式 (images 字段)
+        支持普通图片和实况照片 (LivePhoto)
+        """
+        has_live_photos = any(img.get("video") for img in images)
+        if has_live_photos:
+            image_list = []
+            for image in images:
+                if video := image.get("video"):
+                    video_info = parse_video_info(video)
+                    image_list.append(
+                        LivePhotoRef(
+                            url=video_info["thumb_url"],
+                            video_url=video_info["video_url"],
+                            width=int(video_info["width"]),
+                            height=int(video_info["height"]),
+                            duration=int(video_info["duration"]) or 3,
+                        )
+                    )
+                else:
+                    url_list = image.get("url_list", [])
+                    if url_list:
+                        image_list.append(
+                            ImageRef(
+                                url=url_list[-1],
+                                height=image.get("height", 0),
+                                width=image.get("width", 0),
+                            )
+                        )
+        else:
+            image_list = [
+                ImageRef(
+                    url=img["url_list"][-1],
+                    height=img.get("height", 0),
+                    width=img.get("width", 0),
+                )
+                for img in images
+                if img.get("url_list")
+            ]
+        return cls(
+            type=DouyinMediaType.IMAGE,
+            desc=desc,
+            image_list=image_list,
+        )
+    @classmethod
+    def _parse_image_post_info(cls, image_post_info: dict, desc: str) -> Self:
+        """解析新版图片格式 (image_post_info 字段)"""
+        images = image_post_info.get("images", [])
+        image_list = []
+        for image in images:
+            display_image = image.get("display_image", {})
+            url_list = display_image.get("url_list", [])
+            if url_list:
+                image_list.append(
+                    ImageRef(
+                        url=url_list[-1],
+                        height=display_image.get("height", 0),
+                        width=display_image.get("width", 0),
+                    )
+                )
+        return cls(
+            type=DouyinMediaType.IMAGE,
+            image_list=image_list,
+            desc=desc,
+        )
+    @classmethod
+    def _parse_video(cls, data: dict, desc: str) -> Self:
+        """解析视频"""
+        video_data = data.get("video")
+        if not video_data:
+            raise ParseError("抖音解析失败: 未获取到视频数据")
+        video_info = parse_video_info(video_data)
+        return cls(
+            type=DouyinMediaType.VIDEO,
+            video=VideoRef(
+                url=video_info["video_url"],
+                thumb_url=video_info["thumb_url"],
+                width=video_info["width"],
+                height=video_info["height"],
+                duration=video_info["duration"],
+            ),
+            desc=desc,
+        )
+__all__ = ["DouyinParser"]

parsehub-2.0.16/src/parsehub/parsers/parser/tiktok.py ADDED Viewed

@@ -0,0 +1,228 @@
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Self, Union
+from ... import ProgressCallback
+from ...config import GlobalConfig
+from ...provider_api.tiktok import TikTokWebCrawler
+from ...types import (
+    DownloadResult,
+    ImageParseResult,
+    ImageRef,
+    LivePhotoRef,
+    MultimediaParseResult,
+    ParseError,
+    Platform,
+    VideoParseResult,
+    VideoRef,
+)
+from ..base.base import BaseParser
+class TikTokParser(BaseParser):
+    __platform__ = Platform.TIKTOK
+    __supported_type__ = ["视频", "图文"]
+    __match__ = r"^(http(s)?://)?.+tiktok.com/(?!share/user|qishui).+"
+    __redirect_keywords__ = ["vt.tiktok"]
+    async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult", "MultimediaParseResult"]:
+        result = await self._fetch_api_result(raw_url)
+        match result.type:
+            case TikTokMediaType.VIDEO:
+                return self._build_video_result(result)
+            case TikTokMediaType.IMAGE:
+                return self._build_image_result(result)
+    async def _fetch_api_result(self, url: str) -> "TikTokApiResult":
+        crawler = TikTokWebCrawler(proxy=self.proxy, cookie=self.cookie)
+        try:
+            response = await crawler.parse(url)
+            return TikTokApiResult.parse(response)
+        except ParseError:
+            raise
+        except Exception as e:
+            raise ParseError(f"TikTok 解析失败: {e}") from e
+    @staticmethod
+    def _build_video_result(result: "TikTokApiResult") -> VideoParseResult:
+        return TikTokVideoParseResult(
+            title=result.desc,
+            video=result.video,
+        )
+    @staticmethod
+    def _build_image_result(result: "TikTokApiResult") -> ImageParseResult:
+        return ImageParseResult(
+            title=result.desc,
+            photo=result.image_list,
+        )
+class TikTokVideoParseResult(VideoParseResult):
+    async def _do_download(
+        self,
+        *,
+        output_dir: str | Path,
+        callback: ProgressCallback | None = None,
+        callback_args: tuple = (),
+        callback_kwargs: dict | None = None,
+        proxy: str | None = None,
+        headers: dict | None = None,
+    ) -> "DownloadResult":
+        headers = {
+            "User-Agent": GlobalConfig.ua,
+            "Referer": "https://www.tiktok.com/",
+        }
+        return await super()._do_download(
+            output_dir=output_dir,
+            callback=callback,
+            callback_args=callback_args,
+            callback_kwargs=callback_kwargs,
+            proxy=proxy,
+            headers=headers,
+        )
+def first_url(data: dict | None) -> str | None:
+    url_list = (data or {}).get("url_list") or (data or {}).get("UrlList") or []
+    return next((url for url in url_list if url), None)
+def as_int(value) -> int:
+    try:
+        return int(value or 0)
+    except (TypeError, ValueError):
+        return 0
+def pick_cover(video_data: dict) -> str | None:
+    for key in ("origin_cover", "cover", "dynamic_cover", "originCover", "dynamicCover"):
+        cover_url = first_url(video_data.get(key))
+        if cover_url:
+            return cover_url
+    cover = video_data.get("cover")
+    return cover if isinstance(cover, str) else None
+def parse_video_info(video_data: dict) -> dict:
+    bit_rates = video_data.get("bit_rate") or video_data.get("bitrateInfo") or []
+    candidates = []
+    for bit_rate in bit_rates:
+        play_addr = bit_rate.get("play_addr") or bit_rate.get("PlayAddr") or {}
+        video_url = first_url(play_addr)
+        if not video_url:
+            continue
+        width = as_int(play_addr.get("width") or play_addr.get("Width") or video_data.get("width"))
+        height = as_int(play_addr.get("height") or play_addr.get("Height") or video_data.get("height"))
+        bitrate = as_int(bit_rate.get("bit_rate") or bit_rate.get("Bitrate") or bit_rate.get("bitrate"))
+        data_size = as_int(play_addr.get("data_size") or play_addr.get("DataSize") or bit_rate.get("data_size"))
+        duration = as_int(play_addr.get("duration") or play_addr.get("Duration") or video_data.get("duration"))
+        candidates.append(
+            {
+                "video_url": video_url,
+                "thumb_url": pick_cover(video_data),
+                "duration": duration,
+                "width": width,
+                "height": height,
+                "quality": (width * height, bitrate, data_size),
+            }
+        )
+    if not candidates:
+        play_addr = video_data.get("play_addr") or video_data.get("playAddr") or {}
+        video_url = first_url(play_addr)
+        if video_url:
+            width = as_int(play_addr.get("width") or video_data.get("width"))
+            height = as_int(play_addr.get("height") or video_data.get("height"))
+            candidates.append(
+                {
+                    "video_url": video_url,
+                    "thumb_url": pick_cover(video_data),
+                    "duration": as_int(play_addr.get("duration") or video_data.get("duration")),
+                    "width": width,
+                    "height": height,
+                    "quality": (width * height, 0, 0),
+                }
+            )
+    if not candidates:
+        raise ParseError("TikTok 解析失败: 未获取到无水印视频下载地址")
+    return max(candidates, key=lambda x: x["quality"])
+class TikTokMediaType(Enum):
+    VIDEO = "video"
+    IMAGE = "image"
+@dataclass
+class TikTokApiResult:
+    type: TikTokMediaType
+    video: VideoRef = None
+    desc: str = ""
+    image_list: list[ImageRef | LivePhotoRef] = None
+    @classmethod
+    def parse(cls, json_dict: dict) -> Self:
+        if not json_dict:
+            raise ParseError("TikTok 解析失败: 未获取到作品详情")
+        desc = json_dict.get("desc", "")
+        image_post_info: dict = json_dict.get("image_post_info", {}) or json_dict.get("imagePost", {})
+        if image_post_info:
+            return cls._parse_image_post(image_post_info, desc)
+        return cls._parse_video(json_dict, desc)
+    @classmethod
+    def _parse_image_post(cls, image_post_info: dict, desc: str) -> Self:
+        image_list = []
+        for image in image_post_info.get("images", []):
+            display_image = image.get("display_image") or image.get("displayImage") or image.get("image") or {}
+            url = first_url(display_image)
+            if url:
+                image_list.append(
+                    ImageRef(
+                        url=url,
+                        height=as_int(display_image.get("height") or display_image.get("Height")),
+                        width=as_int(display_image.get("width") or display_image.get("Width")),
+                    )
+                )
+        if not image_list:
+            raise ParseError("TikTok 解析失败: 未获取到无水印图文下载地址")
+        return cls(
+            type=TikTokMediaType.IMAGE,
+            desc=desc,
+            image_list=image_list,
+        )
+    @classmethod
+    def _parse_video(cls, data: dict, desc: str) -> Self:
+        video_data = data.get("video", {})
+        if not video_data:
+            raise ParseError("TikTok 解析失败: 未获取到视频数据")
+        video_info = parse_video_info(video_data)
+        return cls(
+            type=TikTokMediaType.VIDEO,
+            video=VideoRef(
+                url=video_info["video_url"],
+                thumb_url=video_info["thumb_url"],
+                width=video_info["width"],
+                height=video_info["height"],
+                duration=video_info["duration"],
+            ),
+            desc=desc,
+        )
+__all__ = ["TikTokParser"]

parsehub 2.0.15__tar.gz → 2.0.16__tar.gz

parsehub 2.0.15tar.gz → 2.0.16tar.gz