parsehub 2.0.16__tar.gz → 2.0.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.16/src/parsehub.egg-info → parsehub-2.0.17}/PKG-INFO +1 -1
- {parsehub-2.0.16 → parsehub-2.0.17}/pyproject.toml +1 -1
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/tiktok.py +58 -17
- parsehub-2.0.17/src/parsehub/provider_api/tiktok.py +245 -0
- {parsehub-2.0.16 → parsehub-2.0.17/src/parsehub.egg-info}/PKG-INFO +1 -1
- parsehub-2.0.16/src/parsehub/provider_api/tiktok.py +0 -124
- {parsehub-2.0.16 → parsehub-2.0.17}/LICENSE +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/README.md +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/setup.cfg +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/base.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/ytdlp.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/bilibili.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/coolapk.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/douyin.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/instagram.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/tieba.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/twitter.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/weibo.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/xhs.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/douyin.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/tieba.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/twitter.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/weibo.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/xhs.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/xiaoheihe.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/result.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/utils.py +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -4,14 +4,11 @@ from pathlib import Path
|
|
|
4
4
|
from typing import Self, Union
|
|
5
5
|
|
|
6
6
|
from ... import ProgressCallback
|
|
7
|
-
from ...config import GlobalConfig
|
|
8
7
|
from ...provider_api.tiktok import TikTokWebCrawler
|
|
9
8
|
from ...types import (
|
|
10
9
|
DownloadResult,
|
|
11
10
|
ImageParseResult,
|
|
12
11
|
ImageRef,
|
|
13
|
-
LivePhotoRef,
|
|
14
|
-
MultimediaParseResult,
|
|
15
12
|
ParseError,
|
|
16
13
|
Platform,
|
|
17
14
|
VideoParseResult,
|
|
@@ -26,7 +23,7 @@ class TikTokParser(BaseParser):
|
|
|
26
23
|
__match__ = r"^(http(s)?://)?.+tiktok.com/(?!share/user|qishui).+"
|
|
27
24
|
__redirect_keywords__ = ["vt.tiktok"]
|
|
28
25
|
|
|
29
|
-
async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult"
|
|
26
|
+
async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult"]:
|
|
30
27
|
result = await self._fetch_api_result(raw_url)
|
|
31
28
|
|
|
32
29
|
match result.type:
|
|
@@ -72,7 +69,6 @@ class TikTokVideoParseResult(VideoParseResult):
|
|
|
72
69
|
headers: dict | None = None,
|
|
73
70
|
) -> "DownloadResult":
|
|
74
71
|
headers = {
|
|
75
|
-
"User-Agent": GlobalConfig.ua,
|
|
76
72
|
"Referer": "https://www.tiktok.com/",
|
|
77
73
|
}
|
|
78
74
|
return await super()._do_download(
|
|
@@ -85,9 +81,31 @@ class TikTokVideoParseResult(VideoParseResult):
|
|
|
85
81
|
)
|
|
86
82
|
|
|
87
83
|
|
|
88
|
-
def
|
|
89
|
-
|
|
90
|
-
|
|
84
|
+
def media_urls(data: dict | str | list | None) -> list[str]:
|
|
85
|
+
if isinstance(data, str):
|
|
86
|
+
if data.startswith("//"):
|
|
87
|
+
return [f"https:{data}"]
|
|
88
|
+
return [data] if data.startswith(("http://", "https://")) else []
|
|
89
|
+
if isinstance(data, list):
|
|
90
|
+
result = []
|
|
91
|
+
for item in data:
|
|
92
|
+
result.extend(media_urls(item))
|
|
93
|
+
return result
|
|
94
|
+
if not isinstance(data, dict):
|
|
95
|
+
return []
|
|
96
|
+
for key in ("url_list", "UrlList", "urlList"):
|
|
97
|
+
if key in data:
|
|
98
|
+
return media_urls(data[key])
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def first_url(data: dict | str | list | None) -> str | None:
|
|
103
|
+
return next(iter(media_urls(data)), None)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def preferred_video_url(data: dict | str | list | None) -> str | None:
|
|
107
|
+
urls = media_urls(data)
|
|
108
|
+
return next((url for url in urls if "aweme" in url), None) or (urls[0] if urls else None)
|
|
91
109
|
|
|
92
110
|
|
|
93
111
|
def as_int(value) -> int:
|
|
@@ -102,8 +120,7 @@ def pick_cover(video_data: dict) -> str | None:
|
|
|
102
120
|
cover_url = first_url(video_data.get(key))
|
|
103
121
|
if cover_url:
|
|
104
122
|
return cover_url
|
|
105
|
-
|
|
106
|
-
return cover if isinstance(cover, str) else None
|
|
123
|
+
return None
|
|
107
124
|
|
|
108
125
|
|
|
109
126
|
def parse_video_info(video_data: dict) -> dict:
|
|
@@ -112,7 +129,7 @@ def parse_video_info(video_data: dict) -> dict:
|
|
|
112
129
|
|
|
113
130
|
for bit_rate in bit_rates:
|
|
114
131
|
play_addr = bit_rate.get("play_addr") or bit_rate.get("PlayAddr") or {}
|
|
115
|
-
video_url =
|
|
132
|
+
video_url = preferred_video_url(play_addr)
|
|
116
133
|
if not video_url:
|
|
117
134
|
continue
|
|
118
135
|
|
|
@@ -135,21 +152,39 @@ def parse_video_info(video_data: dict) -> dict:
|
|
|
135
152
|
|
|
136
153
|
if not candidates:
|
|
137
154
|
play_addr = video_data.get("play_addr") or video_data.get("playAddr") or {}
|
|
138
|
-
video_url =
|
|
155
|
+
video_url = preferred_video_url(play_addr)
|
|
139
156
|
if video_url:
|
|
140
|
-
|
|
141
|
-
|
|
157
|
+
play_addr_data = play_addr if isinstance(play_addr, dict) else {}
|
|
158
|
+
width = as_int(play_addr_data.get("width") or video_data.get("width"))
|
|
159
|
+
height = as_int(play_addr_data.get("height") or video_data.get("height"))
|
|
142
160
|
candidates.append(
|
|
143
161
|
{
|
|
144
162
|
"video_url": video_url,
|
|
145
163
|
"thumb_url": pick_cover(video_data),
|
|
146
|
-
"duration": as_int(
|
|
164
|
+
"duration": as_int(play_addr_data.get("duration") or video_data.get("duration")),
|
|
147
165
|
"width": width,
|
|
148
166
|
"height": height,
|
|
149
167
|
"quality": (width * height, 0, 0),
|
|
150
168
|
}
|
|
151
169
|
)
|
|
152
170
|
|
|
171
|
+
if not candidates:
|
|
172
|
+
download_addr = video_data.get("download_addr") or video_data.get("downloadAddr") or {}
|
|
173
|
+
video_url = preferred_video_url(download_addr)
|
|
174
|
+
if video_url:
|
|
175
|
+
width = as_int(video_data.get("width"))
|
|
176
|
+
height = as_int(video_data.get("height"))
|
|
177
|
+
candidates.append(
|
|
178
|
+
{
|
|
179
|
+
"video_url": video_url,
|
|
180
|
+
"thumb_url": pick_cover(video_data),
|
|
181
|
+
"duration": as_int(video_data.get("duration")),
|
|
182
|
+
"width": width,
|
|
183
|
+
"height": height,
|
|
184
|
+
"quality": (width * height, -1, 0),
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
153
188
|
if not candidates:
|
|
154
189
|
raise ParseError("TikTok 解析失败: 未获取到无水印视频下载地址")
|
|
155
190
|
|
|
@@ -166,7 +201,7 @@ class TikTokApiResult:
|
|
|
166
201
|
type: TikTokMediaType
|
|
167
202
|
video: VideoRef = None
|
|
168
203
|
desc: str = ""
|
|
169
|
-
image_list: list[ImageRef
|
|
204
|
+
image_list: list[ImageRef] = None
|
|
170
205
|
|
|
171
206
|
@classmethod
|
|
172
207
|
def parse(cls, json_dict: dict) -> Self:
|
|
@@ -184,7 +219,13 @@ class TikTokApiResult:
|
|
|
184
219
|
image_list = []
|
|
185
220
|
|
|
186
221
|
for image in image_post_info.get("images", []):
|
|
187
|
-
display_image =
|
|
222
|
+
display_image = (
|
|
223
|
+
image.get("display_image")
|
|
224
|
+
or image.get("displayImage")
|
|
225
|
+
or image.get("imageURL")
|
|
226
|
+
or image.get("image")
|
|
227
|
+
or {}
|
|
228
|
+
)
|
|
188
229
|
url = first_url(display_image)
|
|
189
230
|
if url:
|
|
190
231
|
image_list.append(
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import html
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, NamedTuple
|
|
6
|
+
from urllib.parse import urlencode, urlparse
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from ..config import GlobalConfig
|
|
11
|
+
|
|
12
|
+
TIKTOK_APP_FEED = "https://api22-normal-c-alisg.tiktokv.com/aweme/v1/feed/"
|
|
13
|
+
|
|
14
|
+
FACEBOOK_EXTERNAL_HIT_UA = "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
|
|
15
|
+
UNIVERSAL_DATA_RE = re.compile(
|
|
16
|
+
r'<script[^>]+id=["\']__UNIVERSAL_DATA_FOR_REHYDRATION__["\'][^>]*>(?P<json>.*?)</script>',
|
|
17
|
+
re.DOTALL,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
TIKTOK_HEADERS = {
|
|
21
|
+
"User-Agent": GlobalConfig.ua,
|
|
22
|
+
"Referer": "https://www.tiktok.com/",
|
|
23
|
+
"x-ladon": "Hello From Evil0ctal!",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
TIKTOK_WEB_HEADERS = {
|
|
27
|
+
"User-Agent": GlobalConfig.ua,
|
|
28
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
29
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
30
|
+
"Referer": "https://www.tiktok.com/",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TikTokItemRef(NamedTuple):
|
|
35
|
+
media_type: str
|
|
36
|
+
aweme_id: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TikTokWebCrawler:
|
|
40
|
+
_ITEM = re.compile(r"/(?P<media_type>video|photo)/(?P<aweme_id>\d+)")
|
|
41
|
+
_URL = re.compile(r"https?://\S+")
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
cookie: dict | None = None,
|
|
46
|
+
proxy: str | None = None,
|
|
47
|
+
user_agent: str | None = None,
|
|
48
|
+
max_retries: int = 3,
|
|
49
|
+
timeout: int = 15,
|
|
50
|
+
):
|
|
51
|
+
self.headers = dict(TIKTOK_HEADERS)
|
|
52
|
+
if user_agent:
|
|
53
|
+
self.headers["User-Agent"] = user_agent
|
|
54
|
+
self.cookies = httpx.Cookies()
|
|
55
|
+
for key, value in (cookie or {}).items():
|
|
56
|
+
self.cookies.set(str(key), "" if value is None else str(value))
|
|
57
|
+
self.proxy = proxy
|
|
58
|
+
self.max_retries = max_retries
|
|
59
|
+
self.timeout = timeout
|
|
60
|
+
|
|
61
|
+
async def parse(self, url: str) -> dict:
|
|
62
|
+
item_ref = None
|
|
63
|
+
primary_error: Exception | None = None
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
resolved_url = await self.resolve_url(url)
|
|
67
|
+
item_ref = self.extract_item_ref_from_url(resolved_url)
|
|
68
|
+
if not item_ref:
|
|
69
|
+
raise ValueError(f"无法从链接中提取作品 ID: {resolved_url}")
|
|
70
|
+
return await self.fetch_one_video(item_ref.aweme_id)
|
|
71
|
+
except Exception as exc:
|
|
72
|
+
primary_error = exc
|
|
73
|
+
|
|
74
|
+
if item_ref and item_ref.media_type == "photo":
|
|
75
|
+
raise RuntimeError(f"获取 TikTok 图文作品失败: {primary_error}") from primary_error
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
return await self.fetch_video_from_web(url, expected_aweme_id=item_ref.aweme_id if item_ref else None)
|
|
79
|
+
except Exception as web_error:
|
|
80
|
+
raise RuntimeError(f"获取 TikTok 作品失败: feed={primary_error}; web={web_error}") from web_error
|
|
81
|
+
|
|
82
|
+
def _client(self, *, headers: dict[str, str] | None = None) -> httpx.AsyncClient:
|
|
83
|
+
return httpx.AsyncClient(
|
|
84
|
+
headers=headers or self.headers,
|
|
85
|
+
timeout=self.timeout,
|
|
86
|
+
follow_redirects=True,
|
|
87
|
+
proxy=self.proxy,
|
|
88
|
+
cookies=self.cookies,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def extract_url(cls, text: str) -> str:
|
|
93
|
+
text = html.unescape(text.strip())
|
|
94
|
+
markdown_match = re.search(r"]\((https?://[^)]+)\)", text)
|
|
95
|
+
if markdown_match:
|
|
96
|
+
return markdown_match.group(1).rstrip(".,;,。;'\")]})】>」』")
|
|
97
|
+
match = cls._URL.search(text)
|
|
98
|
+
if not match:
|
|
99
|
+
raise ValueError("未找到 TikTok URL")
|
|
100
|
+
return match.group(0).rstrip(".,;,。;'\")]})】>」』")
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def extract_item_ref_from_url(cls, url: str) -> TikTokItemRef | None:
|
|
104
|
+
match = cls._ITEM.search(url)
|
|
105
|
+
if not match:
|
|
106
|
+
return None
|
|
107
|
+
return TikTokItemRef(match.group("media_type"), match.group("aweme_id"))
|
|
108
|
+
|
|
109
|
+
async def resolve_url(self, url_or_text: str) -> str:
|
|
110
|
+
url = self.extract_url(url_or_text)
|
|
111
|
+
if self.extract_item_ref_from_url(url):
|
|
112
|
+
return url
|
|
113
|
+
|
|
114
|
+
async with self._client() as client:
|
|
115
|
+
response = await client.get(url)
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
resolved = str(response.url)
|
|
118
|
+
|
|
119
|
+
if "notfound" in resolved.lower():
|
|
120
|
+
raise ValueError("TikTok 页面不可用,可能是地区、代理或链接问题")
|
|
121
|
+
return resolved
|
|
122
|
+
|
|
123
|
+
async def resolve_web_url(self, url_or_text: str) -> str:
|
|
124
|
+
url = self.extract_url(url_or_text)
|
|
125
|
+
if self.extract_item_ref_from_url(url):
|
|
126
|
+
return url
|
|
127
|
+
|
|
128
|
+
headers = dict(TIKTOK_WEB_HEADERS)
|
|
129
|
+
headers["User-Agent"] = FACEBOOK_EXTERNAL_HIT_UA
|
|
130
|
+
async with self._client(headers=headers) as client:
|
|
131
|
+
response = await client.head(url)
|
|
132
|
+
if response.status_code >= 400:
|
|
133
|
+
response = await client.get(url)
|
|
134
|
+
response.raise_for_status()
|
|
135
|
+
return str(response.url)
|
|
136
|
+
|
|
137
|
+
async def fetch_one_video(self, aweme_id: str) -> dict[str, Any]:
|
|
138
|
+
params = {
|
|
139
|
+
"iid": 7318518857994389254,
|
|
140
|
+
"device_id": 7318517321748022790,
|
|
141
|
+
"channel": "googleplay",
|
|
142
|
+
"app_name": "musical_ly",
|
|
143
|
+
"version_code": "300904",
|
|
144
|
+
"device_platform": "android",
|
|
145
|
+
"device_type": "SM-ASUS_Z01QD",
|
|
146
|
+
"os_version": "9",
|
|
147
|
+
"aweme_id": aweme_id,
|
|
148
|
+
}
|
|
149
|
+
endpoint = f"{TIKTOK_APP_FEED}?{urlencode(params)}"
|
|
150
|
+
last_error: Exception | None = None
|
|
151
|
+
|
|
152
|
+
for attempt in range(self.max_retries):
|
|
153
|
+
try:
|
|
154
|
+
async with self._client() as client:
|
|
155
|
+
response = await client.get(endpoint)
|
|
156
|
+
response.raise_for_status()
|
|
157
|
+
payload = response.json()
|
|
158
|
+
|
|
159
|
+
aweme_list = payload.get("aweme_list") or []
|
|
160
|
+
for item in aweme_list:
|
|
161
|
+
if str(item.get("aweme_id")) == str(aweme_id):
|
|
162
|
+
return item
|
|
163
|
+
|
|
164
|
+
if aweme_list:
|
|
165
|
+
first_id = aweme_list[0].get("aweme_id")
|
|
166
|
+
raise RuntimeError(f"返回作品 ID 不匹配: expected={aweme_id}, got={first_id}")
|
|
167
|
+
|
|
168
|
+
status_msg = payload.get("status_msg") or payload.get("statusMessage") or payload
|
|
169
|
+
raise RuntimeError(f"接口未返回 aweme_list: {status_msg}")
|
|
170
|
+
except Exception as exc:
|
|
171
|
+
last_error = exc
|
|
172
|
+
if attempt + 1 < self.max_retries:
|
|
173
|
+
await asyncio.sleep(1)
|
|
174
|
+
|
|
175
|
+
raise RuntimeError(f"获取 TikTok 作品失败: {last_error}")
|
|
176
|
+
|
|
177
|
+
async def fetch_video_from_web(self, url_or_text: str, expected_aweme_id: str | None = None) -> dict[str, Any]:
|
|
178
|
+
url = await self.resolve_web_url(url_or_text)
|
|
179
|
+
item_ref = self.extract_item_ref_from_url(url)
|
|
180
|
+
if not item_ref:
|
|
181
|
+
raise ValueError(f"无法从链接中提取作品 ID: {url}")
|
|
182
|
+
if item_ref.media_type == "photo":
|
|
183
|
+
raise ValueError("TikTok 图文作品不支持 Web hydration fallback")
|
|
184
|
+
|
|
185
|
+
webpage = await self.download_webpage(url)
|
|
186
|
+
universal_data = self._search_universal_data(webpage)
|
|
187
|
+
if not universal_data:
|
|
188
|
+
raise RuntimeError("无法从页面提取 __UNIVERSAL_DATA_FOR_REHYDRATION__")
|
|
189
|
+
|
|
190
|
+
item = self._extract_web_item(universal_data)
|
|
191
|
+
item_id = str(item.get("aweme_id") or item.get("id") or "")
|
|
192
|
+
expected_id = str(expected_aweme_id or item_ref.aweme_id)
|
|
193
|
+
if item_id and item_id != expected_id:
|
|
194
|
+
raise RuntimeError(f"返回作品 ID 不匹配: expected={expected_id}, got={item_id}")
|
|
195
|
+
if item_id and not item.get("aweme_id"):
|
|
196
|
+
item["aweme_id"] = item_id
|
|
197
|
+
return item
|
|
198
|
+
|
|
199
|
+
async def download_webpage(self, url: str) -> str:
|
|
200
|
+
async with self._client(headers=TIKTOK_WEB_HEADERS) as client:
|
|
201
|
+
last_webpage = ""
|
|
202
|
+
for attempt in range(self.max_retries):
|
|
203
|
+
response = await client.get(url)
|
|
204
|
+
if urlparse(str(response.url)).path == "/login":
|
|
205
|
+
raise RuntimeError("TikTok 要求登录才能访问这个内容")
|
|
206
|
+
response.raise_for_status()
|
|
207
|
+
webpage = response.text
|
|
208
|
+
if self._search_universal_data(webpage):
|
|
209
|
+
return webpage
|
|
210
|
+
last_webpage = webpage
|
|
211
|
+
if attempt + 1 < self.max_retries:
|
|
212
|
+
await asyncio.sleep(1)
|
|
213
|
+
return last_webpage
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def _search_universal_data(webpage: str) -> dict[str, Any]:
|
|
217
|
+
match = UNIVERSAL_DATA_RE.search(webpage)
|
|
218
|
+
if not match:
|
|
219
|
+
return {}
|
|
220
|
+
raw = html.unescape(match.group("json")).strip()
|
|
221
|
+
if not raw:
|
|
222
|
+
return {}
|
|
223
|
+
try:
|
|
224
|
+
data = json.loads(raw)
|
|
225
|
+
except json.JSONDecodeError:
|
|
226
|
+
return {}
|
|
227
|
+
return data.get("__DEFAULT_SCOPE__") or {}
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _extract_web_item(universal_data: dict[str, Any]) -> dict[str, Any]:
|
|
231
|
+
detail = universal_data.get("webapp.video-detail") or {}
|
|
232
|
+
status = detail.get("statusCode") or 0
|
|
233
|
+
try:
|
|
234
|
+
status = int(status)
|
|
235
|
+
except (TypeError, ValueError):
|
|
236
|
+
status = 0
|
|
237
|
+
item = ((detail.get("itemInfo") or {}).get("itemStruct")) or {}
|
|
238
|
+
if item:
|
|
239
|
+
return item
|
|
240
|
+
if status in (10216, 10222):
|
|
241
|
+
raise RuntimeError("这个 TikTok 内容需要登录或无权访问")
|
|
242
|
+
if status == 10204:
|
|
243
|
+
raise RuntimeError("当前 IP 被 TikTok 阻止访问这个内容")
|
|
244
|
+
status_msg = detail.get("statusMsg") or detail.get("statusMessage") or status
|
|
245
|
+
raise RuntimeError(f"页面中没有作品详情,status={status_msg}")
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import re
|
|
3
|
-
from typing import Any
|
|
4
|
-
from urllib.parse import urlencode
|
|
5
|
-
|
|
6
|
-
import httpx
|
|
7
|
-
|
|
8
|
-
from ..config import GlobalConfig
|
|
9
|
-
|
|
10
|
-
TIKTOK_APP_FEED = "https://api22-normal-c-alisg.tiktokv.com/aweme/v1/feed/"
|
|
11
|
-
|
|
12
|
-
TIKTOK_HEADERS = {
|
|
13
|
-
"User-Agent": GlobalConfig.ua,
|
|
14
|
-
"Referer": "https://www.tiktok.com/",
|
|
15
|
-
"x-ladon": "Hello From Evil0ctal!",
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class TikTokWebCrawler:
|
|
20
|
-
_ITEM = re.compile(r"/(?:video|photo)/(\d+)")
|
|
21
|
-
_URL = re.compile(r"https?://\S+")
|
|
22
|
-
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
cookie: str = None,
|
|
26
|
-
proxy: str = None,
|
|
27
|
-
user_agent: str = None,
|
|
28
|
-
max_retries: int = 3,
|
|
29
|
-
timeout: int = 15,
|
|
30
|
-
):
|
|
31
|
-
self.headers = dict(TIKTOK_HEADERS)
|
|
32
|
-
if user_agent:
|
|
33
|
-
self.headers["User-Agent"] = user_agent
|
|
34
|
-
if cookie:
|
|
35
|
-
self.headers["Cookie"] = cookie
|
|
36
|
-
|
|
37
|
-
self.proxy = proxy
|
|
38
|
-
self.max_retries = max_retries
|
|
39
|
-
self.timeout = timeout
|
|
40
|
-
|
|
41
|
-
async def parse(self, url: str) -> dict:
|
|
42
|
-
aweme_id = await self.get_aweme_id(url)
|
|
43
|
-
return await self.fetch_one_video(aweme_id)
|
|
44
|
-
|
|
45
|
-
def _client(self) -> httpx.AsyncClient:
|
|
46
|
-
return httpx.AsyncClient(
|
|
47
|
-
headers=self.headers,
|
|
48
|
-
timeout=self.timeout,
|
|
49
|
-
follow_redirects=True,
|
|
50
|
-
proxy=self.proxy,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
@classmethod
|
|
54
|
-
def extract_url(cls, text: str) -> str:
|
|
55
|
-
match = cls._URL.search(text)
|
|
56
|
-
if not match:
|
|
57
|
-
raise ValueError("未找到 TikTok URL")
|
|
58
|
-
return match.group(0).rstrip(".,;,。;'\")]})】>」』")
|
|
59
|
-
|
|
60
|
-
@classmethod
|
|
61
|
-
def extract_aweme_id_from_url(cls, url: str) -> str | None:
|
|
62
|
-
match = cls._ITEM.search(url)
|
|
63
|
-
return match.group(1) if match else None
|
|
64
|
-
|
|
65
|
-
async def resolve_url(self, url_or_text: str) -> str:
|
|
66
|
-
url = self.extract_url(url_or_text)
|
|
67
|
-
if self.extract_aweme_id_from_url(url):
|
|
68
|
-
return url
|
|
69
|
-
|
|
70
|
-
async with self._client() as client:
|
|
71
|
-
response = await client.get(url)
|
|
72
|
-
response.raise_for_status()
|
|
73
|
-
resolved = str(response.url)
|
|
74
|
-
|
|
75
|
-
if "notfound" in resolved.lower():
|
|
76
|
-
raise ValueError("TikTok 页面不可用,可能是地区、代理或链接问题")
|
|
77
|
-
return resolved
|
|
78
|
-
|
|
79
|
-
async def get_aweme_id(self, url_or_text: str) -> str:
|
|
80
|
-
resolved_url = await self.resolve_url(url_or_text)
|
|
81
|
-
aweme_id = self.extract_aweme_id_from_url(resolved_url)
|
|
82
|
-
if not aweme_id:
|
|
83
|
-
raise ValueError(f"无法从链接中提取作品 ID: {resolved_url}")
|
|
84
|
-
return aweme_id
|
|
85
|
-
|
|
86
|
-
async def fetch_one_video(self, aweme_id: str) -> dict[str, Any]:
|
|
87
|
-
params = {
|
|
88
|
-
"iid": 7318518857994389254,
|
|
89
|
-
"device_id": 7318517321748022790,
|
|
90
|
-
"channel": "googleplay",
|
|
91
|
-
"app_name": "musical_ly",
|
|
92
|
-
"version_code": "300904",
|
|
93
|
-
"device_platform": "android",
|
|
94
|
-
"device_type": "SM-ASUS_Z01QD",
|
|
95
|
-
"os_version": "9",
|
|
96
|
-
"aweme_id": aweme_id,
|
|
97
|
-
}
|
|
98
|
-
endpoint = f"{TIKTOK_APP_FEED}?{urlencode(params)}"
|
|
99
|
-
last_error: Exception | None = None
|
|
100
|
-
|
|
101
|
-
for attempt in range(self.max_retries):
|
|
102
|
-
try:
|
|
103
|
-
async with self._client() as client:
|
|
104
|
-
response = await client.get(endpoint)
|
|
105
|
-
response.raise_for_status()
|
|
106
|
-
payload = response.json()
|
|
107
|
-
|
|
108
|
-
aweme_list = payload.get("aweme_list") or []
|
|
109
|
-
for item in aweme_list:
|
|
110
|
-
if str(item.get("aweme_id")) == str(aweme_id):
|
|
111
|
-
return item
|
|
112
|
-
|
|
113
|
-
if aweme_list:
|
|
114
|
-
first_id = aweme_list[0].get("aweme_id")
|
|
115
|
-
raise RuntimeError(f"返回作品 ID 不匹配: expected={aweme_id}, got={first_id}")
|
|
116
|
-
|
|
117
|
-
status_msg = payload.get("status_msg") or payload.get("statusMessage") or payload
|
|
118
|
-
raise RuntimeError(f"接口未返回 aweme_list: {status_msg}")
|
|
119
|
-
except Exception as exc:
|
|
120
|
-
last_error = exc
|
|
121
|
-
if attempt + 1 < self.max_retries:
|
|
122
|
-
await asyncio.sleep(1)
|
|
123
|
-
|
|
124
|
-
raise RuntimeError(f"获取 TikTok 作品失败: {last_error}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|