parsehub 2.0.16__tar.gz → 2.0.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {parsehub-2.0.16/src/parsehub.egg-info → parsehub-2.0.17}/PKG-INFO +1 -1
  2. {parsehub-2.0.16 → parsehub-2.0.17}/pyproject.toml +1 -1
  3. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/tiktok.py +58 -17
  4. parsehub-2.0.17/src/parsehub/provider_api/tiktok.py +245 -0
  5. {parsehub-2.0.16 → parsehub-2.0.17/src/parsehub.egg-info}/PKG-INFO +1 -1
  6. parsehub-2.0.16/src/parsehub/provider_api/tiktok.py +0 -124
  7. {parsehub-2.0.16 → parsehub-2.0.17}/LICENSE +0 -0
  8. {parsehub-2.0.16 → parsehub-2.0.17}/README.md +0 -0
  9. {parsehub-2.0.16 → parsehub-2.0.17}/setup.cfg +0 -0
  10. {parsehub-2.0.16 → parsehub-2.0.17}/src/__init__.py +0 -0
  11. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/__init__.py +0 -0
  12. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/config/__init__.py +0 -0
  13. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/config/config.py +0 -0
  14. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/errors.py +0 -0
  15. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/__init__.py +0 -0
  16. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/__init__.py +0 -0
  17. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/base.py +0 -0
  18. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/base/ytdlp.py +0 -0
  19. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/__init__.py +0 -0
  20. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/bilibili.py +0 -0
  21. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/coolapk.py +0 -0
  22. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/douyin.py +0 -0
  23. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/facebook.py +0 -0
  24. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/instagram.py +0 -0
  25. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/kuaishou.py +0 -0
  26. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/pipix.py +0 -0
  27. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/threads.py +0 -0
  28. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/tieba.py +0 -0
  29. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/twitter.py +0 -0
  30. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/weibo.py +0 -0
  31. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/weixin.py +0 -0
  32. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/xhs.py +0 -0
  33. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
  34. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/youtube.py +0 -0
  35. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/parsers/parser/zuiyou.py +0 -0
  36. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/__init__.py +0 -0
  37. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/bilibili.py +0 -0
  38. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/coolapk.py +0 -0
  39. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/douyin.py +0 -0
  40. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/instagram.py +0 -0
  41. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/kuaishou.py +0 -0
  42. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/pipix.py +0 -0
  43. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/threads.py +0 -0
  44. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/tieba.py +0 -0
  45. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/twitter.py +0 -0
  46. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/weibo.py +0 -0
  47. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/weixin.py +0 -0
  48. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/xhs.py +0 -0
  49. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/xiaoheihe.py +0 -0
  50. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/provider_api/zuiyou.py +0 -0
  51. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/__init__.py +0 -0
  52. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/callback.py +0 -0
  53. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/media_file.py +0 -0
  54. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/media_ref.py +0 -0
  55. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/platform.py +0 -0
  56. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/post.py +0 -0
  57. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/types/result.py +0 -0
  58. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/downloader.py +0 -0
  59. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/media_info.py +0 -0
  60. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub/utils/utils.py +0 -0
  61. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/SOURCES.txt +0 -0
  62. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/dependency_links.txt +0 -0
  63. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/requires.txt +0 -0
  64. {parsehub-2.0.16 → parsehub-2.0.17}/src/parsehub.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.16
3
+ Version: 2.0.17
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "parsehub"
3
- version = "2.0.16"
3
+ version = "2.0.17"
4
4
  description = "轻量、异步、开箱即用的社交媒体聚合解析库"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12.0"
@@ -4,14 +4,11 @@ from pathlib import Path
4
4
  from typing import Self, Union
5
5
 
6
6
  from ... import ProgressCallback
7
- from ...config import GlobalConfig
8
7
  from ...provider_api.tiktok import TikTokWebCrawler
9
8
  from ...types import (
10
9
  DownloadResult,
11
10
  ImageParseResult,
12
11
  ImageRef,
13
- LivePhotoRef,
14
- MultimediaParseResult,
15
12
  ParseError,
16
13
  Platform,
17
14
  VideoParseResult,
@@ -26,7 +23,7 @@ class TikTokParser(BaseParser):
26
23
  __match__ = r"^(http(s)?://)?.+tiktok.com/(?!share/user|qishui).+"
27
24
  __redirect_keywords__ = ["vt.tiktok"]
28
25
 
29
- async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult", "MultimediaParseResult"]:
26
+ async def _do_parse(self, raw_url: str) -> Union["VideoParseResult", "ImageParseResult"]:
30
27
  result = await self._fetch_api_result(raw_url)
31
28
 
32
29
  match result.type:
@@ -72,7 +69,6 @@ class TikTokVideoParseResult(VideoParseResult):
72
69
  headers: dict | None = None,
73
70
  ) -> "DownloadResult":
74
71
  headers = {
75
- "User-Agent": GlobalConfig.ua,
76
72
  "Referer": "https://www.tiktok.com/",
77
73
  }
78
74
  return await super()._do_download(
@@ -85,9 +81,31 @@ class TikTokVideoParseResult(VideoParseResult):
85
81
  )
86
82
 
87
83
 
88
- def first_url(data: dict | None) -> str | None:
89
- url_list = (data or {}).get("url_list") or (data or {}).get("UrlList") or []
90
- return next((url for url in url_list if url), None)
84
+ def media_urls(data: dict | str | list | None) -> list[str]:
85
+ if isinstance(data, str):
86
+ if data.startswith("//"):
87
+ return [f"https:{data}"]
88
+ return [data] if data.startswith(("http://", "https://")) else []
89
+ if isinstance(data, list):
90
+ result = []
91
+ for item in data:
92
+ result.extend(media_urls(item))
93
+ return result
94
+ if not isinstance(data, dict):
95
+ return []
96
+ for key in ("url_list", "UrlList", "urlList"):
97
+ if key in data:
98
+ return media_urls(data[key])
99
+ return []
100
+
101
+
102
+ def first_url(data: dict | str | list | None) -> str | None:
103
+ return next(iter(media_urls(data)), None)
104
+
105
+
106
+ def preferred_video_url(data: dict | str | list | None) -> str | None:
107
+ urls = media_urls(data)
108
+ return next((url for url in urls if "aweme" in url), None) or (urls[0] if urls else None)
91
109
 
92
110
 
93
111
  def as_int(value) -> int:
@@ -102,8 +120,7 @@ def pick_cover(video_data: dict) -> str | None:
102
120
  cover_url = first_url(video_data.get(key))
103
121
  if cover_url:
104
122
  return cover_url
105
- cover = video_data.get("cover")
106
- return cover if isinstance(cover, str) else None
123
+ return None
107
124
 
108
125
 
109
126
  def parse_video_info(video_data: dict) -> dict:
@@ -112,7 +129,7 @@ def parse_video_info(video_data: dict) -> dict:
112
129
 
113
130
  for bit_rate in bit_rates:
114
131
  play_addr = bit_rate.get("play_addr") or bit_rate.get("PlayAddr") or {}
115
- video_url = first_url(play_addr)
132
+ video_url = preferred_video_url(play_addr)
116
133
  if not video_url:
117
134
  continue
118
135
 
@@ -135,21 +152,39 @@ def parse_video_info(video_data: dict) -> dict:
135
152
 
136
153
  if not candidates:
137
154
  play_addr = video_data.get("play_addr") or video_data.get("playAddr") or {}
138
- video_url = first_url(play_addr)
155
+ video_url = preferred_video_url(play_addr)
139
156
  if video_url:
140
- width = as_int(play_addr.get("width") or video_data.get("width"))
141
- height = as_int(play_addr.get("height") or video_data.get("height"))
157
+ play_addr_data = play_addr if isinstance(play_addr, dict) else {}
158
+ width = as_int(play_addr_data.get("width") or video_data.get("width"))
159
+ height = as_int(play_addr_data.get("height") or video_data.get("height"))
142
160
  candidates.append(
143
161
  {
144
162
  "video_url": video_url,
145
163
  "thumb_url": pick_cover(video_data),
146
- "duration": as_int(play_addr.get("duration") or video_data.get("duration")),
164
+ "duration": as_int(play_addr_data.get("duration") or video_data.get("duration")),
147
165
  "width": width,
148
166
  "height": height,
149
167
  "quality": (width * height, 0, 0),
150
168
  }
151
169
  )
152
170
 
171
+ if not candidates:
172
+ download_addr = video_data.get("download_addr") or video_data.get("downloadAddr") or {}
173
+ video_url = preferred_video_url(download_addr)
174
+ if video_url:
175
+ width = as_int(video_data.get("width"))
176
+ height = as_int(video_data.get("height"))
177
+ candidates.append(
178
+ {
179
+ "video_url": video_url,
180
+ "thumb_url": pick_cover(video_data),
181
+ "duration": as_int(video_data.get("duration")),
182
+ "width": width,
183
+ "height": height,
184
+ "quality": (width * height, -1, 0),
185
+ }
186
+ )
187
+
153
188
  if not candidates:
154
189
  raise ParseError("TikTok 解析失败: 未获取到无水印视频下载地址")
155
190
 
@@ -166,7 +201,7 @@ class TikTokApiResult:
166
201
  type: TikTokMediaType
167
202
  video: VideoRef = None
168
203
  desc: str = ""
169
- image_list: list[ImageRef | LivePhotoRef] = None
204
+ image_list: list[ImageRef] = None
170
205
 
171
206
  @classmethod
172
207
  def parse(cls, json_dict: dict) -> Self:
@@ -184,7 +219,13 @@ class TikTokApiResult:
184
219
  image_list = []
185
220
 
186
221
  for image in image_post_info.get("images", []):
187
- display_image = image.get("display_image") or image.get("displayImage") or image.get("image") or {}
222
+ display_image = (
223
+ image.get("display_image")
224
+ or image.get("displayImage")
225
+ or image.get("imageURL")
226
+ or image.get("image")
227
+ or {}
228
+ )
188
229
  url = first_url(display_image)
189
230
  if url:
190
231
  image_list.append(
@@ -0,0 +1,245 @@
1
+ import asyncio
2
+ import html
3
+ import json
4
+ import re
5
+ from typing import Any, NamedTuple
6
+ from urllib.parse import urlencode, urlparse
7
+
8
+ import httpx
9
+
10
+ from ..config import GlobalConfig
11
+
12
+ TIKTOK_APP_FEED = "https://api22-normal-c-alisg.tiktokv.com/aweme/v1/feed/"
13
+
14
+ FACEBOOK_EXTERNAL_HIT_UA = "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)"
15
+ UNIVERSAL_DATA_RE = re.compile(
16
+ r'<script[^>]+id=["\']__UNIVERSAL_DATA_FOR_REHYDRATION__["\'][^>]*>(?P<json>.*?)</script>',
17
+ re.DOTALL,
18
+ )
19
+
20
+ TIKTOK_HEADERS = {
21
+ "User-Agent": GlobalConfig.ua,
22
+ "Referer": "https://www.tiktok.com/",
23
+ "x-ladon": "Hello From Evil0ctal!",
24
+ }
25
+
26
+ TIKTOK_WEB_HEADERS = {
27
+ "User-Agent": GlobalConfig.ua,
28
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
29
+ "Accept-Language": "en-US,en;q=0.9",
30
+ "Referer": "https://www.tiktok.com/",
31
+ }
32
+
33
+
34
+ class TikTokItemRef(NamedTuple):
35
+ media_type: str
36
+ aweme_id: str
37
+
38
+
39
+ class TikTokWebCrawler:
40
+ _ITEM = re.compile(r"/(?P<media_type>video|photo)/(?P<aweme_id>\d+)")
41
+ _URL = re.compile(r"https?://\S+")
42
+
43
+ def __init__(
44
+ self,
45
+ cookie: dict | None = None,
46
+ proxy: str | None = None,
47
+ user_agent: str | None = None,
48
+ max_retries: int = 3,
49
+ timeout: int = 15,
50
+ ):
51
+ self.headers = dict(TIKTOK_HEADERS)
52
+ if user_agent:
53
+ self.headers["User-Agent"] = user_agent
54
+ self.cookies = httpx.Cookies()
55
+ for key, value in (cookie or {}).items():
56
+ self.cookies.set(str(key), "" if value is None else str(value))
57
+ self.proxy = proxy
58
+ self.max_retries = max_retries
59
+ self.timeout = timeout
60
+
61
+ async def parse(self, url: str) -> dict:
62
+ item_ref = None
63
+ primary_error: Exception | None = None
64
+
65
+ try:
66
+ resolved_url = await self.resolve_url(url)
67
+ item_ref = self.extract_item_ref_from_url(resolved_url)
68
+ if not item_ref:
69
+ raise ValueError(f"无法从链接中提取作品 ID: {resolved_url}")
70
+ return await self.fetch_one_video(item_ref.aweme_id)
71
+ except Exception as exc:
72
+ primary_error = exc
73
+
74
+ if item_ref and item_ref.media_type == "photo":
75
+ raise RuntimeError(f"获取 TikTok 图文作品失败: {primary_error}") from primary_error
76
+
77
+ try:
78
+ return await self.fetch_video_from_web(url, expected_aweme_id=item_ref.aweme_id if item_ref else None)
79
+ except Exception as web_error:
80
+ raise RuntimeError(f"获取 TikTok 作品失败: feed={primary_error}; web={web_error}") from web_error
81
+
82
+ def _client(self, *, headers: dict[str, str] | None = None) -> httpx.AsyncClient:
83
+ return httpx.AsyncClient(
84
+ headers=headers or self.headers,
85
+ timeout=self.timeout,
86
+ follow_redirects=True,
87
+ proxy=self.proxy,
88
+ cookies=self.cookies,
89
+ )
90
+
91
+ @classmethod
92
+ def extract_url(cls, text: str) -> str:
93
+ text = html.unescape(text.strip())
94
+ markdown_match = re.search(r"]\((https?://[^)]+)\)", text)
95
+ if markdown_match:
96
+ return markdown_match.group(1).rstrip(".,;,。;'\")]})】>」』")
97
+ match = cls._URL.search(text)
98
+ if not match:
99
+ raise ValueError("未找到 TikTok URL")
100
+ return match.group(0).rstrip(".,;,。;'\")]})】>」』")
101
+
102
+ @classmethod
103
+ def extract_item_ref_from_url(cls, url: str) -> TikTokItemRef | None:
104
+ match = cls._ITEM.search(url)
105
+ if not match:
106
+ return None
107
+ return TikTokItemRef(match.group("media_type"), match.group("aweme_id"))
108
+
109
+ async def resolve_url(self, url_or_text: str) -> str:
110
+ url = self.extract_url(url_or_text)
111
+ if self.extract_item_ref_from_url(url):
112
+ return url
113
+
114
+ async with self._client() as client:
115
+ response = await client.get(url)
116
+ response.raise_for_status()
117
+ resolved = str(response.url)
118
+
119
+ if "notfound" in resolved.lower():
120
+ raise ValueError("TikTok 页面不可用,可能是地区、代理或链接问题")
121
+ return resolved
122
+
123
+ async def resolve_web_url(self, url_or_text: str) -> str:
124
+ url = self.extract_url(url_or_text)
125
+ if self.extract_item_ref_from_url(url):
126
+ return url
127
+
128
+ headers = dict(TIKTOK_WEB_HEADERS)
129
+ headers["User-Agent"] = FACEBOOK_EXTERNAL_HIT_UA
130
+ async with self._client(headers=headers) as client:
131
+ response = await client.head(url)
132
+ if response.status_code >= 400:
133
+ response = await client.get(url)
134
+ response.raise_for_status()
135
+ return str(response.url)
136
+
137
+ async def fetch_one_video(self, aweme_id: str) -> dict[str, Any]:
138
+ params = {
139
+ "iid": 7318518857994389254,
140
+ "device_id": 7318517321748022790,
141
+ "channel": "googleplay",
142
+ "app_name": "musical_ly",
143
+ "version_code": "300904",
144
+ "device_platform": "android",
145
+ "device_type": "SM-ASUS_Z01QD",
146
+ "os_version": "9",
147
+ "aweme_id": aweme_id,
148
+ }
149
+ endpoint = f"{TIKTOK_APP_FEED}?{urlencode(params)}"
150
+ last_error: Exception | None = None
151
+
152
+ for attempt in range(self.max_retries):
153
+ try:
154
+ async with self._client() as client:
155
+ response = await client.get(endpoint)
156
+ response.raise_for_status()
157
+ payload = response.json()
158
+
159
+ aweme_list = payload.get("aweme_list") or []
160
+ for item in aweme_list:
161
+ if str(item.get("aweme_id")) == str(aweme_id):
162
+ return item
163
+
164
+ if aweme_list:
165
+ first_id = aweme_list[0].get("aweme_id")
166
+ raise RuntimeError(f"返回作品 ID 不匹配: expected={aweme_id}, got={first_id}")
167
+
168
+ status_msg = payload.get("status_msg") or payload.get("statusMessage") or payload
169
+ raise RuntimeError(f"接口未返回 aweme_list: {status_msg}")
170
+ except Exception as exc:
171
+ last_error = exc
172
+ if attempt + 1 < self.max_retries:
173
+ await asyncio.sleep(1)
174
+
175
+ raise RuntimeError(f"获取 TikTok 作品失败: {last_error}")
176
+
177
+ async def fetch_video_from_web(self, url_or_text: str, expected_aweme_id: str | None = None) -> dict[str, Any]:
178
+ url = await self.resolve_web_url(url_or_text)
179
+ item_ref = self.extract_item_ref_from_url(url)
180
+ if not item_ref:
181
+ raise ValueError(f"无法从链接中提取作品 ID: {url}")
182
+ if item_ref.media_type == "photo":
183
+ raise ValueError("TikTok 图文作品不支持 Web hydration fallback")
184
+
185
+ webpage = await self.download_webpage(url)
186
+ universal_data = self._search_universal_data(webpage)
187
+ if not universal_data:
188
+ raise RuntimeError("无法从页面提取 __UNIVERSAL_DATA_FOR_REHYDRATION__")
189
+
190
+ item = self._extract_web_item(universal_data)
191
+ item_id = str(item.get("aweme_id") or item.get("id") or "")
192
+ expected_id = str(expected_aweme_id or item_ref.aweme_id)
193
+ if item_id and item_id != expected_id:
194
+ raise RuntimeError(f"返回作品 ID 不匹配: expected={expected_id}, got={item_id}")
195
+ if item_id and not item.get("aweme_id"):
196
+ item["aweme_id"] = item_id
197
+ return item
198
+
199
+ async def download_webpage(self, url: str) -> str:
200
+ async with self._client(headers=TIKTOK_WEB_HEADERS) as client:
201
+ last_webpage = ""
202
+ for attempt in range(self.max_retries):
203
+ response = await client.get(url)
204
+ if urlparse(str(response.url)).path == "/login":
205
+ raise RuntimeError("TikTok 要求登录才能访问这个内容")
206
+ response.raise_for_status()
207
+ webpage = response.text
208
+ if self._search_universal_data(webpage):
209
+ return webpage
210
+ last_webpage = webpage
211
+ if attempt + 1 < self.max_retries:
212
+ await asyncio.sleep(1)
213
+ return last_webpage
214
+
215
+ @staticmethod
216
+ def _search_universal_data(webpage: str) -> dict[str, Any]:
217
+ match = UNIVERSAL_DATA_RE.search(webpage)
218
+ if not match:
219
+ return {}
220
+ raw = html.unescape(match.group("json")).strip()
221
+ if not raw:
222
+ return {}
223
+ try:
224
+ data = json.loads(raw)
225
+ except json.JSONDecodeError:
226
+ return {}
227
+ return data.get("__DEFAULT_SCOPE__") or {}
228
+
229
+ @staticmethod
230
+ def _extract_web_item(universal_data: dict[str, Any]) -> dict[str, Any]:
231
+ detail = universal_data.get("webapp.video-detail") or {}
232
+ status = detail.get("statusCode") or 0
233
+ try:
234
+ status = int(status)
235
+ except (TypeError, ValueError):
236
+ status = 0
237
+ item = ((detail.get("itemInfo") or {}).get("itemStruct")) or {}
238
+ if item:
239
+ return item
240
+ if status in (10216, 10222):
241
+ raise RuntimeError("这个 TikTok 内容需要登录或无权访问")
242
+ if status == 10204:
243
+ raise RuntimeError("当前 IP 被 TikTok 阻止访问这个内容")
244
+ status_msg = detail.get("statusMsg") or detail.get("statusMessage") or status
245
+ raise RuntimeError(f"页面中没有作品详情,status={status_msg}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.16
3
+ Version: 2.0.17
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -1,124 +0,0 @@
1
- import asyncio
2
- import re
3
- from typing import Any
4
- from urllib.parse import urlencode
5
-
6
- import httpx
7
-
8
- from ..config import GlobalConfig
9
-
10
- TIKTOK_APP_FEED = "https://api22-normal-c-alisg.tiktokv.com/aweme/v1/feed/"
11
-
12
- TIKTOK_HEADERS = {
13
- "User-Agent": GlobalConfig.ua,
14
- "Referer": "https://www.tiktok.com/",
15
- "x-ladon": "Hello From Evil0ctal!",
16
- }
17
-
18
-
19
- class TikTokWebCrawler:
20
- _ITEM = re.compile(r"/(?:video|photo)/(\d+)")
21
- _URL = re.compile(r"https?://\S+")
22
-
23
- def __init__(
24
- self,
25
- cookie: str = None,
26
- proxy: str = None,
27
- user_agent: str = None,
28
- max_retries: int = 3,
29
- timeout: int = 15,
30
- ):
31
- self.headers = dict(TIKTOK_HEADERS)
32
- if user_agent:
33
- self.headers["User-Agent"] = user_agent
34
- if cookie:
35
- self.headers["Cookie"] = cookie
36
-
37
- self.proxy = proxy
38
- self.max_retries = max_retries
39
- self.timeout = timeout
40
-
41
- async def parse(self, url: str) -> dict:
42
- aweme_id = await self.get_aweme_id(url)
43
- return await self.fetch_one_video(aweme_id)
44
-
45
- def _client(self) -> httpx.AsyncClient:
46
- return httpx.AsyncClient(
47
- headers=self.headers,
48
- timeout=self.timeout,
49
- follow_redirects=True,
50
- proxy=self.proxy,
51
- )
52
-
53
- @classmethod
54
- def extract_url(cls, text: str) -> str:
55
- match = cls._URL.search(text)
56
- if not match:
57
- raise ValueError("未找到 TikTok URL")
58
- return match.group(0).rstrip(".,;,。;'\")]})】>」』")
59
-
60
- @classmethod
61
- def extract_aweme_id_from_url(cls, url: str) -> str | None:
62
- match = cls._ITEM.search(url)
63
- return match.group(1) if match else None
64
-
65
- async def resolve_url(self, url_or_text: str) -> str:
66
- url = self.extract_url(url_or_text)
67
- if self.extract_aweme_id_from_url(url):
68
- return url
69
-
70
- async with self._client() as client:
71
- response = await client.get(url)
72
- response.raise_for_status()
73
- resolved = str(response.url)
74
-
75
- if "notfound" in resolved.lower():
76
- raise ValueError("TikTok 页面不可用,可能是地区、代理或链接问题")
77
- return resolved
78
-
79
- async def get_aweme_id(self, url_or_text: str) -> str:
80
- resolved_url = await self.resolve_url(url_or_text)
81
- aweme_id = self.extract_aweme_id_from_url(resolved_url)
82
- if not aweme_id:
83
- raise ValueError(f"无法从链接中提取作品 ID: {resolved_url}")
84
- return aweme_id
85
-
86
- async def fetch_one_video(self, aweme_id: str) -> dict[str, Any]:
87
- params = {
88
- "iid": 7318518857994389254,
89
- "device_id": 7318517321748022790,
90
- "channel": "googleplay",
91
- "app_name": "musical_ly",
92
- "version_code": "300904",
93
- "device_platform": "android",
94
- "device_type": "SM-ASUS_Z01QD",
95
- "os_version": "9",
96
- "aweme_id": aweme_id,
97
- }
98
- endpoint = f"{TIKTOK_APP_FEED}?{urlencode(params)}"
99
- last_error: Exception | None = None
100
-
101
- for attempt in range(self.max_retries):
102
- try:
103
- async with self._client() as client:
104
- response = await client.get(endpoint)
105
- response.raise_for_status()
106
- payload = response.json()
107
-
108
- aweme_list = payload.get("aweme_list") or []
109
- for item in aweme_list:
110
- if str(item.get("aweme_id")) == str(aweme_id):
111
- return item
112
-
113
- if aweme_list:
114
- first_id = aweme_list[0].get("aweme_id")
115
- raise RuntimeError(f"返回作品 ID 不匹配: expected={aweme_id}, got={first_id}")
116
-
117
- status_msg = payload.get("status_msg") or payload.get("statusMessage") or payload
118
- raise RuntimeError(f"接口未返回 aweme_list: {status_msg}")
119
- except Exception as exc:
120
- last_error = exc
121
- if attempt + 1 < self.max_retries:
122
- await asyncio.sleep(1)
123
-
124
- raise RuntimeError(f"获取 TikTok 作品失败: {last_error}")
File without changes
File without changes
File without changes
File without changes