parsehub 2.0.10__tar.gz → 2.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {parsehub-2.0.10/src/parsehub.egg-info → parsehub-2.0.12}/PKG-INFO +2 -2
  2. {parsehub-2.0.10 → parsehub-2.0.12}/README.md +1 -1
  3. {parsehub-2.0.10 → parsehub-2.0.12}/pyproject.toml +1 -1
  4. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/base/ytdlp.py +31 -33
  5. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/twitter.py +19 -16
  6. parsehub-2.0.12/src/parsehub/provider_api/twitter.py +408 -0
  7. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/xiaoheihe.py +81 -95
  8. {parsehub-2.0.10 → parsehub-2.0.12/src/parsehub.egg-info}/PKG-INFO +2 -2
  9. parsehub-2.0.10/src/parsehub/provider_api/twitter.py +0 -203
  10. {parsehub-2.0.10 → parsehub-2.0.12}/LICENSE +0 -0
  11. {parsehub-2.0.10 → parsehub-2.0.12}/setup.cfg +0 -0
  12. {parsehub-2.0.10 → parsehub-2.0.12}/src/__init__.py +0 -0
  13. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/__init__.py +0 -0
  14. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/config/__init__.py +0 -0
  15. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/config/config.py +0 -0
  16. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/errors.py +0 -0
  17. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/__init__.py +0 -0
  18. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/base/__init__.py +0 -0
  19. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/base/base.py +0 -0
  20. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/__init__.py +0 -0
  21. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/bilibili.py +0 -0
  22. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/coolapk.py +0 -0
  23. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/douyin.py +0 -0
  24. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/facebook.py +0 -0
  25. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/instagram.py +0 -0
  26. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/kuaishou.py +0 -0
  27. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/pipix.py +0 -0
  28. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/threads.py +0 -0
  29. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/tieba.py +0 -0
  30. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/weibo.py +0 -0
  31. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/weixin.py +0 -0
  32. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/xhs.py +0 -0
  33. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
  34. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/youtube.py +0 -0
  35. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/parsers/parser/zuiyou.py +0 -0
  36. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/__init__.py +0 -0
  37. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/bilibili.py +0 -0
  38. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/coolapk.py +0 -0
  39. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/instagram.py +0 -0
  40. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/kuaishou.py +0 -0
  41. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/pipix.py +0 -0
  42. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/threads.py +0 -0
  43. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/tieba.py +0 -0
  44. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/weibo.py +0 -0
  45. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/weixin.py +0 -0
  46. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/xhs.py +0 -0
  47. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/provider_api/zuiyou.py +0 -0
  48. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/__init__.py +0 -0
  49. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/callback.py +0 -0
  50. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/media_file.py +0 -0
  51. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/media_ref.py +0 -0
  52. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/platform.py +0 -0
  53. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/post.py +0 -0
  54. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/types/result.py +0 -0
  55. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/utils/downloader.py +0 -0
  56. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/utils/media_info.py +0 -0
  57. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub/utils/utils.py +0 -0
  58. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub.egg-info/SOURCES.txt +0 -0
  59. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub.egg-info/dependency_links.txt +0 -0
  60. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub.egg-info/requires.txt +0 -0
  61. {parsehub-2.0.10 → parsehub-2.0.12}/src/parsehub.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.10
3
+ Version: 2.0.12
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -98,7 +98,7 @@ print(result)
98
98
 
99
99
  | 平台 | 视频 | 图文 | 其他 |
100
100
  |:----------------|:--:|:--:|:-----:|
101
- | **Twitter / X** | ✅ | ✅ | |
101
+ | **Twitter / X** | ✅ | ✅ | 📝 文章 |
102
102
  | **Instagram** | ✅ | ✅ | |
103
103
  | **YouTube** | ✅ | | 🎵 音乐 |
104
104
  | **Facebook** | ✅ | | |
@@ -61,7 +61,7 @@ print(result)
61
61
 
62
62
  | 平台 | 视频 | 图文 | 其他 |
63
63
  |:----------------|:--:|:--:|:-----:|
64
- | **Twitter / X** | ✅ | ✅ | |
64
+ | **Twitter / X** | ✅ | ✅ | 📝 文章 |
65
65
  | **Instagram** | ✅ | ✅ | |
66
66
  | **YouTube** | ✅ | | 🎵 音乐 |
67
67
  | **Facebook** | ✅ | | |
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "parsehub"
3
- version = "2.0.10"
3
+ version = "2.0.12"
4
4
  description = "轻量、异步、开箱即用的社交媒体聚合解析库"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12.0"
@@ -150,33 +150,33 @@ class YtVideoParseResult(VideoParseResult):
150
150
  # paramss["format"] = "worstvideo* + worstaudio / worst"
151
151
 
152
152
  if callback:
153
- loop = asyncio.get_running_loop()
154
- progress_mode = "bytes"
155
-
156
- def _progress_hook(d):
157
- nonlocal progress_mode
158
- if d["status"] == "downloading":
159
- # 已知问题: yt-dlp 返回的总进度不统一
160
- downloaded = int(d.get("downloaded_bytes", 0))
161
- total = int(d.get("total_bytes") or d.get("total_bytes_estimate") or 0)
162
- if total and progress_mode == "bytes":
163
- asyncio.run_coroutine_threadsafe(
164
- callback(downloaded, total, "bytes", *callback_args, **callback_kwargs),
165
- loop,
166
- )
167
- else:
168
- progress_mode = "count"
169
- asyncio.run_coroutine_threadsafe(
170
- callback(0, 1, "count", *callback_args, **callback_kwargs),
171
- loop,
172
- )
173
- elif d["status"] == "finished" and progress_mode == "count":
174
- asyncio.run_coroutine_threadsafe(
175
- callback(1, 1, "count", *callback_args, **callback_kwargs),
176
- loop,
177
- )
178
-
179
- paramss["progress_hooks"] = [_progress_hook]
153
+ # 已知问题: yt-dlp 返回的总进度不统一, 暂使用 count 进度
154
+ await callback(0, 1, "count", *callback_args, **callback_kwargs)
155
+ # loop = asyncio.get_running_loop()
156
+ # progress_mode = "bytes"
157
+ # def _progress_hook(d):
158
+ # nonlocal progress_mode
159
+ # if d["status"] == "downloading":
160
+ # downloaded = int(d.get("downloaded_bytes", 0))
161
+ # total = int(d.get("total_bytes") or d.get("total_bytes_estimate") or 0)
162
+ # if total and progress_mode == "bytes":
163
+ # asyncio.run_coroutine_threadsafe(
164
+ # callback(downloaded, total, "bytes", *callback_args, **callback_kwargs),
165
+ # loop,
166
+ # )
167
+ # else:
168
+ # progress_mode = "count"
169
+ # asyncio.run_coroutine_threadsafe(
170
+ # callback(0, 1, "count", *callback_args, **callback_kwargs),
171
+ # loop,
172
+ # )
173
+ # elif d["status"] == "finished" and progress_mode == "count":
174
+ # asyncio.run_coroutine_threadsafe(
175
+ # callback(1, 1, "count", *callback_args, **callback_kwargs),
176
+ # loop,
177
+ # )
178
+ #
179
+ # paramss["progress_hooks"] = [_progress_hook]
180
180
 
181
181
  await self._run_download(paramss)
182
182
 
@@ -184,6 +184,9 @@ class YtVideoParseResult(VideoParseResult):
184
184
  if not v:
185
185
  raise DownloadError("下载失败 -1")
186
186
 
187
+ if callback:
188
+ await callback(1, 1, "count", *callback_args, **callback_kwargs)
189
+
187
190
  video_path = v[0]
188
191
  return DownloadResult(
189
192
  VideoFile(
@@ -200,12 +203,7 @@ class YtVideoParseResult(VideoParseResult):
200
203
  raise DownloadError("下载失败 -2")
201
204
 
202
205
  try:
203
- await asyncio.wait_for(
204
- asyncio.to_thread(download_video, paramss, [self.dl.url]),
205
- timeout=300,
206
- )
207
- except TimeoutError as e:
208
- raise DownloadError("下载超时") from e
206
+ await asyncio.to_thread(download_video, paramss, [self.dl.url])
209
207
  except RuntimeError as e:
210
208
  error = str(e)
211
209
  if any(
@@ -7,7 +7,7 @@ from ...provider_api.twitter import (
7
7
  TwitterTweet,
8
8
  TwitterVideo,
9
9
  )
10
- from ...types import AniRef, ImageRef, MultimediaParseResult, ParseError, Platform, VideoRef
10
+ from ...types import AniRef, ImageRef, MultimediaParseResult, ParseError, Platform, RichTextParseResult, VideoRef
11
11
  from ...utils.utils import cookie_ellipsis
12
12
  from ..base.base import BaseParser
13
13
 
@@ -48,21 +48,24 @@ class TwitterParser(BaseParser):
48
48
  @staticmethod
49
49
  async def media_parse(tweet: TwitterTweet):
50
50
  media = []
51
- for m in tweet.media:
52
- match m:
53
- case TwitterPhoto():
54
- path = ImageRef(url=m.url, height=m.height, width=m.width, thumb_url=m.thumb_url)
55
- case TwitterVideo():
56
- path = VideoRef(
57
- url=m.url,
58
- height=m.height,
59
- width=m.width,
60
- duration=int(m.duration_millis / 1000),
61
- thumb_url=m.thumb_url,
62
- )
63
- case TwitterAni():
64
- path = AniRef(url=m.url, ext="mp4", height=m.height, width=m.width, thumb_url=m.thumb_url)
65
- media.append(path)
51
+ if tweet.media:
52
+ for m in tweet.media:
53
+ match m:
54
+ case TwitterPhoto():
55
+ path = ImageRef(url=m.url, height=m.height, width=m.width, thumb_url=m.thumb_url)
56
+ case TwitterVideo():
57
+ path = VideoRef(
58
+ url=m.url,
59
+ height=m.height,
60
+ width=m.width,
61
+ duration=int(m.duration_millis / 1000),
62
+ thumb_url=m.thumb_url,
63
+ )
64
+ case TwitterAni():
65
+ path = AniRef(url=m.url, ext="mp4", height=m.height, width=m.width, thumb_url=m.thumb_url)
66
+ media.append(path)
67
+ if article := tweet.article:
68
+ return RichTextParseResult(markdown_content=article.content, title=article.title, media=media)
66
69
  return MultimediaParseResult(content=tweet.full_text, media=media)
67
70
 
68
71
 
@@ -0,0 +1,408 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import Literal, NamedTuple, Union
4
+
5
+ import httpx
6
+ from loguru import logger
7
+
8
+ from ..config import GlobalConfig
9
+ from ..types import ParseError
10
+
11
+
12
+ class Twitter:
13
+ def __init__(self, proxy: str | None = None, cookie: dict = None):
14
+ self.proxy = proxy
15
+ self.authorization = (
16
+ "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOu"
17
+ "H5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
18
+ )
19
+ self.cookie = cookie
20
+
21
+ async def fetch_tweet(self, url: str) -> "TwitterTweet":
22
+ tweet_id = self.get_id_by_url(url)
23
+ headers = {
24
+ "accept-language": "zh-CN,zh;q=0.9",
25
+ "authorization": self.authorization,
26
+ "content-type": "application/json",
27
+ "user-agent": GlobalConfig.ua,
28
+ "x-guest-token": await self.get_guest_token(url),
29
+ "x-twitter-active-user": "yes",
30
+ "x-twitter-client-language": "zh-cn",
31
+ }
32
+
33
+ cookie = None
34
+ if self.cookie and self.check_cookie():
35
+ headers["x-csrf-token"] = self.cookie.get("ct0")
36
+ cookie = self.cookie
37
+
38
+ params = {
39
+ "variables": f'{{"tweetId":"{tweet_id}","withComm'
40
+ f'unity":false,"includePromotedContent":false,"withVoice":false}}',
41
+ "features": '{"creator_subscriptions_tweet_preview_api_enabled":true,'
42
+ '"communities_web_enable_tweet_community_results_fetch":true,'
43
+ '"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,'
44
+ '"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled"'
45
+ ":true,"
46
+ '"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,'
47
+ '"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,'
48
+ '"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled"'
49
+ ":true,"
50
+ '"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enable'
51
+ 'd":true,'
52
+ '"tweet_with_visibility_results_prefer_gql_media_interstitial_enabled":false,"rweb_video_timestamps_enabled'
53
+ '":true,'
54
+ '"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,'
55
+ '"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_'
56
+ 'phone_label_enabled":false,'
57
+ '"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline'
58
+ '_navigation_enabled":true,'
59
+ '"responsive_web_enhance_cards_enabled":false}',
60
+ "fieldToggles": '{"withArticleRichContentState":true,"withArticlePlainText":false}',
61
+ }
62
+
63
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
64
+ response = await client.get(
65
+ "https://api.twitter.com/graphql/kPLTRmMnzbPTv70___D06w/TweetResultByRestId",
66
+ params=params,
67
+ headers=headers,
68
+ cookies=cookie,
69
+ )
70
+ response.raise_for_status()
71
+ return self.parse(response.json())
72
+
73
+ def parse(self, result: dict):
74
+ if e := result.get("errors"):
75
+ raise Exception(f"error -1: {e[0]['message']}")
76
+
77
+ result = result["data"]["tweetResult"].get("result")
78
+ if not result:
79
+ raise ParseError("error -4: 帖子或用户不存在")
80
+
81
+ if tweet := result.get("tweet"):
82
+ tweet_id = tweet.get("rest_id", {})
83
+ legacy: dict = tweet.get("legacy")
84
+ else:
85
+ tweet_id = result.get("rest_id", {})
86
+ legacy = result.get("legacy")
87
+
88
+ if not legacy:
89
+ if result.get("__typename") == "TweetTombstone":
90
+ raise Exception("error -2: 该推文开启了限制, 匿名用户无法查看")
91
+ raise Exception(f"error -3: {result.get('reason')}")
92
+
93
+ if article := result.get("article", {}):
94
+ ta = ArticleRenderer(article["article_results"]["result"]).render()
95
+ return TwitterTweet(tweet_id=tweet_id, article=ta)
96
+
97
+ if note_tweet := result.get("note_tweet"):
98
+ full_text = note_tweet.get("note_tweet_results", {}).get("result", {}).get("text", None)
99
+ if not full_text:
100
+ full_text = legacy.get("full_text", "")
101
+ else:
102
+ full_text = legacy.get("full_text", "")
103
+
104
+ media = legacy["entities"].get("media", [])
105
+ media_list = []
106
+ for i in media:
107
+ original_info = i.get("original_info", {})
108
+ height = original_info.get("height", 0)
109
+ width = original_info.get("width", 0)
110
+ media_url_https = i["media_url_https"]
111
+
112
+ match i["type"]:
113
+ case "photo":
114
+ media_list.append(
115
+ TwitterPhoto(
116
+ url=self._build_img_url(media_url_https, "orig"),
117
+ width=width,
118
+ height=height,
119
+ thumb_url=self._build_img_url(media_url_https, "small"),
120
+ )
121
+ )
122
+ case "video":
123
+ video_info = i.get("video_info", {})
124
+ media_list.append(
125
+ TwitterVideo(
126
+ url=video_info["variants"][-1]["url"],
127
+ height=height,
128
+ width=width,
129
+ duration_millis=video_info.get("duration_millis", 0),
130
+ thumb_url=self._build_img_url(media_url_https, "medium"),
131
+ )
132
+ )
133
+ case "animated_gif":
134
+ media_list.append(
135
+ TwitterAni(
136
+ url=i["video_info"]["variants"][-1]["url"],
137
+ height=height,
138
+ width=width,
139
+ thumb_url=self._build_img_url(media_url_https, "small"),
140
+ )
141
+ )
142
+
143
+ return TwitterTweet(tweet_id=tweet_id, full_text=full_text, media=media_list or None)
144
+
145
+ @staticmethod
146
+ def _build_img_url(url: str, size: Literal["orig", "large", "medium", "small", "thumb"]):
147
+ p = "&" if "?" in url else "?"
148
+ return f"{url}{p}name={size}"
149
+
150
+ @staticmethod
151
+ def get_id_by_url(url: str):
152
+ return re.search(r"status/(\d+)", url)[1]
153
+
154
+ async def get_guest_token(self, url: str):
155
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
156
+ response = await client.post(url)
157
+ response.raise_for_status()
158
+ guest_token = re.search(r'cookie="gt=(\d+);', response.text)
159
+ if not guest_token:
160
+ raise Exception("error -5: 获取 guest_token 失败")
161
+ return guest_token[1]
162
+
163
+ def check_cookie(self):
164
+ if not self.cookie.get("ct0"):
165
+ logger.warning("cookie 缺少必要参数: ct0")
166
+ return False
167
+ if not self.cookie.get("auth_token"):
168
+ logger.warning("cookie 缺少必要参数: auth_token")
169
+ return False
170
+ return True
171
+
172
+
173
+ class TwitterTweet:
174
+ def __init__(
175
+ self,
176
+ tweet_id: str,
177
+ full_text: str | None = None,
178
+ media: list[Union["TwitterVideo", "TwitterPhoto", "TwitterAni"]] | None = None,
179
+ article: Union["TwitterArticle"] = None,
180
+ ):
181
+ self.tweet_id = tweet_id
182
+ self.full_text = re.sub(r"https://t\.co/[^\s,]+$", "", full_text) if media else full_text
183
+ self.media = media
184
+ self.article = article
185
+
186
+
187
+ @dataclass
188
+ class TwitterArticle:
189
+ title: str
190
+ content: str
191
+ media: list[Union["TwitterVideo", "TwitterPhoto"]] | None = None
192
+
193
+
194
+ @dataclass
195
+ class TwitterVideo:
196
+ url: str
197
+ height: int
198
+ width: int
199
+ duration_millis: int
200
+ thumb_url: str | None = None
201
+
202
+
203
+ @dataclass
204
+ class TwitterPhoto:
205
+ url: str
206
+ height: int
207
+ width: int
208
+ thumb_url: str | None = None
209
+
210
+
211
+ @dataclass
212
+ class TwitterAni:
213
+ url: str
214
+ height: int
215
+ width: int
216
+ thumb_url: str | None = None
217
+
218
+
219
+ class _Insertion(NamedTuple):
220
+ """待插入原文的 Markdown 标记。"""
221
+
222
+ idx: int
223
+ text: str
224
+ kind: str # "start" | "end" | "atomic"
225
+ length: int = 0
226
+
227
+
228
+ class ArticleRenderer:
229
+ """将 Twitter Article JSON 解析并渲染为 Markdown。"""
230
+
231
+ # 行内样式 → Markdown 标记
232
+ _INLINE_STYLES: dict[str, str] = {
233
+ "Bold": "**",
234
+ "Italic": "*",
235
+ "Strikethrough": "~~",
236
+ }
237
+
238
+ # 块级类型 → 格式化函数
239
+ _BLOCK_FORMATTERS: dict[str, callable] = {
240
+ "header-one": lambda t: f"# {t}",
241
+ "header-two": lambda t: f"## {t}",
242
+ "header-three": lambda t: f"### {t}",
243
+ "blockquote": lambda t: "\n".join(f"> {line}" for line in t.split("\n")),
244
+ "ordered-list-item": lambda t: f"1. {t}",
245
+ "unordered-list-item": lambda t: f"- {t}",
246
+ }
247
+
248
+ def __init__(self, article_data: dict):
249
+ self._data = article_data
250
+ self._media_dict: dict = {}
251
+ self._media_result: list[TwitterPhoto | TwitterVideo] = []
252
+
253
+ # ── 公共入口 ──────────────────────────────
254
+
255
+ def render(self) -> "TwitterArticle":
256
+ content_state = self._data.get("content_state", {})
257
+ blocks = content_state.get("blocks", [])
258
+ entity_map = {str(item["key"]): item["value"] for item in content_state.get("entityMap", [])}
259
+ title = self._data.get("title", "")
260
+
261
+ self._parse_media_entities()
262
+ cover_url = self._data.get("cover_media", {}).get("media_info", {}).get("original_img_url", "")
263
+
264
+ md_lines: list[str] = []
265
+ if cover_url:
266
+ md_lines.append(f"![Cover Image]({cover_url})\n")
267
+
268
+ for block in blocks:
269
+ md_lines.append(self._render_block(block, entity_map))
270
+
271
+ return TwitterArticle(
272
+ title=title,
273
+ content="\n\n".join(md_lines),
274
+ media=self._media_result or None,
275
+ )
276
+
277
+ # ── 媒体解析 ──────────────────────────────
278
+
279
+ def _parse_media_entities(self) -> None:
280
+ for media in self._data.get("media_entities", []):
281
+ media_id = media.get("media_id")
282
+ media_info = media.get("media_info", {})
283
+ typename = media_info.get("__typename")
284
+
285
+ if typename == "ApiImage":
286
+ self._parse_image(media_id, media_info)
287
+ elif typename == "ApiVideo":
288
+ self._parse_video(media_id, media_info)
289
+
290
+ def _parse_image(self, media_id, info: dict) -> None:
291
+ url = info.get("original_img_url", "")
292
+ if media_id and url:
293
+ self._media_dict[media_id] = {"type": "image", "url": url}
294
+ self._media_result.append(
295
+ TwitterPhoto(
296
+ url=url,
297
+ height=info.get("original_img_height", 0),
298
+ width=info.get("original_img_width", 0),
299
+ )
300
+ )
301
+
302
+ def _parse_video(self, media_id, info: dict) -> None:
303
+ preview = info.get("preview_image", {})
304
+ preview_url = preview.get("original_img_url", "")
305
+ video_url = self._best_mp4_url(info.get("variants", []))
306
+
307
+ if media_id and preview_url:
308
+ self._media_dict[media_id] = {
309
+ "type": "video",
310
+ "preview_url": preview_url,
311
+ "video_url": video_url,
312
+ }
313
+ self._media_result.append(
314
+ TwitterVideo(
315
+ url=video_url,
316
+ height=preview.get("original_img_height", 0),
317
+ width=preview.get("original_img_width", 0),
318
+ duration_millis=info.get("duration_millis", 0),
319
+ thumb_url=preview_url,
320
+ )
321
+ )
322
+
323
+ @staticmethod
324
+ def _best_mp4_url(variants: list) -> str:
325
+ mp4s = [v for v in variants if v.get("content_type") == "video/mp4"]
326
+ if not mp4s:
327
+ return ""
328
+ return max(mp4s, key=lambda v: v.get("bit_rate", 0)).get("url", "")
329
+
330
+ # ── Block 渲染 ────────────────────────────
331
+
332
+ def _render_block(self, block: dict, entity_map: dict) -> str:
333
+ b_type = block.get("type", "unstyled")
334
+ text = block.get("text", "")
335
+
336
+ insertions = self._collect_inline_styles(block)
337
+ insertions += self._collect_entities(block, entity_map)
338
+ insertions.sort(key=self._insertion_sort_key)
339
+
340
+ final_text = self._apply_insertions(text, insertions)
341
+ formatter = self._BLOCK_FORMATTERS.get(b_type)
342
+ return formatter(final_text) if formatter else final_text
343
+
344
+ @staticmethod
345
+ def _collect_inline_styles(block: dict) -> list[_Insertion]:
346
+ result: list[_Insertion] = []
347
+ for style in block.get("inlineStyleRanges", []):
348
+ marker = ArticleRenderer._INLINE_STYLES.get(style["style"])
349
+ if not marker:
350
+ continue
351
+ offset, length = style["offset"], style["length"]
352
+ result.append(_Insertion(offset, marker, "start", length))
353
+ result.append(_Insertion(offset + length, marker, "end", length))
354
+ return result
355
+
356
+ def _collect_entities(self, block: dict, entity_map: dict) -> list[_Insertion]:
357
+ result: list[_Insertion] = []
358
+ for ent in block.get("entityRanges", []):
359
+ offset, length = ent["offset"], ent["length"]
360
+ ent_data = entity_map.get(str(ent["key"]), {})
361
+ ent_type = ent_data.get("type")
362
+
363
+ if ent_type == "LINK":
364
+ url = ent_data.get("data", {}).get("url", "")
365
+ result.append(_Insertion(offset, "[", "start", length))
366
+ result.append(_Insertion(offset + length, f"]({url})", "end", length))
367
+
368
+ elif ent_type == "MEDIA":
369
+ md = self._media_entity_to_md(ent_data)
370
+ if md:
371
+ result.append(_Insertion(offset, md, "atomic", length))
372
+
373
+ elif ent_type == "DIVIDER":
374
+ result.append(_Insertion(offset, "\n---\n", "atomic", length))
375
+
376
+ return result
377
+
378
+ def _media_entity_to_md(self, ent_data: dict) -> str:
379
+ media_items = ent_data.get("data", {}).get("mediaItems", [])
380
+ if not media_items:
381
+ return ""
382
+ obj = self._media_dict.get(media_items[0].get("mediaId"))
383
+ if not obj:
384
+ return ""
385
+
386
+ if obj["type"] == "image":
387
+ return f"![Image]({obj['url']})"
388
+ if obj["type"] == "video":
389
+ p, v = obj["preview_url"], obj["video_url"]
390
+ return f"[![Video]({p})]({v})" if v else f"![Video Preview]({p})"
391
+ return ""
392
+
393
+ # ── 文本拼装 ──────────────────────────────
394
+
395
+ @staticmethod
396
+ def _insertion_sort_key(ins: _Insertion) -> tuple:
397
+ weight = {"end": 1, "atomic": 0, "start": -1}.get(ins.kind, 0)
398
+ return -ins.idx, weight, ins.length
399
+
400
+ @staticmethod
401
+ def _apply_insertions(text: str, insertions: list[_Insertion]) -> str:
402
+ chars = list(text)
403
+ for ins in insertions:
404
+ if ins.kind == "atomic" and ins.idx < len(chars):
405
+ chars[ins.idx] = ins.text
406
+ else:
407
+ chars.insert(ins.idx, ins.text)
408
+ return "".join(chars)
@@ -10,104 +10,90 @@ from urllib.parse import parse_qs, urlparse
10
10
  import httpx
11
11
  from markdownify import MarkdownConverter
12
12
 
13
+ # TODO: 逆向 EP 和 DATA
13
14
  V4_EP = (
14
- "V1ZCERzVgMWrKv+VcTl5QmS9JuPWLOQ8A0mACeTyYXtTbiguOrHhwaqnagZ6zdAgF"
15
- "4WpAYBvUH3EDnPRlNWut4CTDU1tCa80BSnvTMC9X1j9Kh6IMlGmzPIqpBzzx9r7Nt"
16
- "9XtUhv2WiQ2BgPnUwOFe7gN9r8Yj3184qxn1btJL8="
15
+ "CFcLOAE8E7Ew0J7yxtc9hPtklLIOym8yh1eU5jpB6D0M86gJERnbWbE7wPEWM95v8cWsxACqGq7iU"
16
+ "OEnrD2ODeFIj5VZdvbD3zhhOgT4FB6QfskCkuCN+JP/+aLz0rg/B+c/9fd5513ESuZxFVqUmrwe/v"
17
+ "jqZh5nS6Bsyt50VN8="
17
18
  )
18
19
  V4_DATA = (
19
- "abbbe96a1579aa6fe4fa84e875851b7d7a843a14c5c9573c771d9c1443c9b3a"
20
- "d7603a8d9d67dbc9bd001bf42702ac82e4a6979323ff305eecd74b9620ee140"
21
- "0c135f840b35d9402ec3e3a93fcb3d0d3d6b3e740f5176b72225b6fb8a0d483"
22
- "cab753aa71062dc9b59bc8de950628f23607301c6cd94e75f680b86485a11ac"
23
- "36eba1413e9f14b274eadff30114dfb1cedadc4bd08ef83c5b2d048970d07d3"
24
- "943afef809b44e3b9fee602c91e274fee1523a8beee7e7cec85680b279d616d"
25
- "da15e98b1b0aa718276bcdb05d4ac3e44e72da220e0ea798ad7452aec01d0db"
26
- "c31ad6bf147eab7f7e539d35fe5149110aae5c7069a67eba4aae638505819f8"
27
- "9e2a58bc3b5001c8a5045334121ef04a8e442d7dbb7776bd6013674d2c0028a"
28
- "f131bf6bde47b90dce5c8b9463c9f83d0e7264145c2f6f259d70c4d63a4996b"
29
- "b7c0074e8a59fa298ad144ec139cb29bc94074fbe2f4a88400d85c003793e2b"
30
- "e2077184c3ba2e792926fce25f24d3a764a7c2667446173c74aa704d0d517f2"
31
- "10926aaef05376230b43c3a676dad6ff1c9603553d66eadfb492445eac44745"
32
- "acc620b325560d4941c10e05f3099a17a553fd763a1b7d6ef29f512e436bdfa"
33
- "9fa7c5a70b6a5f91bbcb21946fc2ce92db0c92930008b0fc82e90c3c73f9265"
34
- "2ca388f77b262a918cf59160fa88e481138ee7fe9a9b51d7949a74d22d1dab4"
35
- "e865c12325bfb5b9e748526afb6d8a05c543fd6dc72e81b06a4ebbf8149fca5"
36
- "37a19330da2011eec0229e2302babe239397aa1c2292ab3807cf0aa129d078a"
37
- "a9da010003eac5bb2c06435fbbe9bee7543290c1224745bb485d78f42ee4e82"
38
- "afb27a38befc60a688fb2514795064926bf205357bd46b7c14dd15aea2cab48"
39
- "5c993f0df5a20811d0a7b3bfb1fcb0737c8305675e9bdac396ef8cffb0b6bc4"
40
- "700c3d881c1945329b721b9080bed46b18105b7c9fea4f8276f0fcd09fe99ec"
41
- "52fa50b11e12a19eb9d091ecde701ab2879e2d7727386b28bbde8d62832e1ad"
42
- "822ea57b383cdd3767e8ee64e201bf00fe9cc8428ece3262550764fea47c69e"
43
- "e4339de98767f034d8852993fdefa315d9dcda71a74b665804706d4f9a8c139"
44
- "3670c2220e4ceac833620e0dc8175eb7a77b8b37c1a9d9940c67d44c8bc6b5f"
45
- "9e46273e2f5149d3d3148e8f7a02c4a4c3c998924b7d0e93528952034adc20d"
46
- "c342404a8606f0c07cb2b98c4a5434e69b69282daf952f586b9eed4b4f1ef0c"
47
- "fe5c6d156d14fb5057c8c32a355d07e2f56737d1ccfad573d42c840bbe8b750"
48
- "388211f2c0c5d6a1e34e7741389a742dff58bb0b9f339707a349a09519ca78d"
49
- "5e4f1baaf2598ab9001c15824494eecc17735e69a193e5437cbe44c6f156a0b"
50
- "b8df4fed5edefd4f56f4ef0b4d8cc40fe623836da3c5e662005825c9d344074"
51
- "be2306d6241c163fe92a6ce40ff60538d7464f5a06b6bb9ca1e6f18491ca3c7"
52
- "d6c00e299cbb1ca1c525a981fc6c6f2bb05f709101099b8bd0d2c2a628d94c6"
53
- "1aa97fdd58c9f357359fbd5be9e8f0f534f4481fb780d58e3e599e01fdd5a7f"
54
- "c5fb7e01b76fd58b2f264947d2149fefa57577ef326e264fc827939329031d9"
55
- "01be7579ecf5fccdab11c615c1a053f198297c0723faf8b17ea3335d49df2bf"
56
- "dd17271c2b64745b1f412d87297edd4404a4ae5312debf73b66afcc3d884b93"
57
- "8de41b6ee87265ce624897f3557ebe2d97e6fb17f1dc6a893e48dfa16ef2bff"
58
- "d8f3e06f0a1fcf44c7f2efa372e0ff61344c93f4a2a66538fcc134cd0bf94d5"
59
- "4c969cda4392af70608cbab6cfa340b674ba3a59385c0ed9bb236ff6ed10e1e"
60
- "5a9d4b6529c075dc1ac23cfdae18ab1651a5ee747322e51e3cc6035ca929789"
61
- "00924e661a2694a47873569baa95fd821711dc53a1e0299ed707e337b570591"
62
- "a3f61a5e39f8a75771da1613e8236c9b1b94cb5617fdaf2424d68a7fbd83ebf"
63
- "356fc87e8a805bee5bbd20a55a70881394d7624b1dcf5a135f1cf40b842eca3"
64
- "3d46b72447e0a2e85adf6c26efa6cc73b63573840f7b6229fb03ab45a8b639b"
65
- "5a66bbd6f63d10e59db49d7a9c9af3e3aeb79b7b756e24d5002917e7e788018"
66
- "4f80fcc605a1ba825c779e6083fd7fb0920bbcee021ec8e35427391b871b149"
67
- "c306c2dbda602044cd53ec424dd70cfd1c14a23c9964c039258cff4b75112f8"
68
- "15d9717433c1989ec398cd2acd67c89be82a409e0ef8f3e9ea8ec8b51b5ea5a"
69
- "005b5e735978d9a2987a76d62a2af230e30dc6327f7c0d153add27c7e8a320e"
70
- "4df6c05ab91fe0b9f6f9e13c50f39454066776503eb2ec84b74b4b2d5228627"
71
- "d81c938f7201610c9b703e4fd283a94835b7387db2880443a050d3eb0859aa1"
72
- "efd0f9bb7613b6b918ec2f7b5bb3e7722105b595e7973a93e3de8153a0f8e5b"
73
- "fd1aa6cefc6285fea85e8381ddcce98b31dda33db2a3c80ac04df14b872c805"
74
- "15373f231c3653fb2db799b32e83e59fb0f5763febca3d291b49bf83dd7ebd6"
75
- "1229300b65d44964d9e679f6061a0b2ea1bcd9f5af9bf710047237d87d13394"
76
- "ea8b4627c6997589d0b58379d025b076460eab88d6615ee92b0aa6c47f721f9"
77
- "7e0b5bbe721f06544d0a1bb81402697f2d72ad32c791dab45064b4d18460602"
78
- "9494b268feaebb268e7f92352dc3482f857c14885aabbad98a43e5f8fa5d77d"
79
- "61dc22f23080b9e6403c76f5fb862d7520ab85ae7c1d0e339729f664e7d668f"
80
- "4b9d1301acabb62fda5940db236ea9d2ca896cbb6a13eda6120fa5881453cb4"
81
- "490438460c00db4cd4bdf5df993d3a8d5726c756015eed542e0a4b910570f39"
82
- "7211c3f84f6a0d038e82270f94543e8da1e8d0cffd8f4f561daaf6003ad1fad"
83
- "fdd89c50f057a79225d8647aead74b33216e328c4204686b4ae93ce5f7ee25e"
84
- "1c83fe2cb72c67589aa4865d278ff7a112d09c16707de8acd61b49b901a3266"
85
- "e8ef55f1351fdc3013154635e51e649cbf31fc9b32f6956800834ca73e0b75b"
86
- "2b54d7125257eb6c24ebff52b741109be6da99bb6e0ffab85c3c219550ec3fc"
87
- "b12e2e4d0234627b061193c290baa1be73241be70925c08d33e6efdd44eca9a"
88
- "5160bdc5b47bd1f9d3f2cbf38848cf1aaa2a4827f86e43e06246b3bf94cb0b9"
89
- "f050c89533a3be9ffecefebd1a92e04197f18d7fadc0bfc8664de18425d5c03"
90
- "59b58049267934756f513bd68ea427b38f15213f42cce05cd59f5ea502967ec"
91
- "6a096daaa5e5d2a373227f2fe4514e27dfa012d708f7e94a286452972b5fab4"
92
- "581ecee3df40bad802cbb50b1a5d9dd3323a5f7c61ab893b16782a0ba64fd42"
93
- "10c30ac00f9d21b9124e5e5b323f43badf56761e1eea5c86ff61f19ce1485f4"
94
- "2cf6cadd751bbfb2ef87229eee5068ef6e209f123d29a571a374974ceac2e77"
95
- "f143faba60fc5d16f88d801fa01d879420b5d1393ad5b2bc913e3b0ba7155a6"
96
- "7648196573126273cccc79f2eac32ab68d72cc0f7170feca9c9726af9d65962"
97
- "663d5281372386ec88bd2fa82316f687535ecd39f00658523708ca4785529f5"
98
- "93baf100597ed00c15ae8ff87baa295871680b4096ac03a550f0f015297198b"
99
- "1a93f38cfefbeceabc099c1026664d77f616b4f069cf8bf53d2684b9a4d933c"
100
- "3c65a3aef21559527bfc6586e0247efa244a0a355b43751bc09be8012699468"
101
- "a8c332d60b11bb4881bf56b92ead10e059ac40f83a4d6725cacbc1bb307c839"
102
- "c4edc8b5484b9e2935842e867e739223f2eaaaff04d9701cfa49e3f80be4f2d"
103
- "1b7e8eb76fd7f33dfa79831f75ee65a75b7c7fff98254818f1ab77bca856656"
104
- "4d48e0012733dd426bf841f27f960394b1bacb8a3e36b96c41d751584cd580f"
105
- "ef1b6a8bf990487268348f682a27549ecbb9674b14f2fc97f203f3468f248ec"
106
- "3cf5171aa5e8a8d31a9a433c4f7644736aaf6695b28771fe66b4736e3afb322"
107
- "11ad534b05641600d2cdc79a251fc4c4e5540df9a40aaad329fedd49a429b20"
108
- "70e1345a4146c297ee2a03f056675054e83207d17de21242032c30398259440"
109
- "84e60cbd70eb4c469859824cd7d04340de0d19e614a0826a63c63e15c3372b1"
110
- "7515d4b6951ff6c612f65c3e6538fd0515bcb4814bb641fca5a45c7dae9"
20
+ "7ccf4483919143daa17cca371b849651ab10c58aa97415e3fcc9b2f4c0bc776844997f4059"
21
+ "512c213b3cc965e84693188b08f1ddb8924922598173e0cfa0bab40f242bcd20e11c728da7"
22
+ "5a75d64b75d4070affa0d64831d0b32efde8c74ac4e6adeef18bbcbd1d21131746d131e30c"
23
+ "8ba5939ea8247e79534f6688fed7545d5060b069e85c19d11c0277ee8015d2a989d84ce1bd"
24
+ "01ed2754a365959496343de0152044cef7db82d0353a091f566253f2f8ca14a192c64b610f"
25
+ "643309079d235d355438c84f566943df3df71c2cc979a68c6f36ce62861d6ddb64874d03f8"
26
+ "b596b1380de9f84a60aff650ec59e4b2427ba7492f541354ee4dfe09b02c7296539978d281"
27
+ "2269a7d37121ba96133b7e2b5fdba4922efc6f4bacd31855ca2604b86096ed5abfa6b87656"
28
+ "8298f4bd75c1de979608714b5f0ec2bd852ec6974f929891cfff70392b0c42c7efd9f53e6b"
29
+ "52541d08d654f85d92b29b553b3ea4de3c0ddf88ea77815871e476d5ba8b61dfbb427e3147"
30
+ "62b58a306479eeeb7831864bb593c91af9c85004e891efe5d495b3d1cb4885996ffeda2d50"
31
+ "7f747be1022544cf6ca1e4663bba30d7e7be129b23c5dd4ee1b56d2c48969eeee5b7b0e062"
32
+ "8cfc0527c5e2880c43a61dd753c72b76a0ec1556cac7682f54f0582b50419dbfaa504a9363"
33
+ "54dcb289d282dfa94ded53d926a4385cf437e35afed207c8ccf9eea2e2d493b645034a79ea"
34
+ "115b5df365cc3c6b160d0de25d5d94efd576538386521cc617058831a39bd9009555fe8bc3"
35
+ "419e1f4c9c51271d3996dd5616d0071d850a36799296abb9084a8a6b406f62341ebe581d50"
36
+ "11029c18e88074a2cd7e9fb6be16b948da4d696c624412a8adb4651af89e43db779ed90114"
37
+ "001c7ad552a6baf80447c751c39ce85ce713a661dd7b67be37aa749b46d8827b2187401e8c"
38
+ "3e26a5993b654d3b7e6a6323a512a00f925f887d7ce231f20788d999c527b63160b6b1893a"
39
+ "5891ab183760ca28c95232c164563857a98b963838d385b9638295ded7b69eeb7a43185463"
40
+ "d2278bd59409f5badc24abffcf5cab137f93d89657992b72c340d1a87ddec55a828d33857d"
41
+ "ae8b27fc0aad082e14cd8ef294938dddd095f11dd842f94aa055f3b0ba880cc87771f0d61d"
42
+ "cdc419027c010afb23d668b337cf63ce8359f51623326a81e7513beebfd98d3531b8c701b4"
43
+ "cc58b42937245244228fcefe0c74b491e765e98ec0f71814788c347b5340163aaa8aae7c97"
44
+ "332acb3270583f0d77c15c3216696ad4951e24a19107fd5fe150fc275198fe4c9794f2785d"
45
+ "a3b0b840ebfe75e823b997f0d2eda75f5debbeced24462f1b976e5fc9d643858143d1b0ee4"
46
+ "6dc3936991f50b5d9d7040a5d9f1cb202fbbc06420cdee16fafc0a6929789088ce8e695332"
47
+ "b0178a64761a352b15d87aa3a40529febc881d46a3ae80933e407fc2b28c5e0771dd426b02"
48
+ "1cf177e2ef53c94a0cc5fcc83212843955af3e5f3bb8b24e9ed121669dadd689d54644b507"
49
+ "1581b0e882d4513220cdf1fd5345b76d1fe1d824357bf3acd8a1c58d4bfb4fe3f39922f72e"
50
+ "2eb9a74ee4b5f248bf7e279569597f45ef0e7fcbefa2619dcf367fe3638cc93fe90583a72e"
51
+ "4190729c8c5ab6dd6fb6a37b43eaa90c2e25530ac9d9e923492037f1f14c0da73e4968391f"
52
+ "c96fe10e2bfbefd620bbd6ea4e948cf04d6219e2c32ee6875cb0c2515b3a9ff993438412d3"
53
+ "b1b71ba4c50ea98216b50778a1c909cbb7802acc8348aad6a9118a91a9be87f8610a1ab363"
54
+ "ba06beb726e0a5ce56820e6baf9de2d87a10ce1d5cdd2d94c9e0bab0a3b7b8809d52dd3926"
55
+ "873caf244ab322a0f2f4c4d9c119153d0b3105c8321dd30378b5345418c5a509fe731aef31"
56
+ "7b156cdd606d71b291954181fc3efd71467d809b90d2b02a876ddbe7c758c3189ff6ecca21"
57
+ "44b2a63ef949d7b8b643e3ca7a20c2e5c843e6e34f0260d3963982510a8c077dd7f47158de"
58
+ "ee71befbab650ef1fad54a622bf4d1c297d9a39995fb1420bdba52d20a939b2da9ec3d8a13"
59
+ "b156a597f9de8a683ad68a5725a3d2afbdfdbf9c024793558ba6bbd1f6d5f520988358f6d8"
60
+ "02c0ea8580d4f93218d729cf3bbec52e6224175a0f37dd5bb4901ec5efaa6625c6b6c3b452"
61
+ "752584d2e634fdee181ef7772857de3831725a6bbb6a22c29a4ddee5e8d1bf5c9aebc1b863"
62
+ "5ee14584163dae9d4fb2c28be4220a23bb889d1965b870c32273b0166f3195b22cb85fc570"
63
+ "fb3b13335c49792aaef7b675135a5ced82efe0c36713d7b40123254a7cb0099139bc6634c3"
64
+ "c1af20595392a6436b192b8e6bb43038a33dff4d22f6f11497cbcb5662e11f2d1510a77b61"
65
+ "0d1150b15a76b6c916767f1f7f0883db4a0f7b96e9d9b0884249f965212ec1cb54056ee26d"
66
+ "a2a883f29acdfc7040d4e2e99c4ffd42a8bb1c7852cb5b4c758cdc295baaf973eebd6e720c"
67
+ "bf0bd6b30ad4a7133929e4b1223c4a579dc1dde1f4fdc1fec5a83c0e3d5335f2dc79e57efc"
68
+ "74f64b4d69d0151d4025ee5392fd844f783e2c614903e0b3685362f142fa091dce36382c1d"
69
+ "dc3a6a63815fe062c59e86cad9d26bb54dbf93297ad4ae75039719eddf659c22f0922f08fe"
70
+ "9a2241200f87bfe60f92d9983062d868d5eaced8df5b2851f86b9ee00055d386bf1276ad9b"
71
+ "b27f2fa4b04ca6e773ff7348eb078e7b3b20ac5f878552133a652793f630304d28f1dc8ecb"
72
+ "eedf571f743ffb494c9b34a47df86df8530af4243f0fedfea466c374ff920571a998ebb799"
73
+ "6c9b0ec4ef5780bd519f19106ad1a5b16183bf62cbc0d7d7e4c297df6c0870fd07825d29c9"
74
+ "b51ecdc227efeda8848eaca34a4c65ef35c0d5d3fa6e02f416cf25c84ef054206906e0950e"
75
+ "24250b6e8cbea114c42de785f2ac69204ff675c7bd8f89bb1f683b9adb1c08d73cea3b5cfe"
76
+ "420fa46a893b9b4ba5674c502bebc59d492942af6eef30a09eb9ff94ead00ebc2007702868"
77
+ "63ec52c88a45ec7cbe5414485d28c64112aca5015f1976c2bd772cacb7baa5ae267035c7a1"
78
+ "d9703289821b84ef386f6998777f72f44392f28daa1dc23d26445ed5ca382405ae8b2b47a0"
79
+ "06d56a040b55c6796328ace7d8faa040d3009e5b627e12c30ec6c02bff8de7173b9f393320"
80
+ "3e0fb8e06f812ee8ba5a673f3fa31c27e5309a3f7e0a8a55829c0f5c8c7433bbc4db4cfce9"
81
+ "aea6f37058dd0bcaef20b54546466bdef7b5f69745d4d4ba59c61bc64fd4202f9ce95cc8e1"
82
+ "a56273db05551b6de959c5e2d5f2ccc6d9893d99e48a1ff043889c5bdcb96512ccff7237bd"
83
+ "95fd344d3dd46e8d19743a65cde0aeace9ec6563f4c5d2a1dd6e72a32b48dc9444246d6d9e"
84
+ "a5a9a8d4216b9e0b41f1e54179c52c9f456dbe6c4e8872627b54d7ca6957a270bac31a98cb"
85
+ "2bacf895f30ed6a508b9bdeb288ccfbf5166cec8535ab73c5fa90b41f4ba5d8a55a7cfb8d9"
86
+ "783e00356ee534676215463f0aa1333b3388c13c8c0f176af6d7d2a01e2dd01cac2eb73574"
87
+ "bd6c0930c412cf12bcb80708706cc94b2b9546621f64547b8543179a203d9d871dfc4d5cd4"
88
+ "8334f42598f62e7c8199782bd605c75dd719c0db51ed801a47938746caf258966fc3132f6c"
89
+ "77b0a97ba78ece0e150fee450a90433d2b8534d276b07e8d4586043de0ffe1af106f026d45"
90
+ "41ad961aea6f69fa92344ed9a93f76f2a9f0f29110a4f0a7bda6a84a46d815c68784ab6685"
91
+ "466059376f0f8866107623c49d59acf60a010c923a73177ea9f58e187bcec2d6feb94a5220"
92
+ "56325e1651b5499fd28c17456a756e171840b7f8f1d6785e3e63d0bb5a690cc148f45ba0b0"
93
+ "6b5e0c8da2c6711a6b5011fdfc57221767bce9925d149f357cfa8f108965f9f6037f9b3bc9"
94
+ "46d90499ec8c40108216ed10eea155cb8d8e7bf76cc17efc1fda962101dc22114ca7b3b39c"
95
+ "44c3345d0e1c525e4cbdc1f49dbb66ad1f5874bb91a577cf66428fa861624febfb03c369d1"
96
+ "9d794544"
111
97
  )
112
98
 
113
99
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.10
3
+ Version: 2.0.12
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -98,7 +98,7 @@ print(result)
98
98
 
99
99
  | 平台 | 视频 | 图文 | 其他 |
100
100
  |:----------------|:--:|:--:|:-----:|
101
- | **Twitter / X** | ✅ | ✅ | |
101
+ | **Twitter / X** | ✅ | ✅ | 📝 文章 |
102
102
  | **Instagram** | ✅ | ✅ | |
103
103
  | **YouTube** | ✅ | | 🎵 音乐 |
104
104
  | **Facebook** | ✅ | | |
@@ -1,203 +0,0 @@
1
- import re
2
- from dataclasses import dataclass
3
- from typing import Literal, Union
4
-
5
- import httpx
6
- from loguru import logger
7
-
8
- from ..config import GlobalConfig
9
- from ..types import ParseError
10
-
11
-
12
- class Twitter:
13
- def __init__(self, proxy: str | None = None, cookie: dict = None):
14
- self.proxy = proxy
15
- self.authorization = (
16
- "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOu"
17
- "H5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
18
- )
19
- self.cookie = cookie
20
-
21
- async def fetch_tweet(self, url: str) -> "TwitterTweet":
22
- tweet_id = self.get_id_by_url(url)
23
- headers = {
24
- "accept-language": "zh-CN,zh;q=0.9",
25
- "authorization": self.authorization,
26
- "content-type": "application/json",
27
- "user-agent": GlobalConfig.ua,
28
- "x-guest-token": await self.get_guest_token(url),
29
- "x-twitter-active-user": "yes",
30
- "x-twitter-client-language": "zh-cn",
31
- }
32
-
33
- cookie = None
34
- if self.cookie and self.check_cookie():
35
- headers["x-csrf-token"] = self.cookie.get("ct0")
36
- cookie = self.cookie
37
-
38
- params = {
39
- "variables": f'{{"tweetId":"{tweet_id}","withComm'
40
- f'unity":false,"includePromotedContent":false,"withVoice":false}}',
41
- "features": '{"creator_subscriptions_tweet_preview_api_enabled":true,'
42
- '"communities_web_enable_tweet_community_results_fetch":true,'
43
- '"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,'
44
- '"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled"'
45
- ":true,"
46
- '"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,'
47
- '"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,'
48
- '"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled"'
49
- ":true,"
50
- '"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enable'
51
- 'd":true,'
52
- '"tweet_with_visibility_results_prefer_gql_media_interstitial_enabled":false,"rweb_video_timestamps_enabled'
53
- '":true,'
54
- '"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,'
55
- '"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_'
56
- 'phone_label_enabled":false,'
57
- '"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline'
58
- '_navigation_enabled":true,'
59
- '"responsive_web_enhance_cards_enabled":false}',
60
- "fieldToggles": '{"withArticleRichContentState":true,"withArticlePlainText":false}',
61
- }
62
-
63
- async with httpx.AsyncClient(proxy=self.proxy) as client:
64
- response = await client.get(
65
- "https://api.twitter.com/graphql/kPLTRmMnzbPTv70___D06w/TweetResultByRestId",
66
- params=params,
67
- headers=headers,
68
- cookies=cookie,
69
- )
70
- response.raise_for_status()
71
- return self.parse(response.json())
72
-
73
- def parse(self, result: dict):
74
- if e := result.get("errors"):
75
- raise Exception(f"error -1: {e[0]['message']}")
76
-
77
- result = result["data"]["tweetResult"].get("result")
78
- if not result:
79
- raise ParseError("error -4: 帖子或用户不存在")
80
-
81
- if tweet := result.get("tweet"):
82
- tweet_id = tweet.get("rest_id", {})
83
- legacy: dict = tweet.get("legacy")
84
- else:
85
- tweet_id = result.get("rest_id", {})
86
- legacy = result.get("legacy")
87
-
88
- if not legacy:
89
- if result.get("__typename") == "TweetTombstone":
90
- raise Exception("error -2: 该推文开启了限制, 匿名用户无法查看")
91
- raise Exception(f"error -3: {result.get('reason')}")
92
-
93
- if note_tweet := result.get("note_tweet"):
94
- full_text = note_tweet.get("note_tweet_results", {}).get("result", {}).get("text", None)
95
- if not full_text:
96
- full_text = legacy.get("full_text", "")
97
- else:
98
- full_text = legacy.get("full_text", "")
99
-
100
- media = legacy["entities"].get("media", [])
101
- medias = []
102
- for i in media:
103
- original_info = i.get("original_info", {})
104
- height = original_info.get("height", 0)
105
- width = original_info.get("width", 0)
106
- media_url_https = i["media_url_https"]
107
-
108
- match i["type"]:
109
- case "photo":
110
- medias.append(
111
- TwitterPhoto(
112
- url=self._build_img_url(media_url_https, "orig"),
113
- width=width,
114
- height=height,
115
- thumb_url=self._build_img_url(media_url_https, "small"),
116
- )
117
- )
118
- case "video":
119
- video_info = i.get("video_info", {})
120
- medias.append(
121
- TwitterVideo(
122
- url=video_info["variants"][-1]["url"],
123
- height=height,
124
- width=width,
125
- duration_millis=video_info.get("duration_millis", 0),
126
- thumb_url=self._build_img_url(media_url_https, "medium"),
127
- )
128
- )
129
- case "animated_gif":
130
- medias.append(
131
- TwitterAni(
132
- url=i["video_info"]["variants"][-1]["url"],
133
- height=height,
134
- width=width,
135
- thumb_url=self._build_img_url(media_url_https, "small"),
136
- )
137
- )
138
-
139
- return TwitterTweet(tweet_id=tweet_id, full_text=full_text, media=medias)
140
-
141
- @staticmethod
142
- def _build_img_url(url: str, size: Literal["orig", "large", "medium", "small", "thumb"]):
143
- p = "&" if "?" in url else "?"
144
- return f"{url}{p}name={size}"
145
-
146
- @staticmethod
147
- def get_id_by_url(url: str):
148
- return re.search(r"status/(\d+)", url)[1]
149
-
150
- async def get_guest_token(self, url: str):
151
- async with httpx.AsyncClient(proxy=self.proxy) as client:
152
- response = await client.post(url)
153
- response.raise_for_status()
154
- guest_token = re.search(r'cookie="gt=(\d+);', response.text)
155
- if not guest_token:
156
- raise Exception("error -5: 获取 guest_token 失败")
157
- return guest_token[1]
158
-
159
- def check_cookie(self):
160
- if not self.cookie.get("ct0"):
161
- logger.warning("cookie 缺少必要参数: ct0")
162
- return False
163
- if not self.cookie.get("auth_token"):
164
- logger.warning("cookie 缺少必要参数: auth_token")
165
- return False
166
- return True
167
-
168
-
169
- class TwitterTweet:
170
- def __init__(
171
- self,
172
- tweet_id: str,
173
- full_text: str,
174
- media: list[Union["TwitterVideo", "TwitterPhoto", "TwitterAni"]],
175
- ):
176
- self.tweet_id = tweet_id
177
- self.full_text = re.sub(r"https://t\.co/[^\s,]+$", "", full_text) if media else full_text
178
- self.media = media
179
-
180
-
181
- @dataclass
182
- class TwitterVideo:
183
- url: str
184
- height: int
185
- width: int
186
- duration_millis: int
187
- thumb_url: str | None = None
188
-
189
-
190
- @dataclass
191
- class TwitterPhoto:
192
- url: str
193
- height: int
194
- width: int
195
- thumb_url: str | None = None
196
-
197
-
198
- @dataclass
199
- class TwitterAni:
200
- url: str
201
- height: int
202
- width: int
203
- thumb_url: str | None = None
File without changes
File without changes
File without changes