parsehub 2.0.11__tar.gz → 2.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.11/src/parsehub.egg-info → parsehub-2.0.12}/PKG-INFO +2 -2
- {parsehub-2.0.11 → parsehub-2.0.12}/README.md +1 -1
- {parsehub-2.0.11 → parsehub-2.0.12}/pyproject.toml +1 -1
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/twitter.py +19 -16
- parsehub-2.0.12/src/parsehub/provider_api/twitter.py +408 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/xiaoheihe.py +81 -95
- {parsehub-2.0.11 → parsehub-2.0.12/src/parsehub.egg-info}/PKG-INFO +2 -2
- parsehub-2.0.11/src/parsehub/provider_api/twitter.py +0 -203
- {parsehub-2.0.11 → parsehub-2.0.12}/LICENSE +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/setup.cfg +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/base/base.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/base/ytdlp.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/bilibili.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/coolapk.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/douyin.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/instagram.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/tieba.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/weibo.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/xhs.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/tieba.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/weibo.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/xhs.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/types/result.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub/utils/utils.py +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.11 → parsehub-2.0.12}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: parsehub
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.12
|
|
4
4
|
Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
|
|
5
5
|
Author-email: 梓澪 <zilingmio@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -98,7 +98,7 @@ print(result)
|
|
|
98
98
|
|
|
99
99
|
| 平台 | 视频 | 图文 | 其他 |
|
|
100
100
|
|:----------------|:--:|:--:|:-----:|
|
|
101
|
-
| **Twitter / X** | ✅ | ✅ |
|
|
101
|
+
| **Twitter / X** | ✅ | ✅ | 📝 文章 |
|
|
102
102
|
| **Instagram** | ✅ | ✅ | |
|
|
103
103
|
| **YouTube** | ✅ | | 🎵 音乐 |
|
|
104
104
|
| **Facebook** | ✅ | | |
|
|
@@ -7,7 +7,7 @@ from ...provider_api.twitter import (
|
|
|
7
7
|
TwitterTweet,
|
|
8
8
|
TwitterVideo,
|
|
9
9
|
)
|
|
10
|
-
from ...types import AniRef, ImageRef, MultimediaParseResult, ParseError, Platform, VideoRef
|
|
10
|
+
from ...types import AniRef, ImageRef, MultimediaParseResult, ParseError, Platform, RichTextParseResult, VideoRef
|
|
11
11
|
from ...utils.utils import cookie_ellipsis
|
|
12
12
|
from ..base.base import BaseParser
|
|
13
13
|
|
|
@@ -48,21 +48,24 @@ class TwitterParser(BaseParser):
|
|
|
48
48
|
@staticmethod
|
|
49
49
|
async def media_parse(tweet: TwitterTweet):
|
|
50
50
|
media = []
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
51
|
+
if tweet.media:
|
|
52
|
+
for m in tweet.media:
|
|
53
|
+
match m:
|
|
54
|
+
case TwitterPhoto():
|
|
55
|
+
path = ImageRef(url=m.url, height=m.height, width=m.width, thumb_url=m.thumb_url)
|
|
56
|
+
case TwitterVideo():
|
|
57
|
+
path = VideoRef(
|
|
58
|
+
url=m.url,
|
|
59
|
+
height=m.height,
|
|
60
|
+
width=m.width,
|
|
61
|
+
duration=int(m.duration_millis / 1000),
|
|
62
|
+
thumb_url=m.thumb_url,
|
|
63
|
+
)
|
|
64
|
+
case TwitterAni():
|
|
65
|
+
path = AniRef(url=m.url, ext="mp4", height=m.height, width=m.width, thumb_url=m.thumb_url)
|
|
66
|
+
media.append(path)
|
|
67
|
+
if article := tweet.article:
|
|
68
|
+
return RichTextParseResult(markdown_content=article.content, title=article.title, media=media)
|
|
66
69
|
return MultimediaParseResult(content=tweet.full_text, media=media)
|
|
67
70
|
|
|
68
71
|
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Literal, NamedTuple, Union
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from ..config import GlobalConfig
|
|
9
|
+
from ..types import ParseError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Twitter:
|
|
13
|
+
def __init__(self, proxy: str | None = None, cookie: dict = None):
|
|
14
|
+
self.proxy = proxy
|
|
15
|
+
self.authorization = (
|
|
16
|
+
"Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOu"
|
|
17
|
+
"H5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
|
|
18
|
+
)
|
|
19
|
+
self.cookie = cookie
|
|
20
|
+
|
|
21
|
+
async def fetch_tweet(self, url: str) -> "TwitterTweet":
|
|
22
|
+
tweet_id = self.get_id_by_url(url)
|
|
23
|
+
headers = {
|
|
24
|
+
"accept-language": "zh-CN,zh;q=0.9",
|
|
25
|
+
"authorization": self.authorization,
|
|
26
|
+
"content-type": "application/json",
|
|
27
|
+
"user-agent": GlobalConfig.ua,
|
|
28
|
+
"x-guest-token": await self.get_guest_token(url),
|
|
29
|
+
"x-twitter-active-user": "yes",
|
|
30
|
+
"x-twitter-client-language": "zh-cn",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
cookie = None
|
|
34
|
+
if self.cookie and self.check_cookie():
|
|
35
|
+
headers["x-csrf-token"] = self.cookie.get("ct0")
|
|
36
|
+
cookie = self.cookie
|
|
37
|
+
|
|
38
|
+
params = {
|
|
39
|
+
"variables": f'{{"tweetId":"{tweet_id}","withComm'
|
|
40
|
+
f'unity":false,"includePromotedContent":false,"withVoice":false}}',
|
|
41
|
+
"features": '{"creator_subscriptions_tweet_preview_api_enabled":true,'
|
|
42
|
+
'"communities_web_enable_tweet_community_results_fetch":true,'
|
|
43
|
+
'"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,'
|
|
44
|
+
'"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled"'
|
|
45
|
+
":true,"
|
|
46
|
+
'"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,'
|
|
47
|
+
'"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,'
|
|
48
|
+
'"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled"'
|
|
49
|
+
":true,"
|
|
50
|
+
'"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enable'
|
|
51
|
+
'd":true,'
|
|
52
|
+
'"tweet_with_visibility_results_prefer_gql_media_interstitial_enabled":false,"rweb_video_timestamps_enabled'
|
|
53
|
+
'":true,'
|
|
54
|
+
'"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,'
|
|
55
|
+
'"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_'
|
|
56
|
+
'phone_label_enabled":false,'
|
|
57
|
+
'"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline'
|
|
58
|
+
'_navigation_enabled":true,'
|
|
59
|
+
'"responsive_web_enhance_cards_enabled":false}',
|
|
60
|
+
"fieldToggles": '{"withArticleRichContentState":true,"withArticlePlainText":false}',
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
64
|
+
response = await client.get(
|
|
65
|
+
"https://api.twitter.com/graphql/kPLTRmMnzbPTv70___D06w/TweetResultByRestId",
|
|
66
|
+
params=params,
|
|
67
|
+
headers=headers,
|
|
68
|
+
cookies=cookie,
|
|
69
|
+
)
|
|
70
|
+
response.raise_for_status()
|
|
71
|
+
return self.parse(response.json())
|
|
72
|
+
|
|
73
|
+
def parse(self, result: dict):
|
|
74
|
+
if e := result.get("errors"):
|
|
75
|
+
raise Exception(f"error -1: {e[0]['message']}")
|
|
76
|
+
|
|
77
|
+
result = result["data"]["tweetResult"].get("result")
|
|
78
|
+
if not result:
|
|
79
|
+
raise ParseError("error -4: 帖子或用户不存在")
|
|
80
|
+
|
|
81
|
+
if tweet := result.get("tweet"):
|
|
82
|
+
tweet_id = tweet.get("rest_id", {})
|
|
83
|
+
legacy: dict = tweet.get("legacy")
|
|
84
|
+
else:
|
|
85
|
+
tweet_id = result.get("rest_id", {})
|
|
86
|
+
legacy = result.get("legacy")
|
|
87
|
+
|
|
88
|
+
if not legacy:
|
|
89
|
+
if result.get("__typename") == "TweetTombstone":
|
|
90
|
+
raise Exception("error -2: 该推文开启了限制, 匿名用户无法查看")
|
|
91
|
+
raise Exception(f"error -3: {result.get('reason')}")
|
|
92
|
+
|
|
93
|
+
if article := result.get("article", {}):
|
|
94
|
+
ta = ArticleRenderer(article["article_results"]["result"]).render()
|
|
95
|
+
return TwitterTweet(tweet_id=tweet_id, article=ta)
|
|
96
|
+
|
|
97
|
+
if note_tweet := result.get("note_tweet"):
|
|
98
|
+
full_text = note_tweet.get("note_tweet_results", {}).get("result", {}).get("text", None)
|
|
99
|
+
if not full_text:
|
|
100
|
+
full_text = legacy.get("full_text", "")
|
|
101
|
+
else:
|
|
102
|
+
full_text = legacy.get("full_text", "")
|
|
103
|
+
|
|
104
|
+
media = legacy["entities"].get("media", [])
|
|
105
|
+
media_list = []
|
|
106
|
+
for i in media:
|
|
107
|
+
original_info = i.get("original_info", {})
|
|
108
|
+
height = original_info.get("height", 0)
|
|
109
|
+
width = original_info.get("width", 0)
|
|
110
|
+
media_url_https = i["media_url_https"]
|
|
111
|
+
|
|
112
|
+
match i["type"]:
|
|
113
|
+
case "photo":
|
|
114
|
+
media_list.append(
|
|
115
|
+
TwitterPhoto(
|
|
116
|
+
url=self._build_img_url(media_url_https, "orig"),
|
|
117
|
+
width=width,
|
|
118
|
+
height=height,
|
|
119
|
+
thumb_url=self._build_img_url(media_url_https, "small"),
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
case "video":
|
|
123
|
+
video_info = i.get("video_info", {})
|
|
124
|
+
media_list.append(
|
|
125
|
+
TwitterVideo(
|
|
126
|
+
url=video_info["variants"][-1]["url"],
|
|
127
|
+
height=height,
|
|
128
|
+
width=width,
|
|
129
|
+
duration_millis=video_info.get("duration_millis", 0),
|
|
130
|
+
thumb_url=self._build_img_url(media_url_https, "medium"),
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
case "animated_gif":
|
|
134
|
+
media_list.append(
|
|
135
|
+
TwitterAni(
|
|
136
|
+
url=i["video_info"]["variants"][-1]["url"],
|
|
137
|
+
height=height,
|
|
138
|
+
width=width,
|
|
139
|
+
thumb_url=self._build_img_url(media_url_https, "small"),
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return TwitterTweet(tweet_id=tweet_id, full_text=full_text, media=media_list or None)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _build_img_url(url: str, size: Literal["orig", "large", "medium", "small", "thumb"]):
|
|
147
|
+
p = "&" if "?" in url else "?"
|
|
148
|
+
return f"{url}{p}name={size}"
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def get_id_by_url(url: str):
|
|
152
|
+
return re.search(r"status/(\d+)", url)[1]
|
|
153
|
+
|
|
154
|
+
async def get_guest_token(self, url: str):
|
|
155
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
156
|
+
response = await client.post(url)
|
|
157
|
+
response.raise_for_status()
|
|
158
|
+
guest_token = re.search(r'cookie="gt=(\d+);', response.text)
|
|
159
|
+
if not guest_token:
|
|
160
|
+
raise Exception("error -5: 获取 guest_token 失败")
|
|
161
|
+
return guest_token[1]
|
|
162
|
+
|
|
163
|
+
def check_cookie(self):
|
|
164
|
+
if not self.cookie.get("ct0"):
|
|
165
|
+
logger.warning("cookie 缺少必要参数: ct0")
|
|
166
|
+
return False
|
|
167
|
+
if not self.cookie.get("auth_token"):
|
|
168
|
+
logger.warning("cookie 缺少必要参数: auth_token")
|
|
169
|
+
return False
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class TwitterTweet:
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
tweet_id: str,
|
|
177
|
+
full_text: str | None = None,
|
|
178
|
+
media: list[Union["TwitterVideo", "TwitterPhoto", "TwitterAni"]] | None = None,
|
|
179
|
+
article: Union["TwitterArticle"] = None,
|
|
180
|
+
):
|
|
181
|
+
self.tweet_id = tweet_id
|
|
182
|
+
self.full_text = re.sub(r"https://t\.co/[^\s,]+$", "", full_text) if media else full_text
|
|
183
|
+
self.media = media
|
|
184
|
+
self.article = article
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@dataclass
|
|
188
|
+
class TwitterArticle:
|
|
189
|
+
title: str
|
|
190
|
+
content: str
|
|
191
|
+
media: list[Union["TwitterVideo", "TwitterPhoto"]] | None = None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@dataclass
|
|
195
|
+
class TwitterVideo:
|
|
196
|
+
url: str
|
|
197
|
+
height: int
|
|
198
|
+
width: int
|
|
199
|
+
duration_millis: int
|
|
200
|
+
thumb_url: str | None = None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@dataclass
|
|
204
|
+
class TwitterPhoto:
|
|
205
|
+
url: str
|
|
206
|
+
height: int
|
|
207
|
+
width: int
|
|
208
|
+
thumb_url: str | None = None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class TwitterAni:
|
|
213
|
+
url: str
|
|
214
|
+
height: int
|
|
215
|
+
width: int
|
|
216
|
+
thumb_url: str | None = None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class _Insertion(NamedTuple):
|
|
220
|
+
"""待插入原文的 Markdown 标记。"""
|
|
221
|
+
|
|
222
|
+
idx: int
|
|
223
|
+
text: str
|
|
224
|
+
kind: str # "start" | "end" | "atomic"
|
|
225
|
+
length: int = 0
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class ArticleRenderer:
|
|
229
|
+
"""将 Twitter Article JSON 解析并渲染为 Markdown。"""
|
|
230
|
+
|
|
231
|
+
# 行内样式 → Markdown 标记
|
|
232
|
+
_INLINE_STYLES: dict[str, str] = {
|
|
233
|
+
"Bold": "**",
|
|
234
|
+
"Italic": "*",
|
|
235
|
+
"Strikethrough": "~~",
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
# 块级类型 → 格式化函数
|
|
239
|
+
_BLOCK_FORMATTERS: dict[str, callable] = {
|
|
240
|
+
"header-one": lambda t: f"# {t}",
|
|
241
|
+
"header-two": lambda t: f"## {t}",
|
|
242
|
+
"header-three": lambda t: f"### {t}",
|
|
243
|
+
"blockquote": lambda t: "\n".join(f"> {line}" for line in t.split("\n")),
|
|
244
|
+
"ordered-list-item": lambda t: f"1. {t}",
|
|
245
|
+
"unordered-list-item": lambda t: f"- {t}",
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
def __init__(self, article_data: dict):
|
|
249
|
+
self._data = article_data
|
|
250
|
+
self._media_dict: dict = {}
|
|
251
|
+
self._media_result: list[TwitterPhoto | TwitterVideo] = []
|
|
252
|
+
|
|
253
|
+
# ── 公共入口 ──────────────────────────────
|
|
254
|
+
|
|
255
|
+
def render(self) -> "TwitterArticle":
|
|
256
|
+
content_state = self._data.get("content_state", {})
|
|
257
|
+
blocks = content_state.get("blocks", [])
|
|
258
|
+
entity_map = {str(item["key"]): item["value"] for item in content_state.get("entityMap", [])}
|
|
259
|
+
title = self._data.get("title", "")
|
|
260
|
+
|
|
261
|
+
self._parse_media_entities()
|
|
262
|
+
cover_url = self._data.get("cover_media", {}).get("media_info", {}).get("original_img_url", "")
|
|
263
|
+
|
|
264
|
+
md_lines: list[str] = []
|
|
265
|
+
if cover_url:
|
|
266
|
+
md_lines.append(f"\n")
|
|
267
|
+
|
|
268
|
+
for block in blocks:
|
|
269
|
+
md_lines.append(self._render_block(block, entity_map))
|
|
270
|
+
|
|
271
|
+
return TwitterArticle(
|
|
272
|
+
title=title,
|
|
273
|
+
content="\n\n".join(md_lines),
|
|
274
|
+
media=self._media_result or None,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# ── 媒体解析 ──────────────────────────────
|
|
278
|
+
|
|
279
|
+
def _parse_media_entities(self) -> None:
|
|
280
|
+
for media in self._data.get("media_entities", []):
|
|
281
|
+
media_id = media.get("media_id")
|
|
282
|
+
media_info = media.get("media_info", {})
|
|
283
|
+
typename = media_info.get("__typename")
|
|
284
|
+
|
|
285
|
+
if typename == "ApiImage":
|
|
286
|
+
self._parse_image(media_id, media_info)
|
|
287
|
+
elif typename == "ApiVideo":
|
|
288
|
+
self._parse_video(media_id, media_info)
|
|
289
|
+
|
|
290
|
+
def _parse_image(self, media_id, info: dict) -> None:
|
|
291
|
+
url = info.get("original_img_url", "")
|
|
292
|
+
if media_id and url:
|
|
293
|
+
self._media_dict[media_id] = {"type": "image", "url": url}
|
|
294
|
+
self._media_result.append(
|
|
295
|
+
TwitterPhoto(
|
|
296
|
+
url=url,
|
|
297
|
+
height=info.get("original_img_height", 0),
|
|
298
|
+
width=info.get("original_img_width", 0),
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
def _parse_video(self, media_id, info: dict) -> None:
|
|
303
|
+
preview = info.get("preview_image", {})
|
|
304
|
+
preview_url = preview.get("original_img_url", "")
|
|
305
|
+
video_url = self._best_mp4_url(info.get("variants", []))
|
|
306
|
+
|
|
307
|
+
if media_id and preview_url:
|
|
308
|
+
self._media_dict[media_id] = {
|
|
309
|
+
"type": "video",
|
|
310
|
+
"preview_url": preview_url,
|
|
311
|
+
"video_url": video_url,
|
|
312
|
+
}
|
|
313
|
+
self._media_result.append(
|
|
314
|
+
TwitterVideo(
|
|
315
|
+
url=video_url,
|
|
316
|
+
height=preview.get("original_img_height", 0),
|
|
317
|
+
width=preview.get("original_img_width", 0),
|
|
318
|
+
duration_millis=info.get("duration_millis", 0),
|
|
319
|
+
thumb_url=preview_url,
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
@staticmethod
|
|
324
|
+
def _best_mp4_url(variants: list) -> str:
|
|
325
|
+
mp4s = [v for v in variants if v.get("content_type") == "video/mp4"]
|
|
326
|
+
if not mp4s:
|
|
327
|
+
return ""
|
|
328
|
+
return max(mp4s, key=lambda v: v.get("bit_rate", 0)).get("url", "")
|
|
329
|
+
|
|
330
|
+
# ── Block 渲染 ────────────────────────────
|
|
331
|
+
|
|
332
|
+
def _render_block(self, block: dict, entity_map: dict) -> str:
|
|
333
|
+
b_type = block.get("type", "unstyled")
|
|
334
|
+
text = block.get("text", "")
|
|
335
|
+
|
|
336
|
+
insertions = self._collect_inline_styles(block)
|
|
337
|
+
insertions += self._collect_entities(block, entity_map)
|
|
338
|
+
insertions.sort(key=self._insertion_sort_key)
|
|
339
|
+
|
|
340
|
+
final_text = self._apply_insertions(text, insertions)
|
|
341
|
+
formatter = self._BLOCK_FORMATTERS.get(b_type)
|
|
342
|
+
return formatter(final_text) if formatter else final_text
|
|
343
|
+
|
|
344
|
+
@staticmethod
|
|
345
|
+
def _collect_inline_styles(block: dict) -> list[_Insertion]:
|
|
346
|
+
result: list[_Insertion] = []
|
|
347
|
+
for style in block.get("inlineStyleRanges", []):
|
|
348
|
+
marker = ArticleRenderer._INLINE_STYLES.get(style["style"])
|
|
349
|
+
if not marker:
|
|
350
|
+
continue
|
|
351
|
+
offset, length = style["offset"], style["length"]
|
|
352
|
+
result.append(_Insertion(offset, marker, "start", length))
|
|
353
|
+
result.append(_Insertion(offset + length, marker, "end", length))
|
|
354
|
+
return result
|
|
355
|
+
|
|
356
|
+
def _collect_entities(self, block: dict, entity_map: dict) -> list[_Insertion]:
|
|
357
|
+
result: list[_Insertion] = []
|
|
358
|
+
for ent in block.get("entityRanges", []):
|
|
359
|
+
offset, length = ent["offset"], ent["length"]
|
|
360
|
+
ent_data = entity_map.get(str(ent["key"]), {})
|
|
361
|
+
ent_type = ent_data.get("type")
|
|
362
|
+
|
|
363
|
+
if ent_type == "LINK":
|
|
364
|
+
url = ent_data.get("data", {}).get("url", "")
|
|
365
|
+
result.append(_Insertion(offset, "[", "start", length))
|
|
366
|
+
result.append(_Insertion(offset + length, f"]({url})", "end", length))
|
|
367
|
+
|
|
368
|
+
elif ent_type == "MEDIA":
|
|
369
|
+
md = self._media_entity_to_md(ent_data)
|
|
370
|
+
if md:
|
|
371
|
+
result.append(_Insertion(offset, md, "atomic", length))
|
|
372
|
+
|
|
373
|
+
elif ent_type == "DIVIDER":
|
|
374
|
+
result.append(_Insertion(offset, "\n---\n", "atomic", length))
|
|
375
|
+
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
def _media_entity_to_md(self, ent_data: dict) -> str:
|
|
379
|
+
media_items = ent_data.get("data", {}).get("mediaItems", [])
|
|
380
|
+
if not media_items:
|
|
381
|
+
return ""
|
|
382
|
+
obj = self._media_dict.get(media_items[0].get("mediaId"))
|
|
383
|
+
if not obj:
|
|
384
|
+
return ""
|
|
385
|
+
|
|
386
|
+
if obj["type"] == "image":
|
|
387
|
+
return f""
|
|
388
|
+
if obj["type"] == "video":
|
|
389
|
+
p, v = obj["preview_url"], obj["video_url"]
|
|
390
|
+
return f"[]({v})" if v else f""
|
|
391
|
+
return ""
|
|
392
|
+
|
|
393
|
+
# ── 文本拼装 ──────────────────────────────
|
|
394
|
+
|
|
395
|
+
@staticmethod
|
|
396
|
+
def _insertion_sort_key(ins: _Insertion) -> tuple:
|
|
397
|
+
weight = {"end": 1, "atomic": 0, "start": -1}.get(ins.kind, 0)
|
|
398
|
+
return -ins.idx, weight, ins.length
|
|
399
|
+
|
|
400
|
+
@staticmethod
|
|
401
|
+
def _apply_insertions(text: str, insertions: list[_Insertion]) -> str:
|
|
402
|
+
chars = list(text)
|
|
403
|
+
for ins in insertions:
|
|
404
|
+
if ins.kind == "atomic" and ins.idx < len(chars):
|
|
405
|
+
chars[ins.idx] = ins.text
|
|
406
|
+
else:
|
|
407
|
+
chars.insert(ins.idx, ins.text)
|
|
408
|
+
return "".join(chars)
|
|
@@ -10,104 +10,90 @@ from urllib.parse import parse_qs, urlparse
|
|
|
10
10
|
import httpx
|
|
11
11
|
from markdownify import MarkdownConverter
|
|
12
12
|
|
|
13
|
+
# TODO: 逆向 EP 和 DATA
|
|
13
14
|
V4_EP = (
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
15
|
+
"CFcLOAE8E7Ew0J7yxtc9hPtklLIOym8yh1eU5jpB6D0M86gJERnbWbE7wPEWM95v8cWsxACqGq7iU"
|
|
16
|
+
"OEnrD2ODeFIj5VZdvbD3zhhOgT4FB6QfskCkuCN+JP/+aLz0rg/B+c/9fd5513ESuZxFVqUmrwe/v"
|
|
17
|
+
"jqZh5nS6Bsyt50VN8="
|
|
17
18
|
)
|
|
18
19
|
V4_DATA = (
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
"
|
|
36
|
-
"
|
|
37
|
-
"
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
91
|
-
"
|
|
92
|
-
"
|
|
93
|
-
"
|
|
94
|
-
"
|
|
95
|
-
"
|
|
96
|
-
"7648196573126273cccc79f2eac32ab68d72cc0f7170feca9c9726af9d65962"
|
|
97
|
-
"663d5281372386ec88bd2fa82316f687535ecd39f00658523708ca4785529f5"
|
|
98
|
-
"93baf100597ed00c15ae8ff87baa295871680b4096ac03a550f0f015297198b"
|
|
99
|
-
"1a93f38cfefbeceabc099c1026664d77f616b4f069cf8bf53d2684b9a4d933c"
|
|
100
|
-
"3c65a3aef21559527bfc6586e0247efa244a0a355b43751bc09be8012699468"
|
|
101
|
-
"a8c332d60b11bb4881bf56b92ead10e059ac40f83a4d6725cacbc1bb307c839"
|
|
102
|
-
"c4edc8b5484b9e2935842e867e739223f2eaaaff04d9701cfa49e3f80be4f2d"
|
|
103
|
-
"1b7e8eb76fd7f33dfa79831f75ee65a75b7c7fff98254818f1ab77bca856656"
|
|
104
|
-
"4d48e0012733dd426bf841f27f960394b1bacb8a3e36b96c41d751584cd580f"
|
|
105
|
-
"ef1b6a8bf990487268348f682a27549ecbb9674b14f2fc97f203f3468f248ec"
|
|
106
|
-
"3cf5171aa5e8a8d31a9a433c4f7644736aaf6695b28771fe66b4736e3afb322"
|
|
107
|
-
"11ad534b05641600d2cdc79a251fc4c4e5540df9a40aaad329fedd49a429b20"
|
|
108
|
-
"70e1345a4146c297ee2a03f056675054e83207d17de21242032c30398259440"
|
|
109
|
-
"84e60cbd70eb4c469859824cd7d04340de0d19e614a0826a63c63e15c3372b1"
|
|
110
|
-
"7515d4b6951ff6c612f65c3e6538fd0515bcb4814bb641fca5a45c7dae9"
|
|
20
|
+
"7ccf4483919143daa17cca371b849651ab10c58aa97415e3fcc9b2f4c0bc776844997f4059"
|
|
21
|
+
"512c213b3cc965e84693188b08f1ddb8924922598173e0cfa0bab40f242bcd20e11c728da7"
|
|
22
|
+
"5a75d64b75d4070affa0d64831d0b32efde8c74ac4e6adeef18bbcbd1d21131746d131e30c"
|
|
23
|
+
"8ba5939ea8247e79534f6688fed7545d5060b069e85c19d11c0277ee8015d2a989d84ce1bd"
|
|
24
|
+
"01ed2754a365959496343de0152044cef7db82d0353a091f566253f2f8ca14a192c64b610f"
|
|
25
|
+
"643309079d235d355438c84f566943df3df71c2cc979a68c6f36ce62861d6ddb64874d03f8"
|
|
26
|
+
"b596b1380de9f84a60aff650ec59e4b2427ba7492f541354ee4dfe09b02c7296539978d281"
|
|
27
|
+
"2269a7d37121ba96133b7e2b5fdba4922efc6f4bacd31855ca2604b86096ed5abfa6b87656"
|
|
28
|
+
"8298f4bd75c1de979608714b5f0ec2bd852ec6974f929891cfff70392b0c42c7efd9f53e6b"
|
|
29
|
+
"52541d08d654f85d92b29b553b3ea4de3c0ddf88ea77815871e476d5ba8b61dfbb427e3147"
|
|
30
|
+
"62b58a306479eeeb7831864bb593c91af9c85004e891efe5d495b3d1cb4885996ffeda2d50"
|
|
31
|
+
"7f747be1022544cf6ca1e4663bba30d7e7be129b23c5dd4ee1b56d2c48969eeee5b7b0e062"
|
|
32
|
+
"8cfc0527c5e2880c43a61dd753c72b76a0ec1556cac7682f54f0582b50419dbfaa504a9363"
|
|
33
|
+
"54dcb289d282dfa94ded53d926a4385cf437e35afed207c8ccf9eea2e2d493b645034a79ea"
|
|
34
|
+
"115b5df365cc3c6b160d0de25d5d94efd576538386521cc617058831a39bd9009555fe8bc3"
|
|
35
|
+
"419e1f4c9c51271d3996dd5616d0071d850a36799296abb9084a8a6b406f62341ebe581d50"
|
|
36
|
+
"11029c18e88074a2cd7e9fb6be16b948da4d696c624412a8adb4651af89e43db779ed90114"
|
|
37
|
+
"001c7ad552a6baf80447c751c39ce85ce713a661dd7b67be37aa749b46d8827b2187401e8c"
|
|
38
|
+
"3e26a5993b654d3b7e6a6323a512a00f925f887d7ce231f20788d999c527b63160b6b1893a"
|
|
39
|
+
"5891ab183760ca28c95232c164563857a98b963838d385b9638295ded7b69eeb7a43185463"
|
|
40
|
+
"d2278bd59409f5badc24abffcf5cab137f93d89657992b72c340d1a87ddec55a828d33857d"
|
|
41
|
+
"ae8b27fc0aad082e14cd8ef294938dddd095f11dd842f94aa055f3b0ba880cc87771f0d61d"
|
|
42
|
+
"cdc419027c010afb23d668b337cf63ce8359f51623326a81e7513beebfd98d3531b8c701b4"
|
|
43
|
+
"cc58b42937245244228fcefe0c74b491e765e98ec0f71814788c347b5340163aaa8aae7c97"
|
|
44
|
+
"332acb3270583f0d77c15c3216696ad4951e24a19107fd5fe150fc275198fe4c9794f2785d"
|
|
45
|
+
"a3b0b840ebfe75e823b997f0d2eda75f5debbeced24462f1b976e5fc9d643858143d1b0ee4"
|
|
46
|
+
"6dc3936991f50b5d9d7040a5d9f1cb202fbbc06420cdee16fafc0a6929789088ce8e695332"
|
|
47
|
+
"b0178a64761a352b15d87aa3a40529febc881d46a3ae80933e407fc2b28c5e0771dd426b02"
|
|
48
|
+
"1cf177e2ef53c94a0cc5fcc83212843955af3e5f3bb8b24e9ed121669dadd689d54644b507"
|
|
49
|
+
"1581b0e882d4513220cdf1fd5345b76d1fe1d824357bf3acd8a1c58d4bfb4fe3f39922f72e"
|
|
50
|
+
"2eb9a74ee4b5f248bf7e279569597f45ef0e7fcbefa2619dcf367fe3638cc93fe90583a72e"
|
|
51
|
+
"4190729c8c5ab6dd6fb6a37b43eaa90c2e25530ac9d9e923492037f1f14c0da73e4968391f"
|
|
52
|
+
"c96fe10e2bfbefd620bbd6ea4e948cf04d6219e2c32ee6875cb0c2515b3a9ff993438412d3"
|
|
53
|
+
"b1b71ba4c50ea98216b50778a1c909cbb7802acc8348aad6a9118a91a9be87f8610a1ab363"
|
|
54
|
+
"ba06beb726e0a5ce56820e6baf9de2d87a10ce1d5cdd2d94c9e0bab0a3b7b8809d52dd3926"
|
|
55
|
+
"873caf244ab322a0f2f4c4d9c119153d0b3105c8321dd30378b5345418c5a509fe731aef31"
|
|
56
|
+
"7b156cdd606d71b291954181fc3efd71467d809b90d2b02a876ddbe7c758c3189ff6ecca21"
|
|
57
|
+
"44b2a63ef949d7b8b643e3ca7a20c2e5c843e6e34f0260d3963982510a8c077dd7f47158de"
|
|
58
|
+
"ee71befbab650ef1fad54a622bf4d1c297d9a39995fb1420bdba52d20a939b2da9ec3d8a13"
|
|
59
|
+
"b156a597f9de8a683ad68a5725a3d2afbdfdbf9c024793558ba6bbd1f6d5f520988358f6d8"
|
|
60
|
+
"02c0ea8580d4f93218d729cf3bbec52e6224175a0f37dd5bb4901ec5efaa6625c6b6c3b452"
|
|
61
|
+
"752584d2e634fdee181ef7772857de3831725a6bbb6a22c29a4ddee5e8d1bf5c9aebc1b863"
|
|
62
|
+
"5ee14584163dae9d4fb2c28be4220a23bb889d1965b870c32273b0166f3195b22cb85fc570"
|
|
63
|
+
"fb3b13335c49792aaef7b675135a5ced82efe0c36713d7b40123254a7cb0099139bc6634c3"
|
|
64
|
+
"c1af20595392a6436b192b8e6bb43038a33dff4d22f6f11497cbcb5662e11f2d1510a77b61"
|
|
65
|
+
"0d1150b15a76b6c916767f1f7f0883db4a0f7b96e9d9b0884249f965212ec1cb54056ee26d"
|
|
66
|
+
"a2a883f29acdfc7040d4e2e99c4ffd42a8bb1c7852cb5b4c758cdc295baaf973eebd6e720c"
|
|
67
|
+
"bf0bd6b30ad4a7133929e4b1223c4a579dc1dde1f4fdc1fec5a83c0e3d5335f2dc79e57efc"
|
|
68
|
+
"74f64b4d69d0151d4025ee5392fd844f783e2c614903e0b3685362f142fa091dce36382c1d"
|
|
69
|
+
"dc3a6a63815fe062c59e86cad9d26bb54dbf93297ad4ae75039719eddf659c22f0922f08fe"
|
|
70
|
+
"9a2241200f87bfe60f92d9983062d868d5eaced8df5b2851f86b9ee00055d386bf1276ad9b"
|
|
71
|
+
"b27f2fa4b04ca6e773ff7348eb078e7b3b20ac5f878552133a652793f630304d28f1dc8ecb"
|
|
72
|
+
"eedf571f743ffb494c9b34a47df86df8530af4243f0fedfea466c374ff920571a998ebb799"
|
|
73
|
+
"6c9b0ec4ef5780bd519f19106ad1a5b16183bf62cbc0d7d7e4c297df6c0870fd07825d29c9"
|
|
74
|
+
"b51ecdc227efeda8848eaca34a4c65ef35c0d5d3fa6e02f416cf25c84ef054206906e0950e"
|
|
75
|
+
"24250b6e8cbea114c42de785f2ac69204ff675c7bd8f89bb1f683b9adb1c08d73cea3b5cfe"
|
|
76
|
+
"420fa46a893b9b4ba5674c502bebc59d492942af6eef30a09eb9ff94ead00ebc2007702868"
|
|
77
|
+
"63ec52c88a45ec7cbe5414485d28c64112aca5015f1976c2bd772cacb7baa5ae267035c7a1"
|
|
78
|
+
"d9703289821b84ef386f6998777f72f44392f28daa1dc23d26445ed5ca382405ae8b2b47a0"
|
|
79
|
+
"06d56a040b55c6796328ace7d8faa040d3009e5b627e12c30ec6c02bff8de7173b9f393320"
|
|
80
|
+
"3e0fb8e06f812ee8ba5a673f3fa31c27e5309a3f7e0a8a55829c0f5c8c7433bbc4db4cfce9"
|
|
81
|
+
"aea6f37058dd0bcaef20b54546466bdef7b5f69745d4d4ba59c61bc64fd4202f9ce95cc8e1"
|
|
82
|
+
"a56273db05551b6de959c5e2d5f2ccc6d9893d99e48a1ff043889c5bdcb96512ccff7237bd"
|
|
83
|
+
"95fd344d3dd46e8d19743a65cde0aeace9ec6563f4c5d2a1dd6e72a32b48dc9444246d6d9e"
|
|
84
|
+
"a5a9a8d4216b9e0b41f1e54179c52c9f456dbe6c4e8872627b54d7ca6957a270bac31a98cb"
|
|
85
|
+
"2bacf895f30ed6a508b9bdeb288ccfbf5166cec8535ab73c5fa90b41f4ba5d8a55a7cfb8d9"
|
|
86
|
+
"783e00356ee534676215463f0aa1333b3388c13c8c0f176af6d7d2a01e2dd01cac2eb73574"
|
|
87
|
+
"bd6c0930c412cf12bcb80708706cc94b2b9546621f64547b8543179a203d9d871dfc4d5cd4"
|
|
88
|
+
"8334f42598f62e7c8199782bd605c75dd719c0db51ed801a47938746caf258966fc3132f6c"
|
|
89
|
+
"77b0a97ba78ece0e150fee450a90433d2b8534d276b07e8d4586043de0ffe1af106f026d45"
|
|
90
|
+
"41ad961aea6f69fa92344ed9a93f76f2a9f0f29110a4f0a7bda6a84a46d815c68784ab6685"
|
|
91
|
+
"466059376f0f8866107623c49d59acf60a010c923a73177ea9f58e187bcec2d6feb94a5220"
|
|
92
|
+
"56325e1651b5499fd28c17456a756e171840b7f8f1d6785e3e63d0bb5a690cc148f45ba0b0"
|
|
93
|
+
"6b5e0c8da2c6711a6b5011fdfc57221767bce9925d149f357cfa8f108965f9f6037f9b3bc9"
|
|
94
|
+
"46d90499ec8c40108216ed10eea155cb8d8e7bf76cc17efc1fda962101dc22114ca7b3b39c"
|
|
95
|
+
"44c3345d0e1c525e4cbdc1f49dbb66ad1f5874bb91a577cf66428fa861624febfb03c369d1"
|
|
96
|
+
"9d794544"
|
|
111
97
|
)
|
|
112
98
|
|
|
113
99
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: parsehub
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.12
|
|
4
4
|
Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
|
|
5
5
|
Author-email: 梓澪 <zilingmio@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -98,7 +98,7 @@ print(result)
|
|
|
98
98
|
|
|
99
99
|
| 平台 | 视频 | 图文 | 其他 |
|
|
100
100
|
|:----------------|:--:|:--:|:-----:|
|
|
101
|
-
| **Twitter / X** | ✅ | ✅ |
|
|
101
|
+
| **Twitter / X** | ✅ | ✅ | 📝 文章 |
|
|
102
102
|
| **Instagram** | ✅ | ✅ | |
|
|
103
103
|
| **YouTube** | ✅ | | 🎵 音乐 |
|
|
104
104
|
| **Facebook** | ✅ | | |
|
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Literal, Union
|
|
4
|
-
|
|
5
|
-
import httpx
|
|
6
|
-
from loguru import logger
|
|
7
|
-
|
|
8
|
-
from ..config import GlobalConfig
|
|
9
|
-
from ..types import ParseError
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class Twitter:
|
|
13
|
-
def __init__(self, proxy: str | None = None, cookie: dict = None):
|
|
14
|
-
self.proxy = proxy
|
|
15
|
-
self.authorization = (
|
|
16
|
-
"Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOu"
|
|
17
|
-
"H5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
|
|
18
|
-
)
|
|
19
|
-
self.cookie = cookie
|
|
20
|
-
|
|
21
|
-
async def fetch_tweet(self, url: str) -> "TwitterTweet":
|
|
22
|
-
tweet_id = self.get_id_by_url(url)
|
|
23
|
-
headers = {
|
|
24
|
-
"accept-language": "zh-CN,zh;q=0.9",
|
|
25
|
-
"authorization": self.authorization,
|
|
26
|
-
"content-type": "application/json",
|
|
27
|
-
"user-agent": GlobalConfig.ua,
|
|
28
|
-
"x-guest-token": await self.get_guest_token(url),
|
|
29
|
-
"x-twitter-active-user": "yes",
|
|
30
|
-
"x-twitter-client-language": "zh-cn",
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
cookie = None
|
|
34
|
-
if self.cookie and self.check_cookie():
|
|
35
|
-
headers["x-csrf-token"] = self.cookie.get("ct0")
|
|
36
|
-
cookie = self.cookie
|
|
37
|
-
|
|
38
|
-
params = {
|
|
39
|
-
"variables": f'{{"tweetId":"{tweet_id}","withComm'
|
|
40
|
-
f'unity":false,"includePromotedContent":false,"withVoice":false}}',
|
|
41
|
-
"features": '{"creator_subscriptions_tweet_preview_api_enabled":true,'
|
|
42
|
-
'"communities_web_enable_tweet_community_results_fetch":true,'
|
|
43
|
-
'"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,'
|
|
44
|
-
'"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled"'
|
|
45
|
-
":true,"
|
|
46
|
-
'"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,'
|
|
47
|
-
'"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,'
|
|
48
|
-
'"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled"'
|
|
49
|
-
":true,"
|
|
50
|
-
'"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enable'
|
|
51
|
-
'd":true,'
|
|
52
|
-
'"tweet_with_visibility_results_prefer_gql_media_interstitial_enabled":false,"rweb_video_timestamps_enabled'
|
|
53
|
-
'":true,'
|
|
54
|
-
'"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,'
|
|
55
|
-
'"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_'
|
|
56
|
-
'phone_label_enabled":false,'
|
|
57
|
-
'"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline'
|
|
58
|
-
'_navigation_enabled":true,'
|
|
59
|
-
'"responsive_web_enhance_cards_enabled":false}',
|
|
60
|
-
"fieldToggles": '{"withArticleRichContentState":true,"withArticlePlainText":false}',
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
64
|
-
response = await client.get(
|
|
65
|
-
"https://api.twitter.com/graphql/kPLTRmMnzbPTv70___D06w/TweetResultByRestId",
|
|
66
|
-
params=params,
|
|
67
|
-
headers=headers,
|
|
68
|
-
cookies=cookie,
|
|
69
|
-
)
|
|
70
|
-
response.raise_for_status()
|
|
71
|
-
return self.parse(response.json())
|
|
72
|
-
|
|
73
|
-
def parse(self, result: dict):
|
|
74
|
-
if e := result.get("errors"):
|
|
75
|
-
raise Exception(f"error -1: {e[0]['message']}")
|
|
76
|
-
|
|
77
|
-
result = result["data"]["tweetResult"].get("result")
|
|
78
|
-
if not result:
|
|
79
|
-
raise ParseError("error -4: 帖子或用户不存在")
|
|
80
|
-
|
|
81
|
-
if tweet := result.get("tweet"):
|
|
82
|
-
tweet_id = tweet.get("rest_id", {})
|
|
83
|
-
legacy: dict = tweet.get("legacy")
|
|
84
|
-
else:
|
|
85
|
-
tweet_id = result.get("rest_id", {})
|
|
86
|
-
legacy = result.get("legacy")
|
|
87
|
-
|
|
88
|
-
if not legacy:
|
|
89
|
-
if result.get("__typename") == "TweetTombstone":
|
|
90
|
-
raise Exception("error -2: 该推文开启了限制, 匿名用户无法查看")
|
|
91
|
-
raise Exception(f"error -3: {result.get('reason')}")
|
|
92
|
-
|
|
93
|
-
if note_tweet := result.get("note_tweet"):
|
|
94
|
-
full_text = note_tweet.get("note_tweet_results", {}).get("result", {}).get("text", None)
|
|
95
|
-
if not full_text:
|
|
96
|
-
full_text = legacy.get("full_text", "")
|
|
97
|
-
else:
|
|
98
|
-
full_text = legacy.get("full_text", "")
|
|
99
|
-
|
|
100
|
-
media = legacy["entities"].get("media", [])
|
|
101
|
-
medias = []
|
|
102
|
-
for i in media:
|
|
103
|
-
original_info = i.get("original_info", {})
|
|
104
|
-
height = original_info.get("height", 0)
|
|
105
|
-
width = original_info.get("width", 0)
|
|
106
|
-
media_url_https = i["media_url_https"]
|
|
107
|
-
|
|
108
|
-
match i["type"]:
|
|
109
|
-
case "photo":
|
|
110
|
-
medias.append(
|
|
111
|
-
TwitterPhoto(
|
|
112
|
-
url=self._build_img_url(media_url_https, "orig"),
|
|
113
|
-
width=width,
|
|
114
|
-
height=height,
|
|
115
|
-
thumb_url=self._build_img_url(media_url_https, "small"),
|
|
116
|
-
)
|
|
117
|
-
)
|
|
118
|
-
case "video":
|
|
119
|
-
video_info = i.get("video_info", {})
|
|
120
|
-
medias.append(
|
|
121
|
-
TwitterVideo(
|
|
122
|
-
url=video_info["variants"][-1]["url"],
|
|
123
|
-
height=height,
|
|
124
|
-
width=width,
|
|
125
|
-
duration_millis=video_info.get("duration_millis", 0),
|
|
126
|
-
thumb_url=self._build_img_url(media_url_https, "medium"),
|
|
127
|
-
)
|
|
128
|
-
)
|
|
129
|
-
case "animated_gif":
|
|
130
|
-
medias.append(
|
|
131
|
-
TwitterAni(
|
|
132
|
-
url=i["video_info"]["variants"][-1]["url"],
|
|
133
|
-
height=height,
|
|
134
|
-
width=width,
|
|
135
|
-
thumb_url=self._build_img_url(media_url_https, "small"),
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
return TwitterTweet(tweet_id=tweet_id, full_text=full_text, media=medias)
|
|
140
|
-
|
|
141
|
-
@staticmethod
|
|
142
|
-
def _build_img_url(url: str, size: Literal["orig", "large", "medium", "small", "thumb"]):
|
|
143
|
-
p = "&" if "?" in url else "?"
|
|
144
|
-
return f"{url}{p}name={size}"
|
|
145
|
-
|
|
146
|
-
@staticmethod
|
|
147
|
-
def get_id_by_url(url: str):
|
|
148
|
-
return re.search(r"status/(\d+)", url)[1]
|
|
149
|
-
|
|
150
|
-
async def get_guest_token(self, url: str):
|
|
151
|
-
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
152
|
-
response = await client.post(url)
|
|
153
|
-
response.raise_for_status()
|
|
154
|
-
guest_token = re.search(r'cookie="gt=(\d+);', response.text)
|
|
155
|
-
if not guest_token:
|
|
156
|
-
raise Exception("error -5: 获取 guest_token 失败")
|
|
157
|
-
return guest_token[1]
|
|
158
|
-
|
|
159
|
-
def check_cookie(self):
|
|
160
|
-
if not self.cookie.get("ct0"):
|
|
161
|
-
logger.warning("cookie 缺少必要参数: ct0")
|
|
162
|
-
return False
|
|
163
|
-
if not self.cookie.get("auth_token"):
|
|
164
|
-
logger.warning("cookie 缺少必要参数: auth_token")
|
|
165
|
-
return False
|
|
166
|
-
return True
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
class TwitterTweet:
|
|
170
|
-
def __init__(
|
|
171
|
-
self,
|
|
172
|
-
tweet_id: str,
|
|
173
|
-
full_text: str,
|
|
174
|
-
media: list[Union["TwitterVideo", "TwitterPhoto", "TwitterAni"]],
|
|
175
|
-
):
|
|
176
|
-
self.tweet_id = tweet_id
|
|
177
|
-
self.full_text = re.sub(r"https://t\.co/[^\s,]+$", "", full_text) if media else full_text
|
|
178
|
-
self.media = media
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
@dataclass
|
|
182
|
-
class TwitterVideo:
|
|
183
|
-
url: str
|
|
184
|
-
height: int
|
|
185
|
-
width: int
|
|
186
|
-
duration_millis: int
|
|
187
|
-
thumb_url: str | None = None
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
@dataclass
|
|
191
|
-
class TwitterPhoto:
|
|
192
|
-
url: str
|
|
193
|
-
height: int
|
|
194
|
-
width: int
|
|
195
|
-
thumb_url: str | None = None
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
@dataclass
|
|
199
|
-
class TwitterAni:
|
|
200
|
-
url: str
|
|
201
|
-
height: int
|
|
202
|
-
width: int
|
|
203
|
-
thumb_url: str | None = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|