vibesurf 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/agents/vibe_surf_agent.py +4 -5
- vibe_surf/browser/agent_browser_session.py +26 -0
- vibe_surf/tools/browser_use_tools.py +168 -1
- vibe_surf/tools/vibesurf_tools.py +425 -3
- vibe_surf/tools/views.py +75 -0
- vibe_surf/tools/website_api/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/client.py +845 -0
- vibe_surf/tools/website_api/douyin/helpers.py +239 -0
- vibe_surf/tools/website_api/weibo/__init__.py +0 -0
- vibe_surf/tools/website_api/weibo/client.py +846 -0
- vibe_surf/tools/website_api/weibo/helpers.py +997 -0
- vibe_surf/tools/website_api/xhs/__init__.py +0 -0
- vibe_surf/tools/website_api/xhs/client.py +807 -0
- vibe_surf/tools/website_api/xhs/helpers.py +301 -0
- vibe_surf/tools/website_api/youtube/__init__.py +32 -0
- vibe_surf/tools/website_api/youtube/client.py +1179 -0
- vibe_surf/tools/website_api/youtube/helpers.py +420 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/METADATA +26 -5
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/RECORD +25 -12
- vibesurf-0.1.28.dist-info/licenses/LICENSE +22 -0
- vibesurf-0.1.26.dist-info/licenses/LICENSE +0 -201
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,997 @@
|
|
|
1
|
+
import pdb
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
import re
|
|
5
|
+
import json
|
|
6
|
+
import html
|
|
7
|
+
from typing import Dict, List, Tuple, Optional
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from urllib.parse import parse_qs, unquote
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SearchType(Enum):
|
|
13
|
+
"""Search type enumeration for Weibo"""
|
|
14
|
+
DEFAULT = "1"
|
|
15
|
+
REAL_TIME = "61"
|
|
16
|
+
POPULAR = "60"
|
|
17
|
+
VIDEO = "64"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TrendingType(Enum):
|
|
21
|
+
"""Trending type enumeration for Weibo mobile APIs"""
|
|
22
|
+
TRENDING_LIST = "trending_list"
|
|
23
|
+
HOT_POSTS = "hot_posts"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TrendingConstants:
|
|
27
|
+
"""Constants for Weibo mobile trending APIs"""
|
|
28
|
+
# Trending list API
|
|
29
|
+
TRENDING_CONTAINER_ID = "102803_ctg1_8999_-_ctg1_8999_home"
|
|
30
|
+
|
|
31
|
+
# Hot posts API
|
|
32
|
+
HOT_POSTS_CONTAINER_ID = "102803"
|
|
33
|
+
|
|
34
|
+
# Common parameters
|
|
35
|
+
OPEN_APP = "0"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def generate_device_id() -> str:
|
|
39
|
+
"""Generate a random device ID for Weibo requests"""
|
|
40
|
+
chars = "0123456789abcdef"
|
|
41
|
+
return ''.join(random.choices(chars, k=32))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_container_id(search_type: SearchType, keyword: str) -> str:
|
|
45
|
+
"""Create container ID for search requests"""
|
|
46
|
+
return f"100103type={search_type.value}&q={keyword}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_cookies_from_browser(web_cookies: List[Dict]) -> Tuple[str, Dict[str, str]]:
|
|
50
|
+
"""Extract and format cookies from browser, filtering only Weibo related cookies"""
|
|
51
|
+
cookie_dict = {}
|
|
52
|
+
cookie_parts = []
|
|
53
|
+
|
|
54
|
+
# Weibo domain patterns to filter
|
|
55
|
+
weibo_domains = [
|
|
56
|
+
# '.weibo.com',
|
|
57
|
+
'.weibo.cn',
|
|
58
|
+
# 'm.weibo.cn',
|
|
59
|
+
# 'www.weibo.com'
|
|
60
|
+
]
|
|
61
|
+
for cookie in web_cookies:
|
|
62
|
+
if 'name' in cookie and 'value' in cookie and 'domain' in cookie:
|
|
63
|
+
domain = cookie['domain']
|
|
64
|
+
|
|
65
|
+
# Filter only Weibo related cookies
|
|
66
|
+
if any(wb_domain in domain for wb_domain in weibo_domains):
|
|
67
|
+
name = cookie['name']
|
|
68
|
+
value = cookie['value']
|
|
69
|
+
cookie_dict[name] = value
|
|
70
|
+
cookie_parts.append(f"{name}={value}")
|
|
71
|
+
|
|
72
|
+
cookie_string = "; ".join(cookie_parts)
|
|
73
|
+
return cookie_string, cookie_dict
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_mid_from_url(weibo_url: str) -> Optional[str]:
|
|
77
|
+
"""Extract mid from Weibo URL"""
|
|
78
|
+
patterns = [
|
|
79
|
+
r'/detail/(\w+)',
|
|
80
|
+
r'mid=(\w+)',
|
|
81
|
+
r'/(\w+)$',
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
for pattern in patterns:
|
|
85
|
+
match = re.search(pattern, weibo_url)
|
|
86
|
+
if match:
|
|
87
|
+
return match.group(1)
|
|
88
|
+
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def extract_user_id_from_url(user_url: str) -> Optional[str]:
|
|
93
|
+
"""Extract user ID from Weibo user URL"""
|
|
94
|
+
patterns = [
|
|
95
|
+
r'/u/(\d+)',
|
|
96
|
+
r'uid=(\d+)',
|
|
97
|
+
r'/profile/(\d+)',
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
for pattern in patterns:
|
|
101
|
+
match = re.search(pattern, user_url)
|
|
102
|
+
if match:
|
|
103
|
+
return match.group(1)
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_weibo_time(time_str: str) -> Optional[int]:
|
|
109
|
+
"""Parse Weibo time string to timestamp"""
|
|
110
|
+
if not time_str:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# Handle relative time like "3分钟前", "1小时前", etc.
|
|
115
|
+
if "分钟前" in time_str:
|
|
116
|
+
minutes = int(re.search(r'(\d+)分钟前', time_str).group(1))
|
|
117
|
+
return int(time.time()) - minutes * 60
|
|
118
|
+
elif "小时前" in time_str:
|
|
119
|
+
hours = int(re.search(r'(\d+)小时前', time_str).group(1))
|
|
120
|
+
return int(time.time()) - hours * 3600
|
|
121
|
+
elif "天前" in time_str:
|
|
122
|
+
days = int(re.search(r'(\d+)天前', time_str).group(1))
|
|
123
|
+
return int(time.time()) - days * 86400
|
|
124
|
+
elif "今天" in time_str:
|
|
125
|
+
return int(time.time())
|
|
126
|
+
elif "昨天" in time_str:
|
|
127
|
+
return int(time.time()) - 86400
|
|
128
|
+
else:
|
|
129
|
+
# Try to parse as timestamp
|
|
130
|
+
return int(time_str)
|
|
131
|
+
except (ValueError, AttributeError):
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_image_urls(pics: List[Dict]) -> List[str]:
|
|
136
|
+
"""Extract image URLs from Weibo pics data"""
|
|
137
|
+
image_urls = []
|
|
138
|
+
|
|
139
|
+
for pic in pics:
|
|
140
|
+
if isinstance(pic, dict):
|
|
141
|
+
# Try different URL fields
|
|
142
|
+
url = pic.get('url') or pic.get('large', {}).get('url') or pic.get('pic_big')
|
|
143
|
+
if url:
|
|
144
|
+
image_urls.append(url)
|
|
145
|
+
|
|
146
|
+
return image_urls
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def process_weibo_text(text: str) -> str:
|
|
150
|
+
"""Process Weibo text content, remove HTML tags and clean up"""
|
|
151
|
+
if not text:
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
# Remove HTML tags
|
|
155
|
+
text = re.sub(r'<[^>]+>', '', text)
|
|
156
|
+
|
|
157
|
+
# Remove extra whitespace
|
|
158
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
159
|
+
|
|
160
|
+
return text
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def validate_weibo_data(weibo_data: Dict) -> bool:
|
|
164
|
+
"""Validate if weibo data contains required fields"""
|
|
165
|
+
required_fields = ["id", "text", "user"]
|
|
166
|
+
|
|
167
|
+
for field in required_fields:
|
|
168
|
+
if field not in weibo_data:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
return True
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
|
|
175
|
+
"""
|
|
176
|
+
Filter Weibo search results, only keep card_type=9 data
|
|
177
|
+
"""
|
|
178
|
+
note_list: List[Dict] = []
|
|
179
|
+
|
|
180
|
+
for card_item in card_list:
|
|
181
|
+
if card_item.get("card_type") == 9:
|
|
182
|
+
note_list.append(card_item)
|
|
183
|
+
|
|
184
|
+
# Check card_group for nested items
|
|
185
|
+
card_group = card_item.get("card_group", [])
|
|
186
|
+
for card_group_item in card_group:
|
|
187
|
+
if card_group_item.get("card_type") == 9:
|
|
188
|
+
note_list.append(card_group_item)
|
|
189
|
+
|
|
190
|
+
return note_list
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def extract_container_params(m_weibocn_params: str) -> Dict[str, str]:
|
|
194
|
+
"""Extract container parameters from M_WEIBOCN_PARAMS cookie"""
|
|
195
|
+
try:
|
|
196
|
+
params_dict = parse_qs(unquote(m_weibocn_params))
|
|
197
|
+
return {
|
|
198
|
+
"fid_container_id": params_dict.get("fid", [""])[0],
|
|
199
|
+
"lfid_container_id": params_dict.get("lfid", [""])[0]
|
|
200
|
+
}
|
|
201
|
+
except Exception:
|
|
202
|
+
return {"fid_container_id": "", "lfid_container_id": ""}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def build_image_proxy_url(image_url: str, proxy_host: str = "https://i1.wp.com/") -> str:
|
|
206
|
+
"""Build proxied image URL to bypass anti-hotlinking"""
|
|
207
|
+
if not image_url.startswith("http"):
|
|
208
|
+
return image_url
|
|
209
|
+
|
|
210
|
+
# Remove https:// prefix
|
|
211
|
+
clean_url = image_url[8:] if image_url.startswith("https://") else image_url[7:]
|
|
212
|
+
|
|
213
|
+
# Split URL parts
|
|
214
|
+
url_parts = clean_url.split("/")
|
|
215
|
+
|
|
216
|
+
# Reconstruct URL with 'large' for high quality images
|
|
217
|
+
processed_url = ""
|
|
218
|
+
for i, part in enumerate(url_parts):
|
|
219
|
+
if i == 1: # Insert 'large' after domain
|
|
220
|
+
processed_url += "large/"
|
|
221
|
+
elif i == len(url_parts) - 1: # Last part (filename)
|
|
222
|
+
processed_url += part
|
|
223
|
+
else:
|
|
224
|
+
processed_url += part + "/"
|
|
225
|
+
|
|
226
|
+
return f"{proxy_host}{processed_url}"
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def sanitize_filename(filename: str) -> str:
|
|
230
|
+
"""Sanitize filename for file system"""
|
|
231
|
+
# Remove invalid characters
|
|
232
|
+
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
|
|
233
|
+
# Remove extra spaces
|
|
234
|
+
filename = re.sub(r'\s+', ' ', filename).strip()
|
|
235
|
+
# Limit length
|
|
236
|
+
if len(filename) > 100:
|
|
237
|
+
filename = filename[:100]
|
|
238
|
+
|
|
239
|
+
return filename or "untitled"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def extract_render_data(html_content: str) -> Optional[Dict]:
|
|
243
|
+
"""Extract render data from Weibo detail page HTML"""
|
|
244
|
+
try:
|
|
245
|
+
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', html_content, re.DOTALL)
|
|
246
|
+
if match:
|
|
247
|
+
render_data_json = match.group(1)
|
|
248
|
+
render_data_dict = json.loads(render_data_json)
|
|
249
|
+
return render_data_dict[0] if render_data_dict else None
|
|
250
|
+
except (json.JSONDecodeError, IndexError):
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class WeiboError(Exception):
|
|
257
|
+
"""Base exception for Weibo API errors"""
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class NetworkError(WeiboError):
|
|
262
|
+
"""Network connection error"""
|
|
263
|
+
pass
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class DataExtractionError(WeiboError):
|
|
267
|
+
"""Data extraction error"""
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class AuthenticationError(WeiboError):
|
|
272
|
+
"""Authentication error"""
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class RateLimitError(WeiboError):
|
|
277
|
+
"""Rate limit exceeded error"""
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class ContentNotFoundError(WeiboError):
|
|
282
|
+
"""Content not found error"""
|
|
283
|
+
pass
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class ValidationError(WeiboError):
|
|
287
|
+
"""Data validation error"""
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
def extract_redirect_url_from_html(html_content: str) -> Optional[str]:
|
|
291
|
+
"""Extract redirect URL from HTML meta refresh or JavaScript redirect"""
|
|
292
|
+
try:
|
|
293
|
+
# Try meta refresh tag
|
|
294
|
+
meta_match = re.search(r'<meta[^>]*http-equiv=["\']refresh["\'][^>]*content=["\'][^"\']*url=([^"\']+)["\']', html_content, re.IGNORECASE)
|
|
295
|
+
if meta_match:
|
|
296
|
+
return html.unescape(meta_match.group(1))
|
|
297
|
+
|
|
298
|
+
# Try JavaScript location.replace
|
|
299
|
+
js_match = re.search(r'location\.replace\(["\']([^"\']+)["\']\)', html_content, re.IGNORECASE)
|
|
300
|
+
if js_match:
|
|
301
|
+
return html.unescape(js_match.group(1))
|
|
302
|
+
|
|
303
|
+
# Try window.location.href
|
|
304
|
+
js_match2 = re.search(r'window\.location\.href\s*=\s*["\']([^"\']+)["\']', html_content, re.IGNORECASE)
|
|
305
|
+
if js_match2:
|
|
306
|
+
return html.unescape(js_match2.group(1))
|
|
307
|
+
|
|
308
|
+
except Exception:
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def decode_chinese_html(html_content: bytes) -> str:
|
|
315
|
+
"""Decode HTML content that might be in GBK or other Chinese encodings"""
|
|
316
|
+
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']
|
|
317
|
+
|
|
318
|
+
for encoding in encodings:
|
|
319
|
+
try:
|
|
320
|
+
return html_content.decode(encoding)
|
|
321
|
+
except UnicodeDecodeError:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
# If all else fails, try with error handling
|
|
325
|
+
return html_content.decode('utf-8', errors='ignore')
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def get_mobile_user_agent() -> str:
|
|
329
|
+
ua_list = [
|
|
330
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
|
|
331
|
+
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
|
|
332
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
|
|
333
|
+
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
|
|
334
|
+
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
|
|
335
|
+
"Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
|
|
336
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
|
|
337
|
+
"Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
|
|
338
|
+
]
|
|
339
|
+
return random.choice(ua_list)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def transform_weibo_post_data(card_data: Dict) -> Optional[Dict]:
|
|
343
|
+
"""
|
|
344
|
+
Transform raw Weibo card data into structured post information
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
card_data: Raw card data from Weibo API
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Structured post information or None if invalid data
|
|
351
|
+
"""
|
|
352
|
+
if not isinstance(card_data, dict) or card_data.get("card_type") != 9:
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
mblog = card_data.get("mblog", {})
|
|
356
|
+
if not mblog:
|
|
357
|
+
return None
|
|
358
|
+
|
|
359
|
+
user = mblog.get("user", {})
|
|
360
|
+
if not user:
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
post_info = {
|
|
365
|
+
"mid": mblog.get("id"),
|
|
366
|
+
"text": process_weibo_text(mblog.get("text", "")),
|
|
367
|
+
"created_at": mblog.get("created_at"),
|
|
368
|
+
"source": mblog.get("source"),
|
|
369
|
+
"reposts_count": mblog.get("reposts_count", 0),
|
|
370
|
+
"comments_count": mblog.get("comments_count", 0),
|
|
371
|
+
"attitudes_count": mblog.get("attitudes_count", 0),
|
|
372
|
+
"user": {
|
|
373
|
+
"id": user.get("id"),
|
|
374
|
+
"screen_name": user.get("screen_name"),
|
|
375
|
+
"profile_image_url": user.get("profile_image_url"),
|
|
376
|
+
"followers_count": user.get("followers_count", 0),
|
|
377
|
+
"friends_count": user.get("friends_count", 0),
|
|
378
|
+
"statuses_count": user.get("statuses_count", 0),
|
|
379
|
+
},
|
|
380
|
+
"pics": mblog.get("pics", []),
|
|
381
|
+
"page_info": mblog.get("page_info", {}), # Video info if present
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
# Clean up followers_count if it's a string with suffix
|
|
385
|
+
followers_count = user.get("followers_count", 0)
|
|
386
|
+
if isinstance(followers_count, str):
|
|
387
|
+
# Handle cases like "11.2万"
|
|
388
|
+
if "万" in followers_count:
|
|
389
|
+
try:
|
|
390
|
+
num_str = followers_count.replace("万", "")
|
|
391
|
+
post_info["user"]["followers_count"] = int(float(num_str) * 10000)
|
|
392
|
+
except (ValueError, TypeError):
|
|
393
|
+
post_info["user"]["followers_count"] = 0
|
|
394
|
+
else:
|
|
395
|
+
try:
|
|
396
|
+
post_info["user"]["followers_count"] = int(followers_count)
|
|
397
|
+
except (ValueError, TypeError):
|
|
398
|
+
post_info["user"]["followers_count"] = 0
|
|
399
|
+
|
|
400
|
+
# Validate essential fields
|
|
401
|
+
if not post_info["mid"] or not post_info["user"]["id"]:
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
return post_info
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
# Log error but don't fail completely
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def transform_weibo_search_results(api_response: Dict) -> List[Dict]:
|
|
412
|
+
"""
|
|
413
|
+
Transform raw Weibo search API response into list of structured posts
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
api_response: Raw API response from search_posts_by_keyword
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
List of structured post information
|
|
420
|
+
"""
|
|
421
|
+
if not isinstance(api_response, dict):
|
|
422
|
+
return []
|
|
423
|
+
|
|
424
|
+
cards = api_response.get("cards", [])
|
|
425
|
+
if not isinstance(cards, list):
|
|
426
|
+
return []
|
|
427
|
+
|
|
428
|
+
# Filter and transform cards
|
|
429
|
+
filtered_cards = filter_search_result_card(cards)
|
|
430
|
+
structured_posts = []
|
|
431
|
+
|
|
432
|
+
for card in filtered_cards:
|
|
433
|
+
post_info = transform_weibo_post_data(card)
|
|
434
|
+
if post_info:
|
|
435
|
+
structured_posts.append(post_info)
|
|
436
|
+
|
|
437
|
+
return structured_posts
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def transform_weibo_post_detail(detail_response: Dict) -> Optional[Dict]:
|
|
441
|
+
"""
|
|
442
|
+
Transform raw Weibo post detail response into structured post information
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
detail_response: Raw response from get_post_detail
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Structured post detail information or None if invalid data
|
|
449
|
+
"""
|
|
450
|
+
if not isinstance(detail_response, dict):
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
mblog = detail_response.get("mblog", {})
|
|
454
|
+
if not mblog:
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
user = mblog.get("user", {})
|
|
458
|
+
if not user:
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
post_detail = {
|
|
463
|
+
"mid": mblog.get("id"),
|
|
464
|
+
"text": process_weibo_text(mblog.get("text", "")),
|
|
465
|
+
"created_at": mblog.get("created_at"),
|
|
466
|
+
"source": mblog.get("source"),
|
|
467
|
+
"reposts_count": mblog.get("reposts_count", 0),
|
|
468
|
+
"comments_count": mblog.get("comments_count", 0),
|
|
469
|
+
"attitudes_count": mblog.get("attitudes_count", 0),
|
|
470
|
+
"user": {
|
|
471
|
+
"id": user.get("id"),
|
|
472
|
+
"screen_name": user.get("screen_name"),
|
|
473
|
+
"profile_image_url": user.get("profile_image_url"),
|
|
474
|
+
"followers_count": user.get("followers_count", 0),
|
|
475
|
+
"friends_count": user.get("follow_count", 0), # Note: different field name
|
|
476
|
+
"statuses_count": user.get("statuses_count", 0),
|
|
477
|
+
"verified": user.get("verified", False),
|
|
478
|
+
"verified_type": user.get("verified_type", 0),
|
|
479
|
+
"verified_reason": user.get("verified_reason", ""),
|
|
480
|
+
"description": user.get("description", ""),
|
|
481
|
+
},
|
|
482
|
+
"pics": mblog.get("pic_ids", []),
|
|
483
|
+
"pic_num": mblog.get("pic_num", 0),
|
|
484
|
+
"page_info": mblog.get("page_info", {}), # Video info if present
|
|
485
|
+
"is_long_text": mblog.get("isLongText", False),
|
|
486
|
+
"favorited": mblog.get("favorited", False),
|
|
487
|
+
"can_edit": mblog.get("can_edit", False),
|
|
488
|
+
"visible": mblog.get("visible", {}),
|
|
489
|
+
"bid": mblog.get("bid", ""),
|
|
490
|
+
"status_title": mblog.get("status_title", ""),
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
# Clean up followers_count if it's a string with suffix
|
|
494
|
+
followers_count = user.get("followers_count", 0)
|
|
495
|
+
if isinstance(followers_count, str):
|
|
496
|
+
# Handle cases like "3800.8万"
|
|
497
|
+
if "万" in followers_count:
|
|
498
|
+
try:
|
|
499
|
+
num_str = followers_count.replace("万", "")
|
|
500
|
+
post_detail["user"]["followers_count"] = int(float(num_str) * 10000)
|
|
501
|
+
except (ValueError, TypeError):
|
|
502
|
+
post_detail["user"]["followers_count"] = 0
|
|
503
|
+
else:
|
|
504
|
+
try:
|
|
505
|
+
post_detail["user"]["followers_count"] = int(followers_count)
|
|
506
|
+
except (ValueError, TypeError):
|
|
507
|
+
post_detail["user"]["followers_count"] = 0
|
|
508
|
+
|
|
509
|
+
# Process video information if present
|
|
510
|
+
page_info = mblog.get("page_info", {})
|
|
511
|
+
if page_info and page_info.get("type") == "video":
|
|
512
|
+
post_detail["video_info"] = {
|
|
513
|
+
"title": page_info.get("title", ""),
|
|
514
|
+
"page_title": page_info.get("page_title", ""),
|
|
515
|
+
"object_id": page_info.get("object_id", ""),
|
|
516
|
+
"page_url": page_info.get("page_url", ""),
|
|
517
|
+
"duration": page_info.get("media_info", {}).get("duration", 0),
|
|
518
|
+
"video_orientation": page_info.get("video_orientation", ""),
|
|
519
|
+
"urls": page_info.get("urls", {}),
|
|
520
|
+
"cover_image": {
|
|
521
|
+
"url": page_info.get("page_pic", {}).get("url", ""),
|
|
522
|
+
"width": page_info.get("page_pic", {}).get("width", ""),
|
|
523
|
+
"height": page_info.get("page_pic", {}).get("height", ""),
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
# Validate essential fields
|
|
528
|
+
if not post_detail["mid"] or not post_detail["user"]["id"]:
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
return post_detail
|
|
532
|
+
|
|
533
|
+
except Exception as e:
|
|
534
|
+
# Log error but don't fail completely
|
|
535
|
+
return None
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def transform_weibo_comment_data(comment_data: Dict) -> Optional[Dict]:
|
|
539
|
+
"""
|
|
540
|
+
Transform raw Weibo comment data into structured comment information
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
comment_data: Raw comment data from Weibo API
|
|
544
|
+
|
|
545
|
+
Returns:
|
|
546
|
+
Structured comment information or None if invalid data
|
|
547
|
+
"""
|
|
548
|
+
if not isinstance(comment_data, dict):
|
|
549
|
+
return None
|
|
550
|
+
|
|
551
|
+
user = comment_data.get("user", {})
|
|
552
|
+
if not user:
|
|
553
|
+
return None
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
comment_info = {
|
|
557
|
+
"id": comment_data.get("id"),
|
|
558
|
+
"text": process_weibo_text(comment_data.get("text", "")),
|
|
559
|
+
"created_at": comment_data.get("created_at"),
|
|
560
|
+
"source": comment_data.get("source"),
|
|
561
|
+
"floor_number": comment_data.get("floor_number", 0),
|
|
562
|
+
"like_count": comment_data.get("like_count", 0),
|
|
563
|
+
"liked": comment_data.get("liked", False),
|
|
564
|
+
"user": {
|
|
565
|
+
"id": user.get("id"),
|
|
566
|
+
"screen_name": user.get("screen_name"),
|
|
567
|
+
"profile_image_url": user.get("profile_image_url"),
|
|
568
|
+
"followers_count": user.get("followers_count", 0),
|
|
569
|
+
"follow_count": user.get("follow_count", 0),
|
|
570
|
+
"statuses_count": user.get("statuses_count", 0),
|
|
571
|
+
"verified": user.get("verified", False),
|
|
572
|
+
"verified_type": user.get("verified_type", -1),
|
|
573
|
+
"verified_reason": user.get("verified_reason", ""),
|
|
574
|
+
"description": user.get("description", ""),
|
|
575
|
+
"gender": user.get("gender", ""),
|
|
576
|
+
},
|
|
577
|
+
"rootid": comment_data.get("rootid"),
|
|
578
|
+
"disable_reply": comment_data.get("disable_reply", 0),
|
|
579
|
+
"isLikedByMblogAuthor": comment_data.get("isLikedByMblogAuthor", False),
|
|
580
|
+
"bid": comment_data.get("bid", ""),
|
|
581
|
+
# Sub-comments information
|
|
582
|
+
"has_sub_comments": comment_data.get("comments", False),
|
|
583
|
+
"sub_comments_count": comment_data.get("total_number", 0),
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
# Clean up followers_count if it's a string with suffix
|
|
587
|
+
followers_count = user.get("followers_count", 0)
|
|
588
|
+
if isinstance(followers_count, str):
|
|
589
|
+
# Handle cases like "115", "11万", etc.
|
|
590
|
+
if "万" in followers_count:
|
|
591
|
+
try:
|
|
592
|
+
num_str = followers_count.replace("万", "")
|
|
593
|
+
comment_info["user"]["followers_count"] = int(float(num_str) * 10000)
|
|
594
|
+
except (ValueError, TypeError):
|
|
595
|
+
comment_info["user"]["followers_count"] = 0
|
|
596
|
+
else:
|
|
597
|
+
try:
|
|
598
|
+
comment_info["user"]["followers_count"] = int(followers_count)
|
|
599
|
+
except (ValueError, TypeError):
|
|
600
|
+
comment_info["user"]["followers_count"] = 0
|
|
601
|
+
|
|
602
|
+
# Validate essential fields
|
|
603
|
+
if not comment_info["id"] or not comment_info["user"]["id"]:
|
|
604
|
+
return None
|
|
605
|
+
|
|
606
|
+
return comment_info
|
|
607
|
+
|
|
608
|
+
except Exception as e:
|
|
609
|
+
# Log error but don't fail completely
|
|
610
|
+
return None
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def transform_weibo_comments_response(comments_response: Dict) -> List[Dict]:
|
|
614
|
+
"""
|
|
615
|
+
Transform raw Weibo comments API response into list of structured comments
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
comments_response: Raw API response from get_post_comments
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
List of structured comment information
|
|
622
|
+
"""
|
|
623
|
+
if not isinstance(comments_response, dict):
|
|
624
|
+
return []
|
|
625
|
+
|
|
626
|
+
comments_data = comments_response.get("data", [])
|
|
627
|
+
if not isinstance(comments_data, list):
|
|
628
|
+
return []
|
|
629
|
+
|
|
630
|
+
structured_comments = []
|
|
631
|
+
|
|
632
|
+
for comment in comments_data:
|
|
633
|
+
comment_info = transform_weibo_comment_data(comment)
|
|
634
|
+
if comment_info:
|
|
635
|
+
structured_comments.append(comment_info)
|
|
636
|
+
|
|
637
|
+
return structured_comments
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def transform_weibo_user_info(user_response: Dict) -> Optional[Dict]:
|
|
641
|
+
"""
|
|
642
|
+
Transform raw Weibo user info response into structured user information
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
user_response: Raw response from get_user_info
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
Structured user information or None if invalid data
|
|
649
|
+
"""
|
|
650
|
+
if not isinstance(user_response, dict):
|
|
651
|
+
return None
|
|
652
|
+
|
|
653
|
+
user = user_response.get("user", {})
|
|
654
|
+
if not user or not user.get("id"):
|
|
655
|
+
return None
|
|
656
|
+
|
|
657
|
+
try:
|
|
658
|
+
user_info = {
|
|
659
|
+
"id": user.get("id"),
|
|
660
|
+
"screen_name": user.get("screen_name", ""),
|
|
661
|
+
"profile_image_url": user.get("profile_image_url", ""),
|
|
662
|
+
"followers_count": user.get("followers_count", 0),
|
|
663
|
+
"friends_count": user.get("friends_count", 0),
|
|
664
|
+
"statuses_count": user.get("statuses_count", 0),
|
|
665
|
+
"verified": user.get("verified", False),
|
|
666
|
+
"verified_type": user.get("verified_type", -1),
|
|
667
|
+
"verified_reason": user.get("verified_reason", ""),
|
|
668
|
+
"description": user.get("description", ""),
|
|
669
|
+
"gender": user.get("gender", ""),
|
|
670
|
+
"location": user.get("location", ""),
|
|
671
|
+
"created_at": user.get("created_at", ""),
|
|
672
|
+
"profile_url": user.get("profile_url", ""),
|
|
673
|
+
"cover_image_phone": user.get("cover_image_phone", ""),
|
|
674
|
+
"avatar_hd": user.get("avatar_hd", ""),
|
|
675
|
+
# Container and navigation info
|
|
676
|
+
"containerid": user_response.get("containerid", ""),
|
|
677
|
+
"tabs_info": {
|
|
678
|
+
"selected_tab": user_response.get("tabsInfo", {}).get("selectedTab", 1),
|
|
679
|
+
"tabs": []
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
# Process tabs information
|
|
684
|
+
tabs = user_response.get("tabsInfo", {}).get("tabs", [])
|
|
685
|
+
for tab in tabs:
|
|
686
|
+
if isinstance(tab, dict):
|
|
687
|
+
tab_info = {
|
|
688
|
+
"id": tab.get("id"),
|
|
689
|
+
"tab_key": tab.get("tabKey", ""),
|
|
690
|
+
"title": tab.get("title", ""),
|
|
691
|
+
"tab_type": tab.get("tab_type", ""),
|
|
692
|
+
"containerid": tab.get("containerid", ""),
|
|
693
|
+
"must_show": tab.get("must_show", 0),
|
|
694
|
+
"hidden": tab.get("hidden", 0),
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
# Add optional fields if present
|
|
698
|
+
if "apipath" in tab:
|
|
699
|
+
tab_info["apipath"] = tab["apipath"]
|
|
700
|
+
if "headSubTitleText" in tab:
|
|
701
|
+
tab_info["head_subtitle_text"] = tab["headSubTitleText"]
|
|
702
|
+
if "tab_icon" in tab:
|
|
703
|
+
tab_info["tab_icon"] = tab["tab_icon"]
|
|
704
|
+
if "tab_icon_dark" in tab:
|
|
705
|
+
tab_info["tab_icon_dark"] = tab["tab_icon_dark"]
|
|
706
|
+
if "url" in tab:
|
|
707
|
+
tab_info["url"] = tab["url"]
|
|
708
|
+
|
|
709
|
+
user_info["tabs_info"]["tabs"].append(tab_info)
|
|
710
|
+
|
|
711
|
+
# Clean up followers_count if it's a string with suffix
|
|
712
|
+
followers_count = user.get("followers_count", 0)
|
|
713
|
+
if isinstance(followers_count, str):
|
|
714
|
+
if "万" in followers_count:
|
|
715
|
+
try:
|
|
716
|
+
num_str = followers_count.replace("万", "")
|
|
717
|
+
user_info["followers_count"] = int(float(num_str) * 10000)
|
|
718
|
+
except (ValueError, TypeError):
|
|
719
|
+
user_info["followers_count"] = 0
|
|
720
|
+
else:
|
|
721
|
+
try:
|
|
722
|
+
user_info["followers_count"] = int(followers_count)
|
|
723
|
+
except (ValueError, TypeError):
|
|
724
|
+
user_info["followers_count"] = 0
|
|
725
|
+
|
|
726
|
+
return user_info
|
|
727
|
+
|
|
728
|
+
except Exception as e:
|
|
729
|
+
# Log error but don't fail completely
|
|
730
|
+
return None
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def transform_weibo_user_posts_response(user_posts_response: Dict) -> Optional[Dict]:
|
|
734
|
+
"""
|
|
735
|
+
Transform raw Weibo user posts response into structured information
|
|
736
|
+
|
|
737
|
+
Args:
|
|
738
|
+
user_posts_response: Raw response from get_user_posts
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
Structured user posts information or None if invalid data
|
|
742
|
+
"""
|
|
743
|
+
if not isinstance(user_posts_response, dict):
|
|
744
|
+
return None
|
|
745
|
+
|
|
746
|
+
user_info = user_posts_response.get("userInfo", {})
|
|
747
|
+
if not user_info:
|
|
748
|
+
return None
|
|
749
|
+
|
|
750
|
+
try:
|
|
751
|
+
user_posts_info = {
|
|
752
|
+
"user": {
|
|
753
|
+
"id": user_info.get("id"),
|
|
754
|
+
"screen_name": user_info.get("screen_name", ""),
|
|
755
|
+
"profile_image_url": user_info.get("profile_image_url", ""),
|
|
756
|
+
"followers_count": user_info.get("followers_count", 0),
|
|
757
|
+
"follow_count": user_info.get("follow_count", 0),
|
|
758
|
+
"statuses_count": user_info.get("statuses_count", 0),
|
|
759
|
+
"verified": user_info.get("verified", False),
|
|
760
|
+
"verified_type": user_info.get("verified_type", -1),
|
|
761
|
+
"verified_reason": user_info.get("verified_reason", ""),
|
|
762
|
+
"description": user_info.get("description", ""),
|
|
763
|
+
"gender": user_info.get("gender", ""),
|
|
764
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
765
|
+
"cover_image_phone": user_info.get("cover_image_phone", ""),
|
|
766
|
+
"avatar_hd": user_info.get("avatar_hd", ""),
|
|
767
|
+
"mbtype": user_info.get("mbtype", 0),
|
|
768
|
+
"svip": user_info.get("svip", 0),
|
|
769
|
+
"urank": user_info.get("urank", 0),
|
|
770
|
+
"mbrank": user_info.get("mbrank", 0),
|
|
771
|
+
},
|
|
772
|
+
"style_config": {
|
|
773
|
+
"is_video_cover_style": user_posts_response.get("isVideoCoverStyle", 0),
|
|
774
|
+
"is_star_style": user_posts_response.get("isStarStyle", 0),
|
|
775
|
+
},
|
|
776
|
+
"navigation": {
|
|
777
|
+
"fans_scheme": user_posts_response.get("fans_scheme", ""),
|
|
778
|
+
"follow_scheme": user_posts_response.get("follow_scheme", ""),
|
|
779
|
+
"profile_scheme": user_posts_response.get("scheme", ""),
|
|
780
|
+
},
|
|
781
|
+
"tabs_info": {
|
|
782
|
+
"selected_tab": user_posts_response.get("tabsInfo", {}).get("selectedTab", 1),
|
|
783
|
+
"tabs": []
|
|
784
|
+
},
|
|
785
|
+
"toolbar_menus": [],
|
|
786
|
+
"profile_ext": user_posts_response.get("profile_ext", ""),
|
|
787
|
+
"show_app_tips": user_posts_response.get("showAppTips", 0),
|
|
788
|
+
# Posts data if present
|
|
789
|
+
"posts": [],
|
|
790
|
+
"pagination": {
|
|
791
|
+
"since_id": user_posts_response.get("cardlistInfo", {}).get("since_id", ""),
|
|
792
|
+
"total": user_posts_response.get("cardlistInfo", {}).get("total", 0),
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
# Process tabs information
|
|
797
|
+
tabs = user_posts_response.get("tabsInfo", {}).get("tabs", [])
|
|
798
|
+
for tab in tabs:
|
|
799
|
+
if isinstance(tab, dict):
|
|
800
|
+
tab_info = {
|
|
801
|
+
"id": tab.get("id"),
|
|
802
|
+
"tab_key": tab.get("tabKey", ""),
|
|
803
|
+
"title": tab.get("title", ""),
|
|
804
|
+
"tab_type": tab.get("tab_type", ""),
|
|
805
|
+
"containerid": tab.get("containerid", ""),
|
|
806
|
+
"must_show": tab.get("must_show", 0),
|
|
807
|
+
"hidden": tab.get("hidden", 0),
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
# Add optional fields if present
|
|
811
|
+
if "apipath" in tab:
|
|
812
|
+
tab_info["apipath"] = tab["apipath"]
|
|
813
|
+
if "headSubTitleText" in tab:
|
|
814
|
+
tab_info["head_subtitle_text"] = tab["headSubTitleText"]
|
|
815
|
+
if "tab_icon" in tab:
|
|
816
|
+
tab_info["tab_icon"] = tab["tab_icon"]
|
|
817
|
+
if "tab_icon_dark" in tab:
|
|
818
|
+
tab_info["tab_icon_dark"] = tab["tab_icon_dark"]
|
|
819
|
+
if "url" in tab:
|
|
820
|
+
tab_info["url"] = tab["url"]
|
|
821
|
+
|
|
822
|
+
user_posts_info["tabs_info"]["tabs"].append(tab_info)
|
|
823
|
+
|
|
824
|
+
# Process toolbar menus
|
|
825
|
+
toolbar_menus = user_info.get("toolbar_menus", [])
|
|
826
|
+
for menu in toolbar_menus:
|
|
827
|
+
if isinstance(menu, dict):
|
|
828
|
+
menu_info = {
|
|
829
|
+
"type": menu.get("type", ""),
|
|
830
|
+
"name": menu.get("name", ""),
|
|
831
|
+
"params": menu.get("params", {}),
|
|
832
|
+
"scheme": menu.get("scheme", ""),
|
|
833
|
+
}
|
|
834
|
+
user_posts_info["toolbar_menus"].append(menu_info)
|
|
835
|
+
|
|
836
|
+
# Process posts if present in cards
|
|
837
|
+
cards = user_posts_response.get("cards", [])
|
|
838
|
+
if isinstance(cards, list):
|
|
839
|
+
for card in cards:
|
|
840
|
+
if card.get("card_type") == 9: # Regular post card
|
|
841
|
+
post_info = transform_weibo_post_data(card)
|
|
842
|
+
if post_info:
|
|
843
|
+
user_posts_info["posts"].append(post_info)
|
|
844
|
+
|
|
845
|
+
# Clean up followers_count if it's a string with suffix
|
|
846
|
+
followers_count = user_info.get("followers_count", 0)
|
|
847
|
+
if isinstance(followers_count, str):
|
|
848
|
+
if "万" in followers_count:
|
|
849
|
+
try:
|
|
850
|
+
num_str = followers_count.replace("万", "")
|
|
851
|
+
user_posts_info["user"]["followers_count"] = int(float(num_str) * 10000)
|
|
852
|
+
except (ValueError, TypeError):
|
|
853
|
+
user_posts_info["user"]["followers_count"] = 0
|
|
854
|
+
else:
|
|
855
|
+
try:
|
|
856
|
+
user_posts_info["user"]["followers_count"] = int(followers_count)
|
|
857
|
+
except (ValueError, TypeError):
|
|
858
|
+
user_posts_info["user"]["followers_count"] = 0
|
|
859
|
+
|
|
860
|
+
# Validate essential fields
|
|
861
|
+
if not user_posts_info["user"]["id"]:
|
|
862
|
+
return None
|
|
863
|
+
|
|
864
|
+
return user_posts_info
|
|
865
|
+
|
|
866
|
+
except Exception as e:
|
|
867
|
+
# Log error but don't fail completely
|
|
868
|
+
return None
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def transform_weibo_trending_response(trending_response: Dict) -> List[Dict]:
|
|
872
|
+
"""
|
|
873
|
+
Transform raw Weibo trending API response into list of structured posts
|
|
874
|
+
|
|
875
|
+
Args:
|
|
876
|
+
trending_response: Raw API response from get_trending_list
|
|
877
|
+
|
|
878
|
+
Returns:
|
|
879
|
+
List of structured post information
|
|
880
|
+
"""
|
|
881
|
+
if not isinstance(trending_response, dict):
|
|
882
|
+
return []
|
|
883
|
+
|
|
884
|
+
statuses = trending_response.get("statuses", [])
|
|
885
|
+
if not isinstance(statuses, list):
|
|
886
|
+
return []
|
|
887
|
+
|
|
888
|
+
structured_posts = []
|
|
889
|
+
|
|
890
|
+
for status in statuses:
|
|
891
|
+
post_info = transform_weibo_status_data(status)
|
|
892
|
+
if post_info:
|
|
893
|
+
structured_posts.append(post_info)
|
|
894
|
+
|
|
895
|
+
return structured_posts
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def transform_weibo_status_data(status_data: Dict) -> Optional[Dict]:
|
|
899
|
+
"""
|
|
900
|
+
Transform raw Weibo status data into structured post information
|
|
901
|
+
(for trending list and similar direct status responses)
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
status_data: Raw status data from Weibo API
|
|
905
|
+
|
|
906
|
+
Returns:
|
|
907
|
+
Structured post information or None if invalid data
|
|
908
|
+
"""
|
|
909
|
+
if not isinstance(status_data, dict):
|
|
910
|
+
return None
|
|
911
|
+
|
|
912
|
+
user = status_data.get("user", {})
|
|
913
|
+
if not user:
|
|
914
|
+
return None
|
|
915
|
+
|
|
916
|
+
try:
|
|
917
|
+
post_info = {
|
|
918
|
+
"mid": status_data.get("id"),
|
|
919
|
+
"text": process_weibo_text(status_data.get("text", "")),
|
|
920
|
+
"created_at": status_data.get("created_at"),
|
|
921
|
+
"source": status_data.get("source"),
|
|
922
|
+
"reposts_count": status_data.get("reposts_count", 0),
|
|
923
|
+
"comments_count": status_data.get("comments_count", 0),
|
|
924
|
+
"attitudes_count": status_data.get("attitudes_count", 0),
|
|
925
|
+
"user": {
|
|
926
|
+
"id": user.get("id"),
|
|
927
|
+
"screen_name": user.get("screen_name"),
|
|
928
|
+
"profile_image_url": user.get("profile_image_url"),
|
|
929
|
+
"followers_count": user.get("followers_count", 0),
|
|
930
|
+
"friends_count": user.get("follow_count", 0), # Note: different field name
|
|
931
|
+
"statuses_count": user.get("statuses_count", 0),
|
|
932
|
+
"verified": user.get("verified", False),
|
|
933
|
+
"verified_type": user.get("verified_type", 0),
|
|
934
|
+
"verified_reason": user.get("verified_reason", ""),
|
|
935
|
+
"description": user.get("description", ""),
|
|
936
|
+
"gender": user.get("gender", ""),
|
|
937
|
+
"mbtype": user.get("mbtype", 0),
|
|
938
|
+
"svip": user.get("svip", 0),
|
|
939
|
+
"urank": user.get("urank", 0),
|
|
940
|
+
"mbrank": user.get("mbrank", 0),
|
|
941
|
+
},
|
|
942
|
+
"pics": status_data.get("pic_ids", []),
|
|
943
|
+
"pic_num": status_data.get("pic_num", 0),
|
|
944
|
+
"page_info": status_data.get("page_info", {}), # Video info if present
|
|
945
|
+
"is_long_text": status_data.get("isLongText", False),
|
|
946
|
+
"favorited": status_data.get("favorited", False),
|
|
947
|
+
"can_edit": status_data.get("can_edit", False),
|
|
948
|
+
"visible": status_data.get("visible", {}),
|
|
949
|
+
"bid": status_data.get("bid", ""),
|
|
950
|
+
"mixed_count": status_data.get("mixed_count", 0),
|
|
951
|
+
"pending_approval_count": status_data.get("pending_approval_count", 0),
|
|
952
|
+
"floor_number": status_data.get("floor_number", 0),
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
# Clean up followers_count if it's a string with suffix
|
|
956
|
+
followers_count = user.get("followers_count", 0)
|
|
957
|
+
if isinstance(followers_count, str):
|
|
958
|
+
# Handle cases like "83.2万"
|
|
959
|
+
if "万" in followers_count:
|
|
960
|
+
try:
|
|
961
|
+
num_str = followers_count.replace("万", "")
|
|
962
|
+
post_info["user"]["followers_count"] = int(float(num_str) * 10000)
|
|
963
|
+
except (ValueError, TypeError):
|
|
964
|
+
post_info["user"]["followers_count"] = 0
|
|
965
|
+
else:
|
|
966
|
+
try:
|
|
967
|
+
post_info["user"]["followers_count"] = int(followers_count)
|
|
968
|
+
except (ValueError, TypeError):
|
|
969
|
+
post_info["user"]["followers_count"] = 0
|
|
970
|
+
|
|
971
|
+
# Process video information if present
|
|
972
|
+
page_info = status_data.get("page_info", {})
|
|
973
|
+
if page_info and page_info.get("type") == "video":
|
|
974
|
+
post_info["video_info"] = {
|
|
975
|
+
"title": page_info.get("title", ""),
|
|
976
|
+
"page_title": page_info.get("page_title", ""),
|
|
977
|
+
"object_id": page_info.get("object_id", ""),
|
|
978
|
+
"page_url": page_info.get("page_url", ""),
|
|
979
|
+
"duration": page_info.get("media_info", {}).get("duration", 0),
|
|
980
|
+
"video_orientation": page_info.get("video_orientation", ""),
|
|
981
|
+
"urls": page_info.get("urls", {}),
|
|
982
|
+
"cover_image": {
|
|
983
|
+
"url": page_info.get("page_pic", {}).get("url", ""),
|
|
984
|
+
"width": page_info.get("page_pic", {}).get("width", ""),
|
|
985
|
+
"height": page_info.get("page_pic", {}).get("height", ""),
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
# Validate essential fields
|
|
990
|
+
if not post_info["mid"] or not post_info["user"]["id"]:
|
|
991
|
+
return None
|
|
992
|
+
|
|
993
|
+
return post_info
|
|
994
|
+
|
|
995
|
+
except Exception as e:
|
|
996
|
+
# Log error but don't fail completely
|
|
997
|
+
return None
|