vibesurf 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/backend/shared_state.py +2 -2
- vibe_surf/chrome_extension/config.js +8 -0
- vibe_surf/chrome_extension/scripts/ui-manager.js +146 -0
- vibe_surf/chrome_extension/sidepanel.html +2 -16
- vibe_surf/tools/website_api/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/client.py +845 -0
- vibe_surf/tools/website_api/douyin/helpers.py +239 -0
- vibe_surf/tools/website_api/weibo/__init__.py +0 -0
- vibe_surf/tools/website_api/weibo/client.py +846 -0
- vibe_surf/tools/website_api/weibo/helpers.py +997 -0
- vibe_surf/tools/website_api/xhs/__init__.py +0 -0
- vibe_surf/tools/website_api/xhs/client.py +807 -0
- vibe_surf/tools/website_api/xhs/helpers.py +301 -0
- vibe_surf/tools/website_api/youtube/__init__.py +32 -0
- vibe_surf/tools/website_api/youtube/client.py +1179 -0
- vibe_surf/tools/website_api/youtube/helpers.py +420 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/METADATA +55 -23
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/RECORD +24 -11
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/licenses/LICENSE +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,846 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import pdb
|
|
4
|
+
import re
|
|
5
|
+
import copy
|
|
6
|
+
import time
|
|
7
|
+
import urllib.parse
|
|
8
|
+
from typing import Dict, List, Optional, Callable, Union, Any
|
|
9
|
+
import httpx
|
|
10
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
11
|
+
from urllib.parse import parse_qs, unquote, urlencode
|
|
12
|
+
|
|
13
|
+
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
14
|
+
from vibe_surf.logger import get_logger
|
|
15
|
+
|
|
16
|
+
from .helpers import (
|
|
17
|
+
SearchType, TrendingType, TrendingConstants,
|
|
18
|
+
create_container_id, extract_cookies_from_browser,
|
|
19
|
+
filter_search_result_card, extract_container_params,
|
|
20
|
+
build_image_proxy_url, extract_render_data, process_weibo_text,
|
|
21
|
+
validate_weibo_data, sanitize_filename,
|
|
22
|
+
extract_redirect_url_from_html, decode_chinese_html,
|
|
23
|
+
WeiboError, NetworkError, DataExtractionError,
|
|
24
|
+
AuthenticationError, RateLimitError, ContentNotFoundError,
|
|
25
|
+
get_mobile_user_agent
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
logger = get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class WeiboApiClient:
|
|
32
|
+
"""
|
|
33
|
+
Weibo API client with integrated browser session management.
|
|
34
|
+
This client handles API communication through browser session for authentication.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the Weibo API client
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
browser_session: Browser session for authentication
|
|
43
|
+
timeout: Request timeout in seconds
|
|
44
|
+
proxy: Proxy URL if needed
|
|
45
|
+
"""
|
|
46
|
+
self.browser_session = browser_session
|
|
47
|
+
self.target_id = None
|
|
48
|
+
self.proxy = proxy
|
|
49
|
+
self.timeout = timeout
|
|
50
|
+
self._api_base = "https://m.weibo.cn"
|
|
51
|
+
self._web_base = "https://www.weibo.com"
|
|
52
|
+
self._image_proxy_host = "https://i1.wp.com/"
|
|
53
|
+
|
|
54
|
+
# Default headers for mobile Weibo
|
|
55
|
+
self.default_headers = {
|
|
56
|
+
"User-Agent": get_mobile_user_agent(),
|
|
57
|
+
"Origin": "https://m.weibo.cn",
|
|
58
|
+
"Referer": "https://m.weibo.cn",
|
|
59
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
60
|
+
}
|
|
61
|
+
self.cookies = {}
|
|
62
|
+
|
|
63
|
+
async def setup(self, target_id: Optional[str] = None):
|
|
64
|
+
"""
|
|
65
|
+
Setup Weibo client by navigating to the site and extracting cookies
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
target_id: Specific browser target ID to use
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
AuthenticationError: If setup fails or user is not logged in
|
|
72
|
+
"""
|
|
73
|
+
try:
|
|
74
|
+
if self.target_id and self.cookies:
|
|
75
|
+
logger.info("Already setup. Return!")
|
|
76
|
+
return
|
|
77
|
+
if target_id:
|
|
78
|
+
self.target_id = target_id
|
|
79
|
+
else:
|
|
80
|
+
# Navigate to mobile version for better API compatibility
|
|
81
|
+
self.target_id = await self.browser_session.navigate_to_url(
|
|
82
|
+
"https://weibo.com/", new_tab=True
|
|
83
|
+
)
|
|
84
|
+
await asyncio.sleep(3) # Wait for page load
|
|
85
|
+
|
|
86
|
+
# Extract cookies from browser
|
|
87
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
88
|
+
result = await asyncio.wait_for(
|
|
89
|
+
cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
|
|
90
|
+
timeout=8.0
|
|
91
|
+
)
|
|
92
|
+
web_cookies = result.get('cookies', [])
|
|
93
|
+
|
|
94
|
+
cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
|
|
95
|
+
self.default_headers["Cookie"] = cookie_str
|
|
96
|
+
self.cookies = cookie_dict
|
|
97
|
+
|
|
98
|
+
user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
99
|
+
params={
|
|
100
|
+
'expression': "navigator.userAgent",
|
|
101
|
+
'returnByValue': True,
|
|
102
|
+
'awaitPromise': True
|
|
103
|
+
},
|
|
104
|
+
session_id=cdp_session.session_id,
|
|
105
|
+
)
|
|
106
|
+
user_agent = user_agent_result.get('result', {}).get('value')
|
|
107
|
+
if user_agent:
|
|
108
|
+
self.default_headers["User-Agent"] = user_agent
|
|
109
|
+
|
|
110
|
+
# Check if user is logged in
|
|
111
|
+
# is_logged_in = await self.pong()
|
|
112
|
+
#
|
|
113
|
+
# if not is_logged_in:
|
|
114
|
+
# logger.warning("User is not logged in to Weibo, redirecting to login page")
|
|
115
|
+
#
|
|
116
|
+
# # Navigate to Weibo SSO login page
|
|
117
|
+
# weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
|
118
|
+
# await self.browser_session.navigate_to_url(weibo_sso_login_url, new_tab=True)
|
|
119
|
+
#
|
|
120
|
+
# # Raise authentication error to inform user they need to login
|
|
121
|
+
# raise AuthenticationError(
|
|
122
|
+
# "User is not logged in to Weibo. Please complete login process and try again.")
|
|
123
|
+
|
|
124
|
+
logger.info("Weibo client setup completed successfully")
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"Failed to setup Weibo client: {e}")
|
|
128
|
+
raise AuthenticationError(f"Setup failed: {e}")
|
|
129
|
+
|
|
130
|
+
async def pong(self) -> bool:
|
|
131
|
+
"""Check if login state is valid using multiple methods"""
|
|
132
|
+
try:
|
|
133
|
+
logger.info("Testing Weibo login status...")
|
|
134
|
+
|
|
135
|
+
# Method 1: Check essential login cookies
|
|
136
|
+
login_cookies = ['SUB', 'SUBP', 'ALF', 'SSOLoginState']
|
|
137
|
+
has_essential_cookies = any(
|
|
138
|
+
cookie_name in self.cookies and self.cookies[cookie_name]
|
|
139
|
+
for cookie_name in login_cookies
|
|
140
|
+
)
|
|
141
|
+
if has_essential_cookies:
|
|
142
|
+
logger.info("Weibo login status: Valid (found essential cookies)")
|
|
143
|
+
return True
|
|
144
|
+
|
|
145
|
+
# Method 2: Try to access user info API
|
|
146
|
+
try:
|
|
147
|
+
uri = "/api/config"
|
|
148
|
+
response_data = await self._make_request("GET", f"{self._api_base}{uri}")
|
|
149
|
+
|
|
150
|
+
if isinstance(response_data, dict) and response_data.get("login"):
|
|
151
|
+
logger.info("Weibo login status: Valid (API check passed)")
|
|
152
|
+
return True
|
|
153
|
+
except Exception as api_error:
|
|
154
|
+
logger.debug(f"API config check failed: {api_error}")
|
|
155
|
+
|
|
156
|
+
# Method 3: Check browser localStorage for login indicators
|
|
157
|
+
try:
|
|
158
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
159
|
+
js_check = """
|
|
160
|
+
(function() {
|
|
161
|
+
try {
|
|
162
|
+
// Check various login indicators
|
|
163
|
+
var hasLoginCookie = document.cookie.includes('SUB=') || document.cookie.includes('SUBP=');
|
|
164
|
+
var hasLoginStorage = localStorage.getItem('login_status') === '1' ||
|
|
165
|
+
localStorage.getItem('isLogin') === 'true' ||
|
|
166
|
+
localStorage.getItem('weiboLoginStatus') === '1';
|
|
167
|
+
|
|
168
|
+
// Check if there's user info in the page
|
|
169
|
+
var hasUserInfo = window.__INITIAL_STATE__ &&
|
|
170
|
+
window.__INITIAL_STATE__.user &&
|
|
171
|
+
window.__INITIAL_STATE__.user.id;
|
|
172
|
+
|
|
173
|
+
return hasLoginCookie || hasLoginStorage || hasUserInfo;
|
|
174
|
+
} catch(e) {
|
|
175
|
+
return false;
|
|
176
|
+
}
|
|
177
|
+
})()
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
181
|
+
params={
|
|
182
|
+
'expression': js_check,
|
|
183
|
+
'returnByValue': True,
|
|
184
|
+
},
|
|
185
|
+
session_id=cdp_session.session_id,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
browser_login_check = result.get('result', {}).get('value', False)
|
|
189
|
+
if browser_login_check:
|
|
190
|
+
logger.info("Weibo login status: Valid (browser check passed)")
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
except Exception as browser_error:
|
|
194
|
+
logger.debug(f"Browser login check failed: {browser_error}")
|
|
195
|
+
|
|
196
|
+
logger.warning("Weibo login status: No valid login indicators found")
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"Failed to check Weibo login status: {e}")
|
|
201
|
+
return False
|
|
202
|
+
|
|
203
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
|
204
|
+
async def _make_request(self, method: str, url: str, **kwargs):
|
|
205
|
+
"""
|
|
206
|
+
Make HTTP request with error handling and retry logic
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
method: HTTP method
|
|
210
|
+
url: Request URL
|
|
211
|
+
**kwargs: Additional request parameters
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Response data
|
|
215
|
+
"""
|
|
216
|
+
raw_response = kwargs.pop("raw_response", False)
|
|
217
|
+
|
|
218
|
+
async with httpx.AsyncClient(proxy=self.proxy, timeout=self.timeout) as client:
|
|
219
|
+
response = await client.request(method, url, **kwargs)
|
|
220
|
+
# Handle common error status codes
|
|
221
|
+
if response.status_code == 403:
|
|
222
|
+
raise AuthenticationError("Access forbidden - may need login or verification")
|
|
223
|
+
elif response.status_code == 429:
|
|
224
|
+
raise RateLimitError("Rate limit exceeded")
|
|
225
|
+
elif response.status_code == 404:
|
|
226
|
+
raise ContentNotFoundError("Content not found")
|
|
227
|
+
elif response.status_code >= 500:
|
|
228
|
+
raise NetworkError(f"Server error: {response.status_code}")
|
|
229
|
+
|
|
230
|
+
if raw_response:
|
|
231
|
+
return response
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
data = response.json()
|
|
235
|
+
|
|
236
|
+
# Check Weibo API response format
|
|
237
|
+
if isinstance(data, dict):
|
|
238
|
+
ok_code = data.get("ok")
|
|
239
|
+
if ok_code == 0: # Weibo error response
|
|
240
|
+
error_msg = data.get("msg", "Response error")
|
|
241
|
+
logger.error(f"Weibo API error: {error_msg}")
|
|
242
|
+
raise DataExtractionError(error_msg)
|
|
243
|
+
elif ok_code == 1: # Success response
|
|
244
|
+
return data.get("data", {})
|
|
245
|
+
elif ok_code is None: # Some endpoints don't return 'ok' field
|
|
246
|
+
return data
|
|
247
|
+
else: # Unknown error
|
|
248
|
+
error_msg = data.get("msg", "Unknown error")
|
|
249
|
+
logger.error(f"Weibo API unknown error: {error_msg}")
|
|
250
|
+
raise DataExtractionError(error_msg)
|
|
251
|
+
|
|
252
|
+
return data
|
|
253
|
+
|
|
254
|
+
except json.JSONDecodeError:
|
|
255
|
+
raise DataExtractionError(f"Invalid JSON response: {response.text[:200]}")
|
|
256
|
+
|
|
257
|
+
async def _get_request(self, endpoint: str, params: Optional[Dict] = None, headers: Optional[Dict] = None,
|
|
258
|
+
**kwargs) -> Dict:
|
|
259
|
+
"""Make GET request with proper headers and parameters"""
|
|
260
|
+
final_endpoint = endpoint
|
|
261
|
+
if params:
|
|
262
|
+
final_endpoint = f"{endpoint}?{urllib.parse.urlencode(params)}"
|
|
263
|
+
|
|
264
|
+
request_headers = headers or self.default_headers
|
|
265
|
+
|
|
266
|
+
return await self._make_request(
|
|
267
|
+
"GET", f"{self._api_base}{final_endpoint}",
|
|
268
|
+
headers=request_headers,
|
|
269
|
+
**kwargs
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
async def _post_request(self, endpoint: str, data: Dict, headers: Optional[Dict] = None) -> Dict:
|
|
273
|
+
"""Make POST request with proper headers and data"""
|
|
274
|
+
request_headers = headers or self.default_headers
|
|
275
|
+
json_payload = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
|
276
|
+
|
|
277
|
+
return await self._make_request(
|
|
278
|
+
"POST", f"{self._api_base}{endpoint}",
|
|
279
|
+
data=json_payload, headers=request_headers
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
async def search_posts_by_keyword(
|
|
283
|
+
self,
|
|
284
|
+
keyword: str,
|
|
285
|
+
page: int = 1,
|
|
286
|
+
search_type: SearchType = SearchType.DEFAULT,
|
|
287
|
+
) -> List[Dict]:
|
|
288
|
+
"""
|
|
289
|
+
Search Weibo posts by keyword
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
keyword: Search keyword
|
|
293
|
+
page: Page number (starting from 1)
|
|
294
|
+
search_type: Search type filter
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of simplified post information
|
|
298
|
+
"""
|
|
299
|
+
endpoint = "/api/container/getIndex"
|
|
300
|
+
container_id = create_container_id(search_type, keyword)
|
|
301
|
+
|
|
302
|
+
cards = []
|
|
303
|
+
posts = []
|
|
304
|
+
for page_num in range(page):
|
|
305
|
+
params = {
|
|
306
|
+
"containerid": container_id,
|
|
307
|
+
"page_type": "searchall",
|
|
308
|
+
"page": page_num,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
raw_response = await self._get_request(endpoint, params)
|
|
312
|
+
cards.extend(raw_response.get("cards", []))
|
|
313
|
+
|
|
314
|
+
for card in cards:
|
|
315
|
+
mblog = card.get("mblog", {})
|
|
316
|
+
if not mblog.get("id"):
|
|
317
|
+
continue
|
|
318
|
+
|
|
319
|
+
user_info = mblog.get("user", {})
|
|
320
|
+
clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
|
|
321
|
+
|
|
322
|
+
post = {
|
|
323
|
+
"note_id": mblog.get("id"),
|
|
324
|
+
"content": clean_text,
|
|
325
|
+
"created_at": mblog.get("created_at"),
|
|
326
|
+
"liked_count": str(mblog.get("attitudes_count", 0)),
|
|
327
|
+
"comments_count": str(mblog.get("comments_count", 0)),
|
|
328
|
+
"shared_count": str(mblog.get("reposts_count", 0)),
|
|
329
|
+
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
|
|
330
|
+
"note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
|
|
331
|
+
"user_id": str(user_info.get("id", "")),
|
|
332
|
+
"nickname": user_info.get("screen_name", ""),
|
|
333
|
+
"gender": user_info.get("gender", ""),
|
|
334
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
335
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
336
|
+
}
|
|
337
|
+
posts.append(post)
|
|
338
|
+
|
|
339
|
+
return posts
|
|
340
|
+
|
|
341
|
+
async def get_post_detail(self, mid: str) -> Optional[Dict]:
|
|
342
|
+
"""
|
|
343
|
+
Get detailed post information by mid ID
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
mid: Weibo post ID
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
Simplified post detail information
|
|
350
|
+
"""
|
|
351
|
+
url = f"{self._api_base}/detail/{mid}"
|
|
352
|
+
|
|
353
|
+
response = await self._make_request(
|
|
354
|
+
"GET", url, headers=self.default_headers, raw_response=True,
|
|
355
|
+
)
|
|
356
|
+
# Extract render data from HTML
|
|
357
|
+
render_data = extract_render_data(response.text)
|
|
358
|
+
if render_data:
|
|
359
|
+
note_detail = render_data.get("status")
|
|
360
|
+
if note_detail:
|
|
361
|
+
user_info = note_detail.get("user", {})
|
|
362
|
+
clean_text = re.sub(r"<.*?>", "", note_detail.get("text", ""))
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
"note_id": note_detail.get("id"),
|
|
366
|
+
"content": clean_text,
|
|
367
|
+
"created_at": note_detail.get("created_at"),
|
|
368
|
+
"liked_count": str(note_detail.get("attitudes_count", 0)),
|
|
369
|
+
"comments_count": str(note_detail.get("comments_count", 0)),
|
|
370
|
+
"shared_count": str(note_detail.get("reposts_count", 0)),
|
|
371
|
+
"ip_location": note_detail.get("region_name", "").replace("发布于 ", ""),
|
|
372
|
+
"note_url": f"https://m.weibo.cn/detail/{note_detail.get('id')}",
|
|
373
|
+
"user_id": str(user_info.get("id", "")),
|
|
374
|
+
"nickname": user_info.get("screen_name", ""),
|
|
375
|
+
"gender": user_info.get("gender", ""),
|
|
376
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
377
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
logger.warning(f"Could not extract render data for post {mid}")
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
async def get_post_comments(
|
|
384
|
+
self,
|
|
385
|
+
mid: str,
|
|
386
|
+
max_id: int = 0,
|
|
387
|
+
max_id_type: int = 0
|
|
388
|
+
) -> List[Dict]:
|
|
389
|
+
"""
|
|
390
|
+
Get comments for a Weibo post
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
mid: Weibo post ID
|
|
394
|
+
max_id: Pagination parameter
|
|
395
|
+
max_id_type: Pagination type parameter
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
List of simplified comment information
|
|
399
|
+
"""
|
|
400
|
+
endpoint = "/comments/hotflow"
|
|
401
|
+
|
|
402
|
+
params = {
|
|
403
|
+
"id": mid,
|
|
404
|
+
"mid": mid,
|
|
405
|
+
"max_id_type": str(max_id_type),
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if max_id > 0:
|
|
409
|
+
params["max_id"] = str(max_id)
|
|
410
|
+
|
|
411
|
+
# Set referer for comment requests
|
|
412
|
+
headers = copy.deepcopy(self.default_headers)
|
|
413
|
+
headers["Referer"] = f"https://m.weibo.cn/detail/{mid}"
|
|
414
|
+
|
|
415
|
+
raw_response = await self._get_request(endpoint, params, headers)
|
|
416
|
+
|
|
417
|
+
# Return simplified comments
|
|
418
|
+
comments = []
|
|
419
|
+
for comment in raw_response.get("data", []):
|
|
420
|
+
if not comment.get("id"):
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
user_info = comment.get("user", {})
|
|
424
|
+
clean_text = re.sub(r"<.*?>", "", comment.get("text", ""))
|
|
425
|
+
|
|
426
|
+
comment_data = {
|
|
427
|
+
"comment_id": str(comment.get("id")),
|
|
428
|
+
"content": clean_text,
|
|
429
|
+
"created_at": comment.get("created_at"),
|
|
430
|
+
"comment_like_count": str(comment.get("like_count", 0)),
|
|
431
|
+
"sub_comment_count": str(comment.get("total_number", 0)),
|
|
432
|
+
"ip_location": comment.get("source", "").replace("来自", ""),
|
|
433
|
+
"parent_comment_id": comment.get("rootid", ""),
|
|
434
|
+
"user_id": str(user_info.get("id", "")),
|
|
435
|
+
"nickname": user_info.get("screen_name", ""),
|
|
436
|
+
"gender": user_info.get("gender", ""),
|
|
437
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
438
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
439
|
+
}
|
|
440
|
+
comments.append(comment_data)
|
|
441
|
+
|
|
442
|
+
return comments
|
|
443
|
+
|
|
444
|
+
async def get_all_post_comments(
|
|
445
|
+
self,
|
|
446
|
+
mid: str,
|
|
447
|
+
fetch_interval: float = 1.0,
|
|
448
|
+
include_sub_comments: bool = False,
|
|
449
|
+
progress_callback: Optional[Callable] = None,
|
|
450
|
+
max_comments: int = 1000,
|
|
451
|
+
) -> List[Dict]:
|
|
452
|
+
"""
|
|
453
|
+
Fetch all comments for a post including sub-comments
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
mid: Weibo post ID
|
|
457
|
+
fetch_interval: Interval between requests in seconds
|
|
458
|
+
include_sub_comments: Whether to include sub-comments
|
|
459
|
+
progress_callback: Callback function for progress updates
|
|
460
|
+
max_comments: Maximum comments to fetch
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
List of all simplified comments
|
|
464
|
+
"""
|
|
465
|
+
all_comments = []
|
|
466
|
+
is_end = False
|
|
467
|
+
max_id = -1
|
|
468
|
+
max_id_type = 0
|
|
469
|
+
|
|
470
|
+
while not is_end and len(all_comments) < max_comments:
|
|
471
|
+
# Get raw response to access pagination info
|
|
472
|
+
endpoint = "/comments/hotflow"
|
|
473
|
+
|
|
474
|
+
params = {
|
|
475
|
+
"id": mid,
|
|
476
|
+
"mid": mid,
|
|
477
|
+
"max_id_type": str(max_id_type),
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if max_id > 0:
|
|
481
|
+
params["max_id"] = str(max_id)
|
|
482
|
+
|
|
483
|
+
# Set referer for comment requests
|
|
484
|
+
headers = copy.deepcopy(self.default_headers)
|
|
485
|
+
headers["Referer"] = f"https://m.weibo.cn/detail/{mid}"
|
|
486
|
+
|
|
487
|
+
raw_response = await self._get_request(endpoint, params, headers)
|
|
488
|
+
|
|
489
|
+
# Extract pagination info from raw response
|
|
490
|
+
max_id = raw_response.get("max_id", 0)
|
|
491
|
+
max_id_type = raw_response.get("max_id_type", 0)
|
|
492
|
+
is_end = max_id == 0
|
|
493
|
+
|
|
494
|
+
# Transform to simplified comments
|
|
495
|
+
batch_comments = []
|
|
496
|
+
for comment in raw_response.get("data", []):
|
|
497
|
+
if not comment.get("id"):
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
user_info = comment.get("user", {})
|
|
501
|
+
clean_text = re.sub(r"<.*?>", "", comment.get("text", ""))
|
|
502
|
+
|
|
503
|
+
comment_data = {
|
|
504
|
+
"comment_id": str(comment.get("id")),
|
|
505
|
+
"content": clean_text,
|
|
506
|
+
"created_at": comment.get("created_at"),
|
|
507
|
+
"comment_like_count": str(comment.get("like_count", 0)),
|
|
508
|
+
"sub_comment_count": str(comment.get("total_number", 0)),
|
|
509
|
+
"ip_location": comment.get("source", "").replace("来自", ""),
|
|
510
|
+
"parent_comment_id": comment.get("rootid", ""),
|
|
511
|
+
"user_id": str(user_info.get("id", "")),
|
|
512
|
+
"nickname": user_info.get("screen_name", ""),
|
|
513
|
+
"gender": user_info.get("gender", ""),
|
|
514
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
515
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
516
|
+
}
|
|
517
|
+
batch_comments.append(comment_data)
|
|
518
|
+
|
|
519
|
+
# Limit comments if approaching max
|
|
520
|
+
remaining_slots = max_comments - len(all_comments)
|
|
521
|
+
if len(batch_comments) > remaining_slots:
|
|
522
|
+
batch_comments = batch_comments[:remaining_slots]
|
|
523
|
+
|
|
524
|
+
if progress_callback:
|
|
525
|
+
await progress_callback(mid, batch_comments)
|
|
526
|
+
|
|
527
|
+
await asyncio.sleep(fetch_interval)
|
|
528
|
+
all_comments.extend(batch_comments)
|
|
529
|
+
|
|
530
|
+
logger.info(f"Fetched {len(all_comments)} comments for post {mid}")
|
|
531
|
+
return all_comments
|
|
532
|
+
|
|
533
|
+
async def get_user_info(self, user_id: str) -> Optional[Dict]:
|
|
534
|
+
"""
|
|
535
|
+
Get user profile information
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
user_id: User ID
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
Simplified user profile information
|
|
542
|
+
"""
|
|
543
|
+
endpoint = "/api/container/getIndex"
|
|
544
|
+
|
|
545
|
+
# Set proper headers for user info request
|
|
546
|
+
headers = copy.deepcopy(self.default_headers)
|
|
547
|
+
headers["Referer"] = f"{self._api_base}/u/{user_id}"
|
|
548
|
+
|
|
549
|
+
# Use standard user profile container ID
|
|
550
|
+
params = {
|
|
551
|
+
"type": "uid",
|
|
552
|
+
"value": user_id,
|
|
553
|
+
"containerid": f"100505{user_id}", # Standard user profile container
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
user_data = await self._get_request(endpoint, params, headers)
|
|
558
|
+
# Extract user info from cards if available
|
|
559
|
+
user_info = user_data.get('userInfo', {})
|
|
560
|
+
user_info["user_id"] = user_info.get("id", user_id)
|
|
561
|
+
return user_info
|
|
562
|
+
|
|
563
|
+
except Exception as e:
|
|
564
|
+
logger.error(f"Failed to get user info for {user_id}: {e}")
|
|
565
|
+
return None
|
|
566
|
+
|
|
567
|
+
async def get_user_posts(
|
|
568
|
+
self,
|
|
569
|
+
user_id: str,
|
|
570
|
+
since_id: str = "0",
|
|
571
|
+
) -> Optional[Dict]:
|
|
572
|
+
"""
|
|
573
|
+
Get posts by user
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
user_id: User ID
|
|
577
|
+
since_id: Pagination parameter (last post ID from previous page)
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Simplified user posts data
|
|
581
|
+
"""
|
|
582
|
+
endpoint = "/api/container/getIndex"
|
|
583
|
+
|
|
584
|
+
# response = await self._get_request(f"/u/{user_id}", raw_response=True)
|
|
585
|
+
# m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
|
|
586
|
+
# m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
|
|
587
|
+
# containerid = m_weibocn_params_dict['fid'][0]
|
|
588
|
+
|
|
589
|
+
params = {
|
|
590
|
+
"jumpfrom": "weibocom",
|
|
591
|
+
"type": "uid",
|
|
592
|
+
"value": user_id,
|
|
593
|
+
"containerid": f"100505{user_id}",
|
|
594
|
+
"since_id": since_id,
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
response = await self._get_request(endpoint, params)
|
|
598
|
+
containerid = f"100505{user_id}"
|
|
599
|
+
if response.get("tabsInfo"):
|
|
600
|
+
tabs: List[Dict] = response.get("tabsInfo", {}).get("tabs", [])
|
|
601
|
+
for tab in tabs:
|
|
602
|
+
if tab.get("tabKey") == "weibo":
|
|
603
|
+
containerid = tab.get("containerid")
|
|
604
|
+
break
|
|
605
|
+
params = {
|
|
606
|
+
"jumpfrom": "weibocom",
|
|
607
|
+
"type": "uid",
|
|
608
|
+
"value": user_id,
|
|
609
|
+
"containerid": containerid,
|
|
610
|
+
"since_id": since_id,
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
response = await self._get_request(endpoint, params)
|
|
614
|
+
|
|
615
|
+
# Transform to simplified posts
|
|
616
|
+
posts = []
|
|
617
|
+
cards = response.get("cards", [])
|
|
618
|
+
for card in cards:
|
|
619
|
+
if card.get("card_type") == 9: # Weibo post card type
|
|
620
|
+
mblog = card.get("mblog", {})
|
|
621
|
+
if not mblog.get("id"):
|
|
622
|
+
continue
|
|
623
|
+
|
|
624
|
+
user_info = mblog.get("user", {})
|
|
625
|
+
clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
|
|
626
|
+
|
|
627
|
+
post = {
|
|
628
|
+
"note_id": mblog.get("id"),
|
|
629
|
+
"content": clean_text,
|
|
630
|
+
"created_at": mblog.get("created_at"),
|
|
631
|
+
"liked_count": str(mblog.get("attitudes_count", 0)),
|
|
632
|
+
"comments_count": str(mblog.get("comments_count", 0)),
|
|
633
|
+
"shared_count": str(mblog.get("reposts_count", 0)),
|
|
634
|
+
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
|
|
635
|
+
"note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
|
|
636
|
+
"user_id": str(user_info.get("id", "")),
|
|
637
|
+
"nickname": user_info.get("screen_name", ""),
|
|
638
|
+
"gender": user_info.get("gender", ""),
|
|
639
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
640
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
641
|
+
}
|
|
642
|
+
posts.append(post)
|
|
643
|
+
|
|
644
|
+
return {
|
|
645
|
+
"posts": posts,
|
|
646
|
+
"pagination": {
|
|
647
|
+
"since_id": response.get("cardlistInfo", {}).get("since_id", ""),
|
|
648
|
+
"total": response.get("cardlistInfo", {}).get("total", 0)
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
async def get_all_user_posts(
|
|
653
|
+
self,
|
|
654
|
+
user_id: str,
|
|
655
|
+
fetch_interval: float = 1.0,
|
|
656
|
+
progress_callback: Optional[Callable] = None,
|
|
657
|
+
max_posts: int = 1000,
|
|
658
|
+
) -> List[Dict]:
|
|
659
|
+
"""
|
|
660
|
+
Fetch all posts by a user
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
user_id: User ID
|
|
664
|
+
fetch_interval: Interval between requests in seconds
|
|
665
|
+
progress_callback: Callback function for progress updates
|
|
666
|
+
max_posts: Maximum posts to fetch
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
List of all simplified user posts
|
|
670
|
+
"""
|
|
671
|
+
all_posts = []
|
|
672
|
+
has_more = True
|
|
673
|
+
since_id = ""
|
|
674
|
+
crawler_total_count = 0
|
|
675
|
+
|
|
676
|
+
while has_more and len(all_posts) < max_posts:
|
|
677
|
+
# Get raw response to access pagination info and then transform
|
|
678
|
+
endpoint = "/api/container/getIndex"
|
|
679
|
+
|
|
680
|
+
params = {
|
|
681
|
+
"jumpfrom": "weibocom",
|
|
682
|
+
"type": "uid",
|
|
683
|
+
"value": user_id,
|
|
684
|
+
"containerid": f"100505{user_id}",
|
|
685
|
+
"since_id": since_id,
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
raw_posts_data = await self._get_request(endpoint, params)
|
|
689
|
+
|
|
690
|
+
if not raw_posts_data:
|
|
691
|
+
logger.error(f"User {user_id} may be restricted or data unavailable")
|
|
692
|
+
break
|
|
693
|
+
|
|
694
|
+
# Extract pagination info from raw response
|
|
695
|
+
since_id = raw_posts_data.get("cardlistInfo", {}).get("since_id", "0")
|
|
696
|
+
if "cards" not in raw_posts_data:
|
|
697
|
+
logger.info(f"No posts found in response for user {user_id}")
|
|
698
|
+
break
|
|
699
|
+
|
|
700
|
+
# Transform to simplified posts
|
|
701
|
+
posts = []
|
|
702
|
+
cards = raw_posts_data.get("cards", [])
|
|
703
|
+
for card in cards:
|
|
704
|
+
if card.get("card_type") == 9: # Weibo post card type
|
|
705
|
+
mblog = card.get("mblog", {})
|
|
706
|
+
if not mblog.get("id"):
|
|
707
|
+
continue
|
|
708
|
+
|
|
709
|
+
user_info = mblog.get("user", {})
|
|
710
|
+
clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
|
|
711
|
+
|
|
712
|
+
post = {
|
|
713
|
+
"note_id": mblog.get("id"),
|
|
714
|
+
"content": clean_text,
|
|
715
|
+
"created_at": mblog.get("created_at"),
|
|
716
|
+
"liked_count": str(mblog.get("attitudes_count", 0)),
|
|
717
|
+
"comments_count": str(mblog.get("comments_count", 0)),
|
|
718
|
+
"shared_count": str(mblog.get("reposts_count", 0)),
|
|
719
|
+
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
|
|
720
|
+
"note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
|
|
721
|
+
"user_id": str(user_info.get("id", "")),
|
|
722
|
+
"nickname": user_info.get("screen_name", ""),
|
|
723
|
+
"gender": user_info.get("gender", ""),
|
|
724
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
725
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
726
|
+
}
|
|
727
|
+
posts.append(post)
|
|
728
|
+
|
|
729
|
+
logger.info(f"Fetched {len(posts)} posts for user {user_id}")
|
|
730
|
+
|
|
731
|
+
remaining_slots = max_posts - len(all_posts)
|
|
732
|
+
if remaining_slots <= 0:
|
|
733
|
+
break
|
|
734
|
+
|
|
735
|
+
posts_to_add = posts[:remaining_slots]
|
|
736
|
+
|
|
737
|
+
if progress_callback:
|
|
738
|
+
await progress_callback(posts_to_add)
|
|
739
|
+
|
|
740
|
+
all_posts.extend(posts_to_add)
|
|
741
|
+
await asyncio.sleep(fetch_interval)
|
|
742
|
+
|
|
743
|
+
crawler_total_count += 10
|
|
744
|
+
total_available = raw_posts_data.get("cardlistInfo", {}).get("total", 0)
|
|
745
|
+
has_more = total_available > crawler_total_count and since_id != "0"
|
|
746
|
+
|
|
747
|
+
logger.info(f"Fetched total {len(all_posts)} posts for user {user_id}")
|
|
748
|
+
return all_posts
|
|
749
|
+
|
|
750
|
+
async def get_trending_posts(self) -> List[Dict]:
|
|
751
|
+
"""
|
|
752
|
+
Get Weibo trending posts (热搜榜)
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
List of simplified trending post information
|
|
756
|
+
"""
|
|
757
|
+
endpoint = "/api/feed/trendtop"
|
|
758
|
+
params = {
|
|
759
|
+
"containerid": TrendingConstants.TRENDING_CONTAINER_ID
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
raw_response = await self._get_request(endpoint, params)
|
|
763
|
+
|
|
764
|
+
# Transform to simplified posts
|
|
765
|
+
posts = []
|
|
766
|
+
cards = raw_response.get("statuses", [])
|
|
767
|
+
for mblog in cards:
|
|
768
|
+
if not mblog.get("id"):
|
|
769
|
+
continue
|
|
770
|
+
|
|
771
|
+
user_info = mblog.get("user", {})
|
|
772
|
+
clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
|
|
773
|
+
|
|
774
|
+
post = {
|
|
775
|
+
"note_id": mblog.get("id"),
|
|
776
|
+
"content": clean_text,
|
|
777
|
+
"created_at": mblog.get("created_at"),
|
|
778
|
+
"liked_count": str(mblog.get("attitudes_count", 0)),
|
|
779
|
+
"comments_count": str(mblog.get("comments_count", 0)),
|
|
780
|
+
"shared_count": str(mblog.get("reposts_count", 0)),
|
|
781
|
+
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
|
|
782
|
+
"note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
|
|
783
|
+
"user_id": str(user_info.get("id", "")),
|
|
784
|
+
"nickname": user_info.get("screen_name", ""),
|
|
785
|
+
"gender": user_info.get("gender", ""),
|
|
786
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
787
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
788
|
+
}
|
|
789
|
+
posts.append(post)
|
|
790
|
+
|
|
791
|
+
return posts
|
|
792
|
+
|
|
793
|
+
async def get_hot_posts(self) -> List[Dict]:
|
|
794
|
+
"""
|
|
795
|
+
Get Weibo hot posts (热门推荐)
|
|
796
|
+
|
|
797
|
+
Returns:
|
|
798
|
+
List of simplified hot post information
|
|
799
|
+
"""
|
|
800
|
+
endpoint = "/api/container/getIndex"
|
|
801
|
+
params = {
|
|
802
|
+
"containerid": TrendingConstants.HOT_POSTS_CONTAINER_ID,
|
|
803
|
+
"openApp": TrendingConstants.OPEN_APP
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
raw_response = await self._get_request(endpoint, params)
|
|
807
|
+
|
|
808
|
+
# Transform to simplified posts (same structure as search results)
|
|
809
|
+
posts = []
|
|
810
|
+
cards = raw_response.get("cards", [])
|
|
811
|
+
for card in cards:
|
|
812
|
+
if card.get("card_type") == 9: # Weibo post card type
|
|
813
|
+
mblog = card.get("mblog", {})
|
|
814
|
+
if not mblog.get("id"):
|
|
815
|
+
continue
|
|
816
|
+
|
|
817
|
+
user_info = mblog.get("user", {})
|
|
818
|
+
clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
|
|
819
|
+
|
|
820
|
+
post = {
|
|
821
|
+
"note_id": mblog.get("id"),
|
|
822
|
+
"content": clean_text,
|
|
823
|
+
"created_at": mblog.get("created_at"),
|
|
824
|
+
"liked_count": str(mblog.get("attitudes_count", 0)),
|
|
825
|
+
"comments_count": str(mblog.get("comments_count", 0)),
|
|
826
|
+
"shared_count": str(mblog.get("reposts_count", 0)),
|
|
827
|
+
"ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
|
|
828
|
+
"note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
|
|
829
|
+
"user_id": str(user_info.get("id", "")),
|
|
830
|
+
"nickname": user_info.get("screen_name", ""),
|
|
831
|
+
"gender": user_info.get("gender", ""),
|
|
832
|
+
"profile_url": user_info.get("profile_url", ""),
|
|
833
|
+
"avatar": user_info.get("profile_image_url", ""),
|
|
834
|
+
}
|
|
835
|
+
posts.append(post)
|
|
836
|
+
|
|
837
|
+
return posts
|
|
838
|
+
|
|
839
|
+
async def close(self):
|
|
840
|
+
if self.browser_session and self.target_id:
|
|
841
|
+
try:
|
|
842
|
+
logger.info(f"Close target id: {self.target_id}")
|
|
843
|
+
await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
|
|
844
|
+
except Exception as e:
|
|
845
|
+
logger.warning(f"Error closing target {self.target_id}: {e}")
|
|
846
|
+
|