vibesurf 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -0,0 +1,846 @@
1
+ import asyncio
2
+ import json
3
+ import pdb
4
+ import re
5
+ import copy
6
+ import time
7
+ import urllib.parse
8
+ from typing import Dict, List, Optional, Callable, Union, Any
9
+ import httpx
10
+ from tenacity import retry, stop_after_attempt, wait_fixed
11
+ from urllib.parse import parse_qs, unquote, urlencode
12
+
13
+ from vibe_surf.browser.agent_browser_session import AgentBrowserSession
14
+ from vibe_surf.logger import get_logger
15
+
16
+ from .helpers import (
17
+ SearchType, TrendingType, TrendingConstants,
18
+ create_container_id, extract_cookies_from_browser,
19
+ filter_search_result_card, extract_container_params,
20
+ build_image_proxy_url, extract_render_data, process_weibo_text,
21
+ validate_weibo_data, sanitize_filename,
22
+ extract_redirect_url_from_html, decode_chinese_html,
23
+ WeiboError, NetworkError, DataExtractionError,
24
+ AuthenticationError, RateLimitError, ContentNotFoundError,
25
+ get_mobile_user_agent
26
+ )
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class WeiboApiClient:
32
+ """
33
+ Weibo API client with integrated browser session management.
34
+ This client handles API communication through browser session for authentication.
35
+ """
36
+
37
+ def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
38
+ """
39
+ Initialize the Weibo API client
40
+
41
+ Args:
42
+ browser_session: Browser session for authentication
43
+ timeout: Request timeout in seconds
44
+ proxy: Proxy URL if needed
45
+ """
46
+ self.browser_session = browser_session
47
+ self.target_id = None
48
+ self.proxy = proxy
49
+ self.timeout = timeout
50
+ self._api_base = "https://m.weibo.cn"
51
+ self._web_base = "https://www.weibo.com"
52
+ self._image_proxy_host = "https://i1.wp.com/"
53
+
54
+ # Default headers for mobile Weibo
55
+ self.default_headers = {
56
+ "User-Agent": get_mobile_user_agent(),
57
+ "Origin": "https://m.weibo.cn",
58
+ "Referer": "https://m.weibo.cn",
59
+ "Content-Type": "application/json;charset=UTF-8",
60
+ }
61
+ self.cookies = {}
62
+
63
+ async def setup(self, target_id: Optional[str] = None):
64
+ """
65
+ Setup Weibo client by navigating to the site and extracting cookies
66
+
67
+ Args:
68
+ target_id: Specific browser target ID to use
69
+
70
+ Raises:
71
+ AuthenticationError: If setup fails or user is not logged in
72
+ """
73
+ try:
74
+ if self.target_id and self.cookies:
75
+ logger.info("Already setup. Return!")
76
+ return
77
+ if target_id:
78
+ self.target_id = target_id
79
+ else:
80
+ # Navigate to mobile version for better API compatibility
81
+ self.target_id = await self.browser_session.navigate_to_url(
82
+ "https://weibo.com/", new_tab=True
83
+ )
84
+ await asyncio.sleep(3) # Wait for page load
85
+
86
+ # Extract cookies from browser
87
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
88
+ result = await asyncio.wait_for(
89
+ cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
90
+ timeout=8.0
91
+ )
92
+ web_cookies = result.get('cookies', [])
93
+
94
+ cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
95
+ self.default_headers["Cookie"] = cookie_str
96
+ self.cookies = cookie_dict
97
+
98
+ user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
99
+ params={
100
+ 'expression': "navigator.userAgent",
101
+ 'returnByValue': True,
102
+ 'awaitPromise': True
103
+ },
104
+ session_id=cdp_session.session_id,
105
+ )
106
+ user_agent = user_agent_result.get('result', {}).get('value')
107
+ if user_agent:
108
+ self.default_headers["User-Agent"] = user_agent
109
+
110
+ # Check if user is logged in
111
+ # is_logged_in = await self.pong()
112
+ #
113
+ # if not is_logged_in:
114
+ # logger.warning("User is not logged in to Weibo, redirecting to login page")
115
+ #
116
+ # # Navigate to Weibo SSO login page
117
+ # weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
118
+ # await self.browser_session.navigate_to_url(weibo_sso_login_url, new_tab=True)
119
+ #
120
+ # # Raise authentication error to inform user they need to login
121
+ # raise AuthenticationError(
122
+ # "User is not logged in to Weibo. Please complete login process and try again.")
123
+
124
+ logger.info("Weibo client setup completed successfully")
125
+
126
+ except Exception as e:
127
+ logger.error(f"Failed to setup Weibo client: {e}")
128
+ raise AuthenticationError(f"Setup failed: {e}")
129
+
130
+ async def pong(self) -> bool:
131
+ """Check if login state is valid using multiple methods"""
132
+ try:
133
+ logger.info("Testing Weibo login status...")
134
+
135
+ # Method 1: Check essential login cookies
136
+ login_cookies = ['SUB', 'SUBP', 'ALF', 'SSOLoginState']
137
+ has_essential_cookies = any(
138
+ cookie_name in self.cookies and self.cookies[cookie_name]
139
+ for cookie_name in login_cookies
140
+ )
141
+ if has_essential_cookies:
142
+ logger.info("Weibo login status: Valid (found essential cookies)")
143
+ return True
144
+
145
+ # Method 2: Try to access user info API
146
+ try:
147
+ uri = "/api/config"
148
+ response_data = await self._make_request("GET", f"{self._api_base}{uri}")
149
+
150
+ if isinstance(response_data, dict) and response_data.get("login"):
151
+ logger.info("Weibo login status: Valid (API check passed)")
152
+ return True
153
+ except Exception as api_error:
154
+ logger.debug(f"API config check failed: {api_error}")
155
+
156
+ # Method 3: Check browser localStorage for login indicators
157
+ try:
158
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
159
+ js_check = """
160
+ (function() {
161
+ try {
162
+ // Check various login indicators
163
+ var hasLoginCookie = document.cookie.includes('SUB=') || document.cookie.includes('SUBP=');
164
+ var hasLoginStorage = localStorage.getItem('login_status') === '1' ||
165
+ localStorage.getItem('isLogin') === 'true' ||
166
+ localStorage.getItem('weiboLoginStatus') === '1';
167
+
168
+ // Check if there's user info in the page
169
+ var hasUserInfo = window.__INITIAL_STATE__ &&
170
+ window.__INITIAL_STATE__.user &&
171
+ window.__INITIAL_STATE__.user.id;
172
+
173
+ return hasLoginCookie || hasLoginStorage || hasUserInfo;
174
+ } catch(e) {
175
+ return false;
176
+ }
177
+ })()
178
+ """
179
+
180
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
181
+ params={
182
+ 'expression': js_check,
183
+ 'returnByValue': True,
184
+ },
185
+ session_id=cdp_session.session_id,
186
+ )
187
+
188
+ browser_login_check = result.get('result', {}).get('value', False)
189
+ if browser_login_check:
190
+ logger.info("Weibo login status: Valid (browser check passed)")
191
+ return True
192
+
193
+ except Exception as browser_error:
194
+ logger.debug(f"Browser login check failed: {browser_error}")
195
+
196
+ logger.warning("Weibo login status: No valid login indicators found")
197
+ return False
198
+
199
+ except Exception as e:
200
+ logger.error(f"Failed to check Weibo login status: {e}")
201
+ return False
202
+
203
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
204
+ async def _make_request(self, method: str, url: str, **kwargs):
205
+ """
206
+ Make HTTP request with error handling and retry logic
207
+
208
+ Args:
209
+ method: HTTP method
210
+ url: Request URL
211
+ **kwargs: Additional request parameters
212
+
213
+ Returns:
214
+ Response data
215
+ """
216
+ raw_response = kwargs.pop("raw_response", False)
217
+
218
+ async with httpx.AsyncClient(proxy=self.proxy, timeout=self.timeout) as client:
219
+ response = await client.request(method, url, **kwargs)
220
+ # Handle common error status codes
221
+ if response.status_code == 403:
222
+ raise AuthenticationError("Access forbidden - may need login or verification")
223
+ elif response.status_code == 429:
224
+ raise RateLimitError("Rate limit exceeded")
225
+ elif response.status_code == 404:
226
+ raise ContentNotFoundError("Content not found")
227
+ elif response.status_code >= 500:
228
+ raise NetworkError(f"Server error: {response.status_code}")
229
+
230
+ if raw_response:
231
+ return response
232
+
233
+ try:
234
+ data = response.json()
235
+
236
+ # Check Weibo API response format
237
+ if isinstance(data, dict):
238
+ ok_code = data.get("ok")
239
+ if ok_code == 0: # Weibo error response
240
+ error_msg = data.get("msg", "Response error")
241
+ logger.error(f"Weibo API error: {error_msg}")
242
+ raise DataExtractionError(error_msg)
243
+ elif ok_code == 1: # Success response
244
+ return data.get("data", {})
245
+ elif ok_code is None: # Some endpoints don't return 'ok' field
246
+ return data
247
+ else: # Unknown error
248
+ error_msg = data.get("msg", "Unknown error")
249
+ logger.error(f"Weibo API unknown error: {error_msg}")
250
+ raise DataExtractionError(error_msg)
251
+
252
+ return data
253
+
254
+ except json.JSONDecodeError:
255
+ raise DataExtractionError(f"Invalid JSON response: {response.text[:200]}")
256
+
257
+ async def _get_request(self, endpoint: str, params: Optional[Dict] = None, headers: Optional[Dict] = None,
258
+ **kwargs) -> Dict:
259
+ """Make GET request with proper headers and parameters"""
260
+ final_endpoint = endpoint
261
+ if params:
262
+ final_endpoint = f"{endpoint}?{urllib.parse.urlencode(params)}"
263
+
264
+ request_headers = headers or self.default_headers
265
+
266
+ return await self._make_request(
267
+ "GET", f"{self._api_base}{final_endpoint}",
268
+ headers=request_headers,
269
+ **kwargs
270
+ )
271
+
272
+ async def _post_request(self, endpoint: str, data: Dict, headers: Optional[Dict] = None) -> Dict:
273
+ """Make POST request with proper headers and data"""
274
+ request_headers = headers or self.default_headers
275
+ json_payload = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
276
+
277
+ return await self._make_request(
278
+ "POST", f"{self._api_base}{endpoint}",
279
+ data=json_payload, headers=request_headers
280
+ )
281
+
282
+ async def search_posts_by_keyword(
283
+ self,
284
+ keyword: str,
285
+ page: int = 1,
286
+ search_type: SearchType = SearchType.DEFAULT,
287
+ ) -> List[Dict]:
288
+ """
289
+ Search Weibo posts by keyword
290
+
291
+ Args:
292
+ keyword: Search keyword
293
+ page: Page number (starting from 1)
294
+ search_type: Search type filter
295
+
296
+ Returns:
297
+ List of simplified post information
298
+ """
299
+ endpoint = "/api/container/getIndex"
300
+ container_id = create_container_id(search_type, keyword)
301
+
302
+ cards = []
303
+ posts = []
304
+ for page_num in range(page):
305
+ params = {
306
+ "containerid": container_id,
307
+ "page_type": "searchall",
308
+ "page": page_num,
309
+ }
310
+
311
+ raw_response = await self._get_request(endpoint, params)
312
+ cards.extend(raw_response.get("cards", []))
313
+
314
+ for card in cards:
315
+ mblog = card.get("mblog", {})
316
+ if not mblog.get("id"):
317
+ continue
318
+
319
+ user_info = mblog.get("user", {})
320
+ clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
321
+
322
+ post = {
323
+ "note_id": mblog.get("id"),
324
+ "content": clean_text,
325
+ "created_at": mblog.get("created_at"),
326
+ "liked_count": str(mblog.get("attitudes_count", 0)),
327
+ "comments_count": str(mblog.get("comments_count", 0)),
328
+ "shared_count": str(mblog.get("reposts_count", 0)),
329
+ "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
330
+ "note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
331
+ "user_id": str(user_info.get("id", "")),
332
+ "nickname": user_info.get("screen_name", ""),
333
+ "gender": user_info.get("gender", ""),
334
+ "profile_url": user_info.get("profile_url", ""),
335
+ "avatar": user_info.get("profile_image_url", ""),
336
+ }
337
+ posts.append(post)
338
+
339
+ return posts
340
+
341
+ async def get_post_detail(self, mid: str) -> Optional[Dict]:
342
+ """
343
+ Get detailed post information by mid ID
344
+
345
+ Args:
346
+ mid: Weibo post ID
347
+
348
+ Returns:
349
+ Simplified post detail information
350
+ """
351
+ url = f"{self._api_base}/detail/{mid}"
352
+
353
+ response = await self._make_request(
354
+ "GET", url, headers=self.default_headers, raw_response=True,
355
+ )
356
+ # Extract render data from HTML
357
+ render_data = extract_render_data(response.text)
358
+ if render_data:
359
+ note_detail = render_data.get("status")
360
+ if note_detail:
361
+ user_info = note_detail.get("user", {})
362
+ clean_text = re.sub(r"<.*?>", "", note_detail.get("text", ""))
363
+
364
+ return {
365
+ "note_id": note_detail.get("id"),
366
+ "content": clean_text,
367
+ "created_at": note_detail.get("created_at"),
368
+ "liked_count": str(note_detail.get("attitudes_count", 0)),
369
+ "comments_count": str(note_detail.get("comments_count", 0)),
370
+ "shared_count": str(note_detail.get("reposts_count", 0)),
371
+ "ip_location": note_detail.get("region_name", "").replace("发布于 ", ""),
372
+ "note_url": f"https://m.weibo.cn/detail/{note_detail.get('id')}",
373
+ "user_id": str(user_info.get("id", "")),
374
+ "nickname": user_info.get("screen_name", ""),
375
+ "gender": user_info.get("gender", ""),
376
+ "profile_url": user_info.get("profile_url", ""),
377
+ "avatar": user_info.get("profile_image_url", ""),
378
+ }
379
+
380
+ logger.warning(f"Could not extract render data for post {mid}")
381
+ return None
382
+
383
+ async def get_post_comments(
384
+ self,
385
+ mid: str,
386
+ max_id: int = 0,
387
+ max_id_type: int = 0
388
+ ) -> List[Dict]:
389
+ """
390
+ Get comments for a Weibo post
391
+
392
+ Args:
393
+ mid: Weibo post ID
394
+ max_id: Pagination parameter
395
+ max_id_type: Pagination type parameter
396
+
397
+ Returns:
398
+ List of simplified comment information
399
+ """
400
+ endpoint = "/comments/hotflow"
401
+
402
+ params = {
403
+ "id": mid,
404
+ "mid": mid,
405
+ "max_id_type": str(max_id_type),
406
+ }
407
+
408
+ if max_id > 0:
409
+ params["max_id"] = str(max_id)
410
+
411
+ # Set referer for comment requests
412
+ headers = copy.deepcopy(self.default_headers)
413
+ headers["Referer"] = f"https://m.weibo.cn/detail/{mid}"
414
+
415
+ raw_response = await self._get_request(endpoint, params, headers)
416
+
417
+ # Return simplified comments
418
+ comments = []
419
+ for comment in raw_response.get("data", []):
420
+ if not comment.get("id"):
421
+ continue
422
+
423
+ user_info = comment.get("user", {})
424
+ clean_text = re.sub(r"<.*?>", "", comment.get("text", ""))
425
+
426
+ comment_data = {
427
+ "comment_id": str(comment.get("id")),
428
+ "content": clean_text,
429
+ "created_at": comment.get("created_at"),
430
+ "comment_like_count": str(comment.get("like_count", 0)),
431
+ "sub_comment_count": str(comment.get("total_number", 0)),
432
+ "ip_location": comment.get("source", "").replace("来自", ""),
433
+ "parent_comment_id": comment.get("rootid", ""),
434
+ "user_id": str(user_info.get("id", "")),
435
+ "nickname": user_info.get("screen_name", ""),
436
+ "gender": user_info.get("gender", ""),
437
+ "profile_url": user_info.get("profile_url", ""),
438
+ "avatar": user_info.get("profile_image_url", ""),
439
+ }
440
+ comments.append(comment_data)
441
+
442
+ return comments
443
+
444
+ async def get_all_post_comments(
445
+ self,
446
+ mid: str,
447
+ fetch_interval: float = 1.0,
448
+ include_sub_comments: bool = False,
449
+ progress_callback: Optional[Callable] = None,
450
+ max_comments: int = 1000,
451
+ ) -> List[Dict]:
452
+ """
453
+ Fetch all comments for a post including sub-comments
454
+
455
+ Args:
456
+ mid: Weibo post ID
457
+ fetch_interval: Interval between requests in seconds
458
+ include_sub_comments: Whether to include sub-comments
459
+ progress_callback: Callback function for progress updates
460
+ max_comments: Maximum comments to fetch
461
+
462
+ Returns:
463
+ List of all simplified comments
464
+ """
465
+ all_comments = []
466
+ is_end = False
467
+ max_id = -1
468
+ max_id_type = 0
469
+
470
+ while not is_end and len(all_comments) < max_comments:
471
+ # Get raw response to access pagination info
472
+ endpoint = "/comments/hotflow"
473
+
474
+ params = {
475
+ "id": mid,
476
+ "mid": mid,
477
+ "max_id_type": str(max_id_type),
478
+ }
479
+
480
+ if max_id > 0:
481
+ params["max_id"] = str(max_id)
482
+
483
+ # Set referer for comment requests
484
+ headers = copy.deepcopy(self.default_headers)
485
+ headers["Referer"] = f"https://m.weibo.cn/detail/{mid}"
486
+
487
+ raw_response = await self._get_request(endpoint, params, headers)
488
+
489
+ # Extract pagination info from raw response
490
+ max_id = raw_response.get("max_id", 0)
491
+ max_id_type = raw_response.get("max_id_type", 0)
492
+ is_end = max_id == 0
493
+
494
+ # Transform to simplified comments
495
+ batch_comments = []
496
+ for comment in raw_response.get("data", []):
497
+ if not comment.get("id"):
498
+ continue
499
+
500
+ user_info = comment.get("user", {})
501
+ clean_text = re.sub(r"<.*?>", "", comment.get("text", ""))
502
+
503
+ comment_data = {
504
+ "comment_id": str(comment.get("id")),
505
+ "content": clean_text,
506
+ "created_at": comment.get("created_at"),
507
+ "comment_like_count": str(comment.get("like_count", 0)),
508
+ "sub_comment_count": str(comment.get("total_number", 0)),
509
+ "ip_location": comment.get("source", "").replace("来自", ""),
510
+ "parent_comment_id": comment.get("rootid", ""),
511
+ "user_id": str(user_info.get("id", "")),
512
+ "nickname": user_info.get("screen_name", ""),
513
+ "gender": user_info.get("gender", ""),
514
+ "profile_url": user_info.get("profile_url", ""),
515
+ "avatar": user_info.get("profile_image_url", ""),
516
+ }
517
+ batch_comments.append(comment_data)
518
+
519
+ # Limit comments if approaching max
520
+ remaining_slots = max_comments - len(all_comments)
521
+ if len(batch_comments) > remaining_slots:
522
+ batch_comments = batch_comments[:remaining_slots]
523
+
524
+ if progress_callback:
525
+ await progress_callback(mid, batch_comments)
526
+
527
+ await asyncio.sleep(fetch_interval)
528
+ all_comments.extend(batch_comments)
529
+
530
+ logger.info(f"Fetched {len(all_comments)} comments for post {mid}")
531
+ return all_comments
532
+
533
+ async def get_user_info(self, user_id: str) -> Optional[Dict]:
534
+ """
535
+ Get user profile information
536
+
537
+ Args:
538
+ user_id: User ID
539
+
540
+ Returns:
541
+ Simplified user profile information
542
+ """
543
+ endpoint = "/api/container/getIndex"
544
+
545
+ # Set proper headers for user info request
546
+ headers = copy.deepcopy(self.default_headers)
547
+ headers["Referer"] = f"{self._api_base}/u/{user_id}"
548
+
549
+ # Use standard user profile container ID
550
+ params = {
551
+ "type": "uid",
552
+ "value": user_id,
553
+ "containerid": f"100505{user_id}", # Standard user profile container
554
+ }
555
+
556
+ try:
557
+ user_data = await self._get_request(endpoint, params, headers)
558
+ # Extract user info from cards if available
559
+ user_info = user_data.get('userInfo', {})
560
+ user_info["user_id"] = user_info.get("id", user_id)
561
+ return user_info
562
+
563
+ except Exception as e:
564
+ logger.error(f"Failed to get user info for {user_id}: {e}")
565
+ return None
566
+
567
+ async def get_user_posts(
568
+ self,
569
+ user_id: str,
570
+ since_id: str = "0",
571
+ ) -> Optional[Dict]:
572
+ """
573
+ Get posts by user
574
+
575
+ Args:
576
+ user_id: User ID
577
+ since_id: Pagination parameter (last post ID from previous page)
578
+
579
+ Returns:
580
+ Simplified user posts data
581
+ """
582
+ endpoint = "/api/container/getIndex"
583
+
584
+ # response = await self._get_request(f"/u/{user_id}", raw_response=True)
585
+ # m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
586
+ # m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
587
+ # containerid = m_weibocn_params_dict['fid'][0]
588
+
589
+ params = {
590
+ "jumpfrom": "weibocom",
591
+ "type": "uid",
592
+ "value": user_id,
593
+ "containerid": f"100505{user_id}",
594
+ "since_id": since_id,
595
+ }
596
+
597
+ response = await self._get_request(endpoint, params)
598
+ containerid = f"100505{user_id}"
599
+ if response.get("tabsInfo"):
600
+ tabs: List[Dict] = response.get("tabsInfo", {}).get("tabs", [])
601
+ for tab in tabs:
602
+ if tab.get("tabKey") == "weibo":
603
+ containerid = tab.get("containerid")
604
+ break
605
+ params = {
606
+ "jumpfrom": "weibocom",
607
+ "type": "uid",
608
+ "value": user_id,
609
+ "containerid": containerid,
610
+ "since_id": since_id,
611
+ }
612
+
613
+ response = await self._get_request(endpoint, params)
614
+
615
+ # Transform to simplified posts
616
+ posts = []
617
+ cards = response.get("cards", [])
618
+ for card in cards:
619
+ if card.get("card_type") == 9: # Weibo post card type
620
+ mblog = card.get("mblog", {})
621
+ if not mblog.get("id"):
622
+ continue
623
+
624
+ user_info = mblog.get("user", {})
625
+ clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
626
+
627
+ post = {
628
+ "note_id": mblog.get("id"),
629
+ "content": clean_text,
630
+ "created_at": mblog.get("created_at"),
631
+ "liked_count": str(mblog.get("attitudes_count", 0)),
632
+ "comments_count": str(mblog.get("comments_count", 0)),
633
+ "shared_count": str(mblog.get("reposts_count", 0)),
634
+ "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
635
+ "note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
636
+ "user_id": str(user_info.get("id", "")),
637
+ "nickname": user_info.get("screen_name", ""),
638
+ "gender": user_info.get("gender", ""),
639
+ "profile_url": user_info.get("profile_url", ""),
640
+ "avatar": user_info.get("profile_image_url", ""),
641
+ }
642
+ posts.append(post)
643
+
644
+ return {
645
+ "posts": posts,
646
+ "pagination": {
647
+ "since_id": response.get("cardlistInfo", {}).get("since_id", ""),
648
+ "total": response.get("cardlistInfo", {}).get("total", 0)
649
+ }
650
+ }
651
+
652
+ async def get_all_user_posts(
653
+ self,
654
+ user_id: str,
655
+ fetch_interval: float = 1.0,
656
+ progress_callback: Optional[Callable] = None,
657
+ max_posts: int = 1000,
658
+ ) -> List[Dict]:
659
+ """
660
+ Fetch all posts by a user
661
+
662
+ Args:
663
+ user_id: User ID
664
+ fetch_interval: Interval between requests in seconds
665
+ progress_callback: Callback function for progress updates
666
+ max_posts: Maximum posts to fetch
667
+
668
+ Returns:
669
+ List of all simplified user posts
670
+ """
671
+ all_posts = []
672
+ has_more = True
673
+ since_id = ""
674
+ crawler_total_count = 0
675
+
676
+ while has_more and len(all_posts) < max_posts:
677
+ # Get raw response to access pagination info and then transform
678
+ endpoint = "/api/container/getIndex"
679
+
680
+ params = {
681
+ "jumpfrom": "weibocom",
682
+ "type": "uid",
683
+ "value": user_id,
684
+ "containerid": f"100505{user_id}",
685
+ "since_id": since_id,
686
+ }
687
+
688
+ raw_posts_data = await self._get_request(endpoint, params)
689
+
690
+ if not raw_posts_data:
691
+ logger.error(f"User {user_id} may be restricted or data unavailable")
692
+ break
693
+
694
+ # Extract pagination info from raw response
695
+ since_id = raw_posts_data.get("cardlistInfo", {}).get("since_id", "0")
696
+ if "cards" not in raw_posts_data:
697
+ logger.info(f"No posts found in response for user {user_id}")
698
+ break
699
+
700
+ # Transform to simplified posts
701
+ posts = []
702
+ cards = raw_posts_data.get("cards", [])
703
+ for card in cards:
704
+ if card.get("card_type") == 9: # Weibo post card type
705
+ mblog = card.get("mblog", {})
706
+ if not mblog.get("id"):
707
+ continue
708
+
709
+ user_info = mblog.get("user", {})
710
+ clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
711
+
712
+ post = {
713
+ "note_id": mblog.get("id"),
714
+ "content": clean_text,
715
+ "created_at": mblog.get("created_at"),
716
+ "liked_count": str(mblog.get("attitudes_count", 0)),
717
+ "comments_count": str(mblog.get("comments_count", 0)),
718
+ "shared_count": str(mblog.get("reposts_count", 0)),
719
+ "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
720
+ "note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
721
+ "user_id": str(user_info.get("id", "")),
722
+ "nickname": user_info.get("screen_name", ""),
723
+ "gender": user_info.get("gender", ""),
724
+ "profile_url": user_info.get("profile_url", ""),
725
+ "avatar": user_info.get("profile_image_url", ""),
726
+ }
727
+ posts.append(post)
728
+
729
+ logger.info(f"Fetched {len(posts)} posts for user {user_id}")
730
+
731
+ remaining_slots = max_posts - len(all_posts)
732
+ if remaining_slots <= 0:
733
+ break
734
+
735
+ posts_to_add = posts[:remaining_slots]
736
+
737
+ if progress_callback:
738
+ await progress_callback(posts_to_add)
739
+
740
+ all_posts.extend(posts_to_add)
741
+ await asyncio.sleep(fetch_interval)
742
+
743
+ crawler_total_count += 10
744
+ total_available = raw_posts_data.get("cardlistInfo", {}).get("total", 0)
745
+ has_more = total_available > crawler_total_count and since_id != "0"
746
+
747
+ logger.info(f"Fetched total {len(all_posts)} posts for user {user_id}")
748
+ return all_posts
749
+
750
+ async def get_trending_posts(self) -> List[Dict]:
751
+ """
752
+ Get Weibo trending posts (热搜榜)
753
+
754
+ Returns:
755
+ List of simplified trending post information
756
+ """
757
+ endpoint = "/api/feed/trendtop"
758
+ params = {
759
+ "containerid": TrendingConstants.TRENDING_CONTAINER_ID
760
+ }
761
+
762
+ raw_response = await self._get_request(endpoint, params)
763
+
764
+ # Transform to simplified posts
765
+ posts = []
766
+ cards = raw_response.get("statuses", [])
767
+ for mblog in cards:
768
+ if not mblog.get("id"):
769
+ continue
770
+
771
+ user_info = mblog.get("user", {})
772
+ clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
773
+
774
+ post = {
775
+ "note_id": mblog.get("id"),
776
+ "content": clean_text,
777
+ "created_at": mblog.get("created_at"),
778
+ "liked_count": str(mblog.get("attitudes_count", 0)),
779
+ "comments_count": str(mblog.get("comments_count", 0)),
780
+ "shared_count": str(mblog.get("reposts_count", 0)),
781
+ "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
782
+ "note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
783
+ "user_id": str(user_info.get("id", "")),
784
+ "nickname": user_info.get("screen_name", ""),
785
+ "gender": user_info.get("gender", ""),
786
+ "profile_url": user_info.get("profile_url", ""),
787
+ "avatar": user_info.get("profile_image_url", ""),
788
+ }
789
+ posts.append(post)
790
+
791
+ return posts
792
+
793
+ async def get_hot_posts(self) -> List[Dict]:
794
+ """
795
+ Get Weibo hot posts (热门推荐)
796
+
797
+ Returns:
798
+ List of simplified hot post information
799
+ """
800
+ endpoint = "/api/container/getIndex"
801
+ params = {
802
+ "containerid": TrendingConstants.HOT_POSTS_CONTAINER_ID,
803
+ "openApp": TrendingConstants.OPEN_APP
804
+ }
805
+
806
+ raw_response = await self._get_request(endpoint, params)
807
+
808
+ # Transform to simplified posts (same structure as search results)
809
+ posts = []
810
+ cards = raw_response.get("cards", [])
811
+ for card in cards:
812
+ if card.get("card_type") == 9: # Weibo post card type
813
+ mblog = card.get("mblog", {})
814
+ if not mblog.get("id"):
815
+ continue
816
+
817
+ user_info = mblog.get("user", {})
818
+ clean_text = re.sub(r"<.*?>", "", mblog.get("text", ""))
819
+
820
+ post = {
821
+ "note_id": mblog.get("id"),
822
+ "content": clean_text,
823
+ "created_at": mblog.get("created_at"),
824
+ "liked_count": str(mblog.get("attitudes_count", 0)),
825
+ "comments_count": str(mblog.get("comments_count", 0)),
826
+ "shared_count": str(mblog.get("reposts_count", 0)),
827
+ "ip_location": mblog.get("region_name", "").replace("发布于 ", ""),
828
+ "note_url": f"https://m.weibo.cn/detail/{mblog.get('id')}",
829
+ "user_id": str(user_info.get("id", "")),
830
+ "nickname": user_info.get("screen_name", ""),
831
+ "gender": user_info.get("gender", ""),
832
+ "profile_url": user_info.get("profile_url", ""),
833
+ "avatar": user_info.get("profile_image_url", ""),
834
+ }
835
+ posts.append(post)
836
+
837
+ return posts
838
+
839
+ async def close(self):
840
+ if self.browser_session and self.target_id:
841
+ try:
842
+ logger.info(f"Close target id: {self.target_id}")
843
+ await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
844
+ except Exception as e:
845
+ logger.warning(f"Error closing target {self.target_id}: {e}")
846
+