vibesurf 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -0,0 +1,997 @@
1
+ import pdb
2
+ import random
3
+ import time
4
+ import re
5
+ import json
6
+ import html
7
+ from typing import Dict, List, Tuple, Optional
8
+ from enum import Enum
9
+ from urllib.parse import parse_qs, unquote
10
+
11
+
12
+ class SearchType(Enum):
13
+ """Search type enumeration for Weibo"""
14
+ DEFAULT = "1"
15
+ REAL_TIME = "61"
16
+ POPULAR = "60"
17
+ VIDEO = "64"
18
+
19
+
20
+ class TrendingType(Enum):
21
+ """Trending type enumeration for Weibo mobile APIs"""
22
+ TRENDING_LIST = "trending_list"
23
+ HOT_POSTS = "hot_posts"
24
+
25
+
26
+ class TrendingConstants:
27
+ """Constants for Weibo mobile trending APIs"""
28
+ # Trending list API
29
+ TRENDING_CONTAINER_ID = "102803_ctg1_8999_-_ctg1_8999_home"
30
+
31
+ # Hot posts API
32
+ HOT_POSTS_CONTAINER_ID = "102803"
33
+
34
+ # Common parameters
35
+ OPEN_APP = "0"
36
+
37
+
38
+ def generate_device_id() -> str:
39
+ """Generate a random device ID for Weibo requests"""
40
+ chars = "0123456789abcdef"
41
+ return ''.join(random.choices(chars, k=32))
42
+
43
+
44
+ def create_container_id(search_type: SearchType, keyword: str) -> str:
45
+ """Create container ID for search requests"""
46
+ return f"100103type={search_type.value}&q={keyword}"
47
+
48
+
49
+ def extract_cookies_from_browser(web_cookies: List[Dict]) -> Tuple[str, Dict[str, str]]:
50
+ """Extract and format cookies from browser, filtering only Weibo related cookies"""
51
+ cookie_dict = {}
52
+ cookie_parts = []
53
+
54
+ # Weibo domain patterns to filter
55
+ weibo_domains = [
56
+ # '.weibo.com',
57
+ '.weibo.cn',
58
+ # 'm.weibo.cn',
59
+ # 'www.weibo.com'
60
+ ]
61
+ for cookie in web_cookies:
62
+ if 'name' in cookie and 'value' in cookie and 'domain' in cookie:
63
+ domain = cookie['domain']
64
+
65
+ # Filter only Weibo related cookies
66
+ if any(wb_domain in domain for wb_domain in weibo_domains):
67
+ name = cookie['name']
68
+ value = cookie['value']
69
+ cookie_dict[name] = value
70
+ cookie_parts.append(f"{name}={value}")
71
+
72
+ cookie_string = "; ".join(cookie_parts)
73
+ return cookie_string, cookie_dict
74
+
75
+
76
+ def extract_mid_from_url(weibo_url: str) -> Optional[str]:
77
+ """Extract mid from Weibo URL"""
78
+ patterns = [
79
+ r'/detail/(\w+)',
80
+ r'mid=(\w+)',
81
+ r'/(\w+)$',
82
+ ]
83
+
84
+ for pattern in patterns:
85
+ match = re.search(pattern, weibo_url)
86
+ if match:
87
+ return match.group(1)
88
+
89
+ return None
90
+
91
+
92
+ def extract_user_id_from_url(user_url: str) -> Optional[str]:
93
+ """Extract user ID from Weibo user URL"""
94
+ patterns = [
95
+ r'/u/(\d+)',
96
+ r'uid=(\d+)',
97
+ r'/profile/(\d+)',
98
+ ]
99
+
100
+ for pattern in patterns:
101
+ match = re.search(pattern, user_url)
102
+ if match:
103
+ return match.group(1)
104
+
105
+ return None
106
+
107
+
108
+ def parse_weibo_time(time_str: str) -> Optional[int]:
109
+ """Parse Weibo time string to timestamp"""
110
+ if not time_str:
111
+ return None
112
+
113
+ try:
114
+ # Handle relative time like "3分钟前", "1小时前", etc.
115
+ if "分钟前" in time_str:
116
+ minutes = int(re.search(r'(\d+)分钟前', time_str).group(1))
117
+ return int(time.time()) - minutes * 60
118
+ elif "小时前" in time_str:
119
+ hours = int(re.search(r'(\d+)小时前', time_str).group(1))
120
+ return int(time.time()) - hours * 3600
121
+ elif "天前" in time_str:
122
+ days = int(re.search(r'(\d+)天前', time_str).group(1))
123
+ return int(time.time()) - days * 86400
124
+ elif "今天" in time_str:
125
+ return int(time.time())
126
+ elif "昨天" in time_str:
127
+ return int(time.time()) - 86400
128
+ else:
129
+ # Try to parse as timestamp
130
+ return int(time_str)
131
+ except (ValueError, AttributeError):
132
+ return None
133
+
134
+
135
+ def extract_image_urls(pics: List[Dict]) -> List[str]:
136
+ """Extract image URLs from Weibo pics data"""
137
+ image_urls = []
138
+
139
+ for pic in pics:
140
+ if isinstance(pic, dict):
141
+ # Try different URL fields
142
+ url = pic.get('url') or pic.get('large', {}).get('url') or pic.get('pic_big')
143
+ if url:
144
+ image_urls.append(url)
145
+
146
+ return image_urls
147
+
148
+
149
+ def process_weibo_text(text: str) -> str:
150
+ """Process Weibo text content, remove HTML tags and clean up"""
151
+ if not text:
152
+ return ""
153
+
154
+ # Remove HTML tags
155
+ text = re.sub(r'<[^>]+>', '', text)
156
+
157
+ # Remove extra whitespace
158
+ text = re.sub(r'\s+', ' ', text).strip()
159
+
160
+ return text
161
+
162
+
163
+ def validate_weibo_data(weibo_data: Dict) -> bool:
164
+ """Validate if weibo data contains required fields"""
165
+ required_fields = ["id", "text", "user"]
166
+
167
+ for field in required_fields:
168
+ if field not in weibo_data:
169
+ return False
170
+
171
+ return True
172
+
173
+
174
+ def filter_search_result_card(card_list: List[Dict]) -> List[Dict]:
175
+ """
176
+ Filter Weibo search results, only keep card_type=9 data
177
+ """
178
+ note_list: List[Dict] = []
179
+
180
+ for card_item in card_list:
181
+ if card_item.get("card_type") == 9:
182
+ note_list.append(card_item)
183
+
184
+ # Check card_group for nested items
185
+ card_group = card_item.get("card_group", [])
186
+ for card_group_item in card_group:
187
+ if card_group_item.get("card_type") == 9:
188
+ note_list.append(card_group_item)
189
+
190
+ return note_list
191
+
192
+
193
+ def extract_container_params(m_weibocn_params: str) -> Dict[str, str]:
194
+ """Extract container parameters from M_WEIBOCN_PARAMS cookie"""
195
+ try:
196
+ params_dict = parse_qs(unquote(m_weibocn_params))
197
+ return {
198
+ "fid_container_id": params_dict.get("fid", [""])[0],
199
+ "lfid_container_id": params_dict.get("lfid", [""])[0]
200
+ }
201
+ except Exception:
202
+ return {"fid_container_id": "", "lfid_container_id": ""}
203
+
204
+
205
+ def build_image_proxy_url(image_url: str, proxy_host: str = "https://i1.wp.com/") -> str:
206
+ """Build proxied image URL to bypass anti-hotlinking"""
207
+ if not image_url.startswith("http"):
208
+ return image_url
209
+
210
+ # Remove https:// prefix
211
+ clean_url = image_url[8:] if image_url.startswith("https://") else image_url[7:]
212
+
213
+ # Split URL parts
214
+ url_parts = clean_url.split("/")
215
+
216
+ # Reconstruct URL with 'large' for high quality images
217
+ processed_url = ""
218
+ for i, part in enumerate(url_parts):
219
+ if i == 1: # Insert 'large' after domain
220
+ processed_url += "large/"
221
+ elif i == len(url_parts) - 1: # Last part (filename)
222
+ processed_url += part
223
+ else:
224
+ processed_url += part + "/"
225
+
226
+ return f"{proxy_host}{processed_url}"
227
+
228
+
229
+ def sanitize_filename(filename: str) -> str:
230
+ """Sanitize filename for file system"""
231
+ # Remove invalid characters
232
+ filename = re.sub(r'[<>:"/\\|?*]', '', filename)
233
+ # Remove extra spaces
234
+ filename = re.sub(r'\s+', ' ', filename).strip()
235
+ # Limit length
236
+ if len(filename) > 100:
237
+ filename = filename[:100]
238
+
239
+ return filename or "untitled"
240
+
241
+
242
+ def extract_render_data(html_content: str) -> Optional[Dict]:
243
+ """Extract render data from Weibo detail page HTML"""
244
+ try:
245
+ match = re.search(r'var \$render_data = (\[.*?\])\[0\]', html_content, re.DOTALL)
246
+ if match:
247
+ render_data_json = match.group(1)
248
+ render_data_dict = json.loads(render_data_json)
249
+ return render_data_dict[0] if render_data_dict else None
250
+ except (json.JSONDecodeError, IndexError):
251
+ pass
252
+
253
+ return None
254
+
255
+
256
+ class WeiboError(Exception):
257
+ """Base exception for Weibo API errors"""
258
+ pass
259
+
260
+
261
+ class NetworkError(WeiboError):
262
+ """Network connection error"""
263
+ pass
264
+
265
+
266
+ class DataExtractionError(WeiboError):
267
+ """Data extraction error"""
268
+ pass
269
+
270
+
271
+ class AuthenticationError(WeiboError):
272
+ """Authentication error"""
273
+ pass
274
+
275
+
276
+ class RateLimitError(WeiboError):
277
+ """Rate limit exceeded error"""
278
+ pass
279
+
280
+
281
+ class ContentNotFoundError(WeiboError):
282
+ """Content not found error"""
283
+ pass
284
+
285
+
286
+ class ValidationError(WeiboError):
287
+ """Data validation error"""
288
+ pass
289
+
290
+ def extract_redirect_url_from_html(html_content: str) -> Optional[str]:
291
+ """Extract redirect URL from HTML meta refresh or JavaScript redirect"""
292
+ try:
293
+ # Try meta refresh tag
294
+ meta_match = re.search(r'<meta[^>]*http-equiv=["\']refresh["\'][^>]*content=["\'][^"\']*url=([^"\']+)["\']', html_content, re.IGNORECASE)
295
+ if meta_match:
296
+ return html.unescape(meta_match.group(1))
297
+
298
+ # Try JavaScript location.replace
299
+ js_match = re.search(r'location\.replace\(["\']([^"\']+)["\']\)', html_content, re.IGNORECASE)
300
+ if js_match:
301
+ return html.unescape(js_match.group(1))
302
+
303
+ # Try window.location.href
304
+ js_match2 = re.search(r'window\.location\.href\s*=\s*["\']([^"\']+)["\']', html_content, re.IGNORECASE)
305
+ if js_match2:
306
+ return html.unescape(js_match2.group(1))
307
+
308
+ except Exception:
309
+ pass
310
+
311
+ return None
312
+
313
+
314
+ def decode_chinese_html(html_content: bytes) -> str:
315
+ """Decode HTML content that might be in GBK or other Chinese encodings"""
316
+ encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']
317
+
318
+ for encoding in encodings:
319
+ try:
320
+ return html_content.decode(encoding)
321
+ except UnicodeDecodeError:
322
+ continue
323
+
324
+ # If all else fails, try with error handling
325
+ return html_content.decode('utf-8', errors='ignore')
326
+
327
+
328
+ def get_mobile_user_agent() -> str:
329
+ ua_list = [
330
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
331
+ "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
332
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
333
+ "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
334
+ "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
335
+ "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
336
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
337
+ "Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
338
+ ]
339
+ return random.choice(ua_list)
340
+
341
+
342
+ def transform_weibo_post_data(card_data: Dict) -> Optional[Dict]:
343
+ """
344
+ Transform raw Weibo card data into structured post information
345
+
346
+ Args:
347
+ card_data: Raw card data from Weibo API
348
+
349
+ Returns:
350
+ Structured post information or None if invalid data
351
+ """
352
+ if not isinstance(card_data, dict) or card_data.get("card_type") != 9:
353
+ return None
354
+
355
+ mblog = card_data.get("mblog", {})
356
+ if not mblog:
357
+ return None
358
+
359
+ user = mblog.get("user", {})
360
+ if not user:
361
+ return None
362
+
363
+ try:
364
+ post_info = {
365
+ "mid": mblog.get("id"),
366
+ "text": process_weibo_text(mblog.get("text", "")),
367
+ "created_at": mblog.get("created_at"),
368
+ "source": mblog.get("source"),
369
+ "reposts_count": mblog.get("reposts_count", 0),
370
+ "comments_count": mblog.get("comments_count", 0),
371
+ "attitudes_count": mblog.get("attitudes_count", 0),
372
+ "user": {
373
+ "id": user.get("id"),
374
+ "screen_name": user.get("screen_name"),
375
+ "profile_image_url": user.get("profile_image_url"),
376
+ "followers_count": user.get("followers_count", 0),
377
+ "friends_count": user.get("friends_count", 0),
378
+ "statuses_count": user.get("statuses_count", 0),
379
+ },
380
+ "pics": mblog.get("pics", []),
381
+ "page_info": mblog.get("page_info", {}), # Video info if present
382
+ }
383
+
384
+ # Clean up followers_count if it's a string with suffix
385
+ followers_count = user.get("followers_count", 0)
386
+ if isinstance(followers_count, str):
387
+ # Handle cases like "11.2万"
388
+ if "万" in followers_count:
389
+ try:
390
+ num_str = followers_count.replace("万", "")
391
+ post_info["user"]["followers_count"] = int(float(num_str) * 10000)
392
+ except (ValueError, TypeError):
393
+ post_info["user"]["followers_count"] = 0
394
+ else:
395
+ try:
396
+ post_info["user"]["followers_count"] = int(followers_count)
397
+ except (ValueError, TypeError):
398
+ post_info["user"]["followers_count"] = 0
399
+
400
+ # Validate essential fields
401
+ if not post_info["mid"] or not post_info["user"]["id"]:
402
+ return None
403
+
404
+ return post_info
405
+
406
+ except Exception as e:
407
+ # Log error but don't fail completely
408
+ return None
409
+
410
+
411
+ def transform_weibo_search_results(api_response: Dict) -> List[Dict]:
412
+ """
413
+ Transform raw Weibo search API response into list of structured posts
414
+
415
+ Args:
416
+ api_response: Raw API response from search_posts_by_keyword
417
+
418
+ Returns:
419
+ List of structured post information
420
+ """
421
+ if not isinstance(api_response, dict):
422
+ return []
423
+
424
+ cards = api_response.get("cards", [])
425
+ if not isinstance(cards, list):
426
+ return []
427
+
428
+ # Filter and transform cards
429
+ filtered_cards = filter_search_result_card(cards)
430
+ structured_posts = []
431
+
432
+ for card in filtered_cards:
433
+ post_info = transform_weibo_post_data(card)
434
+ if post_info:
435
+ structured_posts.append(post_info)
436
+
437
+ return structured_posts
438
+
439
+
440
+ def transform_weibo_post_detail(detail_response: Dict) -> Optional[Dict]:
441
+ """
442
+ Transform raw Weibo post detail response into structured post information
443
+
444
+ Args:
445
+ detail_response: Raw response from get_post_detail
446
+
447
+ Returns:
448
+ Structured post detail information or None if invalid data
449
+ """
450
+ if not isinstance(detail_response, dict):
451
+ return None
452
+
453
+ mblog = detail_response.get("mblog", {})
454
+ if not mblog:
455
+ return None
456
+
457
+ user = mblog.get("user", {})
458
+ if not user:
459
+ return None
460
+
461
+ try:
462
+ post_detail = {
463
+ "mid": mblog.get("id"),
464
+ "text": process_weibo_text(mblog.get("text", "")),
465
+ "created_at": mblog.get("created_at"),
466
+ "source": mblog.get("source"),
467
+ "reposts_count": mblog.get("reposts_count", 0),
468
+ "comments_count": mblog.get("comments_count", 0),
469
+ "attitudes_count": mblog.get("attitudes_count", 0),
470
+ "user": {
471
+ "id": user.get("id"),
472
+ "screen_name": user.get("screen_name"),
473
+ "profile_image_url": user.get("profile_image_url"),
474
+ "followers_count": user.get("followers_count", 0),
475
+ "friends_count": user.get("follow_count", 0), # Note: different field name
476
+ "statuses_count": user.get("statuses_count", 0),
477
+ "verified": user.get("verified", False),
478
+ "verified_type": user.get("verified_type", 0),
479
+ "verified_reason": user.get("verified_reason", ""),
480
+ "description": user.get("description", ""),
481
+ },
482
+ "pics": mblog.get("pic_ids", []),
483
+ "pic_num": mblog.get("pic_num", 0),
484
+ "page_info": mblog.get("page_info", {}), # Video info if present
485
+ "is_long_text": mblog.get("isLongText", False),
486
+ "favorited": mblog.get("favorited", False),
487
+ "can_edit": mblog.get("can_edit", False),
488
+ "visible": mblog.get("visible", {}),
489
+ "bid": mblog.get("bid", ""),
490
+ "status_title": mblog.get("status_title", ""),
491
+ }
492
+
493
+ # Clean up followers_count if it's a string with suffix
494
+ followers_count = user.get("followers_count", 0)
495
+ if isinstance(followers_count, str):
496
+ # Handle cases like "3800.8万"
497
+ if "万" in followers_count:
498
+ try:
499
+ num_str = followers_count.replace("万", "")
500
+ post_detail["user"]["followers_count"] = int(float(num_str) * 10000)
501
+ except (ValueError, TypeError):
502
+ post_detail["user"]["followers_count"] = 0
503
+ else:
504
+ try:
505
+ post_detail["user"]["followers_count"] = int(followers_count)
506
+ except (ValueError, TypeError):
507
+ post_detail["user"]["followers_count"] = 0
508
+
509
+ # Process video information if present
510
+ page_info = mblog.get("page_info", {})
511
+ if page_info and page_info.get("type") == "video":
512
+ post_detail["video_info"] = {
513
+ "title": page_info.get("title", ""),
514
+ "page_title": page_info.get("page_title", ""),
515
+ "object_id": page_info.get("object_id", ""),
516
+ "page_url": page_info.get("page_url", ""),
517
+ "duration": page_info.get("media_info", {}).get("duration", 0),
518
+ "video_orientation": page_info.get("video_orientation", ""),
519
+ "urls": page_info.get("urls", {}),
520
+ "cover_image": {
521
+ "url": page_info.get("page_pic", {}).get("url", ""),
522
+ "width": page_info.get("page_pic", {}).get("width", ""),
523
+ "height": page_info.get("page_pic", {}).get("height", ""),
524
+ }
525
+ }
526
+
527
+ # Validate essential fields
528
+ if not post_detail["mid"] or not post_detail["user"]["id"]:
529
+ return None
530
+
531
+ return post_detail
532
+
533
+ except Exception as e:
534
+ # Log error but don't fail completely
535
+ return None
536
+
537
+
538
+ def transform_weibo_comment_data(comment_data: Dict) -> Optional[Dict]:
539
+ """
540
+ Transform raw Weibo comment data into structured comment information
541
+
542
+ Args:
543
+ comment_data: Raw comment data from Weibo API
544
+
545
+ Returns:
546
+ Structured comment information or None if invalid data
547
+ """
548
+ if not isinstance(comment_data, dict):
549
+ return None
550
+
551
+ user = comment_data.get("user", {})
552
+ if not user:
553
+ return None
554
+
555
+ try:
556
+ comment_info = {
557
+ "id": comment_data.get("id"),
558
+ "text": process_weibo_text(comment_data.get("text", "")),
559
+ "created_at": comment_data.get("created_at"),
560
+ "source": comment_data.get("source"),
561
+ "floor_number": comment_data.get("floor_number", 0),
562
+ "like_count": comment_data.get("like_count", 0),
563
+ "liked": comment_data.get("liked", False),
564
+ "user": {
565
+ "id": user.get("id"),
566
+ "screen_name": user.get("screen_name"),
567
+ "profile_image_url": user.get("profile_image_url"),
568
+ "followers_count": user.get("followers_count", 0),
569
+ "follow_count": user.get("follow_count", 0),
570
+ "statuses_count": user.get("statuses_count", 0),
571
+ "verified": user.get("verified", False),
572
+ "verified_type": user.get("verified_type", -1),
573
+ "verified_reason": user.get("verified_reason", ""),
574
+ "description": user.get("description", ""),
575
+ "gender": user.get("gender", ""),
576
+ },
577
+ "rootid": comment_data.get("rootid"),
578
+ "disable_reply": comment_data.get("disable_reply", 0),
579
+ "isLikedByMblogAuthor": comment_data.get("isLikedByMblogAuthor", False),
580
+ "bid": comment_data.get("bid", ""),
581
+ # Sub-comments information
582
+ "has_sub_comments": comment_data.get("comments", False),
583
+ "sub_comments_count": comment_data.get("total_number", 0),
584
+ }
585
+
586
+ # Clean up followers_count if it's a string with suffix
587
+ followers_count = user.get("followers_count", 0)
588
+ if isinstance(followers_count, str):
589
+ # Handle cases like "115", "11万", etc.
590
+ if "万" in followers_count:
591
+ try:
592
+ num_str = followers_count.replace("万", "")
593
+ comment_info["user"]["followers_count"] = int(float(num_str) * 10000)
594
+ except (ValueError, TypeError):
595
+ comment_info["user"]["followers_count"] = 0
596
+ else:
597
+ try:
598
+ comment_info["user"]["followers_count"] = int(followers_count)
599
+ except (ValueError, TypeError):
600
+ comment_info["user"]["followers_count"] = 0
601
+
602
+ # Validate essential fields
603
+ if not comment_info["id"] or not comment_info["user"]["id"]:
604
+ return None
605
+
606
+ return comment_info
607
+
608
+ except Exception as e:
609
+ # Log error but don't fail completely
610
+ return None
611
+
612
+
613
+ def transform_weibo_comments_response(comments_response: Dict) -> List[Dict]:
614
+ """
615
+ Transform raw Weibo comments API response into list of structured comments
616
+
617
+ Args:
618
+ comments_response: Raw API response from get_post_comments
619
+
620
+ Returns:
621
+ List of structured comment information
622
+ """
623
+ if not isinstance(comments_response, dict):
624
+ return []
625
+
626
+ comments_data = comments_response.get("data", [])
627
+ if not isinstance(comments_data, list):
628
+ return []
629
+
630
+ structured_comments = []
631
+
632
+ for comment in comments_data:
633
+ comment_info = transform_weibo_comment_data(comment)
634
+ if comment_info:
635
+ structured_comments.append(comment_info)
636
+
637
+ return structured_comments
638
+
639
+
640
+ def transform_weibo_user_info(user_response: Dict) -> Optional[Dict]:
641
+ """
642
+ Transform raw Weibo user info response into structured user information
643
+
644
+ Args:
645
+ user_response: Raw response from get_user_info
646
+
647
+ Returns:
648
+ Structured user information or None if invalid data
649
+ """
650
+ if not isinstance(user_response, dict):
651
+ return None
652
+
653
+ user = user_response.get("user", {})
654
+ if not user or not user.get("id"):
655
+ return None
656
+
657
+ try:
658
+ user_info = {
659
+ "id": user.get("id"),
660
+ "screen_name": user.get("screen_name", ""),
661
+ "profile_image_url": user.get("profile_image_url", ""),
662
+ "followers_count": user.get("followers_count", 0),
663
+ "friends_count": user.get("friends_count", 0),
664
+ "statuses_count": user.get("statuses_count", 0),
665
+ "verified": user.get("verified", False),
666
+ "verified_type": user.get("verified_type", -1),
667
+ "verified_reason": user.get("verified_reason", ""),
668
+ "description": user.get("description", ""),
669
+ "gender": user.get("gender", ""),
670
+ "location": user.get("location", ""),
671
+ "created_at": user.get("created_at", ""),
672
+ "profile_url": user.get("profile_url", ""),
673
+ "cover_image_phone": user.get("cover_image_phone", ""),
674
+ "avatar_hd": user.get("avatar_hd", ""),
675
+ # Container and navigation info
676
+ "containerid": user_response.get("containerid", ""),
677
+ "tabs_info": {
678
+ "selected_tab": user_response.get("tabsInfo", {}).get("selectedTab", 1),
679
+ "tabs": []
680
+ }
681
+ }
682
+
683
+ # Process tabs information
684
+ tabs = user_response.get("tabsInfo", {}).get("tabs", [])
685
+ for tab in tabs:
686
+ if isinstance(tab, dict):
687
+ tab_info = {
688
+ "id": tab.get("id"),
689
+ "tab_key": tab.get("tabKey", ""),
690
+ "title": tab.get("title", ""),
691
+ "tab_type": tab.get("tab_type", ""),
692
+ "containerid": tab.get("containerid", ""),
693
+ "must_show": tab.get("must_show", 0),
694
+ "hidden": tab.get("hidden", 0),
695
+ }
696
+
697
+ # Add optional fields if present
698
+ if "apipath" in tab:
699
+ tab_info["apipath"] = tab["apipath"]
700
+ if "headSubTitleText" in tab:
701
+ tab_info["head_subtitle_text"] = tab["headSubTitleText"]
702
+ if "tab_icon" in tab:
703
+ tab_info["tab_icon"] = tab["tab_icon"]
704
+ if "tab_icon_dark" in tab:
705
+ tab_info["tab_icon_dark"] = tab["tab_icon_dark"]
706
+ if "url" in tab:
707
+ tab_info["url"] = tab["url"]
708
+
709
+ user_info["tabs_info"]["tabs"].append(tab_info)
710
+
711
+ # Clean up followers_count if it's a string with suffix
712
+ followers_count = user.get("followers_count", 0)
713
+ if isinstance(followers_count, str):
714
+ if "万" in followers_count:
715
+ try:
716
+ num_str = followers_count.replace("万", "")
717
+ user_info["followers_count"] = int(float(num_str) * 10000)
718
+ except (ValueError, TypeError):
719
+ user_info["followers_count"] = 0
720
+ else:
721
+ try:
722
+ user_info["followers_count"] = int(followers_count)
723
+ except (ValueError, TypeError):
724
+ user_info["followers_count"] = 0
725
+
726
+ return user_info
727
+
728
+ except Exception as e:
729
+ # Log error but don't fail completely
730
+ return None
731
+
732
+
733
+ def transform_weibo_user_posts_response(user_posts_response: Dict) -> Optional[Dict]:
734
+ """
735
+ Transform raw Weibo user posts response into structured information
736
+
737
+ Args:
738
+ user_posts_response: Raw response from get_user_posts
739
+
740
+ Returns:
741
+ Structured user posts information or None if invalid data
742
+ """
743
+ if not isinstance(user_posts_response, dict):
744
+ return None
745
+
746
+ user_info = user_posts_response.get("userInfo", {})
747
+ if not user_info:
748
+ return None
749
+
750
+ try:
751
+ user_posts_info = {
752
+ "user": {
753
+ "id": user_info.get("id"),
754
+ "screen_name": user_info.get("screen_name", ""),
755
+ "profile_image_url": user_info.get("profile_image_url", ""),
756
+ "followers_count": user_info.get("followers_count", 0),
757
+ "follow_count": user_info.get("follow_count", 0),
758
+ "statuses_count": user_info.get("statuses_count", 0),
759
+ "verified": user_info.get("verified", False),
760
+ "verified_type": user_info.get("verified_type", -1),
761
+ "verified_reason": user_info.get("verified_reason", ""),
762
+ "description": user_info.get("description", ""),
763
+ "gender": user_info.get("gender", ""),
764
+ "profile_url": user_info.get("profile_url", ""),
765
+ "cover_image_phone": user_info.get("cover_image_phone", ""),
766
+ "avatar_hd": user_info.get("avatar_hd", ""),
767
+ "mbtype": user_info.get("mbtype", 0),
768
+ "svip": user_info.get("svip", 0),
769
+ "urank": user_info.get("urank", 0),
770
+ "mbrank": user_info.get("mbrank", 0),
771
+ },
772
+ "style_config": {
773
+ "is_video_cover_style": user_posts_response.get("isVideoCoverStyle", 0),
774
+ "is_star_style": user_posts_response.get("isStarStyle", 0),
775
+ },
776
+ "navigation": {
777
+ "fans_scheme": user_posts_response.get("fans_scheme", ""),
778
+ "follow_scheme": user_posts_response.get("follow_scheme", ""),
779
+ "profile_scheme": user_posts_response.get("scheme", ""),
780
+ },
781
+ "tabs_info": {
782
+ "selected_tab": user_posts_response.get("tabsInfo", {}).get("selectedTab", 1),
783
+ "tabs": []
784
+ },
785
+ "toolbar_menus": [],
786
+ "profile_ext": user_posts_response.get("profile_ext", ""),
787
+ "show_app_tips": user_posts_response.get("showAppTips", 0),
788
+ # Posts data if present
789
+ "posts": [],
790
+ "pagination": {
791
+ "since_id": user_posts_response.get("cardlistInfo", {}).get("since_id", ""),
792
+ "total": user_posts_response.get("cardlistInfo", {}).get("total", 0),
793
+ }
794
+ }
795
+
796
+ # Process tabs information
797
+ tabs = user_posts_response.get("tabsInfo", {}).get("tabs", [])
798
+ for tab in tabs:
799
+ if isinstance(tab, dict):
800
+ tab_info = {
801
+ "id": tab.get("id"),
802
+ "tab_key": tab.get("tabKey", ""),
803
+ "title": tab.get("title", ""),
804
+ "tab_type": tab.get("tab_type", ""),
805
+ "containerid": tab.get("containerid", ""),
806
+ "must_show": tab.get("must_show", 0),
807
+ "hidden": tab.get("hidden", 0),
808
+ }
809
+
810
+ # Add optional fields if present
811
+ if "apipath" in tab:
812
+ tab_info["apipath"] = tab["apipath"]
813
+ if "headSubTitleText" in tab:
814
+ tab_info["head_subtitle_text"] = tab["headSubTitleText"]
815
+ if "tab_icon" in tab:
816
+ tab_info["tab_icon"] = tab["tab_icon"]
817
+ if "tab_icon_dark" in tab:
818
+ tab_info["tab_icon_dark"] = tab["tab_icon_dark"]
819
+ if "url" in tab:
820
+ tab_info["url"] = tab["url"]
821
+
822
+ user_posts_info["tabs_info"]["tabs"].append(tab_info)
823
+
824
+ # Process toolbar menus
825
+ toolbar_menus = user_info.get("toolbar_menus", [])
826
+ for menu in toolbar_menus:
827
+ if isinstance(menu, dict):
828
+ menu_info = {
829
+ "type": menu.get("type", ""),
830
+ "name": menu.get("name", ""),
831
+ "params": menu.get("params", {}),
832
+ "scheme": menu.get("scheme", ""),
833
+ }
834
+ user_posts_info["toolbar_menus"].append(menu_info)
835
+
836
+ # Process posts if present in cards
837
+ cards = user_posts_response.get("cards", [])
838
+ if isinstance(cards, list):
839
+ for card in cards:
840
+ if card.get("card_type") == 9: # Regular post card
841
+ post_info = transform_weibo_post_data(card)
842
+ if post_info:
843
+ user_posts_info["posts"].append(post_info)
844
+
845
+ # Clean up followers_count if it's a string with suffix
846
+ followers_count = user_info.get("followers_count", 0)
847
+ if isinstance(followers_count, str):
848
+ if "万" in followers_count:
849
+ try:
850
+ num_str = followers_count.replace("万", "")
851
+ user_posts_info["user"]["followers_count"] = int(float(num_str) * 10000)
852
+ except (ValueError, TypeError):
853
+ user_posts_info["user"]["followers_count"] = 0
854
+ else:
855
+ try:
856
+ user_posts_info["user"]["followers_count"] = int(followers_count)
857
+ except (ValueError, TypeError):
858
+ user_posts_info["user"]["followers_count"] = 0
859
+
860
+ # Validate essential fields
861
+ if not user_posts_info["user"]["id"]:
862
+ return None
863
+
864
+ return user_posts_info
865
+
866
+ except Exception as e:
867
+ # Log error but don't fail completely
868
+ return None
869
+
870
+
871
+ def transform_weibo_trending_response(trending_response: Dict) -> List[Dict]:
872
+ """
873
+ Transform raw Weibo trending API response into list of structured posts
874
+
875
+ Args:
876
+ trending_response: Raw API response from get_trending_list
877
+
878
+ Returns:
879
+ List of structured post information
880
+ """
881
+ if not isinstance(trending_response, dict):
882
+ return []
883
+
884
+ statuses = trending_response.get("statuses", [])
885
+ if not isinstance(statuses, list):
886
+ return []
887
+
888
+ structured_posts = []
889
+
890
+ for status in statuses:
891
+ post_info = transform_weibo_status_data(status)
892
+ if post_info:
893
+ structured_posts.append(post_info)
894
+
895
+ return structured_posts
896
+
897
+
898
+ def transform_weibo_status_data(status_data: Dict) -> Optional[Dict]:
899
+ """
900
+ Transform raw Weibo status data into structured post information
901
+ (for trending list and similar direct status responses)
902
+
903
+ Args:
904
+ status_data: Raw status data from Weibo API
905
+
906
+ Returns:
907
+ Structured post information or None if invalid data
908
+ """
909
+ if not isinstance(status_data, dict):
910
+ return None
911
+
912
+ user = status_data.get("user", {})
913
+ if not user:
914
+ return None
915
+
916
+ try:
917
+ post_info = {
918
+ "mid": status_data.get("id"),
919
+ "text": process_weibo_text(status_data.get("text", "")),
920
+ "created_at": status_data.get("created_at"),
921
+ "source": status_data.get("source"),
922
+ "reposts_count": status_data.get("reposts_count", 0),
923
+ "comments_count": status_data.get("comments_count", 0),
924
+ "attitudes_count": status_data.get("attitudes_count", 0),
925
+ "user": {
926
+ "id": user.get("id"),
927
+ "screen_name": user.get("screen_name"),
928
+ "profile_image_url": user.get("profile_image_url"),
929
+ "followers_count": user.get("followers_count", 0),
930
+ "friends_count": user.get("follow_count", 0), # Note: different field name
931
+ "statuses_count": user.get("statuses_count", 0),
932
+ "verified": user.get("verified", False),
933
+ "verified_type": user.get("verified_type", 0),
934
+ "verified_reason": user.get("verified_reason", ""),
935
+ "description": user.get("description", ""),
936
+ "gender": user.get("gender", ""),
937
+ "mbtype": user.get("mbtype", 0),
938
+ "svip": user.get("svip", 0),
939
+ "urank": user.get("urank", 0),
940
+ "mbrank": user.get("mbrank", 0),
941
+ },
942
+ "pics": status_data.get("pic_ids", []),
943
+ "pic_num": status_data.get("pic_num", 0),
944
+ "page_info": status_data.get("page_info", {}), # Video info if present
945
+ "is_long_text": status_data.get("isLongText", False),
946
+ "favorited": status_data.get("favorited", False),
947
+ "can_edit": status_data.get("can_edit", False),
948
+ "visible": status_data.get("visible", {}),
949
+ "bid": status_data.get("bid", ""),
950
+ "mixed_count": status_data.get("mixed_count", 0),
951
+ "pending_approval_count": status_data.get("pending_approval_count", 0),
952
+ "floor_number": status_data.get("floor_number", 0),
953
+ }
954
+
955
+ # Clean up followers_count if it's a string with suffix
956
+ followers_count = user.get("followers_count", 0)
957
+ if isinstance(followers_count, str):
958
+ # Handle cases like "83.2万"
959
+ if "万" in followers_count:
960
+ try:
961
+ num_str = followers_count.replace("万", "")
962
+ post_info["user"]["followers_count"] = int(float(num_str) * 10000)
963
+ except (ValueError, TypeError):
964
+ post_info["user"]["followers_count"] = 0
965
+ else:
966
+ try:
967
+ post_info["user"]["followers_count"] = int(followers_count)
968
+ except (ValueError, TypeError):
969
+ post_info["user"]["followers_count"] = 0
970
+
971
+ # Process video information if present
972
+ page_info = status_data.get("page_info", {})
973
+ if page_info and page_info.get("type") == "video":
974
+ post_info["video_info"] = {
975
+ "title": page_info.get("title", ""),
976
+ "page_title": page_info.get("page_title", ""),
977
+ "object_id": page_info.get("object_id", ""),
978
+ "page_url": page_info.get("page_url", ""),
979
+ "duration": page_info.get("media_info", {}).get("duration", 0),
980
+ "video_orientation": page_info.get("video_orientation", ""),
981
+ "urls": page_info.get("urls", {}),
982
+ "cover_image": {
983
+ "url": page_info.get("page_pic", {}).get("url", ""),
984
+ "width": page_info.get("page_pic", {}).get("width", ""),
985
+ "height": page_info.get("page_pic", {}).get("height", ""),
986
+ }
987
+ }
988
+
989
+ # Validate essential fields
990
+ if not post_info["mid"] or not post_info["user"]["id"]:
991
+ return None
992
+
993
+ return post_info
994
+
995
+ except Exception as e:
996
+ # Log error but don't fail completely
997
+ return None