vibesurf 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -0,0 +1,807 @@
1
+ import asyncio
2
+ import json
3
+ import pdb
4
+ import time
5
+ from typing import Dict, List, Optional, Callable, Union, Any
6
+ import httpx
7
+ from urllib.parse import urlencode
8
+ import random
9
+ import copy
10
+ from tenacity import retry, stop_after_attempt, wait_fixed
11
+
12
+ from vibe_surf.browser.agent_browser_session import AgentBrowserSession
13
+ from vibe_surf.logger import get_logger
14
+
15
+ from .helpers import (
16
+ generate_trace_id, create_session_id, create_signature_headers,
17
+ extract_cookies_from_browser, XHSError, NetworkError,
18
+ DataExtractionError, AuthenticationError, extract_user_info_from_html
19
+ )
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class SearchType:
25
+ """Search type constants"""
26
+ GENERAL = "general"
27
+ LATEST = "time"
28
+ POPULAR = "popularity_descending"
29
+
30
+
31
+ class ContentType:
32
+ """Content type constants"""
33
+ ALL = 0
34
+ VIDEO = 1
35
+ IMAGE = 2
36
+
37
+
38
+ class XiaoHongShuApiClient:
39
+ """
40
+ XiaoHongShu API client with integrated browser session management.
41
+ This client handles API communication through browser session for authentication.
42
+ """
43
+
44
+ def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
45
+ """
46
+ Initialize the RedBook API client
47
+
48
+ Args:
49
+ timeout: Request timeout in seconds¬
50
+ proxy: Proxy URL if needed
51
+ """
52
+ self.browser_session = browser_session
53
+ self.target_id = None
54
+ self.proxy = proxy
55
+ self.timeout = timeout
56
+ self._api_base = "https://edith.xiaohongshu.com"
57
+ self._web_base = "https://www.xiaohongshu.com"
58
+
59
+ # Error constants
60
+ self.NETWORK_ERROR_MSG = "Network connection error, please check network settings or restart"
61
+ self.NETWORK_ERROR_CODE = 300012
62
+ self.CONTENT_ERROR_MSG = "Content status abnormal, please check later"
63
+ self.CONTENT_ERROR_CODE = -510001
64
+
65
+ # Default headers
66
+ self.default_headers = {
67
+ 'content-type': 'application/json;charset=UTF-8',
68
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
69
+ }
70
+ self.cookies = {}
71
+
72
+ async def _prepare_request_headers(self, endpoint, payload: Optional[Dict] = None):
73
+ headers = copy.deepcopy(self.default_headers)
74
+
75
+ js_expression = f"window._webmsxyw && window._webmsxyw('{endpoint}', {json.dumps(payload) if payload else 'null'})"
76
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
77
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
78
+ params={
79
+ 'expression': js_expression,
80
+ 'returnByValue': True,
81
+ 'awaitPromise': True
82
+ },
83
+ session_id=cdp_session.session_id,
84
+ )
85
+
86
+ encrypt_result = result.get('result', {}).get('value') if result else None
87
+ if encrypt_result:
88
+ # Get browser storage value
89
+ b1_result = await cdp_session.cdp_client.send.Runtime.evaluate(
90
+ params={
91
+ 'expression': "window.localStorage.getItem('b1')",
92
+ 'returnByValue': True,
93
+ 'awaitPromise': True
94
+ },
95
+ session_id=cdp_session.session_id,
96
+ )
97
+
98
+ b1_storage = b1_result.get('result', {}).get('value') if b1_result else None
99
+ # Create signature headers
100
+ signature_headers = create_signature_headers(
101
+ a1=self.cookies.get('a1', ''),
102
+ b1=b1_storage or '',
103
+ x_s=encrypt_result.get('X-s', ''),
104
+ x_t=str(encrypt_result.get('X-t', ''))
105
+ )
106
+ headers.update(signature_headers)
107
+
108
+ return headers
109
+
110
+ async def get_me(self) -> Dict:
111
+ """
112
+ Get current user information to check login status
113
+
114
+ Returns:
115
+ User information dictionary
116
+ """
117
+ uri = '/api/sns/web/v2/user/me'
118
+ return await self._make_request(
119
+ "GET", f"{self._api_base}{uri}", headers=self.default_headers
120
+ )
121
+
122
+ async def setup(self, target_id: Optional[str] = None):
123
+ """
124
+ Get XiaoHongShu cookies and verify login status
125
+
126
+ Args:
127
+ browser_session: Main browser session to use for navigation
128
+
129
+ Returns:
130
+ Dict containing status and message
131
+
132
+ Raises:
133
+ AuthenticationError: If user is not logged in
134
+ """
135
+ try:
136
+ if self.target_id and self.cookies:
137
+ logger.info("Already setup. Return!")
138
+ return
139
+
140
+ if target_id:
141
+ self.target_id = target_id
142
+ else:
143
+ self.target_id = await self.browser_session.navigate_to_url("https://www.xiaohongshu.com/",
144
+ new_tab=True)
145
+ await asyncio.sleep(2)
146
+
147
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id)
148
+ result = await asyncio.wait_for(
149
+ cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id), timeout=8.0
150
+ )
151
+ web_cookies = result.get('cookies', [])
152
+
153
+ cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
154
+ self.default_headers["Cookie"] = cookie_str
155
+ self.cookies = cookie_dict
156
+
157
+ if not self.cookies:
158
+ raise AuthenticationError("No valid cookies found! Please Login first!")
159
+
160
+ user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
161
+ params={
162
+ 'expression': "navigator.userAgent",
163
+ 'returnByValue': True,
164
+ 'awaitPromise': True
165
+ },
166
+ session_id=cdp_session.session_id,
167
+ )
168
+ user_agent = user_agent_result.get('result', {}).get('value')
169
+ if user_agent:
170
+ self.default_headers["User-Agent"] = user_agent
171
+
172
+ user_info = await self.get_me()
173
+ if not user_info or 'user_id' not in user_info:
174
+ self.cookies = {}
175
+ del self.default_headers["Cookie"]
176
+ raise AuthenticationError("No user login in xiaohongshu!")
177
+
178
+ except Exception as e:
179
+ logger.error(f"Failed to get XiaoHongShu cookies: {e}")
180
+ raise e
181
+
182
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
183
+ async def _make_request(self, method: str, url: str, **kwargs) -> Union[str, Dict]:
184
+ """
185
+ Make HTTP request with error handling
186
+
187
+ Args:
188
+ method: HTTP method
189
+ url: Request URL
190
+ **kwargs: Additional request parameters
191
+
192
+ Returns:
193
+ Response data
194
+ """
195
+ raw_response = kwargs.pop("raw_response", False)
196
+
197
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
198
+ response = await client.request(method, url, timeout=self.timeout, **kwargs)
199
+
200
+ # Handle verification challenges
201
+ if response.status_code in [471, 461]:
202
+ verify_type = response.headers.get("Verifytype", "")
203
+ verify_uuid = response.headers.get("Verifyuuid", "")
204
+ error_msg = f"Verification challenge detected, Verifytype: {verify_type}, Verifyuuid: {verify_uuid}"
205
+ logger.error(error_msg)
206
+ raise AuthenticationError(error_msg)
207
+
208
+ if raw_response:
209
+ return response.text
210
+
211
+ try:
212
+ data = response.json()
213
+ if data.get("success"):
214
+ return data.get("data", data.get("success", {}))
215
+ elif data.get("code") == self.NETWORK_ERROR_CODE:
216
+ raise NetworkError(self.NETWORK_ERROR_MSG)
217
+ else:
218
+ raise DataExtractionError(data.get("msg", "Request failed"))
219
+ except json.JSONDecodeError:
220
+ raise DataExtractionError(f"Invalid JSON response: {response.text}")
221
+
222
+ async def _get_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
223
+ """
224
+ Make GET request with signature
225
+
226
+ Args:
227
+ endpoint: API endpoint
228
+ params: URL parameters
229
+
230
+ Returns:
231
+ Response data
232
+ """
233
+ final_endpoint = endpoint
234
+ if params:
235
+ final_endpoint = f"{endpoint}?{urlencode(params)}"
236
+
237
+ headers = await self._prepare_request_headers(final_endpoint)
238
+ return await self._make_request(
239
+ "GET", f"{self._api_base}{final_endpoint}", headers=headers
240
+ )
241
+
242
+ async def _post_request(self, endpoint: str, data: Dict, **kwargs) -> Dict:
243
+ """
244
+ Make POST request with signature
245
+
246
+ Args:
247
+ endpoint: API endpoint
248
+ data: Request body data
249
+ **kwargs: Additional parameters
250
+
251
+ Returns:
252
+ Response data
253
+ """
254
+ headers = await self._prepare_request_headers(endpoint, data)
255
+ json_payload = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
256
+ return await self._make_request(
257
+ "POST", f"{self._api_base}{endpoint}",
258
+ data=json_payload, headers=headers, **kwargs
259
+ )
260
+
261
+ async def search_content_by_keyword(
262
+ self,
263
+ keyword: str,
264
+ session_id: Optional[str] = None,
265
+ page: int = 1,
266
+ page_size: int = 20,
267
+ sort_type: str = SearchType.GENERAL,
268
+ content_type: int = ContentType.ALL,
269
+ ) -> List[Dict]:
270
+ """
271
+ Search content by keyword
272
+
273
+ Args:
274
+ keyword: Search keyword
275
+ session_id: Search session ID (auto-generated if not provided)
276
+ page: Page number
277
+ page_size: Items per page
278
+ sort_type: Sort method
279
+ content_type: Content type filter
280
+
281
+ Returns:
282
+ List of simplified search results
283
+ """
284
+ if session_id is None:
285
+ session_id = create_session_id()
286
+
287
+ endpoint = "/api/sns/web/v1/search/notes"
288
+ payload = {
289
+ "keyword": keyword,
290
+ "page": page,
291
+ "page_size": page_size,
292
+ "search_id": session_id,
293
+ "sort": sort_type,
294
+ "note_type": content_type,
295
+ }
296
+ result = await self._post_request(endpoint, payload)
297
+ # Return simplified note list
298
+ note_list = []
299
+ for item in result.get('items', []):
300
+ if not item.get('id'):
301
+ continue
302
+
303
+ note_card = item.get("note_card", {})
304
+ user_info = note_card.get('user', {})
305
+ interact_info = note_card.get('interact_info', {})
306
+ image_list = note_card.get('image_list', [])
307
+ tag_list = note_card.get('tag_list', [])
308
+
309
+ note_data = {
310
+ "note_id": note_card.get("note_id"),
311
+ "type": note_card.get("type"),
312
+ "title": note_card.get("display_title", "")[:255],
313
+ "desc": note_card.get("desc", ""),
314
+ "time": note_card.get("time"),
315
+ "last_update_time": note_card.get("last_update_time", 0),
316
+ "user_id": user_info.get("user_id"),
317
+ "nickname": user_info.get("nickname"),
318
+ "avatar": user_info.get("avatar"),
319
+ "liked_count": interact_info.get("liked_count", 0),
320
+ "collected_count": interact_info.get("collected_count", 0),
321
+ "comment_count": interact_info.get("comment_count", 0),
322
+ "share_count": interact_info.get("share_count", 0),
323
+ "ip_location": note_card.get("ip_location", ""),
324
+ "image_list": ','.join([img.get('url', '') for img in image_list]),
325
+ "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
326
+ "note_url": f"https://www.xiaohongshu.com/explore/{item.get('id')}",
327
+ "xsec_token": item.get("xsec_token", ""),
328
+ }
329
+ note_list.append(note_data)
330
+
331
+ return note_list
332
+
333
+ async def fetch_content_details(
334
+ self,
335
+ content_id: str,
336
+ xsec_token: str,
337
+ source_channel: str = "pc_search",
338
+ ) -> Dict:
339
+ """
340
+ Fetch detailed content information
341
+
342
+ Args:
343
+ content_id: Content ID
344
+ source_channel: Source channel identifier
345
+ security_token: Security token
346
+
347
+ Returns:
348
+ Simplified content details
349
+ """
350
+ payload = {
351
+ "source_note_id": content_id,
352
+ "image_formats": ["jpg", "webp", "avif"],
353
+ "extra": {"need_body_topic": 1},
354
+ "xsec_source": source_channel,
355
+ "xsec_token": xsec_token,
356
+ }
357
+ endpoint = "/api/sns/web/v1/feed"
358
+ result = await self._post_request(endpoint, payload)
359
+
360
+ if result and result.get("items"):
361
+ note_item = result.get("items")[0]
362
+ note_card = note_item.get("note_card", {})
363
+ user_info = note_card.get('user', {})
364
+ interact_info = note_card.get('interact_info', {})
365
+ image_list = note_card.get('image_list', [])
366
+ tag_list = note_card.get('tag_list', [])
367
+
368
+ return {
369
+ "note_id": note_card.get("note_id"),
370
+ "type": note_card.get("type"),
371
+ "title": note_card.get("title", ""),
372
+ "desc": note_card.get("desc", ""),
373
+ "time": note_card.get("time"),
374
+ "last_update_time": note_card.get("last_update_time", 0),
375
+ "user_id": user_info.get("user_id"),
376
+ "nickname": user_info.get("nickname"),
377
+ "avatar": user_info.get("avatar"),
378
+ "liked_count": interact_info.get("liked_count", 0),
379
+ "collected_count": interact_info.get("collected_count", 0),
380
+ "comment_count": interact_info.get("comment_count", 0),
381
+ "share_count": interact_info.get("share_count", 0),
382
+ "ip_location": note_card.get("ip_location", ""),
383
+ "image_list": ','.join([img.get('url', '') for img in image_list]),
384
+ "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
385
+ "note_url": f"https://www.xiaohongshu.com/explore/{note_card.get('note_id')}",
386
+ "xsec_token": xsec_token,
387
+ }
388
+
389
+ logger.error(f"Failed to fetch content {content_id}, response: {result}")
390
+ return {}
391
+
392
+ async def fetch_content_comments(
393
+ self,
394
+ content_id: str,
395
+ xsec_token: str,
396
+ cursor: str = "",
397
+ ) -> List[Dict]:
398
+ """
399
+ Fetch content comments (first level)
400
+
401
+ Args:
402
+ content_id: Content ID
403
+ security_token: Security token
404
+ cursor: Pagination cursor
405
+
406
+ Returns:
407
+ List of simplified comments data
408
+ """
409
+ endpoint = "/api/sns/web/v2/comment/page"
410
+ params = {
411
+ "note_id": content_id,
412
+ "cursor": cursor,
413
+ "top_comment_id": "",
414
+ "image_formats": "jpg,webp,avif",
415
+ "xsec_token": xsec_token,
416
+ }
417
+ response = await self._get_request(endpoint, params)
418
+
419
+ # Return simplified comments
420
+ comments = []
421
+ for comment_item in response.get("comments", []):
422
+ if not comment_item.get("id"):
423
+ continue
424
+
425
+ user_info = comment_item.get("user_info", {})
426
+ comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
427
+ target_comment = comment_item.get("target_comment", {})
428
+
429
+ comment_data = {
430
+ "comment_id": comment_item.get("id"),
431
+ "create_time": comment_item.get("create_time"),
432
+ "ip_location": comment_item.get("ip_location"),
433
+ "note_id": content_id,
434
+ "content": comment_item.get("content"),
435
+ "user_id": user_info.get("user_id"),
436
+ "nickname": user_info.get("nickname"),
437
+ "avatar": user_info.get("image"),
438
+ "sub_comment_count": comment_item.get("sub_comment_count", 0),
439
+ "pictures": ",".join(comment_pictures),
440
+ "parent_comment_id": target_comment.get("id", 0),
441
+ "like_count": comment_item.get("like_count", 0),
442
+ }
443
+ comments.append(comment_data)
444
+
445
+ return comments
446
+
447
+ async def fetch_all_content_comments(
448
+ self,
449
+ content_id: str,
450
+ xsec_token: str,
451
+ fetch_interval: float = 1.0,
452
+ progress_callback: Optional[Callable] = None,
453
+ max_comments: int = 1000,
454
+ ) -> List[Dict]:
455
+ """
456
+ Fetch all comments for content (including pagination)
457
+
458
+ Args:
459
+ content_id: Content ID
460
+ security_token: Security token
461
+ fetch_interval: Interval between requests in seconds
462
+ progress_callback: Callback function for progress updates
463
+ max_comments: Maximum comments to fetch
464
+
465
+ Returns:
466
+ List of all simplified comments
467
+ """
468
+ all_comments = []
469
+ has_more = True
470
+ cursor = ""
471
+
472
+ while has_more and len(all_comments) < max_comments:
473
+ endpoint = "/api/sns/web/v2/comment/page"
474
+ params = {
475
+ "note_id": content_id,
476
+ "cursor": cursor,
477
+ "top_comment_id": "",
478
+ "image_formats": "jpg,webp,avif",
479
+ "xsec_token": xsec_token,
480
+ }
481
+ comments_data = await self._get_request(endpoint, params)
482
+ has_more = comments_data.get("has_more", False)
483
+ cursor = comments_data.get("cursor", "")
484
+
485
+ if "comments" not in comments_data:
486
+ logger.info(f"No more comments found: {comments_data}")
487
+ break
488
+
489
+ # Get simplified comments from this batch
490
+ batch_comments = []
491
+ for comment_item in comments_data["comments"]:
492
+ if not comment_item.get("id"):
493
+ continue
494
+
495
+ user_info = comment_item.get("user_info", {})
496
+ comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
497
+ target_comment = comment_item.get("target_comment", {})
498
+
499
+ comment_data = {
500
+ "comment_id": comment_item.get("id"),
501
+ "create_time": comment_item.get("create_time"),
502
+ "ip_location": comment_item.get("ip_location"),
503
+ "note_id": content_id,
504
+ "content": comment_item.get("content"),
505
+ "user_id": user_info.get("user_id"),
506
+ "nickname": user_info.get("nickname"),
507
+ "avatar": user_info.get("image"),
508
+ "sub_comment_count": comment_item.get("sub_comment_count", 0),
509
+ "pictures": ",".join(comment_pictures),
510
+ "parent_comment_id": target_comment.get("id", 0),
511
+ "like_count": comment_item.get("like_count", 0),
512
+ }
513
+ batch_comments.append(comment_data)
514
+
515
+ remaining_slots = max_comments - len(all_comments)
516
+ if remaining_slots <= 0:
517
+ break
518
+
519
+ if len(batch_comments) > remaining_slots:
520
+ batch_comments = batch_comments[:remaining_slots]
521
+
522
+ if progress_callback:
523
+ await progress_callback(content_id, batch_comments)
524
+
525
+ await asyncio.sleep(fetch_interval)
526
+ all_comments.extend(batch_comments)
527
+
528
+ logger.info(f"Fetched {len(all_comments)} comments for content {content_id}")
529
+ return all_comments
530
+
531
+ async def get_user_profile(self, user_id: str) -> Dict:
532
+ """
533
+ Get user profile information
534
+
535
+ Args:
536
+ user_id: User ID
537
+
538
+ Returns:
539
+ Simplified user profile data
540
+ """
541
+ endpoint = f"/user/profile/{user_id}"
542
+ try:
543
+ html_response = await self._make_request(
544
+ "GET", self._web_base + endpoint,
545
+ raw_response=True, headers=self.default_headers
546
+ )
547
+
548
+ # Extract user info from HTML response
549
+ if "window.__INITIAL_STATE__" in html_response:
550
+ # For now, return basic info since full extraction would need HTML parsing
551
+ user_info = extract_user_info_from_html(html_response)
552
+ return user_info
553
+ else:
554
+ return {}
555
+
556
+ except Exception as e:
557
+ logger.error(f"Failed to get user profile for {user_id}: {e}")
558
+ return {}
559
+
560
+ async def fetch_user_content(
561
+ self,
562
+ user_id: str,
563
+ cursor: str = "",
564
+ page_size: int = 30,
565
+ ) -> List[Dict]:
566
+ """
567
+ Fetch content by user
568
+
569
+ Args:
570
+ user_id: User ID
571
+ cursor: Last content ID for pagination
572
+ page_size: Number of items per page
573
+
574
+ Returns:
575
+ List of simplified user content data
576
+ """
577
+ endpoint = "/api/sns/web/v1/user_posted"
578
+ params = {
579
+ "user_id": user_id,
580
+ "cursor": cursor,
581
+ "num": page_size,
582
+ "image_formats": "jpg,webp,avif",
583
+ }
584
+ response = await self._get_request(endpoint, params)
585
+
586
+ # Return simplified note list
587
+ note_list = []
588
+ for note_item in response.get("notes", []):
589
+ if not note_item.get('id'):
590
+ continue
591
+
592
+ user_info = note_item.get('user', {})
593
+ interact_info = note_item.get('interact_info', {})
594
+ image_list = note_item.get('image_list', [])
595
+ tag_list = note_item.get('tag_list', [])
596
+
597
+ note_data = {
598
+ "note_id": note_item.get("id"),
599
+ "type": note_item.get("type"),
600
+ "title": note_item.get("display_title", "")[:255],
601
+ "desc": note_item.get("desc", ""),
602
+ "time": note_item.get("time"),
603
+ "last_update_time": note_item.get("last_update_time", 0),
604
+ "user_id": user_info.get("user_id"),
605
+ "nickname": user_info.get("nickname"),
606
+ "avatar": user_info.get("avatar"),
607
+ "liked_count": interact_info.get("liked_count", 0),
608
+ "collected_count": interact_info.get("collected_count", 0),
609
+ "comment_count": interact_info.get("comment_count", 0),
610
+ "share_count": interact_info.get("share_count", 0),
611
+ "ip_location": note_item.get("ip_location", ""),
612
+ "image_list": ','.join([img.get('url', '') for img in image_list]),
613
+ "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
614
+ "note_url": f"https://www.xiaohongshu.com/explore/{note_item.get('id')}",
615
+ "xsec_token": note_item.get("xsec_token", ""),
616
+ }
617
+ note_list.append(note_data)
618
+
619
+ return note_list
620
+
621
+ async def fetch_all_user_content(
622
+ self,
623
+ user_id: str,
624
+ fetch_interval: float = 1.0,
625
+ progress_callback: Optional[Callable] = None,
626
+ max_content: int = 1000,
627
+ ) -> List[Dict]:
628
+ """
629
+ Fetch all content by user
630
+
631
+ Args:
632
+ user_id: User ID
633
+ fetch_interval: Interval between requests in seconds
634
+ progress_callback: Callback function for progress updates
635
+ max_content: Maximum content items to fetch
636
+
637
+ Returns:
638
+ List of all simplified user content
639
+ """
640
+ all_content = []
641
+ has_more = True
642
+ cursor = ""
643
+
644
+ while has_more and len(all_content) < max_content:
645
+ endpoint = "/api/sns/web/v1/user_posted"
646
+ params = {
647
+ "user_id": user_id,
648
+ "cursor": cursor,
649
+ "num": 30,
650
+ "image_formats": "jpg,webp,avif",
651
+ }
652
+ content_data = await self._get_request(endpoint, params)
653
+ if not content_data:
654
+ logger.error(f"User {user_id} may be restricted or data unavailable")
655
+ break
656
+
657
+ has_more = content_data.get("has_more", False)
658
+ cursor = content_data.get("cursor", "")
659
+
660
+ if "notes" not in content_data:
661
+ logger.info(f"No content found: {content_data}")
662
+ break
663
+
664
+ # Get simplified content from this batch
665
+ batch_content = []
666
+ for note_item in content_data["notes"]:
667
+ if not note_item.get('note_id'):
668
+ continue
669
+
670
+ user_info = note_item.get('user', {})
671
+ interact_info = note_item.get('interact_info', {})
672
+ image_list = note_item.get('image_list', [])
673
+ tag_list = note_item.get('tag_list', [])
674
+
675
+ note_data = {
676
+ "note_id": note_item.get("note_id"),
677
+ "type": note_item.get("type"),
678
+ "title": note_item.get("display_title", ""),
679
+ "desc": note_item.get("desc", ""),
680
+ "time": note_item.get("time"),
681
+ "last_update_time": note_item.get("last_update_time", 0),
682
+ "user_id": user_info.get("user_id"),
683
+ "nickname": user_info.get("nickname"),
684
+ "avatar": user_info.get("avatar"),
685
+ "liked_count": interact_info.get("liked_count", 0),
686
+ "collected_count": interact_info.get("collected_count", 0),
687
+ "comment_count": interact_info.get("comment_count", 0),
688
+ "share_count": interact_info.get("share_count", 0),
689
+ "ip_location": note_item.get("ip_location", ""),
690
+ "image_list": ','.join([img.get('url', '') for img in image_list]),
691
+ "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
692
+ "note_url": f"https://www.xiaohongshu.com/explore/{note_item.get('note_id')}",
693
+ "xsec_token": note_item.get("xsec_token", ""),
694
+ }
695
+ batch_content.append(note_data)
696
+
697
+ logger.info(f"Fetched {len(batch_content)} content items for user {user_id}")
698
+
699
+ remaining_slots = max_content - len(all_content)
700
+ if remaining_slots <= 0:
701
+ break
702
+
703
+ content_to_add = batch_content[:remaining_slots]
704
+ if progress_callback:
705
+ await progress_callback(content_to_add)
706
+
707
+ all_content.extend(content_to_add)
708
+ await asyncio.sleep(fetch_interval)
709
+
710
+ logger.info(f"Fetched {len(all_content)} content items for user {user_id}")
711
+ return all_content
712
+
713
+ async def get_home_recommendations(self) -> List[Dict]:
714
+ """
715
+ Get home feed recommendations with proper header signature
716
+
717
+ Returns:
718
+ List of simplified home feed data
719
+ """
720
+ payload = {
721
+ "category": "homefeed_recommend",
722
+ "cursor_score": "",
723
+ "image_formats": json.dumps(["jpg", "webp", "avif"], separators=(",", ":")),
724
+ "need_filter_image": False,
725
+ "need_num": 8,
726
+ "num": 18,
727
+ "note_index": 33,
728
+ "refresh_type": 1,
729
+ "search_key": "",
730
+ "unread_begin_note_id": "",
731
+ "unread_end_note_id": "",
732
+ "unread_note_count": 0
733
+ }
734
+ endpoint = "/api/sns/web/v1/homefeed"
735
+
736
+ # Prepare headers with signature specifically for home feed
737
+ headers = await self._prepare_request_headers(endpoint, payload)
738
+
739
+ # Make the request with proper headers
740
+ json_payload = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
741
+ result = await self._make_request(
742
+ "POST", f"{self._api_base}{endpoint}",
743
+ data=json_payload, headers=headers
744
+ )
745
+
746
+ # Return simplified note list
747
+ note_list = []
748
+ for item in result.get("items", []):
749
+ if not item.get('id'):
750
+ continue
751
+ note_card = item.get('note_card', {})
752
+ user_info = note_card.get('user', {})
753
+ interact_info = note_card.get('interact_info', {})
754
+ image_list = note_card.get('image_list', [])
755
+ tag_list = note_card.get('tag_list', [])
756
+
757
+ note_data = {
758
+ "note_id": item.get("id"),
759
+ "type": note_card.get("type"),
760
+ "title": note_card.get("display_title", ""),
761
+ "desc": note_card.get("desc", ""),
762
+ "time": note_card.get("time"),
763
+ "last_update_time": note_card.get("last_update_time", 0),
764
+ "user_id": user_info.get("user_id"),
765
+ "nickname": user_info.get("nickname"),
766
+ "avatar": user_info.get("avatar"),
767
+ "liked_count": interact_info.get("liked_count", 0),
768
+ "collected_count": interact_info.get("collected_count", 0),
769
+ "comment_count": interact_info.get("comment_count", 0),
770
+ "share_count": interact_info.get("share_count", 0),
771
+ "ip_location": note_card.get("ip_location", ""),
772
+ "image_list": ','.join([img.get('url', '') for img in image_list]),
773
+ "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
774
+ "note_url": f"https://www.xiaohongshu.com/explore/{item.get('id')}",
775
+ "xsec_token": item.get("xsec_token", ""),
776
+ }
777
+ note_list.append(note_data)
778
+
779
+ return note_list
780
+
781
+ async def submit_comment(self, content_id: str, comment_text: str) -> Dict:
782
+ """
783
+ Submit comment to content
784
+
785
+ Args:
786
+ content_id: Content ID
787
+ comment_text: Comment text
788
+
789
+ Returns:
790
+ Submit result
791
+ """
792
+ endpoint = '/api/sns/web/v1/comment/post'
793
+ payload = {
794
+ "note_id": content_id,
795
+ "content": comment_text,
796
+ "at_users": []
797
+ }
798
+ return await self._post_request(endpoint, payload)
799
+
800
+ async def close(self):
801
+ if self.browser_session and self.target_id:
802
+ try:
803
+ logger.info(f"Close target id: {self.target_id}")
804
+ await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
805
+ except Exception as e:
806
+ logger.warning(f"Error closing target {self.target_id}: {e}")
807
+