vibesurf 0.1.27__py3-none-any.whl → 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/backend/shared_state.py +2 -2
- vibe_surf/chrome_extension/config.js +8 -0
- vibe_surf/chrome_extension/scripts/ui-manager.js +146 -0
- vibe_surf/chrome_extension/sidepanel.html +2 -16
- vibe_surf/tools/website_api/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/client.py +845 -0
- vibe_surf/tools/website_api/douyin/helpers.py +239 -0
- vibe_surf/tools/website_api/weibo/__init__.py +0 -0
- vibe_surf/tools/website_api/weibo/client.py +846 -0
- vibe_surf/tools/website_api/weibo/helpers.py +997 -0
- vibe_surf/tools/website_api/xhs/__init__.py +0 -0
- vibe_surf/tools/website_api/xhs/client.py +807 -0
- vibe_surf/tools/website_api/xhs/helpers.py +301 -0
- vibe_surf/tools/website_api/youtube/__init__.py +32 -0
- vibe_surf/tools/website_api/youtube/client.py +1179 -0
- vibe_surf/tools/website_api/youtube/helpers.py +420 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/METADATA +55 -23
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/RECORD +24 -11
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/licenses/LICENSE +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.29.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,807 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import pdb
|
|
4
|
+
import time
|
|
5
|
+
from typing import Dict, List, Optional, Callable, Union, Any
|
|
6
|
+
import httpx
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
import random
|
|
9
|
+
import copy
|
|
10
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
11
|
+
|
|
12
|
+
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
13
|
+
from vibe_surf.logger import get_logger
|
|
14
|
+
|
|
15
|
+
from .helpers import (
|
|
16
|
+
generate_trace_id, create_session_id, create_signature_headers,
|
|
17
|
+
extract_cookies_from_browser, XHSError, NetworkError,
|
|
18
|
+
DataExtractionError, AuthenticationError, extract_user_info_from_html
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SearchType:
|
|
25
|
+
"""Search type constants"""
|
|
26
|
+
GENERAL = "general"
|
|
27
|
+
LATEST = "time"
|
|
28
|
+
POPULAR = "popularity_descending"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ContentType:
|
|
32
|
+
"""Content type constants"""
|
|
33
|
+
ALL = 0
|
|
34
|
+
VIDEO = 1
|
|
35
|
+
IMAGE = 2
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class XiaoHongShuApiClient:
|
|
39
|
+
"""
|
|
40
|
+
XiaoHongShu API client with integrated browser session management.
|
|
41
|
+
This client handles API communication through browser session for authentication.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
|
|
45
|
+
"""
|
|
46
|
+
Initialize the RedBook API client
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
timeout: Request timeout in seconds¬
|
|
50
|
+
proxy: Proxy URL if needed
|
|
51
|
+
"""
|
|
52
|
+
self.browser_session = browser_session
|
|
53
|
+
self.target_id = None
|
|
54
|
+
self.proxy = proxy
|
|
55
|
+
self.timeout = timeout
|
|
56
|
+
self._api_base = "https://edith.xiaohongshu.com"
|
|
57
|
+
self._web_base = "https://www.xiaohongshu.com"
|
|
58
|
+
|
|
59
|
+
# Error constants
|
|
60
|
+
self.NETWORK_ERROR_MSG = "Network connection error, please check network settings or restart"
|
|
61
|
+
self.NETWORK_ERROR_CODE = 300012
|
|
62
|
+
self.CONTENT_ERROR_MSG = "Content status abnormal, please check later"
|
|
63
|
+
self.CONTENT_ERROR_CODE = -510001
|
|
64
|
+
|
|
65
|
+
# Default headers
|
|
66
|
+
self.default_headers = {
|
|
67
|
+
'content-type': 'application/json;charset=UTF-8',
|
|
68
|
+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
|
69
|
+
}
|
|
70
|
+
self.cookies = {}
|
|
71
|
+
|
|
72
|
+
async def _prepare_request_headers(self, endpoint, payload: Optional[Dict] = None):
|
|
73
|
+
headers = copy.deepcopy(self.default_headers)
|
|
74
|
+
|
|
75
|
+
js_expression = f"window._webmsxyw && window._webmsxyw('{endpoint}', {json.dumps(payload) if payload else 'null'})"
|
|
76
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
77
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
78
|
+
params={
|
|
79
|
+
'expression': js_expression,
|
|
80
|
+
'returnByValue': True,
|
|
81
|
+
'awaitPromise': True
|
|
82
|
+
},
|
|
83
|
+
session_id=cdp_session.session_id,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
encrypt_result = result.get('result', {}).get('value') if result else None
|
|
87
|
+
if encrypt_result:
|
|
88
|
+
# Get browser storage value
|
|
89
|
+
b1_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
90
|
+
params={
|
|
91
|
+
'expression': "window.localStorage.getItem('b1')",
|
|
92
|
+
'returnByValue': True,
|
|
93
|
+
'awaitPromise': True
|
|
94
|
+
},
|
|
95
|
+
session_id=cdp_session.session_id,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
b1_storage = b1_result.get('result', {}).get('value') if b1_result else None
|
|
99
|
+
# Create signature headers
|
|
100
|
+
signature_headers = create_signature_headers(
|
|
101
|
+
a1=self.cookies.get('a1', ''),
|
|
102
|
+
b1=b1_storage or '',
|
|
103
|
+
x_s=encrypt_result.get('X-s', ''),
|
|
104
|
+
x_t=str(encrypt_result.get('X-t', ''))
|
|
105
|
+
)
|
|
106
|
+
headers.update(signature_headers)
|
|
107
|
+
|
|
108
|
+
return headers
|
|
109
|
+
|
|
110
|
+
async def get_me(self) -> Dict:
|
|
111
|
+
"""
|
|
112
|
+
Get current user information to check login status
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
User information dictionary
|
|
116
|
+
"""
|
|
117
|
+
uri = '/api/sns/web/v2/user/me'
|
|
118
|
+
return await self._make_request(
|
|
119
|
+
"GET", f"{self._api_base}{uri}", headers=self.default_headers
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
async def setup(self, target_id: Optional[str] = None):
|
|
123
|
+
"""
|
|
124
|
+
Get XiaoHongShu cookies and verify login status
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
browser_session: Main browser session to use for navigation
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Dict containing status and message
|
|
131
|
+
|
|
132
|
+
Raises:
|
|
133
|
+
AuthenticationError: If user is not logged in
|
|
134
|
+
"""
|
|
135
|
+
try:
|
|
136
|
+
if self.target_id and self.cookies:
|
|
137
|
+
logger.info("Already setup. Return!")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
if target_id:
|
|
141
|
+
self.target_id = target_id
|
|
142
|
+
else:
|
|
143
|
+
self.target_id = await self.browser_session.navigate_to_url("https://www.xiaohongshu.com/",
|
|
144
|
+
new_tab=True)
|
|
145
|
+
await asyncio.sleep(2)
|
|
146
|
+
|
|
147
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id)
|
|
148
|
+
result = await asyncio.wait_for(
|
|
149
|
+
cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id), timeout=8.0
|
|
150
|
+
)
|
|
151
|
+
web_cookies = result.get('cookies', [])
|
|
152
|
+
|
|
153
|
+
cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
|
|
154
|
+
self.default_headers["Cookie"] = cookie_str
|
|
155
|
+
self.cookies = cookie_dict
|
|
156
|
+
|
|
157
|
+
if not self.cookies:
|
|
158
|
+
raise AuthenticationError("No valid cookies found! Please Login first!")
|
|
159
|
+
|
|
160
|
+
user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
161
|
+
params={
|
|
162
|
+
'expression': "navigator.userAgent",
|
|
163
|
+
'returnByValue': True,
|
|
164
|
+
'awaitPromise': True
|
|
165
|
+
},
|
|
166
|
+
session_id=cdp_session.session_id,
|
|
167
|
+
)
|
|
168
|
+
user_agent = user_agent_result.get('result', {}).get('value')
|
|
169
|
+
if user_agent:
|
|
170
|
+
self.default_headers["User-Agent"] = user_agent
|
|
171
|
+
|
|
172
|
+
user_info = await self.get_me()
|
|
173
|
+
if not user_info or 'user_id' not in user_info:
|
|
174
|
+
self.cookies = {}
|
|
175
|
+
del self.default_headers["Cookie"]
|
|
176
|
+
raise AuthenticationError("No user login in xiaohongshu!")
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.error(f"Failed to get XiaoHongShu cookies: {e}")
|
|
180
|
+
raise e
|
|
181
|
+
|
|
182
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
|
183
|
+
async def _make_request(self, method: str, url: str, **kwargs) -> Union[str, Dict]:
|
|
184
|
+
"""
|
|
185
|
+
Make HTTP request with error handling
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
method: HTTP method
|
|
189
|
+
url: Request URL
|
|
190
|
+
**kwargs: Additional request parameters
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Response data
|
|
194
|
+
"""
|
|
195
|
+
raw_response = kwargs.pop("raw_response", False)
|
|
196
|
+
|
|
197
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
198
|
+
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
|
199
|
+
|
|
200
|
+
# Handle verification challenges
|
|
201
|
+
if response.status_code in [471, 461]:
|
|
202
|
+
verify_type = response.headers.get("Verifytype", "")
|
|
203
|
+
verify_uuid = response.headers.get("Verifyuuid", "")
|
|
204
|
+
error_msg = f"Verification challenge detected, Verifytype: {verify_type}, Verifyuuid: {verify_uuid}"
|
|
205
|
+
logger.error(error_msg)
|
|
206
|
+
raise AuthenticationError(error_msg)
|
|
207
|
+
|
|
208
|
+
if raw_response:
|
|
209
|
+
return response.text
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
data = response.json()
|
|
213
|
+
if data.get("success"):
|
|
214
|
+
return data.get("data", data.get("success", {}))
|
|
215
|
+
elif data.get("code") == self.NETWORK_ERROR_CODE:
|
|
216
|
+
raise NetworkError(self.NETWORK_ERROR_MSG)
|
|
217
|
+
else:
|
|
218
|
+
raise DataExtractionError(data.get("msg", "Request failed"))
|
|
219
|
+
except json.JSONDecodeError:
|
|
220
|
+
raise DataExtractionError(f"Invalid JSON response: {response.text}")
|
|
221
|
+
|
|
222
|
+
async def _get_request(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
|
|
223
|
+
"""
|
|
224
|
+
Make GET request with signature
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
endpoint: API endpoint
|
|
228
|
+
params: URL parameters
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Response data
|
|
232
|
+
"""
|
|
233
|
+
final_endpoint = endpoint
|
|
234
|
+
if params:
|
|
235
|
+
final_endpoint = f"{endpoint}?{urlencode(params)}"
|
|
236
|
+
|
|
237
|
+
headers = await self._prepare_request_headers(final_endpoint)
|
|
238
|
+
return await self._make_request(
|
|
239
|
+
"GET", f"{self._api_base}{final_endpoint}", headers=headers
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
async def _post_request(self, endpoint: str, data: Dict, **kwargs) -> Dict:
|
|
243
|
+
"""
|
|
244
|
+
Make POST request with signature
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
endpoint: API endpoint
|
|
248
|
+
data: Request body data
|
|
249
|
+
**kwargs: Additional parameters
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Response data
|
|
253
|
+
"""
|
|
254
|
+
headers = await self._prepare_request_headers(endpoint, data)
|
|
255
|
+
json_payload = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
|
|
256
|
+
return await self._make_request(
|
|
257
|
+
"POST", f"{self._api_base}{endpoint}",
|
|
258
|
+
data=json_payload, headers=headers, **kwargs
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
async def search_content_by_keyword(
|
|
262
|
+
self,
|
|
263
|
+
keyword: str,
|
|
264
|
+
session_id: Optional[str] = None,
|
|
265
|
+
page: int = 1,
|
|
266
|
+
page_size: int = 20,
|
|
267
|
+
sort_type: str = SearchType.GENERAL,
|
|
268
|
+
content_type: int = ContentType.ALL,
|
|
269
|
+
) -> List[Dict]:
|
|
270
|
+
"""
|
|
271
|
+
Search content by keyword
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
keyword: Search keyword
|
|
275
|
+
session_id: Search session ID (auto-generated if not provided)
|
|
276
|
+
page: Page number
|
|
277
|
+
page_size: Items per page
|
|
278
|
+
sort_type: Sort method
|
|
279
|
+
content_type: Content type filter
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List of simplified search results
|
|
283
|
+
"""
|
|
284
|
+
if session_id is None:
|
|
285
|
+
session_id = create_session_id()
|
|
286
|
+
|
|
287
|
+
endpoint = "/api/sns/web/v1/search/notes"
|
|
288
|
+
payload = {
|
|
289
|
+
"keyword": keyword,
|
|
290
|
+
"page": page,
|
|
291
|
+
"page_size": page_size,
|
|
292
|
+
"search_id": session_id,
|
|
293
|
+
"sort": sort_type,
|
|
294
|
+
"note_type": content_type,
|
|
295
|
+
}
|
|
296
|
+
result = await self._post_request(endpoint, payload)
|
|
297
|
+
# Return simplified note list
|
|
298
|
+
note_list = []
|
|
299
|
+
for item in result.get('items', []):
|
|
300
|
+
if not item.get('id'):
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
note_card = item.get("note_card", {})
|
|
304
|
+
user_info = note_card.get('user', {})
|
|
305
|
+
interact_info = note_card.get('interact_info', {})
|
|
306
|
+
image_list = note_card.get('image_list', [])
|
|
307
|
+
tag_list = note_card.get('tag_list', [])
|
|
308
|
+
|
|
309
|
+
note_data = {
|
|
310
|
+
"note_id": note_card.get("note_id"),
|
|
311
|
+
"type": note_card.get("type"),
|
|
312
|
+
"title": note_card.get("display_title", "")[:255],
|
|
313
|
+
"desc": note_card.get("desc", ""),
|
|
314
|
+
"time": note_card.get("time"),
|
|
315
|
+
"last_update_time": note_card.get("last_update_time", 0),
|
|
316
|
+
"user_id": user_info.get("user_id"),
|
|
317
|
+
"nickname": user_info.get("nickname"),
|
|
318
|
+
"avatar": user_info.get("avatar"),
|
|
319
|
+
"liked_count": interact_info.get("liked_count", 0),
|
|
320
|
+
"collected_count": interact_info.get("collected_count", 0),
|
|
321
|
+
"comment_count": interact_info.get("comment_count", 0),
|
|
322
|
+
"share_count": interact_info.get("share_count", 0),
|
|
323
|
+
"ip_location": note_card.get("ip_location", ""),
|
|
324
|
+
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
|
325
|
+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
|
326
|
+
"note_url": f"https://www.xiaohongshu.com/explore/{item.get('id')}",
|
|
327
|
+
"xsec_token": item.get("xsec_token", ""),
|
|
328
|
+
}
|
|
329
|
+
note_list.append(note_data)
|
|
330
|
+
|
|
331
|
+
return note_list
|
|
332
|
+
|
|
333
|
+
async def fetch_content_details(
|
|
334
|
+
self,
|
|
335
|
+
content_id: str,
|
|
336
|
+
xsec_token: str,
|
|
337
|
+
source_channel: str = "pc_search",
|
|
338
|
+
) -> Dict:
|
|
339
|
+
"""
|
|
340
|
+
Fetch detailed content information
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
content_id: Content ID
|
|
344
|
+
source_channel: Source channel identifier
|
|
345
|
+
security_token: Security token
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Simplified content details
|
|
349
|
+
"""
|
|
350
|
+
payload = {
|
|
351
|
+
"source_note_id": content_id,
|
|
352
|
+
"image_formats": ["jpg", "webp", "avif"],
|
|
353
|
+
"extra": {"need_body_topic": 1},
|
|
354
|
+
"xsec_source": source_channel,
|
|
355
|
+
"xsec_token": xsec_token,
|
|
356
|
+
}
|
|
357
|
+
endpoint = "/api/sns/web/v1/feed"
|
|
358
|
+
result = await self._post_request(endpoint, payload)
|
|
359
|
+
|
|
360
|
+
if result and result.get("items"):
|
|
361
|
+
note_item = result.get("items")[0]
|
|
362
|
+
note_card = note_item.get("note_card", {})
|
|
363
|
+
user_info = note_card.get('user', {})
|
|
364
|
+
interact_info = note_card.get('interact_info', {})
|
|
365
|
+
image_list = note_card.get('image_list', [])
|
|
366
|
+
tag_list = note_card.get('tag_list', [])
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
"note_id": note_card.get("note_id"),
|
|
370
|
+
"type": note_card.get("type"),
|
|
371
|
+
"title": note_card.get("title", ""),
|
|
372
|
+
"desc": note_card.get("desc", ""),
|
|
373
|
+
"time": note_card.get("time"),
|
|
374
|
+
"last_update_time": note_card.get("last_update_time", 0),
|
|
375
|
+
"user_id": user_info.get("user_id"),
|
|
376
|
+
"nickname": user_info.get("nickname"),
|
|
377
|
+
"avatar": user_info.get("avatar"),
|
|
378
|
+
"liked_count": interact_info.get("liked_count", 0),
|
|
379
|
+
"collected_count": interact_info.get("collected_count", 0),
|
|
380
|
+
"comment_count": interact_info.get("comment_count", 0),
|
|
381
|
+
"share_count": interact_info.get("share_count", 0),
|
|
382
|
+
"ip_location": note_card.get("ip_location", ""),
|
|
383
|
+
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
|
384
|
+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
|
385
|
+
"note_url": f"https://www.xiaohongshu.com/explore/{note_card.get('note_id')}",
|
|
386
|
+
"xsec_token": xsec_token,
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
logger.error(f"Failed to fetch content {content_id}, response: {result}")
|
|
390
|
+
return {}
|
|
391
|
+
|
|
392
|
+
async def fetch_content_comments(
|
|
393
|
+
self,
|
|
394
|
+
content_id: str,
|
|
395
|
+
xsec_token: str,
|
|
396
|
+
cursor: str = "",
|
|
397
|
+
) -> List[Dict]:
|
|
398
|
+
"""
|
|
399
|
+
Fetch content comments (first level)
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
content_id: Content ID
|
|
403
|
+
security_token: Security token
|
|
404
|
+
cursor: Pagination cursor
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
List of simplified comments data
|
|
408
|
+
"""
|
|
409
|
+
endpoint = "/api/sns/web/v2/comment/page"
|
|
410
|
+
params = {
|
|
411
|
+
"note_id": content_id,
|
|
412
|
+
"cursor": cursor,
|
|
413
|
+
"top_comment_id": "",
|
|
414
|
+
"image_formats": "jpg,webp,avif",
|
|
415
|
+
"xsec_token": xsec_token,
|
|
416
|
+
}
|
|
417
|
+
response = await self._get_request(endpoint, params)
|
|
418
|
+
|
|
419
|
+
# Return simplified comments
|
|
420
|
+
comments = []
|
|
421
|
+
for comment_item in response.get("comments", []):
|
|
422
|
+
if not comment_item.get("id"):
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
user_info = comment_item.get("user_info", {})
|
|
426
|
+
comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
|
|
427
|
+
target_comment = comment_item.get("target_comment", {})
|
|
428
|
+
|
|
429
|
+
comment_data = {
|
|
430
|
+
"comment_id": comment_item.get("id"),
|
|
431
|
+
"create_time": comment_item.get("create_time"),
|
|
432
|
+
"ip_location": comment_item.get("ip_location"),
|
|
433
|
+
"note_id": content_id,
|
|
434
|
+
"content": comment_item.get("content"),
|
|
435
|
+
"user_id": user_info.get("user_id"),
|
|
436
|
+
"nickname": user_info.get("nickname"),
|
|
437
|
+
"avatar": user_info.get("image"),
|
|
438
|
+
"sub_comment_count": comment_item.get("sub_comment_count", 0),
|
|
439
|
+
"pictures": ",".join(comment_pictures),
|
|
440
|
+
"parent_comment_id": target_comment.get("id", 0),
|
|
441
|
+
"like_count": comment_item.get("like_count", 0),
|
|
442
|
+
}
|
|
443
|
+
comments.append(comment_data)
|
|
444
|
+
|
|
445
|
+
return comments
|
|
446
|
+
|
|
447
|
+
async def fetch_all_content_comments(
|
|
448
|
+
self,
|
|
449
|
+
content_id: str,
|
|
450
|
+
xsec_token: str,
|
|
451
|
+
fetch_interval: float = 1.0,
|
|
452
|
+
progress_callback: Optional[Callable] = None,
|
|
453
|
+
max_comments: int = 1000,
|
|
454
|
+
) -> List[Dict]:
|
|
455
|
+
"""
|
|
456
|
+
Fetch all comments for content (including pagination)
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
content_id: Content ID
|
|
460
|
+
security_token: Security token
|
|
461
|
+
fetch_interval: Interval between requests in seconds
|
|
462
|
+
progress_callback: Callback function for progress updates
|
|
463
|
+
max_comments: Maximum comments to fetch
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
List of all simplified comments
|
|
467
|
+
"""
|
|
468
|
+
all_comments = []
|
|
469
|
+
has_more = True
|
|
470
|
+
cursor = ""
|
|
471
|
+
|
|
472
|
+
while has_more and len(all_comments) < max_comments:
|
|
473
|
+
endpoint = "/api/sns/web/v2/comment/page"
|
|
474
|
+
params = {
|
|
475
|
+
"note_id": content_id,
|
|
476
|
+
"cursor": cursor,
|
|
477
|
+
"top_comment_id": "",
|
|
478
|
+
"image_formats": "jpg,webp,avif",
|
|
479
|
+
"xsec_token": xsec_token,
|
|
480
|
+
}
|
|
481
|
+
comments_data = await self._get_request(endpoint, params)
|
|
482
|
+
has_more = comments_data.get("has_more", False)
|
|
483
|
+
cursor = comments_data.get("cursor", "")
|
|
484
|
+
|
|
485
|
+
if "comments" not in comments_data:
|
|
486
|
+
logger.info(f"No more comments found: {comments_data}")
|
|
487
|
+
break
|
|
488
|
+
|
|
489
|
+
# Get simplified comments from this batch
|
|
490
|
+
batch_comments = []
|
|
491
|
+
for comment_item in comments_data["comments"]:
|
|
492
|
+
if not comment_item.get("id"):
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
user_info = comment_item.get("user_info", {})
|
|
496
|
+
comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
|
|
497
|
+
target_comment = comment_item.get("target_comment", {})
|
|
498
|
+
|
|
499
|
+
comment_data = {
|
|
500
|
+
"comment_id": comment_item.get("id"),
|
|
501
|
+
"create_time": comment_item.get("create_time"),
|
|
502
|
+
"ip_location": comment_item.get("ip_location"),
|
|
503
|
+
"note_id": content_id,
|
|
504
|
+
"content": comment_item.get("content"),
|
|
505
|
+
"user_id": user_info.get("user_id"),
|
|
506
|
+
"nickname": user_info.get("nickname"),
|
|
507
|
+
"avatar": user_info.get("image"),
|
|
508
|
+
"sub_comment_count": comment_item.get("sub_comment_count", 0),
|
|
509
|
+
"pictures": ",".join(comment_pictures),
|
|
510
|
+
"parent_comment_id": target_comment.get("id", 0),
|
|
511
|
+
"like_count": comment_item.get("like_count", 0),
|
|
512
|
+
}
|
|
513
|
+
batch_comments.append(comment_data)
|
|
514
|
+
|
|
515
|
+
remaining_slots = max_comments - len(all_comments)
|
|
516
|
+
if remaining_slots <= 0:
|
|
517
|
+
break
|
|
518
|
+
|
|
519
|
+
if len(batch_comments) > remaining_slots:
|
|
520
|
+
batch_comments = batch_comments[:remaining_slots]
|
|
521
|
+
|
|
522
|
+
if progress_callback:
|
|
523
|
+
await progress_callback(content_id, batch_comments)
|
|
524
|
+
|
|
525
|
+
await asyncio.sleep(fetch_interval)
|
|
526
|
+
all_comments.extend(batch_comments)
|
|
527
|
+
|
|
528
|
+
logger.info(f"Fetched {len(all_comments)} comments for content {content_id}")
|
|
529
|
+
return all_comments
|
|
530
|
+
|
|
531
|
+
async def get_user_profile(self, user_id: str) -> Dict:
|
|
532
|
+
"""
|
|
533
|
+
Get user profile information
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
user_id: User ID
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Simplified user profile data
|
|
540
|
+
"""
|
|
541
|
+
endpoint = f"/user/profile/{user_id}"
|
|
542
|
+
try:
|
|
543
|
+
html_response = await self._make_request(
|
|
544
|
+
"GET", self._web_base + endpoint,
|
|
545
|
+
raw_response=True, headers=self.default_headers
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Extract user info from HTML response
|
|
549
|
+
if "window.__INITIAL_STATE__" in html_response:
|
|
550
|
+
# For now, return basic info since full extraction would need HTML parsing
|
|
551
|
+
user_info = extract_user_info_from_html(html_response)
|
|
552
|
+
return user_info
|
|
553
|
+
else:
|
|
554
|
+
return {}
|
|
555
|
+
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.error(f"Failed to get user profile for {user_id}: {e}")
|
|
558
|
+
return {}
|
|
559
|
+
|
|
560
|
+
async def fetch_user_content(
|
|
561
|
+
self,
|
|
562
|
+
user_id: str,
|
|
563
|
+
cursor: str = "",
|
|
564
|
+
page_size: int = 30,
|
|
565
|
+
) -> List[Dict]:
|
|
566
|
+
"""
|
|
567
|
+
Fetch content by user
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
user_id: User ID
|
|
571
|
+
cursor: Last content ID for pagination
|
|
572
|
+
page_size: Number of items per page
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
List of simplified user content data
|
|
576
|
+
"""
|
|
577
|
+
endpoint = "/api/sns/web/v1/user_posted"
|
|
578
|
+
params = {
|
|
579
|
+
"user_id": user_id,
|
|
580
|
+
"cursor": cursor,
|
|
581
|
+
"num": page_size,
|
|
582
|
+
"image_formats": "jpg,webp,avif",
|
|
583
|
+
}
|
|
584
|
+
response = await self._get_request(endpoint, params)
|
|
585
|
+
|
|
586
|
+
# Return simplified note list
|
|
587
|
+
note_list = []
|
|
588
|
+
for note_item in response.get("notes", []):
|
|
589
|
+
if not note_item.get('id'):
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
user_info = note_item.get('user', {})
|
|
593
|
+
interact_info = note_item.get('interact_info', {})
|
|
594
|
+
image_list = note_item.get('image_list', [])
|
|
595
|
+
tag_list = note_item.get('tag_list', [])
|
|
596
|
+
|
|
597
|
+
note_data = {
|
|
598
|
+
"note_id": note_item.get("id"),
|
|
599
|
+
"type": note_item.get("type"),
|
|
600
|
+
"title": note_item.get("display_title", "")[:255],
|
|
601
|
+
"desc": note_item.get("desc", ""),
|
|
602
|
+
"time": note_item.get("time"),
|
|
603
|
+
"last_update_time": note_item.get("last_update_time", 0),
|
|
604
|
+
"user_id": user_info.get("user_id"),
|
|
605
|
+
"nickname": user_info.get("nickname"),
|
|
606
|
+
"avatar": user_info.get("avatar"),
|
|
607
|
+
"liked_count": interact_info.get("liked_count", 0),
|
|
608
|
+
"collected_count": interact_info.get("collected_count", 0),
|
|
609
|
+
"comment_count": interact_info.get("comment_count", 0),
|
|
610
|
+
"share_count": interact_info.get("share_count", 0),
|
|
611
|
+
"ip_location": note_item.get("ip_location", ""),
|
|
612
|
+
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
|
613
|
+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
|
614
|
+
"note_url": f"https://www.xiaohongshu.com/explore/{note_item.get('id')}",
|
|
615
|
+
"xsec_token": note_item.get("xsec_token", ""),
|
|
616
|
+
}
|
|
617
|
+
note_list.append(note_data)
|
|
618
|
+
|
|
619
|
+
return note_list
|
|
620
|
+
|
|
621
|
+
async def fetch_all_user_content(
|
|
622
|
+
self,
|
|
623
|
+
user_id: str,
|
|
624
|
+
fetch_interval: float = 1.0,
|
|
625
|
+
progress_callback: Optional[Callable] = None,
|
|
626
|
+
max_content: int = 1000,
|
|
627
|
+
) -> List[Dict]:
|
|
628
|
+
"""
|
|
629
|
+
Fetch all content by user
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
user_id: User ID
|
|
633
|
+
fetch_interval: Interval between requests in seconds
|
|
634
|
+
progress_callback: Callback function for progress updates
|
|
635
|
+
max_content: Maximum content items to fetch
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
List of all simplified user content
|
|
639
|
+
"""
|
|
640
|
+
all_content = []
|
|
641
|
+
has_more = True
|
|
642
|
+
cursor = ""
|
|
643
|
+
|
|
644
|
+
while has_more and len(all_content) < max_content:
|
|
645
|
+
endpoint = "/api/sns/web/v1/user_posted"
|
|
646
|
+
params = {
|
|
647
|
+
"user_id": user_id,
|
|
648
|
+
"cursor": cursor,
|
|
649
|
+
"num": 30,
|
|
650
|
+
"image_formats": "jpg,webp,avif",
|
|
651
|
+
}
|
|
652
|
+
content_data = await self._get_request(endpoint, params)
|
|
653
|
+
if not content_data:
|
|
654
|
+
logger.error(f"User {user_id} may be restricted or data unavailable")
|
|
655
|
+
break
|
|
656
|
+
|
|
657
|
+
has_more = content_data.get("has_more", False)
|
|
658
|
+
cursor = content_data.get("cursor", "")
|
|
659
|
+
|
|
660
|
+
if "notes" not in content_data:
|
|
661
|
+
logger.info(f"No content found: {content_data}")
|
|
662
|
+
break
|
|
663
|
+
|
|
664
|
+
# Get simplified content from this batch
|
|
665
|
+
batch_content = []
|
|
666
|
+
for note_item in content_data["notes"]:
|
|
667
|
+
if not note_item.get('note_id'):
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
user_info = note_item.get('user', {})
|
|
671
|
+
interact_info = note_item.get('interact_info', {})
|
|
672
|
+
image_list = note_item.get('image_list', [])
|
|
673
|
+
tag_list = note_item.get('tag_list', [])
|
|
674
|
+
|
|
675
|
+
note_data = {
|
|
676
|
+
"note_id": note_item.get("note_id"),
|
|
677
|
+
"type": note_item.get("type"),
|
|
678
|
+
"title": note_item.get("display_title", ""),
|
|
679
|
+
"desc": note_item.get("desc", ""),
|
|
680
|
+
"time": note_item.get("time"),
|
|
681
|
+
"last_update_time": note_item.get("last_update_time", 0),
|
|
682
|
+
"user_id": user_info.get("user_id"),
|
|
683
|
+
"nickname": user_info.get("nickname"),
|
|
684
|
+
"avatar": user_info.get("avatar"),
|
|
685
|
+
"liked_count": interact_info.get("liked_count", 0),
|
|
686
|
+
"collected_count": interact_info.get("collected_count", 0),
|
|
687
|
+
"comment_count": interact_info.get("comment_count", 0),
|
|
688
|
+
"share_count": interact_info.get("share_count", 0),
|
|
689
|
+
"ip_location": note_item.get("ip_location", ""),
|
|
690
|
+
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
|
691
|
+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
|
692
|
+
"note_url": f"https://www.xiaohongshu.com/explore/{note_item.get('note_id')}",
|
|
693
|
+
"xsec_token": note_item.get("xsec_token", ""),
|
|
694
|
+
}
|
|
695
|
+
batch_content.append(note_data)
|
|
696
|
+
|
|
697
|
+
logger.info(f"Fetched {len(batch_content)} content items for user {user_id}")
|
|
698
|
+
|
|
699
|
+
remaining_slots = max_content - len(all_content)
|
|
700
|
+
if remaining_slots <= 0:
|
|
701
|
+
break
|
|
702
|
+
|
|
703
|
+
content_to_add = batch_content[:remaining_slots]
|
|
704
|
+
if progress_callback:
|
|
705
|
+
await progress_callback(content_to_add)
|
|
706
|
+
|
|
707
|
+
all_content.extend(content_to_add)
|
|
708
|
+
await asyncio.sleep(fetch_interval)
|
|
709
|
+
|
|
710
|
+
logger.info(f"Fetched {len(all_content)} content items for user {user_id}")
|
|
711
|
+
return all_content
|
|
712
|
+
|
|
713
|
+
async def get_home_recommendations(self) -> List[Dict]:
|
|
714
|
+
"""
|
|
715
|
+
Get home feed recommendations with proper header signature
|
|
716
|
+
|
|
717
|
+
Returns:
|
|
718
|
+
List of simplified home feed data
|
|
719
|
+
"""
|
|
720
|
+
payload = {
|
|
721
|
+
"category": "homefeed_recommend",
|
|
722
|
+
"cursor_score": "",
|
|
723
|
+
"image_formats": json.dumps(["jpg", "webp", "avif"], separators=(",", ":")),
|
|
724
|
+
"need_filter_image": False,
|
|
725
|
+
"need_num": 8,
|
|
726
|
+
"num": 18,
|
|
727
|
+
"note_index": 33,
|
|
728
|
+
"refresh_type": 1,
|
|
729
|
+
"search_key": "",
|
|
730
|
+
"unread_begin_note_id": "",
|
|
731
|
+
"unread_end_note_id": "",
|
|
732
|
+
"unread_note_count": 0
|
|
733
|
+
}
|
|
734
|
+
endpoint = "/api/sns/web/v1/homefeed"
|
|
735
|
+
|
|
736
|
+
# Prepare headers with signature specifically for home feed
|
|
737
|
+
headers = await self._prepare_request_headers(endpoint, payload)
|
|
738
|
+
|
|
739
|
+
# Make the request with proper headers
|
|
740
|
+
json_payload = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
|
|
741
|
+
result = await self._make_request(
|
|
742
|
+
"POST", f"{self._api_base}{endpoint}",
|
|
743
|
+
data=json_payload, headers=headers
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Return simplified note list
|
|
747
|
+
note_list = []
|
|
748
|
+
for item in result.get("items", []):
|
|
749
|
+
if not item.get('id'):
|
|
750
|
+
continue
|
|
751
|
+
note_card = item.get('note_card', {})
|
|
752
|
+
user_info = note_card.get('user', {})
|
|
753
|
+
interact_info = note_card.get('interact_info', {})
|
|
754
|
+
image_list = note_card.get('image_list', [])
|
|
755
|
+
tag_list = note_card.get('tag_list', [])
|
|
756
|
+
|
|
757
|
+
note_data = {
|
|
758
|
+
"note_id": item.get("id"),
|
|
759
|
+
"type": note_card.get("type"),
|
|
760
|
+
"title": note_card.get("display_title", ""),
|
|
761
|
+
"desc": note_card.get("desc", ""),
|
|
762
|
+
"time": note_card.get("time"),
|
|
763
|
+
"last_update_time": note_card.get("last_update_time", 0),
|
|
764
|
+
"user_id": user_info.get("user_id"),
|
|
765
|
+
"nickname": user_info.get("nickname"),
|
|
766
|
+
"avatar": user_info.get("avatar"),
|
|
767
|
+
"liked_count": interact_info.get("liked_count", 0),
|
|
768
|
+
"collected_count": interact_info.get("collected_count", 0),
|
|
769
|
+
"comment_count": interact_info.get("comment_count", 0),
|
|
770
|
+
"share_count": interact_info.get("share_count", 0),
|
|
771
|
+
"ip_location": note_card.get("ip_location", ""),
|
|
772
|
+
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
|
773
|
+
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
|
774
|
+
"note_url": f"https://www.xiaohongshu.com/explore/{item.get('id')}",
|
|
775
|
+
"xsec_token": item.get("xsec_token", ""),
|
|
776
|
+
}
|
|
777
|
+
note_list.append(note_data)
|
|
778
|
+
|
|
779
|
+
return note_list
|
|
780
|
+
|
|
781
|
+
async def submit_comment(self, content_id: str, comment_text: str) -> Dict:
|
|
782
|
+
"""
|
|
783
|
+
Submit comment to content
|
|
784
|
+
|
|
785
|
+
Args:
|
|
786
|
+
content_id: Content ID
|
|
787
|
+
comment_text: Comment text
|
|
788
|
+
|
|
789
|
+
Returns:
|
|
790
|
+
Submit result
|
|
791
|
+
"""
|
|
792
|
+
endpoint = '/api/sns/web/v1/comment/post'
|
|
793
|
+
payload = {
|
|
794
|
+
"note_id": content_id,
|
|
795
|
+
"content": comment_text,
|
|
796
|
+
"at_users": []
|
|
797
|
+
}
|
|
798
|
+
return await self._post_request(endpoint, payload)
|
|
799
|
+
|
|
800
|
+
async def close(self):
|
|
801
|
+
if self.browser_session and self.target_id:
|
|
802
|
+
try:
|
|
803
|
+
logger.info(f"Close target id: {self.target_id}")
|
|
804
|
+
await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
|
|
805
|
+
except Exception as e:
|
|
806
|
+
logger.warning(f"Error closing target {self.target_id}: {e}")
|
|
807
|
+
|