vibesurf 0.1.27__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/tools/website_api/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/client.py +845 -0
- vibe_surf/tools/website_api/douyin/helpers.py +239 -0
- vibe_surf/tools/website_api/weibo/__init__.py +0 -0
- vibe_surf/tools/website_api/weibo/client.py +846 -0
- vibe_surf/tools/website_api/weibo/helpers.py +997 -0
- vibe_surf/tools/website_api/xhs/__init__.py +0 -0
- vibe_surf/tools/website_api/xhs/client.py +807 -0
- vibe_surf/tools/website_api/xhs/helpers.py +301 -0
- vibe_surf/tools/website_api/youtube/__init__.py +32 -0
- vibe_surf/tools/website_api/youtube/client.py +1179 -0
- vibe_surf/tools/website_api/youtube/helpers.py +420 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/METADATA +1 -1
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/RECORD +20 -7
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/licenses/LICENSE +0 -0
- {vibesurf-0.1.27.dist-info → vibesurf-0.1.28.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,845 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import copy
|
|
4
|
+
import pdb
|
|
5
|
+
import time
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import os
|
|
8
|
+
from typing import Dict, List, Optional, Callable, Union, Any
|
|
9
|
+
import httpx
|
|
10
|
+
import random
|
|
11
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import execjs
|
|
15
|
+
|
|
16
|
+
HAS_EXECJS = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
HAS_EXECJS = False
|
|
19
|
+
|
|
20
|
+
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
21
|
+
from vibe_surf.logger import get_logger
|
|
22
|
+
|
|
23
|
+
from .helpers import (
|
|
24
|
+
SearchChannelType, SearchSortType, PublishTimeType,
|
|
25
|
+
generate_web_id, generate_trace_id, create_common_params,
|
|
26
|
+
extract_cookies_from_browser, create_referer_url,
|
|
27
|
+
extract_aweme_media_urls, DouyinError, NetworkError,
|
|
28
|
+
DataExtractionError, AuthenticationError, RateLimitError,
|
|
29
|
+
VerificationError
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = get_logger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DouyinApiClient:
|
|
36
|
+
"""
|
|
37
|
+
Douyin API client with integrated browser session management.
|
|
38
|
+
This client handles API communication through browser session for authentication.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
|
|
42
|
+
"""
|
|
43
|
+
Initialize the Douyin API client
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
browser_session: Browser session for authentication
|
|
47
|
+
timeout: Request timeout in seconds
|
|
48
|
+
proxy: Proxy URL if needed
|
|
49
|
+
"""
|
|
50
|
+
self.browser_session = browser_session
|
|
51
|
+
self.target_id = None
|
|
52
|
+
self.proxy = proxy
|
|
53
|
+
self.timeout = timeout
|
|
54
|
+
self._host = "https://www.douyin.com"
|
|
55
|
+
|
|
56
|
+
# Default headers
|
|
57
|
+
self.default_headers = {
|
|
58
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
59
|
+
"Host": "www.douyin.com",
|
|
60
|
+
"Origin": "https://www.douyin.com/",
|
|
61
|
+
"Referer": "https://www.douyin.com/",
|
|
62
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
63
|
+
}
|
|
64
|
+
self.cookies = {}
|
|
65
|
+
|
|
66
|
+
async def setup(self, target_id: Optional[str] = None):
|
|
67
|
+
"""
|
|
68
|
+
Setup Douyin client by navigating to the site and extracting cookies
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
target_id: Specific target ID to use, or None to create new
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
AuthenticationError: If unable to access Douyin properly
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
if self.target_id and self.cookies:
|
|
78
|
+
logger.info("Douyin client already setup. Returning!")
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if target_id:
|
|
82
|
+
self.target_id = target_id
|
|
83
|
+
else:
|
|
84
|
+
self.target_id = await self.browser_session.navigate_to_url(
|
|
85
|
+
"https://www.douyin.com/", new_tab=True
|
|
86
|
+
)
|
|
87
|
+
await asyncio.sleep(3) # Wait for page to load
|
|
88
|
+
|
|
89
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
90
|
+
result = await asyncio.wait_for(
|
|
91
|
+
cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
|
|
92
|
+
timeout=8.0
|
|
93
|
+
)
|
|
94
|
+
web_cookies = result.get('cookies', [])
|
|
95
|
+
user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
96
|
+
params={
|
|
97
|
+
'expression': "navigator.userAgent",
|
|
98
|
+
'returnByValue': True,
|
|
99
|
+
'awaitPromise': True
|
|
100
|
+
},
|
|
101
|
+
session_id=cdp_session.session_id,
|
|
102
|
+
)
|
|
103
|
+
user_agent = user_agent_result.get('result', {}).get('value')
|
|
104
|
+
if user_agent:
|
|
105
|
+
self.default_headers["User-Agent"] = user_agent
|
|
106
|
+
cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
|
|
107
|
+
if cookie_str:
|
|
108
|
+
self.default_headers["Cookie"] = cookie_str
|
|
109
|
+
self.cookies = cookie_dict
|
|
110
|
+
|
|
111
|
+
logger.info(f"Douyin client setup completed with {len(cookie_dict)} cookies")
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Failed to setup Douyin client: {e}")
|
|
115
|
+
raise AuthenticationError(f"Douyin client setup failed: {e}")
|
|
116
|
+
|
|
117
|
+
async def _get_local_storage_token(self) -> Optional[str]:
|
|
118
|
+
"""Get msToken from browser local storage"""
|
|
119
|
+
try:
|
|
120
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
121
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
122
|
+
params={
|
|
123
|
+
'expression': "window.localStorage.getItem('xmst')",
|
|
124
|
+
'returnByValue': True,
|
|
125
|
+
'awaitPromise': True
|
|
126
|
+
},
|
|
127
|
+
session_id=cdp_session.session_id,
|
|
128
|
+
)
|
|
129
|
+
return result.get('result', {}).get('value')
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.warning(f"Failed to get local storage token: {e}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
def _init_js_context(self):
|
|
135
|
+
"""Initialize JavaScript context for signature generation"""
|
|
136
|
+
if not HAS_EXECJS:
|
|
137
|
+
logger.warning("execjs not available, signature generation disabled")
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
js_file_path = os.path.join(os.path.dirname(__file__), 'douyin.js')
|
|
142
|
+
if not os.path.exists(js_file_path):
|
|
143
|
+
logger.warning(f"douyin.js file not found at {js_file_path}")
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
with open(js_file_path, 'r', encoding='utf-8-sig') as f:
|
|
147
|
+
js_content = f.read()
|
|
148
|
+
|
|
149
|
+
return execjs.compile(js_content)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"Failed to initialize JS context: {e}")
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
async def _get_a_bogus_signature(self, uri: str, params: str, post_data: Dict = None) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Get a-bogus signature using JavaScript execution
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
uri: Request URI
|
|
160
|
+
params: URL parameters string
|
|
161
|
+
post_data: POST data if applicable
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
a-bogus signature string
|
|
165
|
+
"""
|
|
166
|
+
try:
|
|
167
|
+
if not hasattr(self, '_js_context'):
|
|
168
|
+
self._js_context = self._init_js_context()
|
|
169
|
+
|
|
170
|
+
if not self._js_context:
|
|
171
|
+
return ""
|
|
172
|
+
|
|
173
|
+
user_agent = self.default_headers.get('User-Agent', '')
|
|
174
|
+
|
|
175
|
+
# Determine the signature function name based on URI
|
|
176
|
+
sign_function_name = "sign_datail"
|
|
177
|
+
if "/reply" in uri:
|
|
178
|
+
sign_function_name = "sign_reply"
|
|
179
|
+
|
|
180
|
+
# Call the JavaScript function
|
|
181
|
+
a_bogus = self._js_context.call(sign_function_name, params, user_agent)
|
|
182
|
+
return a_bogus or ""
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.warning(f"Failed to generate a-bogus signature: {e}")
|
|
186
|
+
return ""
|
|
187
|
+
|
|
188
|
+
async def _prepare_request_params(self, uri: str, params: Optional[Dict] = None,
|
|
189
|
+
headers: Optional[Dict] = None, request_method: str = "GET",
|
|
190
|
+
post_data: Optional[Dict] = None):
|
|
191
|
+
"""
|
|
192
|
+
Prepare request parameters with common Douyin parameters and signatures
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
uri: Request URI
|
|
196
|
+
params: Request parameters
|
|
197
|
+
headers: Request headers
|
|
198
|
+
request_method: HTTP method
|
|
199
|
+
post_data: POST data if applicable
|
|
200
|
+
"""
|
|
201
|
+
if not params:
|
|
202
|
+
params = {}
|
|
203
|
+
|
|
204
|
+
headers = headers or copy.deepcopy(self.default_headers)
|
|
205
|
+
|
|
206
|
+
# Add common parameters
|
|
207
|
+
common_params = create_common_params()
|
|
208
|
+
|
|
209
|
+
# Add msToken from local storage
|
|
210
|
+
ms_token = await self._get_local_storage_token()
|
|
211
|
+
if ms_token:
|
|
212
|
+
common_params["msToken"] = ms_token
|
|
213
|
+
|
|
214
|
+
params.update(common_params)
|
|
215
|
+
|
|
216
|
+
# Generate query string
|
|
217
|
+
query_string = urllib.parse.urlencode(params)
|
|
218
|
+
|
|
219
|
+
# Get a-bogus signature
|
|
220
|
+
post_data = post_data or {}
|
|
221
|
+
a_bogus = await self._get_a_bogus_signature(uri, query_string, post_data)
|
|
222
|
+
if a_bogus:
|
|
223
|
+
params["a_bogus"] = a_bogus
|
|
224
|
+
|
|
225
|
+
return params, headers
|
|
226
|
+
|
|
227
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
|
|
228
|
+
async def _make_request(self, method: str, url: str, **kwargs) -> Union[str, Dict]:
|
|
229
|
+
"""
|
|
230
|
+
Make HTTP request with error handling and retries
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
method: HTTP method
|
|
234
|
+
url: Request URL
|
|
235
|
+
**kwargs: Additional request parameters
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Response data
|
|
239
|
+
"""
|
|
240
|
+
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
|
241
|
+
response = await client.request(method, url, timeout=self.timeout, **kwargs)
|
|
242
|
+
|
|
243
|
+
# Handle common error responses
|
|
244
|
+
if response.text == "" or response.text == "blocked":
|
|
245
|
+
logger.error(f"Request blocked, response.text: {response.text}")
|
|
246
|
+
raise VerificationError("Account may be blocked or requires verification")
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
data = response.json()
|
|
250
|
+
|
|
251
|
+
# Check for successful response
|
|
252
|
+
if response.status_code == 200:
|
|
253
|
+
return data
|
|
254
|
+
else:
|
|
255
|
+
error_msg = data.get("message", "Request failed")
|
|
256
|
+
raise DataExtractionError(f"API error: {error_msg}")
|
|
257
|
+
|
|
258
|
+
except json.JSONDecodeError:
|
|
259
|
+
if response.status_code == 200:
|
|
260
|
+
return response.text
|
|
261
|
+
else:
|
|
262
|
+
raise DataExtractionError(f"Invalid response: {response.text[:200]}")
|
|
263
|
+
|
|
264
|
+
async def get_request(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
|
265
|
+
"""Make GET request with Douyin-specific parameter preparation"""
|
|
266
|
+
params, headers = await self._prepare_request_params(uri, params, headers, "GET")
|
|
267
|
+
return await self._make_request("GET", f"{self._host}{uri}", params=params, headers=headers)
|
|
268
|
+
|
|
269
|
+
async def post_request(self, uri: str, data: Dict, headers: Optional[Dict] = None):
|
|
270
|
+
"""Make POST request with Douyin-specific parameter preparation"""
|
|
271
|
+
data, headers = await self._prepare_request_params(uri, data, headers, "POST", post_data=data)
|
|
272
|
+
return await self._make_request("POST", f"{self._host}{uri}", data=data, headers=headers)
|
|
273
|
+
|
|
274
|
+
async def search_content_by_keyword(
|
|
275
|
+
self,
|
|
276
|
+
keyword: str,
|
|
277
|
+
offset: int = 0,
|
|
278
|
+
search_channel: SearchChannelType = SearchChannelType.GENERAL,
|
|
279
|
+
sort_type: SearchSortType = SearchSortType.GENERAL,
|
|
280
|
+
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
|
|
281
|
+
search_id: str = "",
|
|
282
|
+
) -> List[Dict]:
|
|
283
|
+
"""
|
|
284
|
+
Search content by keyword using Douyin Web Search API
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
keyword: Search keyword
|
|
288
|
+
offset: Pagination offset
|
|
289
|
+
search_channel: Search channel type
|
|
290
|
+
sort_type: Sort method
|
|
291
|
+
publish_time: Time filter
|
|
292
|
+
search_id: Search session ID
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
List of simplified aweme data
|
|
296
|
+
"""
|
|
297
|
+
query_params = {
|
|
298
|
+
'search_channel': search_channel.value,
|
|
299
|
+
'enable_history': '1',
|
|
300
|
+
'keyword': keyword,
|
|
301
|
+
'search_source': 'tab_search',
|
|
302
|
+
'query_correct_type': '1',
|
|
303
|
+
'is_filter_search': '0',
|
|
304
|
+
'offset': offset,
|
|
305
|
+
'count': '15',
|
|
306
|
+
'need_filter_settings': '1',
|
|
307
|
+
'list_type': 'multi',
|
|
308
|
+
'search_id': search_id,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
# Add filters if not default
|
|
312
|
+
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
|
313
|
+
query_params["filter_selected"] = json.dumps({
|
|
314
|
+
"sort_type": str(sort_type.value),
|
|
315
|
+
"publish_time": str(publish_time.value)
|
|
316
|
+
})
|
|
317
|
+
query_params["is_filter_search"] = 1
|
|
318
|
+
query_params["search_source"] = "tab_search"
|
|
319
|
+
|
|
320
|
+
referer_url = create_referer_url(keyword=keyword)
|
|
321
|
+
headers = copy.copy(self.default_headers)
|
|
322
|
+
headers["Referer"] = referer_url
|
|
323
|
+
|
|
324
|
+
search_result = await self.get_request("/aweme/v1/web/general/search/single/", query_params, headers)
|
|
325
|
+
|
|
326
|
+
# Return simplified aweme list
|
|
327
|
+
aweme_list = []
|
|
328
|
+
for post_item in search_result.get("data", []):
|
|
329
|
+
try:
|
|
330
|
+
aweme_info: Dict = (
|
|
331
|
+
post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
|
|
332
|
+
except (TypeError, IndexError):
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
if not aweme_info or not aweme_info.get("aweme_id"):
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
user_info = aweme_info.get("author", {})
|
|
339
|
+
interact_info = aweme_info.get("statistics", {})
|
|
340
|
+
|
|
341
|
+
# Simplified aweme data
|
|
342
|
+
aweme_data = {
|
|
343
|
+
"aweme_id": aweme_info.get("aweme_id"),
|
|
344
|
+
"aweme_type": str(aweme_info.get("aweme_type", "")),
|
|
345
|
+
"title": aweme_info.get("desc", ""),
|
|
346
|
+
"desc": aweme_info.get("desc", ""),
|
|
347
|
+
"create_time": aweme_info.get("create_time"),
|
|
348
|
+
"user_id": user_info.get("uid"),
|
|
349
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
350
|
+
"short_user_id": user_info.get("short_id"),
|
|
351
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
352
|
+
"nickname": user_info.get("nickname"),
|
|
353
|
+
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
|
354
|
+
"liked_count": str(interact_info.get("digg_count", 0)),
|
|
355
|
+
"collected_count": str(interact_info.get("collect_count", 0)),
|
|
356
|
+
"comment_count": str(interact_info.get("comment_count", 0)),
|
|
357
|
+
"share_count": str(interact_info.get("share_count", 0)),
|
|
358
|
+
"ip_location": aweme_info.get("ip_label", ""),
|
|
359
|
+
"aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
|
|
360
|
+
}
|
|
361
|
+
aweme_list.append(aweme_data)
|
|
362
|
+
|
|
363
|
+
return aweme_list
|
|
364
|
+
|
|
365
|
+
async def fetch_video_details(self, aweme_id: str) -> Dict:
|
|
366
|
+
"""
|
|
367
|
+
Fetch detailed video information by aweme ID
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
aweme_id: Video ID
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Simplified video details data
|
|
374
|
+
"""
|
|
375
|
+
params = {"aweme_id": aweme_id}
|
|
376
|
+
headers = copy.copy(self.default_headers)
|
|
377
|
+
if "Origin" in headers:
|
|
378
|
+
del headers["Origin"]
|
|
379
|
+
|
|
380
|
+
response = await self.get_request("/aweme/v1/web/aweme/detail/", params, headers)
|
|
381
|
+
aweme_detail = response.get("aweme_detail", {})
|
|
382
|
+
|
|
383
|
+
if not aweme_detail:
|
|
384
|
+
return {}
|
|
385
|
+
|
|
386
|
+
user_info = aweme_detail.get("author", {})
|
|
387
|
+
interact_info = aweme_detail.get("statistics", {})
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
"aweme_id": aweme_detail.get("aweme_id"),
|
|
391
|
+
"aweme_type": str(aweme_detail.get("aweme_type", "")),
|
|
392
|
+
"title": aweme_detail.get("desc", ""),
|
|
393
|
+
"desc": aweme_detail.get("desc", ""),
|
|
394
|
+
"create_time": aweme_detail.get("create_time"),
|
|
395
|
+
"user_id": user_info.get("uid"),
|
|
396
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
397
|
+
"short_user_id": user_info.get("short_id"),
|
|
398
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
399
|
+
"nickname": user_info.get("nickname"),
|
|
400
|
+
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
|
401
|
+
"liked_count": str(interact_info.get("digg_count", 0)),
|
|
402
|
+
"collected_count": str(interact_info.get("collect_count", 0)),
|
|
403
|
+
"comment_count": str(interact_info.get("comment_count", 0)),
|
|
404
|
+
"share_count": str(interact_info.get("share_count", 0)),
|
|
405
|
+
"ip_location": aweme_detail.get("ip_label", ""),
|
|
406
|
+
"aweme_url": f"https://www.douyin.com/video/{aweme_detail.get('aweme_id')}",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
async def fetch_video_comments(self, aweme_id: str, cursor: int = 0) -> List[Dict]:
|
|
410
|
+
"""
|
|
411
|
+
Fetch video comments with pagination
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
aweme_id: Video ID
|
|
415
|
+
cursor: Pagination cursor
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
List of simplified comments data
|
|
419
|
+
"""
|
|
420
|
+
uri = "/aweme/v1/web/comment/list/"
|
|
421
|
+
params = {
|
|
422
|
+
"aweme_id": aweme_id,
|
|
423
|
+
"cursor": cursor,
|
|
424
|
+
"count": 20,
|
|
425
|
+
"item_type": 0
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
headers = copy.copy(self.default_headers)
|
|
429
|
+
headers["Referer"] = create_referer_url(aweme_id=aweme_id)
|
|
430
|
+
|
|
431
|
+
response = await self.get_request(uri, params, headers)
|
|
432
|
+
|
|
433
|
+
# Return simplified comments
|
|
434
|
+
comments = []
|
|
435
|
+
for comment_item in response.get("comments", []):
|
|
436
|
+
if not comment_item.get("cid"):
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
user_info = comment_item.get("user", {})
|
|
440
|
+
avatar_info = (user_info.get("avatar_medium", {}) or
|
|
441
|
+
user_info.get("avatar_300x300", {}) or
|
|
442
|
+
user_info.get("avatar_168x168", {}) or
|
|
443
|
+
user_info.get("avatar_thumb", {}) or {})
|
|
444
|
+
|
|
445
|
+
comment_data = {
|
|
446
|
+
"comment_id": comment_item.get("cid"),
|
|
447
|
+
"create_time": comment_item.get("create_time"),
|
|
448
|
+
"ip_location": comment_item.get("ip_label", ""),
|
|
449
|
+
"aweme_id": aweme_id,
|
|
450
|
+
"content": comment_item.get("text"),
|
|
451
|
+
"user_id": user_info.get("uid"),
|
|
452
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
453
|
+
"short_user_id": user_info.get("short_id"),
|
|
454
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
455
|
+
"nickname": user_info.get("nickname"),
|
|
456
|
+
"avatar": avatar_info.get("url_list", [""])[0],
|
|
457
|
+
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
|
|
458
|
+
"like_count": comment_item.get("digg_count", 0),
|
|
459
|
+
"parent_comment_id": comment_item.get("reply_id", "0"),
|
|
460
|
+
}
|
|
461
|
+
comments.append(comment_data)
|
|
462
|
+
|
|
463
|
+
return comments
|
|
464
|
+
|
|
465
|
+
async def fetch_comment_replies(self, aweme_id: str, comment_id: str, cursor: int = 0) -> List[Dict]:
|
|
466
|
+
"""
|
|
467
|
+
Fetch replies to a specific comment
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
aweme_id: Video ID
|
|
471
|
+
comment_id: Parent comment ID
|
|
472
|
+
cursor: Pagination cursor
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
List of simplified reply comments data
|
|
476
|
+
"""
|
|
477
|
+
uri = "/aweme/v1/web/comment/list/reply/"
|
|
478
|
+
params = {
|
|
479
|
+
'comment_id': comment_id,
|
|
480
|
+
"cursor": cursor,
|
|
481
|
+
"count": 20,
|
|
482
|
+
"item_type": 0,
|
|
483
|
+
"item_id": aweme_id,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
headers = copy.copy(self.default_headers)
|
|
487
|
+
headers["Referer"] = create_referer_url(aweme_id=aweme_id)
|
|
488
|
+
|
|
489
|
+
response = await self.get_request(uri, params, headers)
|
|
490
|
+
|
|
491
|
+
# Return simplified reply comments
|
|
492
|
+
replies = []
|
|
493
|
+
for comment_item in response.get("comments", []):
|
|
494
|
+
if not comment_item.get("cid"):
|
|
495
|
+
continue
|
|
496
|
+
|
|
497
|
+
user_info = comment_item.get("user", {})
|
|
498
|
+
avatar_info = (user_info.get("avatar_medium", {}) or
|
|
499
|
+
user_info.get("avatar_300x300", {}) or
|
|
500
|
+
user_info.get("avatar_168x168", {}) or
|
|
501
|
+
user_info.get("avatar_thumb", {}) or {})
|
|
502
|
+
|
|
503
|
+
reply_data = {
|
|
504
|
+
"comment_id": comment_item.get("cid"),
|
|
505
|
+
"create_time": comment_item.get("create_time"),
|
|
506
|
+
"ip_location": comment_item.get("ip_label", ""),
|
|
507
|
+
"aweme_id": aweme_id,
|
|
508
|
+
"content": comment_item.get("text"),
|
|
509
|
+
"user_id": user_info.get("uid"),
|
|
510
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
511
|
+
"short_user_id": user_info.get("short_id"),
|
|
512
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
513
|
+
"nickname": user_info.get("nickname"),
|
|
514
|
+
"avatar": avatar_info.get("url_list", [""])[0],
|
|
515
|
+
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
|
|
516
|
+
"like_count": comment_item.get("digg_count", 0),
|
|
517
|
+
"parent_comment_id": comment_id,
|
|
518
|
+
}
|
|
519
|
+
replies.append(reply_data)
|
|
520
|
+
|
|
521
|
+
return replies
|
|
522
|
+
|
|
523
|
+
async def fetch_all_video_comments(
|
|
524
|
+
self,
|
|
525
|
+
aweme_id: str,
|
|
526
|
+
fetch_interval: float = 1.0,
|
|
527
|
+
include_replies: bool = False,
|
|
528
|
+
progress_callback: Optional[Callable] = None,
|
|
529
|
+
max_comments: int = 1000,
|
|
530
|
+
) -> List[Dict]:
|
|
531
|
+
"""
|
|
532
|
+
Fetch all comments for a video, including replies if requested
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
aweme_id: Video ID
|
|
536
|
+
fetch_interval: Delay between requests
|
|
537
|
+
include_replies: Whether to fetch comment replies
|
|
538
|
+
progress_callback: Callback for progress updates
|
|
539
|
+
max_comments: Maximum comments to fetch
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
List of all simplified comments
|
|
543
|
+
"""
|
|
544
|
+
all_comments = []
|
|
545
|
+
has_more = True
|
|
546
|
+
cursor = 0
|
|
547
|
+
|
|
548
|
+
while has_more and len(all_comments) < max_comments:
|
|
549
|
+
uri = "/aweme/v1/web/comment/list/"
|
|
550
|
+
params = {
|
|
551
|
+
"aweme_id": aweme_id,
|
|
552
|
+
"cursor": cursor,
|
|
553
|
+
"count": 20,
|
|
554
|
+
"item_type": 0
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
headers = copy.copy(self.default_headers)
|
|
558
|
+
headers["Referer"] = create_referer_url(aweme_id=aweme_id)
|
|
559
|
+
|
|
560
|
+
comments_data = await self.get_request(uri, params, headers)
|
|
561
|
+
has_more = comments_data.get("has_more", False)
|
|
562
|
+
cursor = comments_data.get("cursor", 0)
|
|
563
|
+
|
|
564
|
+
# Get simplified comments from this batch
|
|
565
|
+
batch_comments = []
|
|
566
|
+
for comment_item in comments_data.get("comments", []):
|
|
567
|
+
if not comment_item.get("cid"):
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
user_info = comment_item.get("user", {})
|
|
571
|
+
avatar_info = (user_info.get("avatar_medium", {}) or
|
|
572
|
+
user_info.get("avatar_300x300", {}) or
|
|
573
|
+
user_info.get("avatar_168x168", {}) or
|
|
574
|
+
user_info.get("avatar_thumb", {}) or {})
|
|
575
|
+
|
|
576
|
+
comment_data = {
|
|
577
|
+
"comment_id": comment_item.get("cid"),
|
|
578
|
+
"create_time": comment_item.get("create_time"),
|
|
579
|
+
"ip_location": comment_item.get("ip_label", ""),
|
|
580
|
+
"aweme_id": aweme_id,
|
|
581
|
+
"content": comment_item.get("text"),
|
|
582
|
+
"user_id": user_info.get("uid"),
|
|
583
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
584
|
+
"short_user_id": user_info.get("short_id"),
|
|
585
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
586
|
+
"nickname": user_info.get("nickname"),
|
|
587
|
+
"avatar": avatar_info.get("url_list", [""])[0],
|
|
588
|
+
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
|
|
589
|
+
"like_count": comment_item.get("digg_count", 0),
|
|
590
|
+
"parent_comment_id": comment_item.get("reply_id", "0"),
|
|
591
|
+
}
|
|
592
|
+
batch_comments.append(comment_data)
|
|
593
|
+
|
|
594
|
+
if not batch_comments:
|
|
595
|
+
break
|
|
596
|
+
|
|
597
|
+
# Limit comments to max_comments
|
|
598
|
+
remaining_slots = max_comments - len(all_comments)
|
|
599
|
+
if remaining_slots <= 0:
|
|
600
|
+
break
|
|
601
|
+
|
|
602
|
+
if len(batch_comments) > remaining_slots:
|
|
603
|
+
batch_comments = batch_comments[:remaining_slots]
|
|
604
|
+
|
|
605
|
+
all_comments.extend(batch_comments)
|
|
606
|
+
|
|
607
|
+
if progress_callback:
|
|
608
|
+
await progress_callback(aweme_id, batch_comments)
|
|
609
|
+
|
|
610
|
+
await asyncio.sleep(fetch_interval)
|
|
611
|
+
|
|
612
|
+
# Fetch replies if requested
|
|
613
|
+
if include_replies:
|
|
614
|
+
for comment in batch_comments:
|
|
615
|
+
reply_count = int(comment.get("sub_comment_count", 0))
|
|
616
|
+
|
|
617
|
+
if reply_count > 0:
|
|
618
|
+
comment_id = comment.get("comment_id")
|
|
619
|
+
replies = await self.fetch_comment_replies(aweme_id, comment_id, 0)
|
|
620
|
+
all_comments.extend(replies)
|
|
621
|
+
|
|
622
|
+
if progress_callback:
|
|
623
|
+
await progress_callback(aweme_id, replies)
|
|
624
|
+
|
|
625
|
+
await asyncio.sleep(fetch_interval)
|
|
626
|
+
|
|
627
|
+
logger.info(f"Fetched {len(all_comments)} comments for video {aweme_id}")
|
|
628
|
+
return all_comments
|
|
629
|
+
|
|
630
|
+
async def fetch_user_info(self, sec_user_id: str) -> Dict:
|
|
631
|
+
"""
|
|
632
|
+
Fetch user profile information
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
sec_user_id: User's security ID
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
Simplified user information data
|
|
639
|
+
"""
|
|
640
|
+
uri = "/aweme/v1/web/user/profile/other/"
|
|
641
|
+
params = {
|
|
642
|
+
"sec_user_id": sec_user_id,
|
|
643
|
+
"publish_video_strategy_type": 2,
|
|
644
|
+
"personal_center_strategy": 1,
|
|
645
|
+
}
|
|
646
|
+
response = await self.get_request(uri, params)
|
|
647
|
+
|
|
648
|
+
user_data = response.get("user", {})
|
|
649
|
+
if not user_data:
|
|
650
|
+
return {}
|
|
651
|
+
|
|
652
|
+
gender_map = {0: "未知", 1: "男", 2: "女"}
|
|
653
|
+
avatar_uri = user_data.get("avatar_300x300", {}).get("uri", "")
|
|
654
|
+
|
|
655
|
+
return {
|
|
656
|
+
"user_id": user_data.get("uid"),
|
|
657
|
+
"nickname": user_data.get("nickname"),
|
|
658
|
+
"gender": gender_map.get(user_data.get("gender"), "未知"),
|
|
659
|
+
"avatar": f"https://p3-pc.douyinpic.com/img/{avatar_uri}~c5_300x300.jpeg?from=2956013662" if avatar_uri else "",
|
|
660
|
+
"desc": user_data.get("signature"),
|
|
661
|
+
"ip_location": user_data.get("ip_location"),
|
|
662
|
+
"follows": user_data.get("following_count", 0),
|
|
663
|
+
"fans": user_data.get("max_follower_count", 0),
|
|
664
|
+
"interaction": user_data.get("total_favorited", 0),
|
|
665
|
+
"videos_count": user_data.get("aweme_count", 0),
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
async def fetch_user_videos(self, sec_user_id: str, max_cursor: str = "") -> List[Dict]:
|
|
669
|
+
"""
|
|
670
|
+
Fetch user's videos with pagination
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
sec_user_id: User's security ID
|
|
674
|
+
max_cursor: Pagination cursor
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
List of simplified user videos data
|
|
678
|
+
"""
|
|
679
|
+
uri = "/aweme/v1/web/aweme/post/"
|
|
680
|
+
params = {
|
|
681
|
+
"sec_user_id": sec_user_id,
|
|
682
|
+
"count": 18,
|
|
683
|
+
"max_cursor": max_cursor,
|
|
684
|
+
"locate_query": "false",
|
|
685
|
+
"publish_video_strategy_type": 2,
|
|
686
|
+
}
|
|
687
|
+
response = await self.get_request(uri, params)
|
|
688
|
+
|
|
689
|
+
# Return simplified aweme list
|
|
690
|
+
aweme_list = []
|
|
691
|
+
for aweme_info in response.get("aweme_list", []):
|
|
692
|
+
if not aweme_info.get("aweme_id"):
|
|
693
|
+
continue
|
|
694
|
+
|
|
695
|
+
user_info = aweme_info.get("author", {})
|
|
696
|
+
interact_info = aweme_info.get("statistics", {})
|
|
697
|
+
|
|
698
|
+
aweme_data = {
|
|
699
|
+
"aweme_id": aweme_info.get("aweme_id"),
|
|
700
|
+
"aweme_type": str(aweme_info.get("aweme_type", "")),
|
|
701
|
+
"title": aweme_info.get("desc", ""),
|
|
702
|
+
"desc": aweme_info.get("desc", ""),
|
|
703
|
+
"create_time": aweme_info.get("create_time"),
|
|
704
|
+
"user_id": user_info.get("uid"),
|
|
705
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
706
|
+
"short_user_id": user_info.get("short_id"),
|
|
707
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
708
|
+
"nickname": user_info.get("nickname"),
|
|
709
|
+
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
|
710
|
+
"liked_count": str(interact_info.get("digg_count", 0)),
|
|
711
|
+
"collected_count": str(interact_info.get("collect_count", 0)),
|
|
712
|
+
"comment_count": str(interact_info.get("comment_count", 0)),
|
|
713
|
+
"share_count": str(interact_info.get("share_count", 0)),
|
|
714
|
+
"ip_location": aweme_info.get("ip_label", ""),
|
|
715
|
+
"aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
|
|
716
|
+
}
|
|
717
|
+
aweme_list.append(aweme_data)
|
|
718
|
+
|
|
719
|
+
return aweme_list
|
|
720
|
+
|
|
721
|
+
async def fetch_all_user_videos(
|
|
722
|
+
self,
|
|
723
|
+
sec_user_id: str,
|
|
724
|
+
progress_callback: Optional[Callable] = None,
|
|
725
|
+
max_videos: int = 1000
|
|
726
|
+
) -> List[Dict]:
|
|
727
|
+
"""
|
|
728
|
+
Fetch all videos from a user
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
sec_user_id: User's security ID
|
|
732
|
+
progress_callback: Callback for progress updates
|
|
733
|
+
max_videos: Maximum videos to fetch
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
List of all simplified user videos
|
|
737
|
+
"""
|
|
738
|
+
all_videos = []
|
|
739
|
+
has_more = True
|
|
740
|
+
max_cursor = ""
|
|
741
|
+
|
|
742
|
+
while has_more and len(all_videos) < max_videos:
|
|
743
|
+
uri = "/aweme/v1/web/aweme/post/"
|
|
744
|
+
params = {
|
|
745
|
+
"sec_user_id": sec_user_id,
|
|
746
|
+
"count": 18,
|
|
747
|
+
"max_cursor": max_cursor,
|
|
748
|
+
"locate_query": "false",
|
|
749
|
+
"publish_video_strategy_type": 2,
|
|
750
|
+
}
|
|
751
|
+
videos_data = await self.get_request(uri, params)
|
|
752
|
+
has_more = videos_data.get("has_more", False)
|
|
753
|
+
max_cursor = videos_data.get("max_cursor", "")
|
|
754
|
+
|
|
755
|
+
# Get simplified videos from this batch
|
|
756
|
+
batch_videos = []
|
|
757
|
+
for aweme_info in videos_data.get("aweme_list", []):
|
|
758
|
+
if not aweme_info.get("aweme_id"):
|
|
759
|
+
continue
|
|
760
|
+
|
|
761
|
+
user_info = aweme_info.get("author", {})
|
|
762
|
+
interact_info = aweme_info.get("statistics", {})
|
|
763
|
+
|
|
764
|
+
aweme_data = {
|
|
765
|
+
"aweme_id": aweme_info.get("aweme_id"),
|
|
766
|
+
"aweme_type": str(aweme_info.get("aweme_type", "")),
|
|
767
|
+
"title": aweme_info.get("desc", ""),
|
|
768
|
+
"desc": aweme_info.get("desc", ""),
|
|
769
|
+
"create_time": aweme_info.get("create_time"),
|
|
770
|
+
"user_id": user_info.get("uid"),
|
|
771
|
+
"sec_uid": user_info.get("sec_uid"),
|
|
772
|
+
"short_user_id": user_info.get("short_id"),
|
|
773
|
+
"user_unique_id": user_info.get("unique_id"),
|
|
774
|
+
"nickname": user_info.get("nickname"),
|
|
775
|
+
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
|
776
|
+
"liked_count": str(interact_info.get("digg_count", 0)),
|
|
777
|
+
"collected_count": str(interact_info.get("collect_count", 0)),
|
|
778
|
+
"comment_count": str(interact_info.get("comment_count", 0)),
|
|
779
|
+
"share_count": str(interact_info.get("share_count", 0)),
|
|
780
|
+
"ip_location": aweme_info.get("ip_label", ""),
|
|
781
|
+
"aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
|
|
782
|
+
}
|
|
783
|
+
batch_videos.append(aweme_data)
|
|
784
|
+
|
|
785
|
+
if not batch_videos:
|
|
786
|
+
break
|
|
787
|
+
|
|
788
|
+
remaining_slots = max_videos - len(all_videos)
|
|
789
|
+
if remaining_slots <= 0:
|
|
790
|
+
break
|
|
791
|
+
|
|
792
|
+
if len(batch_videos) > remaining_slots:
|
|
793
|
+
batch_videos = batch_videos[:remaining_slots]
|
|
794
|
+
|
|
795
|
+
all_videos.extend(batch_videos)
|
|
796
|
+
logger.info(f"Fetched {len(batch_videos)} videos for user {sec_user_id}, total: {len(all_videos)}")
|
|
797
|
+
|
|
798
|
+
if progress_callback:
|
|
799
|
+
await progress_callback(batch_videos)
|
|
800
|
+
|
|
801
|
+
await asyncio.sleep(1.0) # Rate limiting
|
|
802
|
+
|
|
803
|
+
return all_videos
|
|
804
|
+
|
|
805
|
+
async def check_login_status(self) -> bool:
|
|
806
|
+
"""
|
|
807
|
+
Check if user is logged in to Douyin
|
|
808
|
+
|
|
809
|
+
Returns:
|
|
810
|
+
True if logged in, False otherwise
|
|
811
|
+
"""
|
|
812
|
+
try:
|
|
813
|
+
if not self.target_id:
|
|
814
|
+
return False
|
|
815
|
+
|
|
816
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
817
|
+
|
|
818
|
+
# Check localStorage for login status
|
|
819
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
820
|
+
params={
|
|
821
|
+
'expression': "window.localStorage.getItem('HasUserLogin')",
|
|
822
|
+
'returnByValue': True,
|
|
823
|
+
},
|
|
824
|
+
session_id=cdp_session.session_id,
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
has_user_login = result.get('result', {}).get('value')
|
|
828
|
+
if has_user_login == "1":
|
|
829
|
+
return True
|
|
830
|
+
|
|
831
|
+
# Also check cookies for LOGIN_STATUS
|
|
832
|
+
return self.cookies.get("LOGIN_STATUS") == "1"
|
|
833
|
+
|
|
834
|
+
except Exception as e:
|
|
835
|
+
logger.error(f"Failed to check login status: {e}")
|
|
836
|
+
return False
|
|
837
|
+
|
|
838
|
+
async def close(self):
|
|
839
|
+
if self.browser_session and self.target_id:
|
|
840
|
+
try:
|
|
841
|
+
logger.info(f"Close target id: {self.target_id}")
|
|
842
|
+
await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
|
|
843
|
+
except Exception as e:
|
|
844
|
+
logger.warning(f"Error closing target {self.target_id}: {e}")
|
|
845
|
+
|