vibesurf 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -0,0 +1,845 @@
1
+ import asyncio
2
+ import json
3
+ import copy
4
+ import pdb
5
+ import time
6
+ import urllib.parse
7
+ import os
8
+ from typing import Dict, List, Optional, Callable, Union, Any
9
+ import httpx
10
+ import random
11
+ from tenacity import retry, stop_after_attempt, wait_fixed
12
+
13
+ try:
14
+ import execjs
15
+
16
+ HAS_EXECJS = True
17
+ except ImportError:
18
+ HAS_EXECJS = False
19
+
20
+ from vibe_surf.browser.agent_browser_session import AgentBrowserSession
21
+ from vibe_surf.logger import get_logger
22
+
23
+ from .helpers import (
24
+ SearchChannelType, SearchSortType, PublishTimeType,
25
+ generate_web_id, generate_trace_id, create_common_params,
26
+ extract_cookies_from_browser, create_referer_url,
27
+ extract_aweme_media_urls, DouyinError, NetworkError,
28
+ DataExtractionError, AuthenticationError, RateLimitError,
29
+ VerificationError
30
+ )
31
+
32
+ logger = get_logger(__name__)
33
+
34
+
35
+ class DouyinApiClient:
36
+ """
37
+ Douyin API client with integrated browser session management.
38
+ This client handles API communication through browser session for authentication.
39
+ """
40
+
41
+ def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
42
+ """
43
+ Initialize the Douyin API client
44
+
45
+ Args:
46
+ browser_session: Browser session for authentication
47
+ timeout: Request timeout in seconds
48
+ proxy: Proxy URL if needed
49
+ """
50
+ self.browser_session = browser_session
51
+ self.target_id = None
52
+ self.proxy = proxy
53
+ self.timeout = timeout
54
+ self._host = "https://www.douyin.com"
55
+
56
+ # Default headers
57
+ self.default_headers = {
58
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
59
+ "Host": "www.douyin.com",
60
+ "Origin": "https://www.douyin.com/",
61
+ "Referer": "https://www.douyin.com/",
62
+ "Content-Type": "application/json;charset=UTF-8",
63
+ }
64
+ self.cookies = {}
65
+
66
+ async def setup(self, target_id: Optional[str] = None):
67
+ """
68
+ Setup Douyin client by navigating to the site and extracting cookies
69
+
70
+ Args:
71
+ target_id: Specific target ID to use, or None to create new
72
+
73
+ Raises:
74
+ AuthenticationError: If unable to access Douyin properly
75
+ """
76
+ try:
77
+ if self.target_id and self.cookies:
78
+ logger.info("Douyin client already setup. Returning!")
79
+ return
80
+
81
+ if target_id:
82
+ self.target_id = target_id
83
+ else:
84
+ self.target_id = await self.browser_session.navigate_to_url(
85
+ "https://www.douyin.com/", new_tab=True
86
+ )
87
+ await asyncio.sleep(3) # Wait for page to load
88
+
89
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
90
+ result = await asyncio.wait_for(
91
+ cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
92
+ timeout=8.0
93
+ )
94
+ web_cookies = result.get('cookies', [])
95
+ user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
96
+ params={
97
+ 'expression': "navigator.userAgent",
98
+ 'returnByValue': True,
99
+ 'awaitPromise': True
100
+ },
101
+ session_id=cdp_session.session_id,
102
+ )
103
+ user_agent = user_agent_result.get('result', {}).get('value')
104
+ if user_agent:
105
+ self.default_headers["User-Agent"] = user_agent
106
+ cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
107
+ if cookie_str:
108
+ self.default_headers["Cookie"] = cookie_str
109
+ self.cookies = cookie_dict
110
+
111
+ logger.info(f"Douyin client setup completed with {len(cookie_dict)} cookies")
112
+
113
+ except Exception as e:
114
+ logger.error(f"Failed to setup Douyin client: {e}")
115
+ raise AuthenticationError(f"Douyin client setup failed: {e}")
116
+
117
+ async def _get_local_storage_token(self) -> Optional[str]:
118
+ """Get msToken from browser local storage"""
119
+ try:
120
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
121
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
122
+ params={
123
+ 'expression': "window.localStorage.getItem('xmst')",
124
+ 'returnByValue': True,
125
+ 'awaitPromise': True
126
+ },
127
+ session_id=cdp_session.session_id,
128
+ )
129
+ return result.get('result', {}).get('value')
130
+ except Exception as e:
131
+ logger.warning(f"Failed to get local storage token: {e}")
132
+ return None
133
+
134
+ def _init_js_context(self):
135
+ """Initialize JavaScript context for signature generation"""
136
+ if not HAS_EXECJS:
137
+ logger.warning("execjs not available, signature generation disabled")
138
+ return None
139
+
140
+ try:
141
+ js_file_path = os.path.join(os.path.dirname(__file__), 'douyin.js')
142
+ if not os.path.exists(js_file_path):
143
+ logger.warning(f"douyin.js file not found at {js_file_path}")
144
+ return None
145
+
146
+ with open(js_file_path, 'r', encoding='utf-8-sig') as f:
147
+ js_content = f.read()
148
+
149
+ return execjs.compile(js_content)
150
+ except Exception as e:
151
+ logger.error(f"Failed to initialize JS context: {e}")
152
+ return None
153
+
154
+ async def _get_a_bogus_signature(self, uri: str, params: str, post_data: Dict = None) -> str:
155
+ """
156
+ Get a-bogus signature using JavaScript execution
157
+
158
+ Args:
159
+ uri: Request URI
160
+ params: URL parameters string
161
+ post_data: POST data if applicable
162
+
163
+ Returns:
164
+ a-bogus signature string
165
+ """
166
+ try:
167
+ if not hasattr(self, '_js_context'):
168
+ self._js_context = self._init_js_context()
169
+
170
+ if not self._js_context:
171
+ return ""
172
+
173
+ user_agent = self.default_headers.get('User-Agent', '')
174
+
175
+ # Determine the signature function name based on URI
176
+ sign_function_name = "sign_datail"
177
+ if "/reply" in uri:
178
+ sign_function_name = "sign_reply"
179
+
180
+ # Call the JavaScript function
181
+ a_bogus = self._js_context.call(sign_function_name, params, user_agent)
182
+ return a_bogus or ""
183
+
184
+ except Exception as e:
185
+ logger.warning(f"Failed to generate a-bogus signature: {e}")
186
+ return ""
187
+
188
+ async def _prepare_request_params(self, uri: str, params: Optional[Dict] = None,
189
+ headers: Optional[Dict] = None, request_method: str = "GET",
190
+ post_data: Optional[Dict] = None):
191
+ """
192
+ Prepare request parameters with common Douyin parameters and signatures
193
+
194
+ Args:
195
+ uri: Request URI
196
+ params: Request parameters
197
+ headers: Request headers
198
+ request_method: HTTP method
199
+ post_data: POST data if applicable
200
+ """
201
+ if not params:
202
+ params = {}
203
+
204
+ headers = headers or copy.deepcopy(self.default_headers)
205
+
206
+ # Add common parameters
207
+ common_params = create_common_params()
208
+
209
+ # Add msToken from local storage
210
+ ms_token = await self._get_local_storage_token()
211
+ if ms_token:
212
+ common_params["msToken"] = ms_token
213
+
214
+ params.update(common_params)
215
+
216
+ # Generate query string
217
+ query_string = urllib.parse.urlencode(params)
218
+
219
+ # Get a-bogus signature
220
+ post_data = post_data or {}
221
+ a_bogus = await self._get_a_bogus_signature(uri, query_string, post_data)
222
+ if a_bogus:
223
+ params["a_bogus"] = a_bogus
224
+
225
+ return params, headers
226
+
227
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
228
+ async def _make_request(self, method: str, url: str, **kwargs) -> Union[str, Dict]:
229
+ """
230
+ Make HTTP request with error handling and retries
231
+
232
+ Args:
233
+ method: HTTP method
234
+ url: Request URL
235
+ **kwargs: Additional request parameters
236
+
237
+ Returns:
238
+ Response data
239
+ """
240
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
241
+ response = await client.request(method, url, timeout=self.timeout, **kwargs)
242
+
243
+ # Handle common error responses
244
+ if response.text == "" or response.text == "blocked":
245
+ logger.error(f"Request blocked, response.text: {response.text}")
246
+ raise VerificationError("Account may be blocked or requires verification")
247
+
248
+ try:
249
+ data = response.json()
250
+
251
+ # Check for successful response
252
+ if response.status_code == 200:
253
+ return data
254
+ else:
255
+ error_msg = data.get("message", "Request failed")
256
+ raise DataExtractionError(f"API error: {error_msg}")
257
+
258
+ except json.JSONDecodeError:
259
+ if response.status_code == 200:
260
+ return response.text
261
+ else:
262
+ raise DataExtractionError(f"Invalid response: {response.text[:200]}")
263
+
264
+ async def get_request(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
265
+ """Make GET request with Douyin-specific parameter preparation"""
266
+ params, headers = await self._prepare_request_params(uri, params, headers, "GET")
267
+ return await self._make_request("GET", f"{self._host}{uri}", params=params, headers=headers)
268
+
269
+ async def post_request(self, uri: str, data: Dict, headers: Optional[Dict] = None):
270
+ """Make POST request with Douyin-specific parameter preparation"""
271
+ data, headers = await self._prepare_request_params(uri, data, headers, "POST", post_data=data)
272
+ return await self._make_request("POST", f"{self._host}{uri}", data=data, headers=headers)
273
+
274
+ async def search_content_by_keyword(
275
+ self,
276
+ keyword: str,
277
+ offset: int = 0,
278
+ search_channel: SearchChannelType = SearchChannelType.GENERAL,
279
+ sort_type: SearchSortType = SearchSortType.GENERAL,
280
+ publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
281
+ search_id: str = "",
282
+ ) -> List[Dict]:
283
+ """
284
+ Search content by keyword using Douyin Web Search API
285
+
286
+ Args:
287
+ keyword: Search keyword
288
+ offset: Pagination offset
289
+ search_channel: Search channel type
290
+ sort_type: Sort method
291
+ publish_time: Time filter
292
+ search_id: Search session ID
293
+
294
+ Returns:
295
+ List of simplified aweme data
296
+ """
297
+ query_params = {
298
+ 'search_channel': search_channel.value,
299
+ 'enable_history': '1',
300
+ 'keyword': keyword,
301
+ 'search_source': 'tab_search',
302
+ 'query_correct_type': '1',
303
+ 'is_filter_search': '0',
304
+ 'offset': offset,
305
+ 'count': '15',
306
+ 'need_filter_settings': '1',
307
+ 'list_type': 'multi',
308
+ 'search_id': search_id,
309
+ }
310
+
311
+ # Add filters if not default
312
+ if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
313
+ query_params["filter_selected"] = json.dumps({
314
+ "sort_type": str(sort_type.value),
315
+ "publish_time": str(publish_time.value)
316
+ })
317
+ query_params["is_filter_search"] = 1
318
+ query_params["search_source"] = "tab_search"
319
+
320
+ referer_url = create_referer_url(keyword=keyword)
321
+ headers = copy.copy(self.default_headers)
322
+ headers["Referer"] = referer_url
323
+
324
+ search_result = await self.get_request("/aweme/v1/web/general/search/single/", query_params, headers)
325
+
326
+ # Return simplified aweme list
327
+ aweme_list = []
328
+ for post_item in search_result.get("data", []):
329
+ try:
330
+ aweme_info: Dict = (
331
+ post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
332
+ except (TypeError, IndexError):
333
+ continue
334
+
335
+ if not aweme_info or not aweme_info.get("aweme_id"):
336
+ continue
337
+
338
+ user_info = aweme_info.get("author", {})
339
+ interact_info = aweme_info.get("statistics", {})
340
+
341
+ # Simplified aweme data
342
+ aweme_data = {
343
+ "aweme_id": aweme_info.get("aweme_id"),
344
+ "aweme_type": str(aweme_info.get("aweme_type", "")),
345
+ "title": aweme_info.get("desc", ""),
346
+ "desc": aweme_info.get("desc", ""),
347
+ "create_time": aweme_info.get("create_time"),
348
+ "user_id": user_info.get("uid"),
349
+ "sec_uid": user_info.get("sec_uid"),
350
+ "short_user_id": user_info.get("short_id"),
351
+ "user_unique_id": user_info.get("unique_id"),
352
+ "nickname": user_info.get("nickname"),
353
+ "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
354
+ "liked_count": str(interact_info.get("digg_count", 0)),
355
+ "collected_count": str(interact_info.get("collect_count", 0)),
356
+ "comment_count": str(interact_info.get("comment_count", 0)),
357
+ "share_count": str(interact_info.get("share_count", 0)),
358
+ "ip_location": aweme_info.get("ip_label", ""),
359
+ "aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
360
+ }
361
+ aweme_list.append(aweme_data)
362
+
363
+ return aweme_list
364
+
365
+ async def fetch_video_details(self, aweme_id: str) -> Dict:
366
+ """
367
+ Fetch detailed video information by aweme ID
368
+
369
+ Args:
370
+ aweme_id: Video ID
371
+
372
+ Returns:
373
+ Simplified video details data
374
+ """
375
+ params = {"aweme_id": aweme_id}
376
+ headers = copy.copy(self.default_headers)
377
+ if "Origin" in headers:
378
+ del headers["Origin"]
379
+
380
+ response = await self.get_request("/aweme/v1/web/aweme/detail/", params, headers)
381
+ aweme_detail = response.get("aweme_detail", {})
382
+
383
+ if not aweme_detail:
384
+ return {}
385
+
386
+ user_info = aweme_detail.get("author", {})
387
+ interact_info = aweme_detail.get("statistics", {})
388
+
389
+ return {
390
+ "aweme_id": aweme_detail.get("aweme_id"),
391
+ "aweme_type": str(aweme_detail.get("aweme_type", "")),
392
+ "title": aweme_detail.get("desc", ""),
393
+ "desc": aweme_detail.get("desc", ""),
394
+ "create_time": aweme_detail.get("create_time"),
395
+ "user_id": user_info.get("uid"),
396
+ "sec_uid": user_info.get("sec_uid"),
397
+ "short_user_id": user_info.get("short_id"),
398
+ "user_unique_id": user_info.get("unique_id"),
399
+ "nickname": user_info.get("nickname"),
400
+ "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
401
+ "liked_count": str(interact_info.get("digg_count", 0)),
402
+ "collected_count": str(interact_info.get("collect_count", 0)),
403
+ "comment_count": str(interact_info.get("comment_count", 0)),
404
+ "share_count": str(interact_info.get("share_count", 0)),
405
+ "ip_location": aweme_detail.get("ip_label", ""),
406
+ "aweme_url": f"https://www.douyin.com/video/{aweme_detail.get('aweme_id')}",
407
+ }
408
+
409
+ async def fetch_video_comments(self, aweme_id: str, cursor: int = 0) -> List[Dict]:
410
+ """
411
+ Fetch video comments with pagination
412
+
413
+ Args:
414
+ aweme_id: Video ID
415
+ cursor: Pagination cursor
416
+
417
+ Returns:
418
+ List of simplified comments data
419
+ """
420
+ uri = "/aweme/v1/web/comment/list/"
421
+ params = {
422
+ "aweme_id": aweme_id,
423
+ "cursor": cursor,
424
+ "count": 20,
425
+ "item_type": 0
426
+ }
427
+
428
+ headers = copy.copy(self.default_headers)
429
+ headers["Referer"] = create_referer_url(aweme_id=aweme_id)
430
+
431
+ response = await self.get_request(uri, params, headers)
432
+
433
+ # Return simplified comments
434
+ comments = []
435
+ for comment_item in response.get("comments", []):
436
+ if not comment_item.get("cid"):
437
+ continue
438
+
439
+ user_info = comment_item.get("user", {})
440
+ avatar_info = (user_info.get("avatar_medium", {}) or
441
+ user_info.get("avatar_300x300", {}) or
442
+ user_info.get("avatar_168x168", {}) or
443
+ user_info.get("avatar_thumb", {}) or {})
444
+
445
+ comment_data = {
446
+ "comment_id": comment_item.get("cid"),
447
+ "create_time": comment_item.get("create_time"),
448
+ "ip_location": comment_item.get("ip_label", ""),
449
+ "aweme_id": aweme_id,
450
+ "content": comment_item.get("text"),
451
+ "user_id": user_info.get("uid"),
452
+ "sec_uid": user_info.get("sec_uid"),
453
+ "short_user_id": user_info.get("short_id"),
454
+ "user_unique_id": user_info.get("unique_id"),
455
+ "nickname": user_info.get("nickname"),
456
+ "avatar": avatar_info.get("url_list", [""])[0],
457
+ "sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
458
+ "like_count": comment_item.get("digg_count", 0),
459
+ "parent_comment_id": comment_item.get("reply_id", "0"),
460
+ }
461
+ comments.append(comment_data)
462
+
463
+ return comments
464
+
465
+ async def fetch_comment_replies(self, aweme_id: str, comment_id: str, cursor: int = 0) -> List[Dict]:
466
+ """
467
+ Fetch replies to a specific comment
468
+
469
+ Args:
470
+ aweme_id: Video ID
471
+ comment_id: Parent comment ID
472
+ cursor: Pagination cursor
473
+
474
+ Returns:
475
+ List of simplified reply comments data
476
+ """
477
+ uri = "/aweme/v1/web/comment/list/reply/"
478
+ params = {
479
+ 'comment_id': comment_id,
480
+ "cursor": cursor,
481
+ "count": 20,
482
+ "item_type": 0,
483
+ "item_id": aweme_id,
484
+ }
485
+
486
+ headers = copy.copy(self.default_headers)
487
+ headers["Referer"] = create_referer_url(aweme_id=aweme_id)
488
+
489
+ response = await self.get_request(uri, params, headers)
490
+
491
+ # Return simplified reply comments
492
+ replies = []
493
+ for comment_item in response.get("comments", []):
494
+ if not comment_item.get("cid"):
495
+ continue
496
+
497
+ user_info = comment_item.get("user", {})
498
+ avatar_info = (user_info.get("avatar_medium", {}) or
499
+ user_info.get("avatar_300x300", {}) or
500
+ user_info.get("avatar_168x168", {}) or
501
+ user_info.get("avatar_thumb", {}) or {})
502
+
503
+ reply_data = {
504
+ "comment_id": comment_item.get("cid"),
505
+ "create_time": comment_item.get("create_time"),
506
+ "ip_location": comment_item.get("ip_label", ""),
507
+ "aweme_id": aweme_id,
508
+ "content": comment_item.get("text"),
509
+ "user_id": user_info.get("uid"),
510
+ "sec_uid": user_info.get("sec_uid"),
511
+ "short_user_id": user_info.get("short_id"),
512
+ "user_unique_id": user_info.get("unique_id"),
513
+ "nickname": user_info.get("nickname"),
514
+ "avatar": avatar_info.get("url_list", [""])[0],
515
+ "sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
516
+ "like_count": comment_item.get("digg_count", 0),
517
+ "parent_comment_id": comment_id,
518
+ }
519
+ replies.append(reply_data)
520
+
521
+ return replies
522
+
523
+ async def fetch_all_video_comments(
524
+ self,
525
+ aweme_id: str,
526
+ fetch_interval: float = 1.0,
527
+ include_replies: bool = False,
528
+ progress_callback: Optional[Callable] = None,
529
+ max_comments: int = 1000,
530
+ ) -> List[Dict]:
531
+ """
532
+ Fetch all comments for a video, including replies if requested
533
+
534
+ Args:
535
+ aweme_id: Video ID
536
+ fetch_interval: Delay between requests
537
+ include_replies: Whether to fetch comment replies
538
+ progress_callback: Callback for progress updates
539
+ max_comments: Maximum comments to fetch
540
+
541
+ Returns:
542
+ List of all simplified comments
543
+ """
544
+ all_comments = []
545
+ has_more = True
546
+ cursor = 0
547
+
548
+ while has_more and len(all_comments) < max_comments:
549
+ uri = "/aweme/v1/web/comment/list/"
550
+ params = {
551
+ "aweme_id": aweme_id,
552
+ "cursor": cursor,
553
+ "count": 20,
554
+ "item_type": 0
555
+ }
556
+
557
+ headers = copy.copy(self.default_headers)
558
+ headers["Referer"] = create_referer_url(aweme_id=aweme_id)
559
+
560
+ comments_data = await self.get_request(uri, params, headers)
561
+ has_more = comments_data.get("has_more", False)
562
+ cursor = comments_data.get("cursor", 0)
563
+
564
+ # Get simplified comments from this batch
565
+ batch_comments = []
566
+ for comment_item in comments_data.get("comments", []):
567
+ if not comment_item.get("cid"):
568
+ continue
569
+
570
+ user_info = comment_item.get("user", {})
571
+ avatar_info = (user_info.get("avatar_medium", {}) or
572
+ user_info.get("avatar_300x300", {}) or
573
+ user_info.get("avatar_168x168", {}) or
574
+ user_info.get("avatar_thumb", {}) or {})
575
+
576
+ comment_data = {
577
+ "comment_id": comment_item.get("cid"),
578
+ "create_time": comment_item.get("create_time"),
579
+ "ip_location": comment_item.get("ip_label", ""),
580
+ "aweme_id": aweme_id,
581
+ "content": comment_item.get("text"),
582
+ "user_id": user_info.get("uid"),
583
+ "sec_uid": user_info.get("sec_uid"),
584
+ "short_user_id": user_info.get("short_id"),
585
+ "user_unique_id": user_info.get("unique_id"),
586
+ "nickname": user_info.get("nickname"),
587
+ "avatar": avatar_info.get("url_list", [""])[0],
588
+ "sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
589
+ "like_count": comment_item.get("digg_count", 0),
590
+ "parent_comment_id": comment_item.get("reply_id", "0"),
591
+ }
592
+ batch_comments.append(comment_data)
593
+
594
+ if not batch_comments:
595
+ break
596
+
597
+ # Limit comments to max_comments
598
+ remaining_slots = max_comments - len(all_comments)
599
+ if remaining_slots <= 0:
600
+ break
601
+
602
+ if len(batch_comments) > remaining_slots:
603
+ batch_comments = batch_comments[:remaining_slots]
604
+
605
+ all_comments.extend(batch_comments)
606
+
607
+ if progress_callback:
608
+ await progress_callback(aweme_id, batch_comments)
609
+
610
+ await asyncio.sleep(fetch_interval)
611
+
612
+ # Fetch replies if requested
613
+ if include_replies:
614
+ for comment in batch_comments:
615
+ reply_count = int(comment.get("sub_comment_count", 0))
616
+
617
+ if reply_count > 0:
618
+ comment_id = comment.get("comment_id")
619
+ replies = await self.fetch_comment_replies(aweme_id, comment_id, 0)
620
+ all_comments.extend(replies)
621
+
622
+ if progress_callback:
623
+ await progress_callback(aweme_id, replies)
624
+
625
+ await asyncio.sleep(fetch_interval)
626
+
627
+ logger.info(f"Fetched {len(all_comments)} comments for video {aweme_id}")
628
+ return all_comments
629
+
630
+ async def fetch_user_info(self, sec_user_id: str) -> Dict:
631
+ """
632
+ Fetch user profile information
633
+
634
+ Args:
635
+ sec_user_id: User's security ID
636
+
637
+ Returns:
638
+ Simplified user information data
639
+ """
640
+ uri = "/aweme/v1/web/user/profile/other/"
641
+ params = {
642
+ "sec_user_id": sec_user_id,
643
+ "publish_video_strategy_type": 2,
644
+ "personal_center_strategy": 1,
645
+ }
646
+ response = await self.get_request(uri, params)
647
+
648
+ user_data = response.get("user", {})
649
+ if not user_data:
650
+ return {}
651
+
652
+ gender_map = {0: "未知", 1: "男", 2: "女"}
653
+ avatar_uri = user_data.get("avatar_300x300", {}).get("uri", "")
654
+
655
+ return {
656
+ "user_id": user_data.get("uid"),
657
+ "nickname": user_data.get("nickname"),
658
+ "gender": gender_map.get(user_data.get("gender"), "未知"),
659
+ "avatar": f"https://p3-pc.douyinpic.com/img/{avatar_uri}~c5_300x300.jpeg?from=2956013662" if avatar_uri else "",
660
+ "desc": user_data.get("signature"),
661
+ "ip_location": user_data.get("ip_location"),
662
+ "follows": user_data.get("following_count", 0),
663
+ "fans": user_data.get("max_follower_count", 0),
664
+ "interaction": user_data.get("total_favorited", 0),
665
+ "videos_count": user_data.get("aweme_count", 0),
666
+ }
667
+
668
+ async def fetch_user_videos(self, sec_user_id: str, max_cursor: str = "") -> List[Dict]:
669
+ """
670
+ Fetch user's videos with pagination
671
+
672
+ Args:
673
+ sec_user_id: User's security ID
674
+ max_cursor: Pagination cursor
675
+
676
+ Returns:
677
+ List of simplified user videos data
678
+ """
679
+ uri = "/aweme/v1/web/aweme/post/"
680
+ params = {
681
+ "sec_user_id": sec_user_id,
682
+ "count": 18,
683
+ "max_cursor": max_cursor,
684
+ "locate_query": "false",
685
+ "publish_video_strategy_type": 2,
686
+ }
687
+ response = await self.get_request(uri, params)
688
+
689
+ # Return simplified aweme list
690
+ aweme_list = []
691
+ for aweme_info in response.get("aweme_list", []):
692
+ if not aweme_info.get("aweme_id"):
693
+ continue
694
+
695
+ user_info = aweme_info.get("author", {})
696
+ interact_info = aweme_info.get("statistics", {})
697
+
698
+ aweme_data = {
699
+ "aweme_id": aweme_info.get("aweme_id"),
700
+ "aweme_type": str(aweme_info.get("aweme_type", "")),
701
+ "title": aweme_info.get("desc", ""),
702
+ "desc": aweme_info.get("desc", ""),
703
+ "create_time": aweme_info.get("create_time"),
704
+ "user_id": user_info.get("uid"),
705
+ "sec_uid": user_info.get("sec_uid"),
706
+ "short_user_id": user_info.get("short_id"),
707
+ "user_unique_id": user_info.get("unique_id"),
708
+ "nickname": user_info.get("nickname"),
709
+ "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
710
+ "liked_count": str(interact_info.get("digg_count", 0)),
711
+ "collected_count": str(interact_info.get("collect_count", 0)),
712
+ "comment_count": str(interact_info.get("comment_count", 0)),
713
+ "share_count": str(interact_info.get("share_count", 0)),
714
+ "ip_location": aweme_info.get("ip_label", ""),
715
+ "aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
716
+ }
717
+ aweme_list.append(aweme_data)
718
+
719
+ return aweme_list
720
+
721
+ async def fetch_all_user_videos(
722
+ self,
723
+ sec_user_id: str,
724
+ progress_callback: Optional[Callable] = None,
725
+ max_videos: int = 1000
726
+ ) -> List[Dict]:
727
+ """
728
+ Fetch all videos from a user
729
+
730
+ Args:
731
+ sec_user_id: User's security ID
732
+ progress_callback: Callback for progress updates
733
+ max_videos: Maximum videos to fetch
734
+
735
+ Returns:
736
+ List of all simplified user videos
737
+ """
738
+ all_videos = []
739
+ has_more = True
740
+ max_cursor = ""
741
+
742
+ while has_more and len(all_videos) < max_videos:
743
+ uri = "/aweme/v1/web/aweme/post/"
744
+ params = {
745
+ "sec_user_id": sec_user_id,
746
+ "count": 18,
747
+ "max_cursor": max_cursor,
748
+ "locate_query": "false",
749
+ "publish_video_strategy_type": 2,
750
+ }
751
+ videos_data = await self.get_request(uri, params)
752
+ has_more = videos_data.get("has_more", False)
753
+ max_cursor = videos_data.get("max_cursor", "")
754
+
755
+ # Get simplified videos from this batch
756
+ batch_videos = []
757
+ for aweme_info in videos_data.get("aweme_list", []):
758
+ if not aweme_info.get("aweme_id"):
759
+ continue
760
+
761
+ user_info = aweme_info.get("author", {})
762
+ interact_info = aweme_info.get("statistics", {})
763
+
764
+ aweme_data = {
765
+ "aweme_id": aweme_info.get("aweme_id"),
766
+ "aweme_type": str(aweme_info.get("aweme_type", "")),
767
+ "title": aweme_info.get("desc", ""),
768
+ "desc": aweme_info.get("desc", ""),
769
+ "create_time": aweme_info.get("create_time"),
770
+ "user_id": user_info.get("uid"),
771
+ "sec_uid": user_info.get("sec_uid"),
772
+ "short_user_id": user_info.get("short_id"),
773
+ "user_unique_id": user_info.get("unique_id"),
774
+ "nickname": user_info.get("nickname"),
775
+ "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
776
+ "liked_count": str(interact_info.get("digg_count", 0)),
777
+ "collected_count": str(interact_info.get("collect_count", 0)),
778
+ "comment_count": str(interact_info.get("comment_count", 0)),
779
+ "share_count": str(interact_info.get("share_count", 0)),
780
+ "ip_location": aweme_info.get("ip_label", ""),
781
+ "aweme_url": f"https://www.douyin.com/video/{aweme_info.get('aweme_id')}",
782
+ }
783
+ batch_videos.append(aweme_data)
784
+
785
+ if not batch_videos:
786
+ break
787
+
788
+ remaining_slots = max_videos - len(all_videos)
789
+ if remaining_slots <= 0:
790
+ break
791
+
792
+ if len(batch_videos) > remaining_slots:
793
+ batch_videos = batch_videos[:remaining_slots]
794
+
795
+ all_videos.extend(batch_videos)
796
+ logger.info(f"Fetched {len(batch_videos)} videos for user {sec_user_id}, total: {len(all_videos)}")
797
+
798
+ if progress_callback:
799
+ await progress_callback(batch_videos)
800
+
801
+ await asyncio.sleep(1.0) # Rate limiting
802
+
803
+ return all_videos
804
+
805
+ async def check_login_status(self) -> bool:
806
+ """
807
+ Check if user is logged in to Douyin
808
+
809
+ Returns:
810
+ True if logged in, False otherwise
811
+ """
812
+ try:
813
+ if not self.target_id:
814
+ return False
815
+
816
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
817
+
818
+ # Check localStorage for login status
819
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
820
+ params={
821
+ 'expression': "window.localStorage.getItem('HasUserLogin')",
822
+ 'returnByValue': True,
823
+ },
824
+ session_id=cdp_session.session_id,
825
+ )
826
+
827
+ has_user_login = result.get('result', {}).get('value')
828
+ if has_user_login == "1":
829
+ return True
830
+
831
+ # Also check cookies for LOGIN_STATUS
832
+ return self.cookies.get("LOGIN_STATUS") == "1"
833
+
834
+ except Exception as e:
835
+ logger.error(f"Failed to check login status: {e}")
836
+ return False
837
+
838
+ async def close(self):
839
+ if self.browser_session and self.target_id:
840
+ try:
841
+ logger.info(f"Close target id: {self.target_id}")
842
+ await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
843
+ except Exception as e:
844
+ logger.warning(f"Error closing target {self.target_id}: {e}")
845
+