vibesurf 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vibesurf might be problematic. Click here for more details.

@@ -0,0 +1,1179 @@
1
+ import asyncio
2
+ import json
3
+ import pdb
4
+ import re
5
+ import copy
6
+ import time
7
+ import urllib.parse
8
+ import xml.etree.ElementTree as ET
9
+ from typing import Dict, List, Optional, Callable, Union, Any
10
+ import httpx
11
+ from tenacity import retry, stop_after_attempt, wait_fixed
12
+ from urllib.parse import parse_qs, unquote, urlencode
13
+
14
+ from vibe_surf.browser.agent_browser_session import AgentBrowserSession
15
+ from vibe_surf.logger import get_logger
16
+
17
+ from .helpers import (
18
+ SearchType, SortType, Duration, UploadDate,
19
+ extract_cookies_from_browser, extract_video_id_from_url,
20
+ extract_channel_id_from_url, extract_playlist_id_from_url,
21
+ parse_youtube_duration, format_view_count, parse_youtube_time,
22
+ process_youtube_text, validate_youtube_data, sanitize_filename,
23
+ extract_ytcfg_data, extract_initial_data, get_desktop_user_agent,
24
+ build_search_url, extract_continuation_token, decode_html_entities,
25
+ extract_thumbnail_url, generate_visitor_data,
26
+ YouTubeError, NetworkError, DataExtractionError,
27
+ AuthenticationError, RateLimitError, ContentNotFoundError
28
+ )
29
+
30
+ logger = get_logger(__name__)
31
+
32
+
33
+ class YouTubeApiClient:
34
+ """
35
+ YouTube API client with integrated browser session management.
36
+ This client handles API communication through browser session for authentication.
37
+ """
38
+
39
+ def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
40
+ """
41
+ Initialize the YouTube API client
42
+
43
+ Args:
44
+ browser_session: Browser session for authentication
45
+ timeout: Request timeout in seconds
46
+ proxy: Proxy URL if needed
47
+ """
48
+ self.browser_session = browser_session
49
+ self.target_id = None
50
+ self.proxy = proxy
51
+ self.timeout = timeout
52
+ self._base_url = "https://www.youtube.com"
53
+ self._api_base = "https://www.youtube.com/youtubei/v1"
54
+
55
+ # YouTube API key and client version (these are usually extracted from the page)
56
+ self._api_key = None
57
+ self._client_version = "2.20240229.01.00"
58
+ self._visitor_data = generate_visitor_data()
59
+
60
+ # Default headers for YouTube
61
+ self.default_headers = {
62
+ "User-Agent": get_desktop_user_agent(),
63
+ "Origin": "https://www.youtube.com",
64
+ "Referer": "https://www.youtube.com/",
65
+ "Content-Type": "application/json;charset=UTF-8",
66
+ }
67
+ self.cookies = {}
68
+
69
+ async def setup(self, target_id: Optional[str] = None):
70
+ """
71
+ Setup YouTube client by navigating to the site and extracting cookies
72
+
73
+ Args:
74
+ target_id: Specific browser target ID to use
75
+
76
+ Raises:
77
+ AuthenticationError: If setup fails
78
+ """
79
+ try:
80
+ if self.target_id and self.cookies and self._api_key:
81
+ logger.info("YouTube client already setup. Return!")
82
+ return
83
+
84
+ if target_id:
85
+ self.target_id = target_id
86
+ else:
87
+ # Navigate to YouTube home page
88
+ self.target_id = await self.browser_session.navigate_to_url(
89
+ "https://www.youtube.com/", new_tab=True
90
+ )
91
+ await asyncio.sleep(3) # Wait for page load
92
+
93
+ # Extract cookies from browser
94
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
95
+ result = await asyncio.wait_for(
96
+ cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
97
+ timeout=8.0
98
+ )
99
+ web_cookies = result.get('cookies', [])
100
+
101
+ cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
102
+ self.default_headers["Cookie"] = cookie_str
103
+ self.cookies = cookie_dict
104
+
105
+ # Get user agent from browser
106
+ user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
107
+ params={
108
+ 'expression': "navigator.userAgent",
109
+ 'returnByValue': True,
110
+ 'awaitPromise': True
111
+ },
112
+ session_id=cdp_session.session_id,
113
+ )
114
+ user_agent = user_agent_result.get('result', {}).get('value')
115
+ if user_agent:
116
+ self.default_headers["User-Agent"] = user_agent
117
+
118
+ # Extract API key and configuration from page
119
+ await self._extract_api_config()
120
+
121
+ logger.info("YouTube client setup completed successfully")
122
+
123
+ except Exception as e:
124
+ logger.error(f"Failed to setup YouTube client: {e}")
125
+ raise AuthenticationError(f"Setup failed: {e}")
126
+
127
+ async def _extract_api_config(self):
128
+ """Extract API key and configuration from YouTube page"""
129
+ try:
130
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
131
+
132
+ # Get page content to extract API key
133
+ content_result = await cdp_session.cdp_client.send.Runtime.evaluate(
134
+ params={
135
+ 'expression': "document.documentElement.outerHTML",
136
+ 'returnByValue': True,
137
+ },
138
+ session_id=cdp_session.session_id,
139
+ )
140
+
141
+ html_content = content_result.get('result', {}).get('value', '')
142
+
143
+ # Extract API key from page
144
+ api_key_match = re.search(r'"INNERTUBE_API_KEY":"([^"]+)"', html_content)
145
+ if api_key_match:
146
+ self._api_key = api_key_match.group(1)
147
+ logger.info(f"Extracted YouTube API key: {self._api_key[:10]}...")
148
+
149
+ # Extract client version
150
+ version_match = re.search(r'"clientVersion":"([^"]+)"', html_content)
151
+ if version_match:
152
+ self._client_version = version_match.group(1)
153
+ self.default_headers["X-YouTube-Client-Version"] = self._client_version
154
+
155
+ # Extract visitor data if available
156
+ visitor_match = re.search(r'"visitorData":"([^"]+)"', html_content)
157
+ if visitor_match:
158
+ self._visitor_data = visitor_match.group(1)
159
+
160
+ except Exception as e:
161
+ logger.warning(f"Failed to extract YouTube API config: {e}")
162
+ # Use default values if extraction fails
163
+
164
+ async def pong(self) -> bool:
165
+ """Check if the client is working by making a simple request"""
166
+ try:
167
+ logger.info("Testing YouTube client status...")
168
+
169
+ # Try to make a simple search request
170
+ test_response = await self.search_videos("test", max_results=1)
171
+
172
+ if test_response and len(test_response) >= 0:
173
+ logger.info("YouTube client status: Valid")
174
+ return True
175
+
176
+ logger.warning("YouTube client status: Invalid response")
177
+ return False
178
+
179
+ except Exception as e:
180
+ logger.error(f"Failed to check YouTube client status: {e}")
181
+ return False
182
+
183
+ @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
184
+ async def _make_request(self, method: str, url: str, **kwargs):
185
+ """
186
+ Make HTTP request with error handling and retry logic
187
+
188
+ Args:
189
+ method: HTTP method
190
+ url: Request URL
191
+ **kwargs: Additional request parameters
192
+
193
+ Returns:
194
+ Response data
195
+ """
196
+ raw_response = kwargs.pop("raw_response", False)
197
+
198
+ async with httpx.AsyncClient(proxy=self.proxy, timeout=self.timeout) as client:
199
+ response = await client.request(method, url, **kwargs)
200
+
201
+ # Handle common error status codes
202
+ if response.status_code == 403:
203
+ raise AuthenticationError("Access forbidden - may need login or verification")
204
+ elif response.status_code == 429:
205
+ raise RateLimitError("Rate limit exceeded")
206
+ elif response.status_code == 404:
207
+ raise ContentNotFoundError("Content not found")
208
+ elif response.status_code >= 500:
209
+ raise NetworkError(f"Server error: {response.status_code}")
210
+
211
+ if raw_response:
212
+ return response
213
+
214
+ try:
215
+ if 'application/json' in response.headers.get('content-type', ''):
216
+ return response.json()
217
+ else:
218
+ return response.text
219
+
220
+ except json.JSONDecodeError:
221
+ return response.text
222
+
223
+ async def _make_api_request(self, endpoint: str, data: Dict, **kwargs) -> Dict:
224
+ """Make YouTube Internal API request"""
225
+ if not self._api_key:
226
+ raise AuthenticationError("YouTube API key not available")
227
+
228
+ url = f"{self._api_base}/{endpoint}?key={self._api_key}"
229
+
230
+ # Add default context to request data
231
+ request_data = {
232
+ "context": {
233
+ "client": {
234
+ "clientName": "WEB",
235
+ "clientVersion": self._client_version,
236
+ "visitorData": self._visitor_data,
237
+ },
238
+ "user": {
239
+ "lockedSafetyMode": False
240
+ },
241
+ "request": {
242
+ "useSsl": True
243
+ }
244
+ },
245
+ **data
246
+ }
247
+
248
+ json_payload = json.dumps(request_data, separators=(",", ":"), ensure_ascii=False)
249
+
250
+ return await self._make_request(
251
+ "POST", url,
252
+ data=json_payload,
253
+ headers=self.default_headers,
254
+ **kwargs
255
+ )
256
+
257
+ def _find_video_renderers(self, data: Any) -> List[Dict]:
258
+ """
259
+ Recursively find all videoRenderer objects in the YouTube API response
260
+
261
+ Args:
262
+ data: YouTube API response data (can be dict, list, or any type)
263
+ max_results: Maximum number of video renderers to find
264
+
265
+ Returns:
266
+ List of videoRenderer data
267
+ """
268
+ video_renderers = []
269
+
270
+ def recursive_search(obj, current_count=0):
271
+
272
+ if isinstance(obj, dict):
273
+ # Check if this dict contains a videoRenderer
274
+ if "videoRenderer" in obj:
275
+ video_data = obj["videoRenderer"]
276
+ if video_data and isinstance(video_data, dict):
277
+ video_renderers.append(video_data)
278
+ current_count += 1
279
+
280
+ # Recursively search all values in the dict
281
+ for value in obj.values():
282
+ current_count = recursive_search(value, current_count)
283
+
284
+ elif isinstance(obj, list):
285
+ # Recursively search all items in the list
286
+ for item in obj:
287
+ current_count = recursive_search(item, current_count)
288
+
289
+ return current_count
290
+
291
+ recursive_search(data)
292
+ return video_renderers
293
+
294
+ async def search_videos(
295
+ self,
296
+ query: str,
297
+ max_results: int = 20,
298
+ continuation_token: Optional[str] = None,
299
+ sleep_time: float = 0.1
300
+ ) -> List[Dict]:
301
+ """
302
+ Search YouTube videos with pagination support
303
+
304
+ Args:
305
+ query: Search query
306
+ max_results: Maximum number of results to fetch (0 for all available)
307
+ continuation_token: Token for pagination
308
+ sleep_time: Sleep time between requests
309
+
310
+ Returns:
311
+ List of simplified video information
312
+ """
313
+ try:
314
+ videos = []
315
+ continuations = []
316
+
317
+ if continuation_token:
318
+ # Use provided continuation token
319
+ continuations.append(continuation_token)
320
+ else:
321
+ # Initial search request
322
+ data = {"query": query}
323
+ response = await self._make_api_request("search", data)
324
+
325
+ # Extract videos from initial response
326
+ video_renderers = self._find_video_renderers(response)
327
+ for video_data in video_renderers:
328
+ if max_results > 0 and len(videos) >= max_results:
329
+ break
330
+ video_info = self._extract_video_info(video_data)
331
+ if video_info:
332
+ videos.append(video_info)
333
+
334
+ # Extract continuation tokens for more results
335
+ continuation_tokens = self._extract_continuation_tokens(response)
336
+ continuations.extend(continuation_tokens)
337
+
338
+ # Process continuation tokens for more videos
339
+ while continuations and (max_results == 0 or len(videos) < max_results):
340
+ current_continuation = continuations.pop(0)
341
+
342
+ # Make API request with continuation token
343
+ data = {"continuation": current_continuation}
344
+ response = await self._make_api_request("search", data)
345
+
346
+ if not response:
347
+ break
348
+
349
+ # Extract videos from continuation response
350
+ video_renderers = self._find_video_renderers(response)
351
+ batch_videos = []
352
+
353
+ for video_data in video_renderers:
354
+ if max_results > 0 and len(videos) + len(batch_videos) >= max_results:
355
+ break
356
+ video_info = self._extract_video_info(video_data)
357
+ if video_info:
358
+ batch_videos.append(video_info)
359
+
360
+ videos.extend(batch_videos)
361
+
362
+ # Look for more continuation tokens
363
+ continuation_tokens = self._extract_continuation_tokens(response)
364
+ for token in continuation_tokens:
365
+ if token not in continuations:
366
+ continuations.append(token)
367
+
368
+ logger.info(f"Fetched {len(batch_videos)} videos, total: {len(videos)}")
369
+
370
+ # Sleep between requests to avoid rate limiting
371
+ if continuations and sleep_time > 0:
372
+ await asyncio.sleep(sleep_time)
373
+
374
+ return videos[:max_results] if max_results > 0 else videos
375
+
376
+ except Exception as e:
377
+ logger.error(f"Failed to search videos: {e}")
378
+ return []
379
+
380
+ def _extract_video_info(self, video_data: Dict) -> Optional[Dict]:
381
+ """Extract simplified video information from YouTube video data"""
382
+ try:
383
+ video_id = video_data.get("videoId")
384
+ if not video_id:
385
+ return None
386
+
387
+ title = video_data.get("title", {}).get("runs", [{}])[0].get("text", "")
388
+ if not title and "accessibility" in video_data.get("title", {}):
389
+ title = video_data["title"]["accessibility"]["accessibilityData"]["label"]
390
+
391
+ # Extract view count
392
+ view_count_text = ""
393
+ view_count_runs = video_data.get("viewCountText", {}).get("simpleText", "")
394
+ if not view_count_runs:
395
+ view_count_runs = video_data.get("shortViewCountText", {}).get("simpleText", "")
396
+ view_count = format_view_count(view_count_runs)
397
+
398
+ # Extract duration
399
+ duration_text = video_data.get("lengthText", {}).get("simpleText", "")
400
+ duration_seconds = 0
401
+ if duration_text:
402
+ # Convert MM:SS or HH:MM:SS to seconds
403
+ time_parts = duration_text.split(":")
404
+ if len(time_parts) == 2: # MM:SS
405
+ duration_seconds = int(time_parts[0]) * 60 + int(time_parts[1])
406
+ elif len(time_parts) == 3: # HH:MM:SS
407
+ duration_seconds = int(time_parts[0]) * 3600 + int(time_parts[1]) * 60 + int(time_parts[2])
408
+
409
+ # Extract channel info
410
+ channel_data = video_data.get("longBylineText", {}).get("runs", [{}])[0]
411
+ channel_name = channel_data.get("text", "")
412
+ channel_url = channel_data.get("navigationEndpoint", {}).get("commandMetadata", {}).get(
413
+ "webCommandMetadata", {}).get("url", "")
414
+ channel_id = extract_channel_id_from_url(channel_url) if channel_url else ""
415
+
416
+ # Extract thumbnail
417
+ thumbnails = video_data.get("thumbnail", {}).get("thumbnails", [])
418
+ thumbnail_url = extract_thumbnail_url(thumbnails)
419
+
420
+ # Extract published time
421
+ published_time_text = video_data.get("publishedTimeText", {}).get("simpleText", "")
422
+
423
+ description = ''
424
+ if 'descriptionSnippet' in video_data:
425
+ for desc in video_data.get('descriptionSnippet', {}).get('runs', {}):
426
+ description += desc.get('text', '')
427
+
428
+ return {
429
+ "video_id": video_id,
430
+ "title": process_youtube_text(title),
431
+ "description": description,
432
+ "duration": duration_seconds,
433
+ "view_count": view_count,
434
+ "like_count": -1, # Not available in search results
435
+ "comment_count": -1, # Not available in search results
436
+ "published_time": published_time_text,
437
+ "thumbnail_url": thumbnail_url,
438
+ "video_url": f"https://www.youtube.com/watch?v={video_id}",
439
+ "channel_id": channel_id,
440
+ "channel_name": channel_name,
441
+ "channel_url": f"https://www.youtube.com{channel_url}" if channel_url else "",
442
+ }
443
+
444
+ except Exception as e:
445
+ logger.error(f"Failed to extract video info: {e}")
446
+ return None
447
+
448
+ async def get_video_details(self, video_id: str) -> Optional[Dict]:
449
+ """
450
+ Get detailed video information
451
+
452
+ Args:
453
+ video_id: YouTube video ID
454
+
455
+ Returns:
456
+ Detailed video information
457
+ """
458
+ try:
459
+ # Use the player API to get video details
460
+ data = {"videoId": video_id}
461
+
462
+ response = await self._make_api_request("player", data)
463
+
464
+ video_details = response.get("videoDetails", {})
465
+ if not video_details:
466
+ return None
467
+
468
+ # Extract basic video information
469
+ title = video_details.get("title", "")
470
+ description = video_details.get("shortDescription", "")
471
+ duration = int(video_details.get("lengthSeconds", 0))
472
+ view_count = int(video_details.get("viewCount", 0))
473
+
474
+ # Extract channel information
475
+ channel_id = video_details.get("channelId", "")
476
+ channel_name = video_details.get("author", "")
477
+
478
+ # Extract thumbnail
479
+ thumbnails = video_details.get("thumbnail", {}).get("thumbnails", [])
480
+ thumbnail_url = extract_thumbnail_url(thumbnails)
481
+
482
+ return {
483
+ "video_id": video_id,
484
+ "title": process_youtube_text(title),
485
+ "description": process_youtube_text(description),
486
+ "duration": duration,
487
+ "view_count": view_count,
488
+ "like_count": 0, # Would need additional API call
489
+ "comment_count": 0, # Would need additional API call
490
+ "published_time": "", # Not available in player API
491
+ "thumbnail_url": thumbnail_url,
492
+ "video_url": f"https://www.youtube.com/watch?v={video_id}",
493
+ "channel_id": channel_id,
494
+ "channel_name": channel_name,
495
+ "channel_url": f"https://www.youtube.com/channel/{channel_id}" if channel_id else "",
496
+ "keywords": video_details.get("keywords", []),
497
+ "category": video_details.get("category", ""),
498
+ "is_live": video_details.get("isLiveContent", False),
499
+ }
500
+
501
+ except Exception as e:
502
+ logger.error(f"Failed to get video details for {video_id}: {e}")
503
+ return None
504
+
505
+ async def get_video_comments(
506
+ self,
507
+ video_id: str,
508
+ max_comments: int = 200,
509
+ continuation_token: Optional[str] = None,
510
+ sort_by: int = 0, # 0 = popular, 1 = recent
511
+ sleep_time: float = 0.1
512
+ ) -> List[Dict]:
513
+ """
514
+ Get comments for a YouTube video with full pagination support
515
+
516
+ Args:
517
+ video_id: YouTube video ID
518
+ max_comments: Maximum number of comments to fetch (0 for all)
519
+ continuation_token: Token for pagination
520
+ sort_by: Comment sorting (0=popular, 1=recent)
521
+ sleep_time: Sleep time between requests
522
+
523
+ Returns:
524
+ List of simplified comment information
525
+ """
526
+ try:
527
+ comments = []
528
+ continuations = []
529
+
530
+ if continuation_token:
531
+ # Use provided continuation token
532
+ continuations.append(continuation_token)
533
+ else:
534
+ # Initial request - need to navigate to video page first to get comments section
535
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
536
+ response = await self._make_request("GET", video_url, headers=self.default_headers, raw_response=True)
537
+ html_content = response.text
538
+
539
+ # Extract initial data from the page
540
+ initial_data = self._extract_initial_data_from_html(html_content)
541
+ if not initial_data:
542
+ logger.error("Failed to extract initial data from video page")
543
+ return []
544
+
545
+ # Find comments section
546
+ continuation_endpoint = self._find_comments_continuation(initial_data, sort_by)
547
+ if not continuation_endpoint:
548
+ logger.warning(f"No comments found for video {video_id}")
549
+ return []
550
+
551
+ continuations.append(continuation_endpoint)
552
+
553
+ # Process all continuation tokens
554
+ while continuations:
555
+ if max_comments > 0 and len(comments) >= max_comments:
556
+ break
557
+
558
+ current_continuation = continuations.pop(0)
559
+ # Make API request for comments
560
+ data = {"continuation": current_continuation}
561
+ response = await self._make_api_request("next", data)
562
+
563
+ if not response:
564
+ break
565
+
566
+ # Check for errors
567
+ error_messages = self._search_dict_recursive(response, 'externalErrorMessage')
568
+ if error_messages:
569
+ logger.error(f"YouTube API error: {error_messages[0]}")
570
+ break
571
+
572
+ # Process response actions to find more comments and continuations
573
+ actions = []
574
+ actions.extend(self._search_dict_recursive(response, 'reloadContinuationItemsCommand'))
575
+ actions.extend(self._search_dict_recursive(response, 'appendContinuationItemsAction'))
576
+
577
+ # Process each action to extract comments and find new continuations
578
+ for action in actions:
579
+ target_id = action.get('targetId', '')
580
+ continuation_items = action.get('continuationItems', [])
581
+
582
+ # Process continuations for comments and replies
583
+ if target_id in ['comments-section', 'engagement-panel-comments-section',
584
+ 'shorts-engagement-panel-comments-section']:
585
+ for item in continuation_items:
586
+ # Look for continuation endpoints for more comments
587
+ continuation_endpoints = self._search_dict_recursive(item, 'continuationEndpoint')
588
+ for endpoint in continuation_endpoints:
589
+ if 'continuationCommand' in endpoint:
590
+ token = endpoint['continuationCommand']['token']
591
+ if token not in continuations:
592
+ continuations.insert(0, token) # Insert at beginning for breadth-first
593
+
594
+ # Process 'Show more replies' buttons
595
+ elif target_id.startswith('comment-replies-item'):
596
+ for item in continuation_items:
597
+ if 'continuationItemRenderer' in item:
598
+ button_renderers = self._search_dict_recursive(item, 'buttonRenderer')
599
+ for button in button_renderers:
600
+ command = button.get('command', {})
601
+ if 'continuationCommand' in command:
602
+ token = command['continuationCommand']['token']
603
+ if token not in continuations:
604
+ continuations.append(token)
605
+
606
+ # Extract comment entity payloads for new comment format
607
+ comment_entities = {}
608
+ for payload in self._search_dict_recursive(response, 'commentEntityPayload'):
609
+ if 'properties' in payload and 'commentId' in payload['properties']:
610
+ comment_id = payload['properties']['commentId']
611
+ comment_entities[comment_id] = payload
612
+
613
+ # Extract toolbar states
614
+ toolbar_states = {}
615
+ for payload in self._search_dict_recursive(response, 'engagementToolbarStateEntityPayload'):
616
+ if 'key' in payload:
617
+ toolbar_states[payload['key']] = payload
618
+
619
+ # Process comment entities and extract comment information
620
+ batch_comments = []
621
+ for comment_id in comment_entities:
622
+ if max_comments > 0 and len(comments) + len(batch_comments) >= max_comments:
623
+ break
624
+
625
+ entity = comment_entities[comment_id]
626
+ comment_info = self._extract_comment_from_entity(entity, toolbar_states)
627
+ if comment_info:
628
+ batch_comments.append(comment_info)
629
+
630
+ # Reverse to maintain chronological order (YouTube returns in reverse)
631
+ batch_comments.reverse()
632
+ comments.extend(batch_comments)
633
+
634
+ logger.info(f"Fetched {len(batch_comments)} comments, total: {len(comments)}")
635
+
636
+ # Sleep between requests to avoid rate limiting
637
+ if continuations and sleep_time > 0:
638
+ await asyncio.sleep(sleep_time)
639
+
640
+ return comments[:max_comments] if max_comments > 0 else comments
641
+
642
+ except Exception as e:
643
+ logger.error(f"Failed to get comments for video {video_id}: {e}")
644
+ return []
645
+
646
+ def _extract_initial_data_from_html(self, html_content: str) -> Optional[Dict]:
647
+ """Extract ytInitialData from HTML content"""
648
+ try:
649
+ # Pattern for ytInitialData
650
+ pattern = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'
651
+ match = re.search(pattern, html_content)
652
+ if match:
653
+ return json.loads(match.group(1))
654
+ return None
655
+ except Exception as e:
656
+ logger.error(f"Failed to extract initial data: {e}")
657
+ return None
658
+
659
+ def _find_comments_continuation(self, initial_data: Dict, sort_by: int = 1) -> Optional[str]:
660
+ """Find comments section continuation token"""
661
+ try:
662
+ # Look for itemSectionRenderer in the data
663
+ for item_section in self._search_dict_recursive(initial_data, 'itemSectionRenderer'):
664
+ for continuation_renderer in self._search_dict_recursive(item_section, 'continuationItemRenderer'):
665
+ continuation_endpoint = continuation_renderer.get('continuationEndpoint', {})
666
+ if continuation_endpoint:
667
+ # Check if we need to handle sort menu
668
+ sort_menu = None
669
+ for sort_filter in self._search_dict_recursive(initial_data, 'sortFilterSubMenuRenderer'):
670
+ sort_menu = sort_filter.get('subMenuItems', [])
671
+ break
672
+
673
+ if sort_menu and sort_by < len(sort_menu):
674
+ # Use the specified sort option
675
+ sort_endpoint = sort_menu[sort_by].get('serviceEndpoint', {})
676
+ if 'continuationCommand' in sort_endpoint:
677
+ return sort_endpoint['continuationCommand']['token']
678
+
679
+ # Fallback to default continuation
680
+ if 'continuationCommand' in continuation_endpoint:
681
+ return continuation_endpoint['continuationCommand']['token']
682
+
683
+ return None
684
+ except Exception as e:
685
+ logger.error(f"Failed to find comments continuation: {e}")
686
+ return None
687
+
688
+ def _search_dict_recursive(self, data: Any, search_key: str) -> List[Any]:
689
+ """Recursively search for a key in nested dict/list structure"""
690
+ results = []
691
+ stack = [data]
692
+
693
+ while stack:
694
+ current = stack.pop()
695
+ if isinstance(current, dict):
696
+ for key, value in current.items():
697
+ if key == search_key:
698
+ results.append(value)
699
+ else:
700
+ stack.append(value)
701
+ elif isinstance(current, list):
702
+ stack.extend(current)
703
+
704
+ return results
705
+
706
+ def _extract_comment_from_entity(self, entity: Dict, toolbar_states: Dict) -> Optional[Dict]:
707
+ """Extract comment info from commentEntityPayload format"""
708
+ try:
709
+ properties = entity.get('properties', {})
710
+ author = entity.get('author', {})
711
+ toolbar = entity.get('toolbar', {})
712
+
713
+ comment_id = properties.get('commentId', '')
714
+ content = properties.get('content', {}).get('content', '')
715
+ published_time = properties.get('publishedTime', '')
716
+
717
+ # Author info
718
+ author_name = author.get('displayName', '')
719
+ author_channel_id = author.get('channelId', '')
720
+ author_avatar = author.get('avatarThumbnailUrl', '')
721
+
722
+ # Engagement info
723
+ like_count_text = toolbar.get('likeCountNotliked', '0').strip() or "0"
724
+ like_count = self._parse_count_string(like_count_text)
725
+ reply_count = toolbar.get('replyCount', 0)
726
+
727
+ # Check if comment is hearted
728
+ toolbar_state_key = properties.get('toolbarStateKey', '')
729
+ is_hearted = False
730
+ if toolbar_state_key in toolbar_states:
731
+ heart_state = toolbar_states[toolbar_state_key].get('heartState', '')
732
+ is_hearted = heart_state == 'TOOLBAR_HEART_STATE_HEARTED'
733
+
734
+ # Check if it's a reply (comment ID contains '.')
735
+ is_reply = '.' in comment_id
736
+
737
+ return {
738
+ "comment_id": comment_id,
739
+ "content": process_youtube_text(content),
740
+ "author_name": author_name,
741
+ "author_channel_id": author_channel_id,
742
+ "author_avatar": author_avatar,
743
+ "like_count": like_count,
744
+ "reply_count": reply_count,
745
+ "published_time": published_time,
746
+ "is_hearted": is_hearted,
747
+ "is_reply": is_reply,
748
+ "time_parsed": self._parse_time_string(published_time)
749
+ }
750
+
751
+ except Exception as e:
752
+ logger.error(f"Failed to extract comment from entity: {e}")
753
+ return None
754
+
755
+ def _parse_count_string(self, count_str: str) -> int:
756
+ """Parse YouTube count strings like '1.2K', '500', etc."""
757
+ try:
758
+ if not count_str or count_str == '0':
759
+ return 0
760
+
761
+ count_str = count_str.strip().upper()
762
+
763
+ # Handle K, M, B suffixes
764
+ multipliers = {'K': 1000, 'M': 1000000, 'B': 1000000000}
765
+
766
+ for suffix, multiplier in multipliers.items():
767
+ if count_str.endswith(suffix):
768
+ number_part = count_str[:-1]
769
+ return int(float(number_part) * multiplier)
770
+
771
+ # Handle comma-separated numbers
772
+ count_str = count_str.replace(',', '')
773
+ return int(count_str)
774
+
775
+ except (ValueError, AttributeError):
776
+ return 0
777
+
778
+ def _parse_time_string(self, time_str: str) -> Optional[float]:
779
+ """Parse time string and return timestamp"""
780
+ try:
781
+ if not time_str:
782
+ return None
783
+
784
+ # Remove any parenthetical content
785
+ clean_time = time_str.split('(')[0].strip()
786
+
787
+ # Try to parse with dateparser if available
788
+ try:
789
+ import dateparser
790
+ parsed = dateparser.parse(clean_time)
791
+ if parsed:
792
+ return parsed.timestamp()
793
+ except ImportError:
794
+ pass
795
+
796
+ return None
797
+ except Exception:
798
+ return None
799
+
800
+ async def search_all_videos(
801
+ self,
802
+ query: str,
803
+ sleep_time: float = 0.1
804
+ ) -> List[Dict]:
805
+ """
806
+ Search for all available YouTube videos for a query (no limit)
807
+
808
+ Args:
809
+ query: Search query
810
+ sleep_time: Sleep time between requests
811
+
812
+ Returns:
813
+ List of all available video information
814
+ """
815
+ return await self.search_videos(
816
+ query=query,
817
+ max_results=0, # 0 means no limit
818
+ sleep_time=sleep_time
819
+ )
820
+
821
+ async def get_all_video_comments(
822
+ self,
823
+ video_id: str,
824
+ sort_by: int = 1, # 0 = popular, 1 = recent
825
+ sleep_time: float = 0.1
826
+ ) -> List[Dict]:
827
+ """
828
+ Get all comments for a YouTube video (no limit)
829
+
830
+ Args:
831
+ video_id: YouTube video ID
832
+ sort_by: Comment sorting (0=popular, 1=recent)
833
+ sleep_time: Sleep time between requests
834
+
835
+ Returns:
836
+ List of all comments for the video
837
+ """
838
+ return await self.get_video_comments(
839
+ video_id=video_id,
840
+ max_comments=0, # 0 means no limit
841
+ sort_by=sort_by,
842
+ sleep_time=sleep_time
843
+ )
844
+
845
+ def _extract_continuation_tokens(self, data: Any) -> List[str]:
846
+ """
847
+ Extract all continuation tokens from YouTube API response
848
+
849
+ Args:
850
+ data: YouTube API response data
851
+
852
+ Returns:
853
+ List of continuation tokens
854
+ """
855
+ tokens = []
856
+
857
+ # Search for continuation endpoints
858
+ continuation_endpoints = self._search_dict_recursive(data, 'continuationEndpoint')
859
+ for endpoint in continuation_endpoints:
860
+ if 'continuationCommand' in endpoint:
861
+ token = endpoint['continuationCommand']['token']
862
+ if token and token not in tokens:
863
+ tokens.append(token)
864
+
865
+ # Search for continuation commands
866
+ continuation_commands = self._search_dict_recursive(data, 'continuationCommand')
867
+ for command in continuation_commands:
868
+ token = command.get('token')
869
+ if token and token not in tokens:
870
+ tokens.append(token)
871
+
872
+ return tokens
873
+
874
+ def _extract_comment_info(self, comment_data: Dict) -> Optional[Dict]:
875
+ """Extract simplified comment information from traditional YouTube comment data"""
876
+ try:
877
+ # Extract comment ID
878
+ comment_id = comment_data.get("commentId", "")
879
+
880
+ # Extract comment text
881
+ content_text = comment_data.get("contentText", {})
882
+ text = ""
883
+ if "runs" in content_text:
884
+ text = "".join([run.get("text", "") for run in content_text["runs"]])
885
+ elif "simpleText" in content_text:
886
+ text = content_text["simpleText"]
887
+
888
+ # Extract author information
889
+ author_text = comment_data.get("authorText", {}).get("simpleText", "")
890
+ author_thumbnail = comment_data.get("authorThumbnail", {}).get("thumbnails", [])
891
+ author_avatar = extract_thumbnail_url(author_thumbnail)
892
+
893
+ # Extract author channel ID if available
894
+ author_endpoint = comment_data.get("authorEndpoint", {}).get("commandMetadata", {}).get(
895
+ "webCommandMetadata", {})
896
+ author_url = author_endpoint.get("url", "")
897
+ author_channel_id = extract_channel_id_from_url(author_url) if author_url else ""
898
+
899
+ # Extract like count
900
+ like_count_text = comment_data.get("voteCount", {}).get("simpleText", "0")
901
+ like_count = self._parse_count_string(like_count_text)
902
+
903
+ # Extract published time
904
+ published_time_data = comment_data.get("publishedTimeText", {})
905
+ published_time = ""
906
+ if "runs" in published_time_data:
907
+ published_time = published_time_data["runs"][0].get("text", "")
908
+ elif "simpleText" in published_time_data:
909
+ published_time = published_time_data["simpleText"]
910
+
911
+ # Extract reply count
912
+ reply_count = 0
913
+ reply_text = comment_data.get("replyCount", 0)
914
+ if isinstance(reply_text, dict):
915
+ reply_text = reply_text.get("simpleText", "0")
916
+ if isinstance(reply_text, str):
917
+ reply_count = self._parse_count_string(reply_text)
918
+ elif isinstance(reply_text, int):
919
+ reply_count = reply_text
920
+
921
+ # Check if comment is hearted by creator
922
+ is_hearted = False
923
+ if "actionButtons" in comment_data:
924
+ buttons = comment_data["actionButtons"].get("commentActionButtonsRenderer", {})
925
+ heart_button = buttons.get("creatorHeart", {})
926
+ is_hearted = bool(heart_button.get("creatorHeartRenderer", {}))
927
+
928
+ # Check if it's a reply (comment ID contains '.')
929
+ is_reply = '.' in comment_id
930
+
931
+ return {
932
+ "comment_id": comment_id,
933
+ "content": process_youtube_text(text),
934
+ "author_name": author_text,
935
+ "author_channel_id": author_channel_id,
936
+ "author_avatar": author_avatar,
937
+ "like_count": like_count,
938
+ "reply_count": reply_count,
939
+ "published_time": published_time,
940
+ "is_hearted": is_hearted,
941
+ "is_reply": is_reply,
942
+ "time_parsed": self._parse_time_string(published_time)
943
+ }
944
+
945
+ except Exception as e:
946
+ logger.error(f"Failed to extract comment info: {e}")
947
+ return None
948
+
949
+ async def get_channel_info(self, channel_id: str) -> Optional[Dict]:
950
+ """
951
+ Get YouTube channel information
952
+
953
+ Args:
954
+ channel_id: YouTube channel ID
955
+
956
+ Returns:
957
+ Simplified channel information
958
+ """
959
+ try:
960
+ # Navigate to channel page to get information
961
+ channel_url = f"https://www.youtube.com/@{channel_id}"
962
+ response = await self._make_request(
963
+ "GET", channel_url, headers=self.default_headers, raw_response=True
964
+ )
965
+ html_content = response.text
966
+ initial_data = extract_initial_data(html_content)
967
+
968
+ if not initial_data:
969
+ return None
970
+
971
+ # Extract channel information from initial data
972
+ metadata = initial_data.get("metadata", {}).get("channelMetadataRenderer", {})
973
+ header = initial_data.get("header", {})
974
+ # Try different header types
975
+ channel_header = (header.get("c4TabbedHeaderRenderer") or
976
+ header.get("pageHeaderRenderer") or
977
+ header.get("interactiveTabbedHeaderRenderer") or {})
978
+
979
+ title = metadata.get("title", "") or channel_header.get("title", "")
980
+ description = metadata.get("description", "")
981
+
982
+ # Extract subscriber count and video count from pageHeaderRenderer if available
983
+ subscriber_count = 0
984
+ video_count = 0
985
+
986
+ if "pageHeaderRenderer" in header:
987
+ page_header = header["pageHeaderRenderer"]
988
+ metadata_rows = page_header.get("content", {}).get("pageHeaderViewModel", {}).get("metadata", {}).get(
989
+ "contentMetadataViewModel", {}).get("metadataRows", [])
990
+
991
+ if len(metadata_rows) > 1:
992
+ # Second row contains subscriber and video counts
993
+ metadata_parts = metadata_rows[1].get("metadataParts", [])
994
+ if len(metadata_parts) > 0:
995
+ # Subscriber count (e.g., "21.2万位订阅者")
996
+ subscriber_text = metadata_parts[0].get("text", {}).get("content", "")
997
+ subscriber_count = subscriber_text.replace("位订阅者", "").replace("订阅者", "").replace(
998
+ "subscribers", "").strip()
999
+
1000
+ if len(metadata_parts) > 1:
1001
+ # Video count (e.g., "67 个视频")
1002
+ video_text = metadata_parts[1].get("text", {}).get("content", "")
1003
+ video_count = video_text.replace("个视频", "").replace("视频", "").replace("videos", "").strip()
1004
+
1005
+ # Extract avatar
1006
+ avatar_thumbnails = channel_header.get("avatar", {}).get("thumbnails", [])
1007
+ avatar_url = extract_thumbnail_url(avatar_thumbnails)
1008
+
1009
+ # Extract banner
1010
+ banner_thumbnails = channel_header.get("banner", {}).get("thumbnails", [])
1011
+ banner_url = extract_thumbnail_url(banner_thumbnails)
1012
+
1013
+ return {
1014
+ "channel_id": channel_id,
1015
+ "title": process_youtube_text(title),
1016
+ "description": process_youtube_text(description),
1017
+ "subscriber_count": subscriber_count,
1018
+ "video_count": video_count,
1019
+ "avatar_url": avatar_url,
1020
+ "banner_url": banner_url,
1021
+ "channel_url": channel_url,
1022
+ "verified": False, # Would need additional processing
1023
+ }
1024
+
1025
+ except Exception as e:
1026
+ logger.error(f"Failed to get channel info for {channel_id}: {e}")
1027
+ return None
1028
+
1029
+ async def get_channel_videos(
1030
+ self,
1031
+ channel_id: str,
1032
+ max_videos: int = 20,
1033
+ continuation_token: Optional[str] = None,
1034
+ sleep_time: float = 0.1
1035
+ ) -> List[Dict]:
1036
+ """
1037
+ Get videos from a YouTube channel with pagination support
1038
+
1039
+ Args:
1040
+ channel_id: YouTube channel ID (can be UC... format, @username, or custom name)
1041
+ max_videos: Maximum number of videos to fetch (0 for all available)
1042
+ continuation_token: Token for pagination
1043
+ sleep_time: Sleep time between requests
1044
+
1045
+ Returns:
1046
+ List of simplified video information
1047
+ """
1048
+ try:
1049
+ videos = []
1050
+ continuations = []
1051
+
1052
+ if continuation_token:
1053
+ # Use provided continuation token
1054
+ continuations.append(continuation_token)
1055
+ else:
1056
+ # Initial request to get videos page and extract initial data
1057
+ videos_url = f"https://www.youtube.com/@{channel_id}/videos"
1058
+ response = await self._make_request(
1059
+ "GET", videos_url, headers=self.default_headers, raw_response=True
1060
+ )
1061
+
1062
+ html_content = response.text
1063
+ initial_data = extract_initial_data(html_content)
1064
+ if not initial_data:
1065
+ logger.error("Failed to extract initial data from videos page")
1066
+ return []
1067
+
1068
+ # Find video renderers in the initial page data
1069
+ video_renderers = self._find_video_renderers(initial_data)
1070
+
1071
+ for video_data in video_renderers:
1072
+ if max_videos > 0 and len(videos) >= max_videos:
1073
+ break
1074
+ video_info = self._extract_video_info(video_data)
1075
+ if video_info:
1076
+ video_info['channel_id'] = channel_id
1077
+ videos.append(video_info)
1078
+
1079
+ # Extract continuation tokens for more results
1080
+ continuation_tokens = self._extract_continuation_tokens(initial_data)
1081
+ continuations.extend(continuation_tokens)
1082
+
1083
+ logger.info(
1084
+ f"Initial page: extracted {len(videos)} videos, found {len(continuations)} continuation tokens")
1085
+
1086
+ # Process continuation tokens for more videos
1087
+ while continuations and (max_videos == 0 or len(videos) < max_videos):
1088
+ current_continuation = continuations.pop(0)
1089
+
1090
+ # Make API request with continuation token
1091
+ data = {"continuation": current_continuation}
1092
+ response = await self._make_api_request("browse", data)
1093
+
1094
+ if not response:
1095
+ break
1096
+
1097
+ # Extract videos from continuation response
1098
+ video_renderers = self._find_video_renderers(response)
1099
+ batch_videos = []
1100
+
1101
+ for video_data in video_renderers:
1102
+ if max_videos > 0 and len(videos) + len(batch_videos) >= max_videos:
1103
+ break
1104
+ video_info = self._extract_video_info(video_data)
1105
+ if video_info:
1106
+ video_info['channel_id'] = channel_id
1107
+ batch_videos.append(video_info)
1108
+
1109
+ videos.extend(batch_videos)
1110
+
1111
+ # Look for more continuation tokens
1112
+ continuation_tokens = self._extract_continuation_tokens(response)
1113
+ for token in continuation_tokens:
1114
+ if token not in continuations:
1115
+ continuations.append(token)
1116
+
1117
+ logger.info(f"Continuation batch: fetched {len(batch_videos)} videos, total: {len(videos)}")
1118
+
1119
+ # Sleep between requests to avoid rate limiting
1120
+ if continuations and sleep_time > 0:
1121
+ await asyncio.sleep(sleep_time)
1122
+
1123
+ return videos[:max_videos] if max_videos > 0 else videos
1124
+
1125
+ except Exception as e:
1126
+ logger.error(f"Failed to get channel videos for {channel_id}: {e}")
1127
+ return []
1128
+
1129
+ async def get_trending_videos(self) -> List[Dict]:
1130
+ """
1131
+ Get trending YouTube videos
1132
+
1133
+ Args:
1134
+ max_videos: Maximum number of videos to fetch
1135
+
1136
+ Returns:
1137
+ List of simplified trending video information
1138
+ """
1139
+ try:
1140
+ data = {"browseId": "FEtrending"}
1141
+
1142
+ response = await self._make_api_request("browse", data)
1143
+
1144
+ videos = []
1145
+
1146
+ # Navigate to trending video list
1147
+ contents = response.get("contents", {}).get("twoColumnBrowseResultsRenderer", {}).get("tabs", [])
1148
+ for tab in contents:
1149
+ tab_content = tab.get("tabRenderer", {}).get("content", {})
1150
+ sections = tab_content.get("sectionListRenderer", {}).get("contents", [])
1151
+
1152
+ for section in sections:
1153
+ items_up = section.get("itemSectionRenderer", {}).get("contents", [])
1154
+ for item_up in items_up:
1155
+ items = item_up.get('shelfRenderer', {}).get(
1156
+ 'content').get('expandedShelfContentsRenderer').get('items', [])
1157
+ for item in items:
1158
+ # Check for different video renderer types
1159
+ video_data = (item.get("videoRenderer") or
1160
+ item.get("compactVideoRenderer") or
1161
+ item.get("gridVideoRenderer"))
1162
+ if video_data:
1163
+ video_info = self._extract_video_info(video_data)
1164
+ if video_info:
1165
+ videos.append(video_info)
1166
+
1167
+ return videos
1168
+
1169
+ except Exception as e:
1170
+ logger.error(f"Failed to get trending videos: {e}")
1171
+ return []
1172
+
1173
+ async def close(self):
1174
+ if self.browser_session and self.target_id:
1175
+ try:
1176
+ logger.info(f"Close target id: {self.target_id}")
1177
+ await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
1178
+ except Exception as e:
1179
+ logger.warning(f"Error closing target {self.target_id}: {e}")