vibesurf 0.1.26__py3-none-any.whl → 0.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vibesurf might be problematic. Click here for more details.
- vibe_surf/_version.py +2 -2
- vibe_surf/agents/vibe_surf_agent.py +4 -5
- vibe_surf/browser/agent_browser_session.py +26 -0
- vibe_surf/tools/browser_use_tools.py +168 -1
- vibe_surf/tools/vibesurf_tools.py +425 -3
- vibe_surf/tools/views.py +75 -0
- vibe_surf/tools/website_api/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/__init__.py +0 -0
- vibe_surf/tools/website_api/douyin/client.py +845 -0
- vibe_surf/tools/website_api/douyin/helpers.py +239 -0
- vibe_surf/tools/website_api/weibo/__init__.py +0 -0
- vibe_surf/tools/website_api/weibo/client.py +846 -0
- vibe_surf/tools/website_api/weibo/helpers.py +997 -0
- vibe_surf/tools/website_api/xhs/__init__.py +0 -0
- vibe_surf/tools/website_api/xhs/client.py +807 -0
- vibe_surf/tools/website_api/xhs/helpers.py +301 -0
- vibe_surf/tools/website_api/youtube/__init__.py +32 -0
- vibe_surf/tools/website_api/youtube/client.py +1179 -0
- vibe_surf/tools/website_api/youtube/helpers.py +420 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/METADATA +26 -5
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/RECORD +25 -12
- vibesurf-0.1.28.dist-info/licenses/LICENSE +22 -0
- vibesurf-0.1.26.dist-info/licenses/LICENSE +0 -201
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/WHEEL +0 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/entry_points.txt +0 -0
- {vibesurf-0.1.26.dist-info → vibesurf-0.1.28.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1179 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import pdb
|
|
4
|
+
import re
|
|
5
|
+
import copy
|
|
6
|
+
import time
|
|
7
|
+
import urllib.parse
|
|
8
|
+
import xml.etree.ElementTree as ET
|
|
9
|
+
from typing import Dict, List, Optional, Callable, Union, Any
|
|
10
|
+
import httpx
|
|
11
|
+
from tenacity import retry, stop_after_attempt, wait_fixed
|
|
12
|
+
from urllib.parse import parse_qs, unquote, urlencode
|
|
13
|
+
|
|
14
|
+
from vibe_surf.browser.agent_browser_session import AgentBrowserSession
|
|
15
|
+
from vibe_surf.logger import get_logger
|
|
16
|
+
|
|
17
|
+
from .helpers import (
|
|
18
|
+
SearchType, SortType, Duration, UploadDate,
|
|
19
|
+
extract_cookies_from_browser, extract_video_id_from_url,
|
|
20
|
+
extract_channel_id_from_url, extract_playlist_id_from_url,
|
|
21
|
+
parse_youtube_duration, format_view_count, parse_youtube_time,
|
|
22
|
+
process_youtube_text, validate_youtube_data, sanitize_filename,
|
|
23
|
+
extract_ytcfg_data, extract_initial_data, get_desktop_user_agent,
|
|
24
|
+
build_search_url, extract_continuation_token, decode_html_entities,
|
|
25
|
+
extract_thumbnail_url, generate_visitor_data,
|
|
26
|
+
YouTubeError, NetworkError, DataExtractionError,
|
|
27
|
+
AuthenticationError, RateLimitError, ContentNotFoundError
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class YouTubeApiClient:
|
|
34
|
+
"""
|
|
35
|
+
YouTube API client with integrated browser session management.
|
|
36
|
+
This client handles API communication through browser session for authentication.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, browser_session: AgentBrowserSession, timeout: int = 60, proxy: Optional[str] = None):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the YouTube API client
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
browser_session: Browser session for authentication
|
|
45
|
+
timeout: Request timeout in seconds
|
|
46
|
+
proxy: Proxy URL if needed
|
|
47
|
+
"""
|
|
48
|
+
self.browser_session = browser_session
|
|
49
|
+
self.target_id = None
|
|
50
|
+
self.proxy = proxy
|
|
51
|
+
self.timeout = timeout
|
|
52
|
+
self._base_url = "https://www.youtube.com"
|
|
53
|
+
self._api_base = "https://www.youtube.com/youtubei/v1"
|
|
54
|
+
|
|
55
|
+
# YouTube API key and client version (these are usually extracted from the page)
|
|
56
|
+
self._api_key = None
|
|
57
|
+
self._client_version = "2.20240229.01.00"
|
|
58
|
+
self._visitor_data = generate_visitor_data()
|
|
59
|
+
|
|
60
|
+
# Default headers for YouTube
|
|
61
|
+
self.default_headers = {
|
|
62
|
+
"User-Agent": get_desktop_user_agent(),
|
|
63
|
+
"Origin": "https://www.youtube.com",
|
|
64
|
+
"Referer": "https://www.youtube.com/",
|
|
65
|
+
"Content-Type": "application/json;charset=UTF-8",
|
|
66
|
+
}
|
|
67
|
+
self.cookies = {}
|
|
68
|
+
|
|
69
|
+
async def setup(self, target_id: Optional[str] = None):
|
|
70
|
+
"""
|
|
71
|
+
Setup YouTube client by navigating to the site and extracting cookies
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
target_id: Specific browser target ID to use
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
AuthenticationError: If setup fails
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
if self.target_id and self.cookies and self._api_key:
|
|
81
|
+
logger.info("YouTube client already setup. Return!")
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
if target_id:
|
|
85
|
+
self.target_id = target_id
|
|
86
|
+
else:
|
|
87
|
+
# Navigate to YouTube home page
|
|
88
|
+
self.target_id = await self.browser_session.navigate_to_url(
|
|
89
|
+
"https://www.youtube.com/", new_tab=True
|
|
90
|
+
)
|
|
91
|
+
await asyncio.sleep(3) # Wait for page load
|
|
92
|
+
|
|
93
|
+
# Extract cookies from browser
|
|
94
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
95
|
+
result = await asyncio.wait_for(
|
|
96
|
+
cdp_session.cdp_client.send.Storage.getCookies(session_id=cdp_session.session_id),
|
|
97
|
+
timeout=8.0
|
|
98
|
+
)
|
|
99
|
+
web_cookies = result.get('cookies', [])
|
|
100
|
+
|
|
101
|
+
cookie_str, cookie_dict = extract_cookies_from_browser(web_cookies)
|
|
102
|
+
self.default_headers["Cookie"] = cookie_str
|
|
103
|
+
self.cookies = cookie_dict
|
|
104
|
+
|
|
105
|
+
# Get user agent from browser
|
|
106
|
+
user_agent_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
107
|
+
params={
|
|
108
|
+
'expression': "navigator.userAgent",
|
|
109
|
+
'returnByValue': True,
|
|
110
|
+
'awaitPromise': True
|
|
111
|
+
},
|
|
112
|
+
session_id=cdp_session.session_id,
|
|
113
|
+
)
|
|
114
|
+
user_agent = user_agent_result.get('result', {}).get('value')
|
|
115
|
+
if user_agent:
|
|
116
|
+
self.default_headers["User-Agent"] = user_agent
|
|
117
|
+
|
|
118
|
+
# Extract API key and configuration from page
|
|
119
|
+
await self._extract_api_config()
|
|
120
|
+
|
|
121
|
+
logger.info("YouTube client setup completed successfully")
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Failed to setup YouTube client: {e}")
|
|
125
|
+
raise AuthenticationError(f"Setup failed: {e}")
|
|
126
|
+
|
|
127
|
+
async def _extract_api_config(self):
|
|
128
|
+
"""Extract API key and configuration from YouTube page"""
|
|
129
|
+
try:
|
|
130
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=self.target_id)
|
|
131
|
+
|
|
132
|
+
# Get page content to extract API key
|
|
133
|
+
content_result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
134
|
+
params={
|
|
135
|
+
'expression': "document.documentElement.outerHTML",
|
|
136
|
+
'returnByValue': True,
|
|
137
|
+
},
|
|
138
|
+
session_id=cdp_session.session_id,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
html_content = content_result.get('result', {}).get('value', '')
|
|
142
|
+
|
|
143
|
+
# Extract API key from page
|
|
144
|
+
api_key_match = re.search(r'"INNERTUBE_API_KEY":"([^"]+)"', html_content)
|
|
145
|
+
if api_key_match:
|
|
146
|
+
self._api_key = api_key_match.group(1)
|
|
147
|
+
logger.info(f"Extracted YouTube API key: {self._api_key[:10]}...")
|
|
148
|
+
|
|
149
|
+
# Extract client version
|
|
150
|
+
version_match = re.search(r'"clientVersion":"([^"]+)"', html_content)
|
|
151
|
+
if version_match:
|
|
152
|
+
self._client_version = version_match.group(1)
|
|
153
|
+
self.default_headers["X-YouTube-Client-Version"] = self._client_version
|
|
154
|
+
|
|
155
|
+
# Extract visitor data if available
|
|
156
|
+
visitor_match = re.search(r'"visitorData":"([^"]+)"', html_content)
|
|
157
|
+
if visitor_match:
|
|
158
|
+
self._visitor_data = visitor_match.group(1)
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.warning(f"Failed to extract YouTube API config: {e}")
|
|
162
|
+
# Use default values if extraction fails
|
|
163
|
+
|
|
164
|
+
async def pong(self) -> bool:
|
|
165
|
+
"""Check if the client is working by making a simple request"""
|
|
166
|
+
try:
|
|
167
|
+
logger.info("Testing YouTube client status...")
|
|
168
|
+
|
|
169
|
+
# Try to make a simple search request
|
|
170
|
+
test_response = await self.search_videos("test", max_results=1)
|
|
171
|
+
|
|
172
|
+
if test_response and len(test_response) >= 0:
|
|
173
|
+
logger.info("YouTube client status: Valid")
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
logger.warning("YouTube client status: Invalid response")
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.error(f"Failed to check YouTube client status: {e}")
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
|
184
|
+
async def _make_request(self, method: str, url: str, **kwargs):
|
|
185
|
+
"""
|
|
186
|
+
Make HTTP request with error handling and retry logic
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
method: HTTP method
|
|
190
|
+
url: Request URL
|
|
191
|
+
**kwargs: Additional request parameters
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Response data
|
|
195
|
+
"""
|
|
196
|
+
raw_response = kwargs.pop("raw_response", False)
|
|
197
|
+
|
|
198
|
+
async with httpx.AsyncClient(proxy=self.proxy, timeout=self.timeout) as client:
|
|
199
|
+
response = await client.request(method, url, **kwargs)
|
|
200
|
+
|
|
201
|
+
# Handle common error status codes
|
|
202
|
+
if response.status_code == 403:
|
|
203
|
+
raise AuthenticationError("Access forbidden - may need login or verification")
|
|
204
|
+
elif response.status_code == 429:
|
|
205
|
+
raise RateLimitError("Rate limit exceeded")
|
|
206
|
+
elif response.status_code == 404:
|
|
207
|
+
raise ContentNotFoundError("Content not found")
|
|
208
|
+
elif response.status_code >= 500:
|
|
209
|
+
raise NetworkError(f"Server error: {response.status_code}")
|
|
210
|
+
|
|
211
|
+
if raw_response:
|
|
212
|
+
return response
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
if 'application/json' in response.headers.get('content-type', ''):
|
|
216
|
+
return response.json()
|
|
217
|
+
else:
|
|
218
|
+
return response.text
|
|
219
|
+
|
|
220
|
+
except json.JSONDecodeError:
|
|
221
|
+
return response.text
|
|
222
|
+
|
|
223
|
+
async def _make_api_request(self, endpoint: str, data: Dict, **kwargs) -> Dict:
|
|
224
|
+
"""Make YouTube Internal API request"""
|
|
225
|
+
if not self._api_key:
|
|
226
|
+
raise AuthenticationError("YouTube API key not available")
|
|
227
|
+
|
|
228
|
+
url = f"{self._api_base}/{endpoint}?key={self._api_key}"
|
|
229
|
+
|
|
230
|
+
# Add default context to request data
|
|
231
|
+
request_data = {
|
|
232
|
+
"context": {
|
|
233
|
+
"client": {
|
|
234
|
+
"clientName": "WEB",
|
|
235
|
+
"clientVersion": self._client_version,
|
|
236
|
+
"visitorData": self._visitor_data,
|
|
237
|
+
},
|
|
238
|
+
"user": {
|
|
239
|
+
"lockedSafetyMode": False
|
|
240
|
+
},
|
|
241
|
+
"request": {
|
|
242
|
+
"useSsl": True
|
|
243
|
+
}
|
|
244
|
+
},
|
|
245
|
+
**data
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
json_payload = json.dumps(request_data, separators=(",", ":"), ensure_ascii=False)
|
|
249
|
+
|
|
250
|
+
return await self._make_request(
|
|
251
|
+
"POST", url,
|
|
252
|
+
data=json_payload,
|
|
253
|
+
headers=self.default_headers,
|
|
254
|
+
**kwargs
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def _find_video_renderers(self, data: Any) -> List[Dict]:
|
|
258
|
+
"""
|
|
259
|
+
Recursively find all videoRenderer objects in the YouTube API response
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
data: YouTube API response data (can be dict, list, or any type)
|
|
263
|
+
max_results: Maximum number of video renderers to find
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
List of videoRenderer data
|
|
267
|
+
"""
|
|
268
|
+
video_renderers = []
|
|
269
|
+
|
|
270
|
+
def recursive_search(obj, current_count=0):
|
|
271
|
+
|
|
272
|
+
if isinstance(obj, dict):
|
|
273
|
+
# Check if this dict contains a videoRenderer
|
|
274
|
+
if "videoRenderer" in obj:
|
|
275
|
+
video_data = obj["videoRenderer"]
|
|
276
|
+
if video_data and isinstance(video_data, dict):
|
|
277
|
+
video_renderers.append(video_data)
|
|
278
|
+
current_count += 1
|
|
279
|
+
|
|
280
|
+
# Recursively search all values in the dict
|
|
281
|
+
for value in obj.values():
|
|
282
|
+
current_count = recursive_search(value, current_count)
|
|
283
|
+
|
|
284
|
+
elif isinstance(obj, list):
|
|
285
|
+
# Recursively search all items in the list
|
|
286
|
+
for item in obj:
|
|
287
|
+
current_count = recursive_search(item, current_count)
|
|
288
|
+
|
|
289
|
+
return current_count
|
|
290
|
+
|
|
291
|
+
recursive_search(data)
|
|
292
|
+
return video_renderers
|
|
293
|
+
|
|
294
|
+
async def search_videos(
|
|
295
|
+
self,
|
|
296
|
+
query: str,
|
|
297
|
+
max_results: int = 20,
|
|
298
|
+
continuation_token: Optional[str] = None,
|
|
299
|
+
sleep_time: float = 0.1
|
|
300
|
+
) -> List[Dict]:
|
|
301
|
+
"""
|
|
302
|
+
Search YouTube videos with pagination support
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
query: Search query
|
|
306
|
+
max_results: Maximum number of results to fetch (0 for all available)
|
|
307
|
+
continuation_token: Token for pagination
|
|
308
|
+
sleep_time: Sleep time between requests
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
List of simplified video information
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
videos = []
|
|
315
|
+
continuations = []
|
|
316
|
+
|
|
317
|
+
if continuation_token:
|
|
318
|
+
# Use provided continuation token
|
|
319
|
+
continuations.append(continuation_token)
|
|
320
|
+
else:
|
|
321
|
+
# Initial search request
|
|
322
|
+
data = {"query": query}
|
|
323
|
+
response = await self._make_api_request("search", data)
|
|
324
|
+
|
|
325
|
+
# Extract videos from initial response
|
|
326
|
+
video_renderers = self._find_video_renderers(response)
|
|
327
|
+
for video_data in video_renderers:
|
|
328
|
+
if max_results > 0 and len(videos) >= max_results:
|
|
329
|
+
break
|
|
330
|
+
video_info = self._extract_video_info(video_data)
|
|
331
|
+
if video_info:
|
|
332
|
+
videos.append(video_info)
|
|
333
|
+
|
|
334
|
+
# Extract continuation tokens for more results
|
|
335
|
+
continuation_tokens = self._extract_continuation_tokens(response)
|
|
336
|
+
continuations.extend(continuation_tokens)
|
|
337
|
+
|
|
338
|
+
# Process continuation tokens for more videos
|
|
339
|
+
while continuations and (max_results == 0 or len(videos) < max_results):
|
|
340
|
+
current_continuation = continuations.pop(0)
|
|
341
|
+
|
|
342
|
+
# Make API request with continuation token
|
|
343
|
+
data = {"continuation": current_continuation}
|
|
344
|
+
response = await self._make_api_request("search", data)
|
|
345
|
+
|
|
346
|
+
if not response:
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
# Extract videos from continuation response
|
|
350
|
+
video_renderers = self._find_video_renderers(response)
|
|
351
|
+
batch_videos = []
|
|
352
|
+
|
|
353
|
+
for video_data in video_renderers:
|
|
354
|
+
if max_results > 0 and len(videos) + len(batch_videos) >= max_results:
|
|
355
|
+
break
|
|
356
|
+
video_info = self._extract_video_info(video_data)
|
|
357
|
+
if video_info:
|
|
358
|
+
batch_videos.append(video_info)
|
|
359
|
+
|
|
360
|
+
videos.extend(batch_videos)
|
|
361
|
+
|
|
362
|
+
# Look for more continuation tokens
|
|
363
|
+
continuation_tokens = self._extract_continuation_tokens(response)
|
|
364
|
+
for token in continuation_tokens:
|
|
365
|
+
if token not in continuations:
|
|
366
|
+
continuations.append(token)
|
|
367
|
+
|
|
368
|
+
logger.info(f"Fetched {len(batch_videos)} videos, total: {len(videos)}")
|
|
369
|
+
|
|
370
|
+
# Sleep between requests to avoid rate limiting
|
|
371
|
+
if continuations and sleep_time > 0:
|
|
372
|
+
await asyncio.sleep(sleep_time)
|
|
373
|
+
|
|
374
|
+
return videos[:max_results] if max_results > 0 else videos
|
|
375
|
+
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.error(f"Failed to search videos: {e}")
|
|
378
|
+
return []
|
|
379
|
+
|
|
380
|
+
def _extract_video_info(self, video_data: Dict) -> Optional[Dict]:
|
|
381
|
+
"""Extract simplified video information from YouTube video data"""
|
|
382
|
+
try:
|
|
383
|
+
video_id = video_data.get("videoId")
|
|
384
|
+
if not video_id:
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
title = video_data.get("title", {}).get("runs", [{}])[0].get("text", "")
|
|
388
|
+
if not title and "accessibility" in video_data.get("title", {}):
|
|
389
|
+
title = video_data["title"]["accessibility"]["accessibilityData"]["label"]
|
|
390
|
+
|
|
391
|
+
# Extract view count
|
|
392
|
+
view_count_text = ""
|
|
393
|
+
view_count_runs = video_data.get("viewCountText", {}).get("simpleText", "")
|
|
394
|
+
if not view_count_runs:
|
|
395
|
+
view_count_runs = video_data.get("shortViewCountText", {}).get("simpleText", "")
|
|
396
|
+
view_count = format_view_count(view_count_runs)
|
|
397
|
+
|
|
398
|
+
# Extract duration
|
|
399
|
+
duration_text = video_data.get("lengthText", {}).get("simpleText", "")
|
|
400
|
+
duration_seconds = 0
|
|
401
|
+
if duration_text:
|
|
402
|
+
# Convert MM:SS or HH:MM:SS to seconds
|
|
403
|
+
time_parts = duration_text.split(":")
|
|
404
|
+
if len(time_parts) == 2: # MM:SS
|
|
405
|
+
duration_seconds = int(time_parts[0]) * 60 + int(time_parts[1])
|
|
406
|
+
elif len(time_parts) == 3: # HH:MM:SS
|
|
407
|
+
duration_seconds = int(time_parts[0]) * 3600 + int(time_parts[1]) * 60 + int(time_parts[2])
|
|
408
|
+
|
|
409
|
+
# Extract channel info
|
|
410
|
+
channel_data = video_data.get("longBylineText", {}).get("runs", [{}])[0]
|
|
411
|
+
channel_name = channel_data.get("text", "")
|
|
412
|
+
channel_url = channel_data.get("navigationEndpoint", {}).get("commandMetadata", {}).get(
|
|
413
|
+
"webCommandMetadata", {}).get("url", "")
|
|
414
|
+
channel_id = extract_channel_id_from_url(channel_url) if channel_url else ""
|
|
415
|
+
|
|
416
|
+
# Extract thumbnail
|
|
417
|
+
thumbnails = video_data.get("thumbnail", {}).get("thumbnails", [])
|
|
418
|
+
thumbnail_url = extract_thumbnail_url(thumbnails)
|
|
419
|
+
|
|
420
|
+
# Extract published time
|
|
421
|
+
published_time_text = video_data.get("publishedTimeText", {}).get("simpleText", "")
|
|
422
|
+
|
|
423
|
+
description = ''
|
|
424
|
+
if 'descriptionSnippet' in video_data:
|
|
425
|
+
for desc in video_data.get('descriptionSnippet', {}).get('runs', {}):
|
|
426
|
+
description += desc.get('text', '')
|
|
427
|
+
|
|
428
|
+
return {
|
|
429
|
+
"video_id": video_id,
|
|
430
|
+
"title": process_youtube_text(title),
|
|
431
|
+
"description": description,
|
|
432
|
+
"duration": duration_seconds,
|
|
433
|
+
"view_count": view_count,
|
|
434
|
+
"like_count": -1, # Not available in search results
|
|
435
|
+
"comment_count": -1, # Not available in search results
|
|
436
|
+
"published_time": published_time_text,
|
|
437
|
+
"thumbnail_url": thumbnail_url,
|
|
438
|
+
"video_url": f"https://www.youtube.com/watch?v={video_id}",
|
|
439
|
+
"channel_id": channel_id,
|
|
440
|
+
"channel_name": channel_name,
|
|
441
|
+
"channel_url": f"https://www.youtube.com{channel_url}" if channel_url else "",
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
except Exception as e:
|
|
445
|
+
logger.error(f"Failed to extract video info: {e}")
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
async def get_video_details(self, video_id: str) -> Optional[Dict]:
|
|
449
|
+
"""
|
|
450
|
+
Get detailed video information
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
video_id: YouTube video ID
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
Detailed video information
|
|
457
|
+
"""
|
|
458
|
+
try:
|
|
459
|
+
# Use the player API to get video details
|
|
460
|
+
data = {"videoId": video_id}
|
|
461
|
+
|
|
462
|
+
response = await self._make_api_request("player", data)
|
|
463
|
+
|
|
464
|
+
video_details = response.get("videoDetails", {})
|
|
465
|
+
if not video_details:
|
|
466
|
+
return None
|
|
467
|
+
|
|
468
|
+
# Extract basic video information
|
|
469
|
+
title = video_details.get("title", "")
|
|
470
|
+
description = video_details.get("shortDescription", "")
|
|
471
|
+
duration = int(video_details.get("lengthSeconds", 0))
|
|
472
|
+
view_count = int(video_details.get("viewCount", 0))
|
|
473
|
+
|
|
474
|
+
# Extract channel information
|
|
475
|
+
channel_id = video_details.get("channelId", "")
|
|
476
|
+
channel_name = video_details.get("author", "")
|
|
477
|
+
|
|
478
|
+
# Extract thumbnail
|
|
479
|
+
thumbnails = video_details.get("thumbnail", {}).get("thumbnails", [])
|
|
480
|
+
thumbnail_url = extract_thumbnail_url(thumbnails)
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
"video_id": video_id,
|
|
484
|
+
"title": process_youtube_text(title),
|
|
485
|
+
"description": process_youtube_text(description),
|
|
486
|
+
"duration": duration,
|
|
487
|
+
"view_count": view_count,
|
|
488
|
+
"like_count": 0, # Would need additional API call
|
|
489
|
+
"comment_count": 0, # Would need additional API call
|
|
490
|
+
"published_time": "", # Not available in player API
|
|
491
|
+
"thumbnail_url": thumbnail_url,
|
|
492
|
+
"video_url": f"https://www.youtube.com/watch?v={video_id}",
|
|
493
|
+
"channel_id": channel_id,
|
|
494
|
+
"channel_name": channel_name,
|
|
495
|
+
"channel_url": f"https://www.youtube.com/channel/{channel_id}" if channel_id else "",
|
|
496
|
+
"keywords": video_details.get("keywords", []),
|
|
497
|
+
"category": video_details.get("category", ""),
|
|
498
|
+
"is_live": video_details.get("isLiveContent", False),
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.error(f"Failed to get video details for {video_id}: {e}")
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
async def get_video_comments(
|
|
506
|
+
self,
|
|
507
|
+
video_id: str,
|
|
508
|
+
max_comments: int = 200,
|
|
509
|
+
continuation_token: Optional[str] = None,
|
|
510
|
+
sort_by: int = 0, # 0 = popular, 1 = recent
|
|
511
|
+
sleep_time: float = 0.1
|
|
512
|
+
) -> List[Dict]:
|
|
513
|
+
"""
|
|
514
|
+
Get comments for a YouTube video with full pagination support
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
video_id: YouTube video ID
|
|
518
|
+
max_comments: Maximum number of comments to fetch (0 for all)
|
|
519
|
+
continuation_token: Token for pagination
|
|
520
|
+
sort_by: Comment sorting (0=popular, 1=recent)
|
|
521
|
+
sleep_time: Sleep time between requests
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
List of simplified comment information
|
|
525
|
+
"""
|
|
526
|
+
try:
|
|
527
|
+
comments = []
|
|
528
|
+
continuations = []
|
|
529
|
+
|
|
530
|
+
if continuation_token:
|
|
531
|
+
# Use provided continuation token
|
|
532
|
+
continuations.append(continuation_token)
|
|
533
|
+
else:
|
|
534
|
+
# Initial request - need to navigate to video page first to get comments section
|
|
535
|
+
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
536
|
+
response = await self._make_request("GET", video_url, headers=self.default_headers, raw_response=True)
|
|
537
|
+
html_content = response.text
|
|
538
|
+
|
|
539
|
+
# Extract initial data from the page
|
|
540
|
+
initial_data = self._extract_initial_data_from_html(html_content)
|
|
541
|
+
if not initial_data:
|
|
542
|
+
logger.error("Failed to extract initial data from video page")
|
|
543
|
+
return []
|
|
544
|
+
|
|
545
|
+
# Find comments section
|
|
546
|
+
continuation_endpoint = self._find_comments_continuation(initial_data, sort_by)
|
|
547
|
+
if not continuation_endpoint:
|
|
548
|
+
logger.warning(f"No comments found for video {video_id}")
|
|
549
|
+
return []
|
|
550
|
+
|
|
551
|
+
continuations.append(continuation_endpoint)
|
|
552
|
+
|
|
553
|
+
# Process all continuation tokens
|
|
554
|
+
while continuations:
|
|
555
|
+
if max_comments > 0 and len(comments) >= max_comments:
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
current_continuation = continuations.pop(0)
|
|
559
|
+
# Make API request for comments
|
|
560
|
+
data = {"continuation": current_continuation}
|
|
561
|
+
response = await self._make_api_request("next", data)
|
|
562
|
+
|
|
563
|
+
if not response:
|
|
564
|
+
break
|
|
565
|
+
|
|
566
|
+
# Check for errors
|
|
567
|
+
error_messages = self._search_dict_recursive(response, 'externalErrorMessage')
|
|
568
|
+
if error_messages:
|
|
569
|
+
logger.error(f"YouTube API error: {error_messages[0]}")
|
|
570
|
+
break
|
|
571
|
+
|
|
572
|
+
# Process response actions to find more comments and continuations
|
|
573
|
+
actions = []
|
|
574
|
+
actions.extend(self._search_dict_recursive(response, 'reloadContinuationItemsCommand'))
|
|
575
|
+
actions.extend(self._search_dict_recursive(response, 'appendContinuationItemsAction'))
|
|
576
|
+
|
|
577
|
+
# Process each action to extract comments and find new continuations
|
|
578
|
+
for action in actions:
|
|
579
|
+
target_id = action.get('targetId', '')
|
|
580
|
+
continuation_items = action.get('continuationItems', [])
|
|
581
|
+
|
|
582
|
+
# Process continuations for comments and replies
|
|
583
|
+
if target_id in ['comments-section', 'engagement-panel-comments-section',
|
|
584
|
+
'shorts-engagement-panel-comments-section']:
|
|
585
|
+
for item in continuation_items:
|
|
586
|
+
# Look for continuation endpoints for more comments
|
|
587
|
+
continuation_endpoints = self._search_dict_recursive(item, 'continuationEndpoint')
|
|
588
|
+
for endpoint in continuation_endpoints:
|
|
589
|
+
if 'continuationCommand' in endpoint:
|
|
590
|
+
token = endpoint['continuationCommand']['token']
|
|
591
|
+
if token not in continuations:
|
|
592
|
+
continuations.insert(0, token) # Insert at beginning for breadth-first
|
|
593
|
+
|
|
594
|
+
# Process 'Show more replies' buttons
|
|
595
|
+
elif target_id.startswith('comment-replies-item'):
|
|
596
|
+
for item in continuation_items:
|
|
597
|
+
if 'continuationItemRenderer' in item:
|
|
598
|
+
button_renderers = self._search_dict_recursive(item, 'buttonRenderer')
|
|
599
|
+
for button in button_renderers:
|
|
600
|
+
command = button.get('command', {})
|
|
601
|
+
if 'continuationCommand' in command:
|
|
602
|
+
token = command['continuationCommand']['token']
|
|
603
|
+
if token not in continuations:
|
|
604
|
+
continuations.append(token)
|
|
605
|
+
|
|
606
|
+
# Extract comment entity payloads for new comment format
|
|
607
|
+
comment_entities = {}
|
|
608
|
+
for payload in self._search_dict_recursive(response, 'commentEntityPayload'):
|
|
609
|
+
if 'properties' in payload and 'commentId' in payload['properties']:
|
|
610
|
+
comment_id = payload['properties']['commentId']
|
|
611
|
+
comment_entities[comment_id] = payload
|
|
612
|
+
|
|
613
|
+
# Extract toolbar states
|
|
614
|
+
toolbar_states = {}
|
|
615
|
+
for payload in self._search_dict_recursive(response, 'engagementToolbarStateEntityPayload'):
|
|
616
|
+
if 'key' in payload:
|
|
617
|
+
toolbar_states[payload['key']] = payload
|
|
618
|
+
|
|
619
|
+
# Process comment entities and extract comment information
|
|
620
|
+
batch_comments = []
|
|
621
|
+
for comment_id in comment_entities:
|
|
622
|
+
if max_comments > 0 and len(comments) + len(batch_comments) >= max_comments:
|
|
623
|
+
break
|
|
624
|
+
|
|
625
|
+
entity = comment_entities[comment_id]
|
|
626
|
+
comment_info = self._extract_comment_from_entity(entity, toolbar_states)
|
|
627
|
+
if comment_info:
|
|
628
|
+
batch_comments.append(comment_info)
|
|
629
|
+
|
|
630
|
+
# Reverse to maintain chronological order (YouTube returns in reverse)
|
|
631
|
+
batch_comments.reverse()
|
|
632
|
+
comments.extend(batch_comments)
|
|
633
|
+
|
|
634
|
+
logger.info(f"Fetched {len(batch_comments)} comments, total: {len(comments)}")
|
|
635
|
+
|
|
636
|
+
# Sleep between requests to avoid rate limiting
|
|
637
|
+
if continuations and sleep_time > 0:
|
|
638
|
+
await asyncio.sleep(sleep_time)
|
|
639
|
+
|
|
640
|
+
return comments[:max_comments] if max_comments > 0 else comments
|
|
641
|
+
|
|
642
|
+
except Exception as e:
|
|
643
|
+
logger.error(f"Failed to get comments for video {video_id}: {e}")
|
|
644
|
+
return []
|
|
645
|
+
|
|
646
|
+
def _extract_initial_data_from_html(self, html_content: str) -> Optional[Dict]:
|
|
647
|
+
"""Extract ytInitialData from HTML content"""
|
|
648
|
+
try:
|
|
649
|
+
# Pattern for ytInitialData
|
|
650
|
+
pattern = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'
|
|
651
|
+
match = re.search(pattern, html_content)
|
|
652
|
+
if match:
|
|
653
|
+
return json.loads(match.group(1))
|
|
654
|
+
return None
|
|
655
|
+
except Exception as e:
|
|
656
|
+
logger.error(f"Failed to extract initial data: {e}")
|
|
657
|
+
return None
|
|
658
|
+
|
|
659
|
+
def _find_comments_continuation(self, initial_data: Dict, sort_by: int = 1) -> Optional[str]:
|
|
660
|
+
"""Find comments section continuation token"""
|
|
661
|
+
try:
|
|
662
|
+
# Look for itemSectionRenderer in the data
|
|
663
|
+
for item_section in self._search_dict_recursive(initial_data, 'itemSectionRenderer'):
|
|
664
|
+
for continuation_renderer in self._search_dict_recursive(item_section, 'continuationItemRenderer'):
|
|
665
|
+
continuation_endpoint = continuation_renderer.get('continuationEndpoint', {})
|
|
666
|
+
if continuation_endpoint:
|
|
667
|
+
# Check if we need to handle sort menu
|
|
668
|
+
sort_menu = None
|
|
669
|
+
for sort_filter in self._search_dict_recursive(initial_data, 'sortFilterSubMenuRenderer'):
|
|
670
|
+
sort_menu = sort_filter.get('subMenuItems', [])
|
|
671
|
+
break
|
|
672
|
+
|
|
673
|
+
if sort_menu and sort_by < len(sort_menu):
|
|
674
|
+
# Use the specified sort option
|
|
675
|
+
sort_endpoint = sort_menu[sort_by].get('serviceEndpoint', {})
|
|
676
|
+
if 'continuationCommand' in sort_endpoint:
|
|
677
|
+
return sort_endpoint['continuationCommand']['token']
|
|
678
|
+
|
|
679
|
+
# Fallback to default continuation
|
|
680
|
+
if 'continuationCommand' in continuation_endpoint:
|
|
681
|
+
return continuation_endpoint['continuationCommand']['token']
|
|
682
|
+
|
|
683
|
+
return None
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logger.error(f"Failed to find comments continuation: {e}")
|
|
686
|
+
return None
|
|
687
|
+
|
|
688
|
+
def _search_dict_recursive(self, data: Any, search_key: str) -> List[Any]:
|
|
689
|
+
"""Recursively search for a key in nested dict/list structure"""
|
|
690
|
+
results = []
|
|
691
|
+
stack = [data]
|
|
692
|
+
|
|
693
|
+
while stack:
|
|
694
|
+
current = stack.pop()
|
|
695
|
+
if isinstance(current, dict):
|
|
696
|
+
for key, value in current.items():
|
|
697
|
+
if key == search_key:
|
|
698
|
+
results.append(value)
|
|
699
|
+
else:
|
|
700
|
+
stack.append(value)
|
|
701
|
+
elif isinstance(current, list):
|
|
702
|
+
stack.extend(current)
|
|
703
|
+
|
|
704
|
+
return results
|
|
705
|
+
|
|
706
|
+
def _extract_comment_from_entity(self, entity: Dict, toolbar_states: Dict) -> Optional[Dict]:
|
|
707
|
+
"""Extract comment info from commentEntityPayload format"""
|
|
708
|
+
try:
|
|
709
|
+
properties = entity.get('properties', {})
|
|
710
|
+
author = entity.get('author', {})
|
|
711
|
+
toolbar = entity.get('toolbar', {})
|
|
712
|
+
|
|
713
|
+
comment_id = properties.get('commentId', '')
|
|
714
|
+
content = properties.get('content', {}).get('content', '')
|
|
715
|
+
published_time = properties.get('publishedTime', '')
|
|
716
|
+
|
|
717
|
+
# Author info
|
|
718
|
+
author_name = author.get('displayName', '')
|
|
719
|
+
author_channel_id = author.get('channelId', '')
|
|
720
|
+
author_avatar = author.get('avatarThumbnailUrl', '')
|
|
721
|
+
|
|
722
|
+
# Engagement info
|
|
723
|
+
like_count_text = toolbar.get('likeCountNotliked', '0').strip() or "0"
|
|
724
|
+
like_count = self._parse_count_string(like_count_text)
|
|
725
|
+
reply_count = toolbar.get('replyCount', 0)
|
|
726
|
+
|
|
727
|
+
# Check if comment is hearted
|
|
728
|
+
toolbar_state_key = properties.get('toolbarStateKey', '')
|
|
729
|
+
is_hearted = False
|
|
730
|
+
if toolbar_state_key in toolbar_states:
|
|
731
|
+
heart_state = toolbar_states[toolbar_state_key].get('heartState', '')
|
|
732
|
+
is_hearted = heart_state == 'TOOLBAR_HEART_STATE_HEARTED'
|
|
733
|
+
|
|
734
|
+
# Check if it's a reply (comment ID contains '.')
|
|
735
|
+
is_reply = '.' in comment_id
|
|
736
|
+
|
|
737
|
+
return {
|
|
738
|
+
"comment_id": comment_id,
|
|
739
|
+
"content": process_youtube_text(content),
|
|
740
|
+
"author_name": author_name,
|
|
741
|
+
"author_channel_id": author_channel_id,
|
|
742
|
+
"author_avatar": author_avatar,
|
|
743
|
+
"like_count": like_count,
|
|
744
|
+
"reply_count": reply_count,
|
|
745
|
+
"published_time": published_time,
|
|
746
|
+
"is_hearted": is_hearted,
|
|
747
|
+
"is_reply": is_reply,
|
|
748
|
+
"time_parsed": self._parse_time_string(published_time)
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
except Exception as e:
|
|
752
|
+
logger.error(f"Failed to extract comment from entity: {e}")
|
|
753
|
+
return None
|
|
754
|
+
|
|
755
|
+
def _parse_count_string(self, count_str: str) -> int:
|
|
756
|
+
"""Parse YouTube count strings like '1.2K', '500', etc."""
|
|
757
|
+
try:
|
|
758
|
+
if not count_str or count_str == '0':
|
|
759
|
+
return 0
|
|
760
|
+
|
|
761
|
+
count_str = count_str.strip().upper()
|
|
762
|
+
|
|
763
|
+
# Handle K, M, B suffixes
|
|
764
|
+
multipliers = {'K': 1000, 'M': 1000000, 'B': 1000000000}
|
|
765
|
+
|
|
766
|
+
for suffix, multiplier in multipliers.items():
|
|
767
|
+
if count_str.endswith(suffix):
|
|
768
|
+
number_part = count_str[:-1]
|
|
769
|
+
return int(float(number_part) * multiplier)
|
|
770
|
+
|
|
771
|
+
# Handle comma-separated numbers
|
|
772
|
+
count_str = count_str.replace(',', '')
|
|
773
|
+
return int(count_str)
|
|
774
|
+
|
|
775
|
+
except (ValueError, AttributeError):
|
|
776
|
+
return 0
|
|
777
|
+
|
|
778
|
+
def _parse_time_string(self, time_str: str) -> Optional[float]:
|
|
779
|
+
"""Parse time string and return timestamp"""
|
|
780
|
+
try:
|
|
781
|
+
if not time_str:
|
|
782
|
+
return None
|
|
783
|
+
|
|
784
|
+
# Remove any parenthetical content
|
|
785
|
+
clean_time = time_str.split('(')[0].strip()
|
|
786
|
+
|
|
787
|
+
# Try to parse with dateparser if available
|
|
788
|
+
try:
|
|
789
|
+
import dateparser
|
|
790
|
+
parsed = dateparser.parse(clean_time)
|
|
791
|
+
if parsed:
|
|
792
|
+
return parsed.timestamp()
|
|
793
|
+
except ImportError:
|
|
794
|
+
pass
|
|
795
|
+
|
|
796
|
+
return None
|
|
797
|
+
except Exception:
|
|
798
|
+
return None
|
|
799
|
+
|
|
800
|
+
async def search_all_videos(
|
|
801
|
+
self,
|
|
802
|
+
query: str,
|
|
803
|
+
sleep_time: float = 0.1
|
|
804
|
+
) -> List[Dict]:
|
|
805
|
+
"""
|
|
806
|
+
Search for all available YouTube videos for a query (no limit)
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
query: Search query
|
|
810
|
+
sleep_time: Sleep time between requests
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
List of all available video information
|
|
814
|
+
"""
|
|
815
|
+
return await self.search_videos(
|
|
816
|
+
query=query,
|
|
817
|
+
max_results=0, # 0 means no limit
|
|
818
|
+
sleep_time=sleep_time
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
async def get_all_video_comments(
|
|
822
|
+
self,
|
|
823
|
+
video_id: str,
|
|
824
|
+
sort_by: int = 1, # 0 = popular, 1 = recent
|
|
825
|
+
sleep_time: float = 0.1
|
|
826
|
+
) -> List[Dict]:
|
|
827
|
+
"""
|
|
828
|
+
Get all comments for a YouTube video (no limit)
|
|
829
|
+
|
|
830
|
+
Args:
|
|
831
|
+
video_id: YouTube video ID
|
|
832
|
+
sort_by: Comment sorting (0=popular, 1=recent)
|
|
833
|
+
sleep_time: Sleep time between requests
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
List of all comments for the video
|
|
837
|
+
"""
|
|
838
|
+
return await self.get_video_comments(
|
|
839
|
+
video_id=video_id,
|
|
840
|
+
max_comments=0, # 0 means no limit
|
|
841
|
+
sort_by=sort_by,
|
|
842
|
+
sleep_time=sleep_time
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
def _extract_continuation_tokens(self, data: Any) -> List[str]:
|
|
846
|
+
"""
|
|
847
|
+
Extract all continuation tokens from YouTube API response
|
|
848
|
+
|
|
849
|
+
Args:
|
|
850
|
+
data: YouTube API response data
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
List of continuation tokens
|
|
854
|
+
"""
|
|
855
|
+
tokens = []
|
|
856
|
+
|
|
857
|
+
# Search for continuation endpoints
|
|
858
|
+
continuation_endpoints = self._search_dict_recursive(data, 'continuationEndpoint')
|
|
859
|
+
for endpoint in continuation_endpoints:
|
|
860
|
+
if 'continuationCommand' in endpoint:
|
|
861
|
+
token = endpoint['continuationCommand']['token']
|
|
862
|
+
if token and token not in tokens:
|
|
863
|
+
tokens.append(token)
|
|
864
|
+
|
|
865
|
+
# Search for continuation commands
|
|
866
|
+
continuation_commands = self._search_dict_recursive(data, 'continuationCommand')
|
|
867
|
+
for command in continuation_commands:
|
|
868
|
+
token = command.get('token')
|
|
869
|
+
if token and token not in tokens:
|
|
870
|
+
tokens.append(token)
|
|
871
|
+
|
|
872
|
+
return tokens
|
|
873
|
+
|
|
874
|
+
def _extract_comment_info(self, comment_data: Dict) -> Optional[Dict]:
|
|
875
|
+
"""Extract simplified comment information from traditional YouTube comment data"""
|
|
876
|
+
try:
|
|
877
|
+
# Extract comment ID
|
|
878
|
+
comment_id = comment_data.get("commentId", "")
|
|
879
|
+
|
|
880
|
+
# Extract comment text
|
|
881
|
+
content_text = comment_data.get("contentText", {})
|
|
882
|
+
text = ""
|
|
883
|
+
if "runs" in content_text:
|
|
884
|
+
text = "".join([run.get("text", "") for run in content_text["runs"]])
|
|
885
|
+
elif "simpleText" in content_text:
|
|
886
|
+
text = content_text["simpleText"]
|
|
887
|
+
|
|
888
|
+
# Extract author information
|
|
889
|
+
author_text = comment_data.get("authorText", {}).get("simpleText", "")
|
|
890
|
+
author_thumbnail = comment_data.get("authorThumbnail", {}).get("thumbnails", [])
|
|
891
|
+
author_avatar = extract_thumbnail_url(author_thumbnail)
|
|
892
|
+
|
|
893
|
+
# Extract author channel ID if available
|
|
894
|
+
author_endpoint = comment_data.get("authorEndpoint", {}).get("commandMetadata", {}).get(
|
|
895
|
+
"webCommandMetadata", {})
|
|
896
|
+
author_url = author_endpoint.get("url", "")
|
|
897
|
+
author_channel_id = extract_channel_id_from_url(author_url) if author_url else ""
|
|
898
|
+
|
|
899
|
+
# Extract like count
|
|
900
|
+
like_count_text = comment_data.get("voteCount", {}).get("simpleText", "0")
|
|
901
|
+
like_count = self._parse_count_string(like_count_text)
|
|
902
|
+
|
|
903
|
+
# Extract published time
|
|
904
|
+
published_time_data = comment_data.get("publishedTimeText", {})
|
|
905
|
+
published_time = ""
|
|
906
|
+
if "runs" in published_time_data:
|
|
907
|
+
published_time = published_time_data["runs"][0].get("text", "")
|
|
908
|
+
elif "simpleText" in published_time_data:
|
|
909
|
+
published_time = published_time_data["simpleText"]
|
|
910
|
+
|
|
911
|
+
# Extract reply count
|
|
912
|
+
reply_count = 0
|
|
913
|
+
reply_text = comment_data.get("replyCount", 0)
|
|
914
|
+
if isinstance(reply_text, dict):
|
|
915
|
+
reply_text = reply_text.get("simpleText", "0")
|
|
916
|
+
if isinstance(reply_text, str):
|
|
917
|
+
reply_count = self._parse_count_string(reply_text)
|
|
918
|
+
elif isinstance(reply_text, int):
|
|
919
|
+
reply_count = reply_text
|
|
920
|
+
|
|
921
|
+
# Check if comment is hearted by creator
|
|
922
|
+
is_hearted = False
|
|
923
|
+
if "actionButtons" in comment_data:
|
|
924
|
+
buttons = comment_data["actionButtons"].get("commentActionButtonsRenderer", {})
|
|
925
|
+
heart_button = buttons.get("creatorHeart", {})
|
|
926
|
+
is_hearted = bool(heart_button.get("creatorHeartRenderer", {}))
|
|
927
|
+
|
|
928
|
+
# Check if it's a reply (comment ID contains '.')
|
|
929
|
+
is_reply = '.' in comment_id
|
|
930
|
+
|
|
931
|
+
return {
|
|
932
|
+
"comment_id": comment_id,
|
|
933
|
+
"content": process_youtube_text(text),
|
|
934
|
+
"author_name": author_text,
|
|
935
|
+
"author_channel_id": author_channel_id,
|
|
936
|
+
"author_avatar": author_avatar,
|
|
937
|
+
"like_count": like_count,
|
|
938
|
+
"reply_count": reply_count,
|
|
939
|
+
"published_time": published_time,
|
|
940
|
+
"is_hearted": is_hearted,
|
|
941
|
+
"is_reply": is_reply,
|
|
942
|
+
"time_parsed": self._parse_time_string(published_time)
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
except Exception as e:
|
|
946
|
+
logger.error(f"Failed to extract comment info: {e}")
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
async def get_channel_info(self, channel_id: str) -> Optional[Dict]:
|
|
950
|
+
"""
|
|
951
|
+
Get YouTube channel information
|
|
952
|
+
|
|
953
|
+
Args:
|
|
954
|
+
channel_id: YouTube channel ID
|
|
955
|
+
|
|
956
|
+
Returns:
|
|
957
|
+
Simplified channel information
|
|
958
|
+
"""
|
|
959
|
+
try:
|
|
960
|
+
# Navigate to channel page to get information
|
|
961
|
+
channel_url = f"https://www.youtube.com/@{channel_id}"
|
|
962
|
+
response = await self._make_request(
|
|
963
|
+
"GET", channel_url, headers=self.default_headers, raw_response=True
|
|
964
|
+
)
|
|
965
|
+
html_content = response.text
|
|
966
|
+
initial_data = extract_initial_data(html_content)
|
|
967
|
+
|
|
968
|
+
if not initial_data:
|
|
969
|
+
return None
|
|
970
|
+
|
|
971
|
+
# Extract channel information from initial data
|
|
972
|
+
metadata = initial_data.get("metadata", {}).get("channelMetadataRenderer", {})
|
|
973
|
+
header = initial_data.get("header", {})
|
|
974
|
+
# Try different header types
|
|
975
|
+
channel_header = (header.get("c4TabbedHeaderRenderer") or
|
|
976
|
+
header.get("pageHeaderRenderer") or
|
|
977
|
+
header.get("interactiveTabbedHeaderRenderer") or {})
|
|
978
|
+
|
|
979
|
+
title = metadata.get("title", "") or channel_header.get("title", "")
|
|
980
|
+
description = metadata.get("description", "")
|
|
981
|
+
|
|
982
|
+
# Extract subscriber count and video count from pageHeaderRenderer if available
|
|
983
|
+
subscriber_count = 0
|
|
984
|
+
video_count = 0
|
|
985
|
+
|
|
986
|
+
if "pageHeaderRenderer" in header:
|
|
987
|
+
page_header = header["pageHeaderRenderer"]
|
|
988
|
+
metadata_rows = page_header.get("content", {}).get("pageHeaderViewModel", {}).get("metadata", {}).get(
|
|
989
|
+
"contentMetadataViewModel", {}).get("metadataRows", [])
|
|
990
|
+
|
|
991
|
+
if len(metadata_rows) > 1:
|
|
992
|
+
# Second row contains subscriber and video counts
|
|
993
|
+
metadata_parts = metadata_rows[1].get("metadataParts", [])
|
|
994
|
+
if len(metadata_parts) > 0:
|
|
995
|
+
# Subscriber count (e.g., "21.2万位订阅者")
|
|
996
|
+
subscriber_text = metadata_parts[0].get("text", {}).get("content", "")
|
|
997
|
+
subscriber_count = subscriber_text.replace("位订阅者", "").replace("订阅者", "").replace(
|
|
998
|
+
"subscribers", "").strip()
|
|
999
|
+
|
|
1000
|
+
if len(metadata_parts) > 1:
|
|
1001
|
+
# Video count (e.g., "67 个视频")
|
|
1002
|
+
video_text = metadata_parts[1].get("text", {}).get("content", "")
|
|
1003
|
+
video_count = video_text.replace("个视频", "").replace("视频", "").replace("videos", "").strip()
|
|
1004
|
+
|
|
1005
|
+
# Extract avatar
|
|
1006
|
+
avatar_thumbnails = channel_header.get("avatar", {}).get("thumbnails", [])
|
|
1007
|
+
avatar_url = extract_thumbnail_url(avatar_thumbnails)
|
|
1008
|
+
|
|
1009
|
+
# Extract banner
|
|
1010
|
+
banner_thumbnails = channel_header.get("banner", {}).get("thumbnails", [])
|
|
1011
|
+
banner_url = extract_thumbnail_url(banner_thumbnails)
|
|
1012
|
+
|
|
1013
|
+
return {
|
|
1014
|
+
"channel_id": channel_id,
|
|
1015
|
+
"title": process_youtube_text(title),
|
|
1016
|
+
"description": process_youtube_text(description),
|
|
1017
|
+
"subscriber_count": subscriber_count,
|
|
1018
|
+
"video_count": video_count,
|
|
1019
|
+
"avatar_url": avatar_url,
|
|
1020
|
+
"banner_url": banner_url,
|
|
1021
|
+
"channel_url": channel_url,
|
|
1022
|
+
"verified": False, # Would need additional processing
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
except Exception as e:
|
|
1026
|
+
logger.error(f"Failed to get channel info for {channel_id}: {e}")
|
|
1027
|
+
return None
|
|
1028
|
+
|
|
1029
|
+
async def get_channel_videos(
|
|
1030
|
+
self,
|
|
1031
|
+
channel_id: str,
|
|
1032
|
+
max_videos: int = 20,
|
|
1033
|
+
continuation_token: Optional[str] = None,
|
|
1034
|
+
sleep_time: float = 0.1
|
|
1035
|
+
) -> List[Dict]:
|
|
1036
|
+
"""
|
|
1037
|
+
Get videos from a YouTube channel with pagination support
|
|
1038
|
+
|
|
1039
|
+
Args:
|
|
1040
|
+
channel_id: YouTube channel ID (can be UC... format, @username, or custom name)
|
|
1041
|
+
max_videos: Maximum number of videos to fetch (0 for all available)
|
|
1042
|
+
continuation_token: Token for pagination
|
|
1043
|
+
sleep_time: Sleep time between requests
|
|
1044
|
+
|
|
1045
|
+
Returns:
|
|
1046
|
+
List of simplified video information
|
|
1047
|
+
"""
|
|
1048
|
+
try:
|
|
1049
|
+
videos = []
|
|
1050
|
+
continuations = []
|
|
1051
|
+
|
|
1052
|
+
if continuation_token:
|
|
1053
|
+
# Use provided continuation token
|
|
1054
|
+
continuations.append(continuation_token)
|
|
1055
|
+
else:
|
|
1056
|
+
# Initial request to get videos page and extract initial data
|
|
1057
|
+
videos_url = f"https://www.youtube.com/@{channel_id}/videos"
|
|
1058
|
+
response = await self._make_request(
|
|
1059
|
+
"GET", videos_url, headers=self.default_headers, raw_response=True
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
html_content = response.text
|
|
1063
|
+
initial_data = extract_initial_data(html_content)
|
|
1064
|
+
if not initial_data:
|
|
1065
|
+
logger.error("Failed to extract initial data from videos page")
|
|
1066
|
+
return []
|
|
1067
|
+
|
|
1068
|
+
# Find video renderers in the initial page data
|
|
1069
|
+
video_renderers = self._find_video_renderers(initial_data)
|
|
1070
|
+
|
|
1071
|
+
for video_data in video_renderers:
|
|
1072
|
+
if max_videos > 0 and len(videos) >= max_videos:
|
|
1073
|
+
break
|
|
1074
|
+
video_info = self._extract_video_info(video_data)
|
|
1075
|
+
if video_info:
|
|
1076
|
+
video_info['channel_id'] = channel_id
|
|
1077
|
+
videos.append(video_info)
|
|
1078
|
+
|
|
1079
|
+
# Extract continuation tokens for more results
|
|
1080
|
+
continuation_tokens = self._extract_continuation_tokens(initial_data)
|
|
1081
|
+
continuations.extend(continuation_tokens)
|
|
1082
|
+
|
|
1083
|
+
logger.info(
|
|
1084
|
+
f"Initial page: extracted {len(videos)} videos, found {len(continuations)} continuation tokens")
|
|
1085
|
+
|
|
1086
|
+
# Process continuation tokens for more videos
|
|
1087
|
+
while continuations and (max_videos == 0 or len(videos) < max_videos):
|
|
1088
|
+
current_continuation = continuations.pop(0)
|
|
1089
|
+
|
|
1090
|
+
# Make API request with continuation token
|
|
1091
|
+
data = {"continuation": current_continuation}
|
|
1092
|
+
response = await self._make_api_request("browse", data)
|
|
1093
|
+
|
|
1094
|
+
if not response:
|
|
1095
|
+
break
|
|
1096
|
+
|
|
1097
|
+
# Extract videos from continuation response
|
|
1098
|
+
video_renderers = self._find_video_renderers(response)
|
|
1099
|
+
batch_videos = []
|
|
1100
|
+
|
|
1101
|
+
for video_data in video_renderers:
|
|
1102
|
+
if max_videos > 0 and len(videos) + len(batch_videos) >= max_videos:
|
|
1103
|
+
break
|
|
1104
|
+
video_info = self._extract_video_info(video_data)
|
|
1105
|
+
if video_info:
|
|
1106
|
+
video_info['channel_id'] = channel_id
|
|
1107
|
+
batch_videos.append(video_info)
|
|
1108
|
+
|
|
1109
|
+
videos.extend(batch_videos)
|
|
1110
|
+
|
|
1111
|
+
# Look for more continuation tokens
|
|
1112
|
+
continuation_tokens = self._extract_continuation_tokens(response)
|
|
1113
|
+
for token in continuation_tokens:
|
|
1114
|
+
if token not in continuations:
|
|
1115
|
+
continuations.append(token)
|
|
1116
|
+
|
|
1117
|
+
logger.info(f"Continuation batch: fetched {len(batch_videos)} videos, total: {len(videos)}")
|
|
1118
|
+
|
|
1119
|
+
# Sleep between requests to avoid rate limiting
|
|
1120
|
+
if continuations and sleep_time > 0:
|
|
1121
|
+
await asyncio.sleep(sleep_time)
|
|
1122
|
+
|
|
1123
|
+
return videos[:max_videos] if max_videos > 0 else videos
|
|
1124
|
+
|
|
1125
|
+
except Exception as e:
|
|
1126
|
+
logger.error(f"Failed to get channel videos for {channel_id}: {e}")
|
|
1127
|
+
return []
|
|
1128
|
+
|
|
1129
|
+
async def get_trending_videos(self) -> List[Dict]:
|
|
1130
|
+
"""
|
|
1131
|
+
Get trending YouTube videos
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
max_videos: Maximum number of videos to fetch
|
|
1135
|
+
|
|
1136
|
+
Returns:
|
|
1137
|
+
List of simplified trending video information
|
|
1138
|
+
"""
|
|
1139
|
+
try:
|
|
1140
|
+
data = {"browseId": "FEtrending"}
|
|
1141
|
+
|
|
1142
|
+
response = await self._make_api_request("browse", data)
|
|
1143
|
+
|
|
1144
|
+
videos = []
|
|
1145
|
+
|
|
1146
|
+
# Navigate to trending video list
|
|
1147
|
+
contents = response.get("contents", {}).get("twoColumnBrowseResultsRenderer", {}).get("tabs", [])
|
|
1148
|
+
for tab in contents:
|
|
1149
|
+
tab_content = tab.get("tabRenderer", {}).get("content", {})
|
|
1150
|
+
sections = tab_content.get("sectionListRenderer", {}).get("contents", [])
|
|
1151
|
+
|
|
1152
|
+
for section in sections:
|
|
1153
|
+
items_up = section.get("itemSectionRenderer", {}).get("contents", [])
|
|
1154
|
+
for item_up in items_up:
|
|
1155
|
+
items = item_up.get('shelfRenderer', {}).get(
|
|
1156
|
+
'content').get('expandedShelfContentsRenderer').get('items', [])
|
|
1157
|
+
for item in items:
|
|
1158
|
+
# Check for different video renderer types
|
|
1159
|
+
video_data = (item.get("videoRenderer") or
|
|
1160
|
+
item.get("compactVideoRenderer") or
|
|
1161
|
+
item.get("gridVideoRenderer"))
|
|
1162
|
+
if video_data:
|
|
1163
|
+
video_info = self._extract_video_info(video_data)
|
|
1164
|
+
if video_info:
|
|
1165
|
+
videos.append(video_info)
|
|
1166
|
+
|
|
1167
|
+
return videos
|
|
1168
|
+
|
|
1169
|
+
except Exception as e:
|
|
1170
|
+
logger.error(f"Failed to get trending videos: {e}")
|
|
1171
|
+
return []
|
|
1172
|
+
|
|
1173
|
+
async def close(self):
|
|
1174
|
+
if self.browser_session and self.target_id:
|
|
1175
|
+
try:
|
|
1176
|
+
logger.info(f"Close target id: {self.target_id}")
|
|
1177
|
+
await self.browser_session.cdp_client.send.Target.closeTarget(params={'targetId': self.target_id})
|
|
1178
|
+
except Exception as e:
|
|
1179
|
+
logger.warning(f"Error closing target {self.target_id}: {e}")
|