headless-youtube-captions 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/LICENSE +21 -0
- package/README.md +213 -10
- package/package.json +7 -3
- package/src/channel.js +243 -0
- package/src/comments.js +156 -0
- package/src/index.d.ts +145 -1
- package/src/index.js +11 -38
- package/src/utils/browser.js +53 -0
- package/src/utils/extract.js +101 -0
- package/src/utils/scroll.js +48 -0
- package/.claude/settings.local.json +0 -25
- package/test/index.test.js +0 -23
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.2.0] - 2025-07-06
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Docker support with `PUPPETEER_EXECUTABLE_PATH` environment variable
|
|
7
|
+
- Ability to specify custom Chrome/Chromium executable path
|
|
8
|
+
- Comprehensive Docker usage documentation with example Dockerfile
|
|
9
|
+
- npm badges in README (version, license, Node.js version)
|
|
10
|
+
- Features section highlighting key capabilities
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- Enhanced README with better organization and documentation
|
|
14
|
+
- Updated Node.js requirement documentation to correctly show v18+
|
|
15
|
+
|
|
3
16
|
## [1.0.1] - 2025-06-10
|
|
4
17
|
|
|
5
18
|
### Added
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 andrewlwn77
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,6 +1,21 @@
|
|
|
1
1
|
# Headless YouTube Captions
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/headless-youtube-captions)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://nodejs.org)
|
|
6
|
+
|
|
7
|
+
> Extract YouTube video transcripts, channel videos, and comments by interacting with YouTube's UI using Puppeteer
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- 🎯 Extract video transcripts/captions in multiple languages
|
|
12
|
+
- 📺 Get channel videos with pagination support
|
|
13
|
+
- 🔍 Search videos within a specific channel
|
|
14
|
+
- 💬 Extract video comments with sorting options
|
|
15
|
+
- 🐳 Docker support with configurable Chrome executable path
|
|
16
|
+
- 📦 Zero build dependencies - runs directly from source
|
|
17
|
+
- 🚀 Modern ES modules with async/await
|
|
18
|
+
- 🛡️ Handles cookie consent and ad skipping automatically
|
|
4
19
|
|
|
5
20
|
## Installation
|
|
6
21
|
|
|
@@ -12,7 +27,7 @@ yarn add headless-youtube-captions
|
|
|
12
27
|
|
|
13
28
|
## Usage
|
|
14
29
|
|
|
15
|
-
###
|
|
30
|
+
### Extract Video Transcripts
|
|
16
31
|
```js
|
|
17
32
|
import { getSubtitles } from 'headless-youtube-captions';
|
|
18
33
|
|
|
@@ -24,16 +39,42 @@ const captions = await getSubtitles({
|
|
|
24
39
|
console.log(captions);
|
|
25
40
|
```
|
|
26
41
|
|
|
27
|
-
###
|
|
42
|
+
### Get Channel Videos
|
|
28
43
|
```js
|
|
29
|
-
|
|
44
|
+
import { getChannelVideos } from 'headless-youtube-captions';
|
|
30
45
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
})
|
|
35
|
-
|
|
46
|
+
const result = await getChannelVideos({
|
|
47
|
+
channelURL: '@mkbhd', // or full URL like 'https://youtube.com/@mkbhd'
|
|
48
|
+
limit: 30 // Optional, default: 30
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
console.log(result.videos);
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Search Channel Videos
|
|
55
|
+
```js
|
|
56
|
+
import { searchChannelVideos } from 'headless-youtube-captions';
|
|
57
|
+
|
|
58
|
+
const result = await searchChannelVideos({
|
|
59
|
+
channelURL: '@mkbhd',
|
|
60
|
+
query: 'iphone review',
|
|
61
|
+
limit: 20 // Optional, default: 30
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
console.log(result.results);
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Get Video Comments
|
|
68
|
+
```js
|
|
69
|
+
import { getVideoComments } from 'headless-youtube-captions';
|
|
70
|
+
|
|
71
|
+
const result = await getVideoComments({
|
|
72
|
+
videoID: 'JueUvj6X3DA',
|
|
73
|
+
limit: 50, // Optional, default: 50
|
|
74
|
+
sortBy: 'top' // Optional, 'top' or 'newest', default: 'top'
|
|
36
75
|
});
|
|
76
|
+
|
|
77
|
+
console.log(result.comments);
|
|
37
78
|
```
|
|
38
79
|
|
|
39
80
|
## API
|
|
@@ -100,9 +141,58 @@ This library uses Puppeteer to:
|
|
|
100
141
|
|
|
101
142
|
## Requirements
|
|
102
143
|
|
|
103
|
-
- Node.js
|
|
144
|
+
- Node.js 18 or higher (ES modules support required)
|
|
104
145
|
- Puppeteer (installed as a dependency)
|
|
105
146
|
|
|
147
|
+
## Docker Usage
|
|
148
|
+
|
|
149
|
+
When running in Docker containers, you may need to specify the Chrome executable path using the `PUPPETEER_EXECUTABLE_PATH` environment variable:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Set the environment variable
|
|
153
|
+
export PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
|
|
154
|
+
|
|
155
|
+
# Or run directly
|
|
156
|
+
PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable node your-script.js
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Example Dockerfile configuration:
|
|
160
|
+
```dockerfile
|
|
161
|
+
# Install Chrome dependencies
|
|
162
|
+
RUN apt-get update && apt-get install -y \
|
|
163
|
+
wget \
|
|
164
|
+
gnupg \
|
|
165
|
+
ca-certificates \
|
|
166
|
+
fonts-liberation \
|
|
167
|
+
libasound2 \
|
|
168
|
+
libatk-bridge2.0-0 \
|
|
169
|
+
libatk1.0-0 \
|
|
170
|
+
libatspi2.0-0 \
|
|
171
|
+
libcups2 \
|
|
172
|
+
libdbus-1-3 \
|
|
173
|
+
libdrm2 \
|
|
174
|
+
libgbm1 \
|
|
175
|
+
libgtk-3-0 \
|
|
176
|
+
libnspr4 \
|
|
177
|
+
libnss3 \
|
|
178
|
+
libxcomposite1 \
|
|
179
|
+
libxdamage1 \
|
|
180
|
+
libxfixes3 \
|
|
181
|
+
libxkbcommon0 \
|
|
182
|
+
libxrandr2 \
|
|
183
|
+
xdg-utils
|
|
184
|
+
|
|
185
|
+
# Install Chrome
|
|
186
|
+
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
|
187
|
+
&& echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
|
|
188
|
+
&& apt-get update \
|
|
189
|
+
&& apt-get install -y google-chrome-stable \
|
|
190
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
191
|
+
|
|
192
|
+
# Set the Chrome executable path
|
|
193
|
+
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
|
|
194
|
+
```
|
|
195
|
+
|
|
106
196
|
## Error Handling
|
|
107
197
|
|
|
108
198
|
The function will throw an error if:
|
|
@@ -129,6 +219,119 @@ try {
|
|
|
129
219
|
- The library respects YouTube's UI structure as of the last update
|
|
130
220
|
- Some videos may not have transcripts available
|
|
131
221
|
|
|
222
|
+
### `getChannelVideos(options)`
|
|
223
|
+
|
|
224
|
+
Extracts videos from a YouTube channel with pagination support.
|
|
225
|
+
|
|
226
|
+
#### Parameters
|
|
227
|
+
|
|
228
|
+
- `options` (Object):
|
|
229
|
+
- `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
|
|
230
|
+
- `limit` (Number, optional): Maximum videos to return. Default: `30`
|
|
231
|
+
- `pageToken` (String, optional): For pagination (future use)
|
|
232
|
+
|
|
233
|
+
#### Returns
|
|
234
|
+
|
|
235
|
+
```js
|
|
236
|
+
{
|
|
237
|
+
channel: {
|
|
238
|
+
name: "Channel Name",
|
|
239
|
+
subscribers: "1.2M subscribers",
|
|
240
|
+
videoCount: "500 videos"
|
|
241
|
+
},
|
|
242
|
+
videos: [
|
|
243
|
+
{
|
|
244
|
+
id: "videoId123",
|
|
245
|
+
title: "Video Title",
|
|
246
|
+
views: "1.2M views",
|
|
247
|
+
uploadTime: "2 days ago",
|
|
248
|
+
duration: "10:45",
|
|
249
|
+
thumbnail: "https://...",
|
|
250
|
+
url: "https://youtube.com/watch?v=videoId123"
|
|
251
|
+
}
|
|
252
|
+
// ... more videos
|
|
253
|
+
],
|
|
254
|
+
totalLoaded: 30,
|
|
255
|
+
hasMore: true
|
|
256
|
+
}
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### `searchChannelVideos(options)`
|
|
260
|
+
|
|
261
|
+
Search for videos within a specific YouTube channel.
|
|
262
|
+
|
|
263
|
+
#### Parameters
|
|
264
|
+
|
|
265
|
+
- `options` (Object):
|
|
266
|
+
- `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
|
|
267
|
+
- `query` (String, required): Search query
|
|
268
|
+
- `limit` (Number, optional): Maximum results. Default: `30`
|
|
269
|
+
|
|
270
|
+
#### Returns
|
|
271
|
+
|
|
272
|
+
```js
|
|
273
|
+
{
|
|
274
|
+
query: "iphone review",
|
|
275
|
+
results: [
|
|
276
|
+
{
|
|
277
|
+
id: "videoId123",
|
|
278
|
+
title: "iPhone 15 Review",
|
|
279
|
+
views: "2.5M views",
|
|
280
|
+
uploadTime: "1 week ago",
|
|
281
|
+
duration: "15:23",
|
|
282
|
+
thumbnail: "https://...",
|
|
283
|
+
url: "https://youtube.com/watch?v=videoId123"
|
|
284
|
+
}
|
|
285
|
+
// ... more results
|
|
286
|
+
],
|
|
287
|
+
totalFound: 25
|
|
288
|
+
}
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### `getVideoComments(options)`
|
|
292
|
+
|
|
293
|
+
Extract comments from a YouTube video with pagination support.
|
|
294
|
+
|
|
295
|
+
#### Parameters
|
|
296
|
+
|
|
297
|
+
- `options` (Object):
|
|
298
|
+
- `videoID` (String, required): YouTube video ID
|
|
299
|
+
- `limit` (Number, optional): Maximum comments to return. Default: `50`
|
|
300
|
+
- `sortBy` (String, optional): Sort order - `'top'` or `'newest'`. Default: `'top'`
|
|
301
|
+
- `pageToken` (String, optional): For pagination (future use)
|
|
302
|
+
|
|
303
|
+
#### Returns
|
|
304
|
+
|
|
305
|
+
```js
|
|
306
|
+
{
|
|
307
|
+
video: {
|
|
308
|
+
id: "JueUvj6X3DA",
|
|
309
|
+
title: "Video Title",
|
|
310
|
+
channel: {
|
|
311
|
+
name: "Channel Name",
|
|
312
|
+
url: "https://youtube.com/@channel"
|
|
313
|
+
},
|
|
314
|
+
views: "1.5M views"
|
|
315
|
+
},
|
|
316
|
+
comments: [
|
|
317
|
+
{
|
|
318
|
+
author: "Username",
|
|
319
|
+
authorUrl: "https://youtube.com/@username",
|
|
320
|
+
authorAvatar: "https://...",
|
|
321
|
+
text: "Great video! Thanks for sharing...",
|
|
322
|
+
time: "2 days ago",
|
|
323
|
+
likes: "245",
|
|
324
|
+
replyCount: "12"
|
|
325
|
+
}
|
|
326
|
+
// ... more comments
|
|
327
|
+
],
|
|
328
|
+
totalComments: 1566,
|
|
329
|
+
totalLoaded: 50,
|
|
330
|
+
hasMore: true,
|
|
331
|
+
sortBy: "top"
|
|
332
|
+
}
|
|
333
|
+
```
|
|
334
|
+
|
|
132
335
|
## License
|
|
133
336
|
|
|
134
337
|
MIT
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "headless-youtube-captions",
|
|
3
|
-
"version": "1.0
|
|
4
|
-
"description": "Extract YouTube video transcripts using headless browser automation",
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Extract YouTube video transcripts, channel videos, and comments using headless browser automation",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "src/index.d.ts",
|
|
7
7
|
"type": "module",
|
|
@@ -35,8 +35,12 @@
|
|
|
35
35
|
"captions",
|
|
36
36
|
"subtitles",
|
|
37
37
|
"transcript",
|
|
38
|
+
"comments",
|
|
39
|
+
"channel",
|
|
40
|
+
"videos",
|
|
38
41
|
"puppeteer",
|
|
39
42
|
"headless",
|
|
40
|
-
"scraper"
|
|
43
|
+
"scraper",
|
|
44
|
+
"api"
|
|
41
45
|
]
|
|
42
46
|
}
|
package/src/channel.js
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import { createBrowser, createPage, handleCookieConsent } from './utils/browser.js';
|
|
2
|
+
import { scrollAndWaitForMore } from './utils/scroll.js';
|
|
3
|
+
import { extractVideoData, extractChannelInfo } from './utils/extract.js';
|
|
4
|
+
|
|
5
|
+
export async function getChannelVideos({ channelURL, limit = 30, pageToken = null }) {
|
|
6
|
+
const browser = await createBrowser();
|
|
7
|
+
|
|
8
|
+
try {
|
|
9
|
+
const page = await createPage(browser);
|
|
10
|
+
|
|
11
|
+
// Construct the full URL
|
|
12
|
+
let fullURL;
|
|
13
|
+
if (channelURL.startsWith('http')) {
|
|
14
|
+
// Ensure we're on the videos tab
|
|
15
|
+
fullURL = channelURL.includes('/videos') ? channelURL : channelURL.replace(/\/?$/, '/videos');
|
|
16
|
+
} else if (channelURL.startsWith('@')) {
|
|
17
|
+
fullURL = `https://youtube.com/${channelURL}/videos`;
|
|
18
|
+
} else if (channelURL.startsWith('UC')) {
|
|
19
|
+
fullURL = `https://youtube.com/channel/${channelURL}/videos`;
|
|
20
|
+
} else {
|
|
21
|
+
fullURL = `https://youtube.com/c/${channelURL}/videos`;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
console.error(`Navigating to ${fullURL}`);
|
|
25
|
+
await page.goto(fullURL, {
|
|
26
|
+
waitUntil: 'networkidle2',
|
|
27
|
+
timeout: 60000
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
// Handle cookie consent
|
|
31
|
+
await handleCookieConsent(page);
|
|
32
|
+
|
|
33
|
+
// Wait a bit for dynamic content
|
|
34
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
35
|
+
|
|
36
|
+
// Wait for initial videos to load
|
|
37
|
+
await page.waitForSelector('ytd-rich-item-renderer', { timeout: 30000 });
|
|
38
|
+
console.error('Initial videos loaded');
|
|
39
|
+
|
|
40
|
+
// Extract channel info
|
|
41
|
+
const channelInfo = await extractChannelInfo(page);
|
|
42
|
+
|
|
43
|
+
let allVideos = [];
|
|
44
|
+
let currentCount = 0;
|
|
45
|
+
|
|
46
|
+
// Load videos up to the limit
|
|
47
|
+
while (allVideos.length < limit) {
|
|
48
|
+
const videos = await extractVideoData(page);
|
|
49
|
+
allVideos = videos;
|
|
50
|
+
|
|
51
|
+
if (videos.length === currentCount) {
|
|
52
|
+
// No more videos to load
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
currentCount = videos.length;
|
|
57
|
+
|
|
58
|
+
if (currentCount < limit) {
|
|
59
|
+
// Try to load more videos
|
|
60
|
+
const newCount = await scrollAndWaitForMore(page, 'ytd-rich-item-renderer', currentCount);
|
|
61
|
+
if (newCount === currentCount) {
|
|
62
|
+
break; // No new videos loaded
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Trim to requested limit
|
|
68
|
+
const resultVideos = allVideos.slice(0, limit);
|
|
69
|
+
|
|
70
|
+
console.error(`Successfully extracted ${resultVideos.length} videos`);
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
channel: channelInfo,
|
|
74
|
+
videos: resultVideos,
|
|
75
|
+
totalLoaded: allVideos.length,
|
|
76
|
+
hasMore: allVideos.length > limit
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
} catch (error) {
|
|
80
|
+
console.error('Error extracting channel videos:', error);
|
|
81
|
+
throw error;
|
|
82
|
+
} finally {
|
|
83
|
+
await browser.close();
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export async function searchChannelVideos({ channelURL, query, limit = 30 }) {
|
|
88
|
+
const browser = await createBrowser();
|
|
89
|
+
|
|
90
|
+
try {
|
|
91
|
+
const page = await createPage(browser);
|
|
92
|
+
|
|
93
|
+
// Navigate to channel page
|
|
94
|
+
let fullURL;
|
|
95
|
+
if (channelURL.startsWith('http')) {
|
|
96
|
+
// Remove /videos if present to get to main channel page
|
|
97
|
+
fullURL = channelURL.replace(/\/videos\/?$/, '');
|
|
98
|
+
} else if (channelURL.startsWith('@')) {
|
|
99
|
+
fullURL = `https://youtube.com/${channelURL}`;
|
|
100
|
+
} else if (channelURL.startsWith('UC')) {
|
|
101
|
+
fullURL = `https://youtube.com/channel/${channelURL}`;
|
|
102
|
+
} else {
|
|
103
|
+
fullURL = `https://youtube.com/c/${channelURL}`;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
console.error(`Navigating to ${fullURL}`);
|
|
107
|
+
await page.goto(fullURL, {
|
|
108
|
+
waitUntil: 'networkidle2',
|
|
109
|
+
timeout: 60000
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
// Handle cookie consent
|
|
113
|
+
await handleCookieConsent(page);
|
|
114
|
+
|
|
115
|
+
// Wait for page to load
|
|
116
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
117
|
+
|
|
118
|
+
// Look for search icon in channel header
|
|
119
|
+
const searchButtonSelectors = [
|
|
120
|
+
'ytd-channel-header-renderer yt-icon-button[aria-label*="Search"]',
|
|
121
|
+
'ytd-channel-header-renderer button[aria-label*="Search"]',
|
|
122
|
+
'#channel-header yt-icon-button[aria-label*="Search"]',
|
|
123
|
+
'yt-icon[icon="yt-icons:search"]'
|
|
124
|
+
];
|
|
125
|
+
|
|
126
|
+
let searchClicked = false;
|
|
127
|
+
for (const selector of searchButtonSelectors) {
|
|
128
|
+
try {
|
|
129
|
+
const searchButton = await page.$(selector);
|
|
130
|
+
if (searchButton) {
|
|
131
|
+
const isVisible = await searchButton.evaluate(el => {
|
|
132
|
+
const rect = el.getBoundingClientRect();
|
|
133
|
+
return rect.width > 0 && rect.height > 0;
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
if (isVisible) {
|
|
137
|
+
await searchButton.click();
|
|
138
|
+
console.error('Clicked search button');
|
|
139
|
+
searchClicked = true;
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
} catch (e) {
|
|
144
|
+
// Try next selector
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (!searchClicked) {
|
|
149
|
+
// Try clicking on the search icon itself
|
|
150
|
+
const clicked = await page.evaluate(() => {
|
|
151
|
+
const icons = document.querySelectorAll('yt-icon');
|
|
152
|
+
for (const icon of icons) {
|
|
153
|
+
if (icon.getAttribute('icon') === 'yt-icons:search') {
|
|
154
|
+
const button = icon.closest('button') || icon.closest('yt-icon-button');
|
|
155
|
+
if (button) {
|
|
156
|
+
button.click();
|
|
157
|
+
return true;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return false;
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
if (clicked) {
|
|
165
|
+
console.error('Clicked search icon');
|
|
166
|
+
searchClicked = true;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (!searchClicked) {
|
|
171
|
+
throw new Error('Could not find channel search button');
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Wait for search input to appear
|
|
175
|
+
await page.waitForSelector('input[placeholder*="Search"]', { timeout: 5000 });
|
|
176
|
+
|
|
177
|
+
// Type search query
|
|
178
|
+
await page.type('input[placeholder*="Search"]', query);
|
|
179
|
+
await page.keyboard.press('Enter');
|
|
180
|
+
|
|
181
|
+
// Wait for search results
|
|
182
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
183
|
+
await page.waitForSelector('ytd-video-renderer, ytd-rich-item-renderer', { timeout: 10000 });
|
|
184
|
+
|
|
185
|
+
// Extract search results
|
|
186
|
+
const searchResults = await page.evaluate(() => {
|
|
187
|
+
// Try different selectors for search results
|
|
188
|
+
let videos = document.querySelectorAll('ytd-video-renderer');
|
|
189
|
+
if (videos.length === 0) {
|
|
190
|
+
videos = document.querySelectorAll('ytd-rich-item-renderer');
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return Array.from(videos).map(video => {
|
|
194
|
+
// Extract video ID
|
|
195
|
+
const link = video.querySelector('a#video-title, a#video-title-link');
|
|
196
|
+
const href = link ? link.href : '';
|
|
197
|
+
const videoId = href.match(/watch\?v=([^&]+)/)?.[1] || '';
|
|
198
|
+
|
|
199
|
+
// Extract title
|
|
200
|
+
const titleElement = video.querySelector('#video-title');
|
|
201
|
+
const title = titleElement ? titleElement.textContent.trim() : '';
|
|
202
|
+
|
|
203
|
+
// Extract metadata
|
|
204
|
+
const viewsElement = video.querySelector('#metadata-line span:first-child, .view-count');
|
|
205
|
+
const views = viewsElement ? viewsElement.textContent : '';
|
|
206
|
+
|
|
207
|
+
const timeElement = video.querySelector('#metadata-line span:last-child, .published-time');
|
|
208
|
+
const uploadTime = timeElement ? timeElement.textContent : '';
|
|
209
|
+
|
|
210
|
+
// Extract duration
|
|
211
|
+
const durationElement = video.querySelector('ytd-thumbnail-overlay-time-status-renderer span, .video-time');
|
|
212
|
+
const duration = durationElement ? durationElement.textContent.trim() : '';
|
|
213
|
+
|
|
214
|
+
// Extract thumbnail
|
|
215
|
+
const thumbnail = video.querySelector('img#img')?.src || '';
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
id: videoId,
|
|
219
|
+
title,
|
|
220
|
+
views,
|
|
221
|
+
uploadTime,
|
|
222
|
+
duration,
|
|
223
|
+
thumbnail,
|
|
224
|
+
url: `https://youtube.com/watch?v=${videoId}`
|
|
225
|
+
};
|
|
226
|
+
}).filter(video => video.id && video.title);
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
console.error(`Found ${searchResults.length} videos matching "${query}"`);
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
query,
|
|
233
|
+
results: searchResults.slice(0, limit),
|
|
234
|
+
totalFound: searchResults.length
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
} catch (error) {
|
|
238
|
+
console.error('Error searching channel videos:', error);
|
|
239
|
+
throw error;
|
|
240
|
+
} finally {
|
|
241
|
+
await browser.close();
|
|
242
|
+
}
|
|
243
|
+
}
|
package/src/comments.js
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import { createBrowser, createPage, handleCookieConsent, skipAds } from './utils/browser.js';
|
|
2
|
+
import { scrollToLoadComments, scrollAndWaitForMore } from './utils/scroll.js';
|
|
3
|
+
import { extractCommentData } from './utils/extract.js';
|
|
4
|
+
|
|
5
|
+
export async function getVideoComments({ videoID, limit = 50, sortBy = 'top', pageToken = null }) {
|
|
6
|
+
const browser = await createBrowser();
|
|
7
|
+
|
|
8
|
+
try {
|
|
9
|
+
const page = await createPage(browser);
|
|
10
|
+
|
|
11
|
+
// Navigate to the YouTube video page
|
|
12
|
+
console.error(`Navigating to https://youtube.com/watch?v=${videoID}`);
|
|
13
|
+
await page.goto(`https://youtube.com/watch?v=${videoID}`, {
|
|
14
|
+
waitUntil: 'networkidle2',
|
|
15
|
+
timeout: 60000
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// Wait for video player to load
|
|
19
|
+
await page.waitForSelector('#movie_player, video', { timeout: 30000 });
|
|
20
|
+
console.error('Video player loaded');
|
|
21
|
+
|
|
22
|
+
// Handle cookie consent
|
|
23
|
+
await handleCookieConsent(page);
|
|
24
|
+
|
|
25
|
+
// Skip ads if present
|
|
26
|
+
await skipAds(page);
|
|
27
|
+
|
|
28
|
+
// Wait for page to stabilize
|
|
29
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
30
|
+
|
|
31
|
+
// Scroll to load comments
|
|
32
|
+
const commentsLoaded = await scrollToLoadComments(page);
|
|
33
|
+
if (!commentsLoaded) {
|
|
34
|
+
throw new Error('Could not load comments section');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Wait for comment threads to load
|
|
38
|
+
await page.waitForSelector('ytd-comment-thread-renderer', {
|
|
39
|
+
timeout: 10000,
|
|
40
|
+
visible: true
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
console.error('Comments section loaded');
|
|
44
|
+
|
|
45
|
+
// Extract total comment count
|
|
46
|
+
const commentCount = await page.evaluate(() => {
|
|
47
|
+
const countElement = document.querySelector('ytd-comments-header-renderer h2 yt-formatted-string');
|
|
48
|
+
if (countElement) {
|
|
49
|
+
const text = countElement.textContent;
|
|
50
|
+
const match = text.match(/[\d,]+/);
|
|
51
|
+
return match ? match[0].replace(/,/g, '') : '0';
|
|
52
|
+
}
|
|
53
|
+
return '0';
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
// Check if we need to change sort order
|
|
57
|
+
if (sortBy === 'newest') {
|
|
58
|
+
// Click on sort menu
|
|
59
|
+
const sortMenuClicked = await page.evaluate(() => {
|
|
60
|
+
const sortButton = document.querySelector('ytd-comments-header-renderer tp-yt-paper-dropdown-menu-light');
|
|
61
|
+
if (sortButton) {
|
|
62
|
+
sortButton.click();
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
return false;
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
if (sortMenuClicked) {
|
|
69
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
70
|
+
|
|
71
|
+
// Click on "Newest first" option
|
|
72
|
+
await page.evaluate(() => {
|
|
73
|
+
const menuItems = document.querySelectorAll('tp-yt-paper-listbox tp-yt-paper-item');
|
|
74
|
+
for (const item of menuItems) {
|
|
75
|
+
if (item.textContent.includes('Newest') || item.textContent.includes('newest')) {
|
|
76
|
+
item.click();
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
// Wait for comments to reload
|
|
83
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
let allComments = [];
|
|
88
|
+
let currentCount = 0;
|
|
89
|
+
|
|
90
|
+
// Load comments up to the limit
|
|
91
|
+
while (allComments.length < limit) {
|
|
92
|
+
const comments = await extractCommentData(page);
|
|
93
|
+
allComments = comments;
|
|
94
|
+
|
|
95
|
+
if (comments.length === currentCount) {
|
|
96
|
+
// No more comments to load
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
currentCount = comments.length;
|
|
101
|
+
|
|
102
|
+
if (currentCount < limit) {
|
|
103
|
+
// Try to load more comments
|
|
104
|
+
const newCount = await scrollAndWaitForMore(page, 'ytd-comment-thread-renderer', currentCount, 3000);
|
|
105
|
+
if (newCount === currentCount) {
|
|
106
|
+
break; // No new comments loaded
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Trim to requested limit
|
|
112
|
+
const resultComments = allComments.slice(0, limit);
|
|
113
|
+
|
|
114
|
+
console.error(`Successfully extracted ${resultComments.length} comments`);
|
|
115
|
+
|
|
116
|
+
// Extract video info
|
|
117
|
+
const videoInfo = await page.evaluate(() => {
|
|
118
|
+
const titleElement = document.querySelector('h1.ytd-video-primary-info-renderer');
|
|
119
|
+
const title = titleElement ? titleElement.textContent.trim() : '';
|
|
120
|
+
|
|
121
|
+
const channelElement = document.querySelector('ytd-channel-name a');
|
|
122
|
+
const channelName = channelElement ? channelElement.textContent.trim() : '';
|
|
123
|
+
const channelUrl = channelElement ? channelElement.href : '';
|
|
124
|
+
|
|
125
|
+
const viewsElement = document.querySelector('.view-count');
|
|
126
|
+
const views = viewsElement ? viewsElement.textContent : '';
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
title,
|
|
130
|
+
channel: {
|
|
131
|
+
name: channelName,
|
|
132
|
+
url: channelUrl
|
|
133
|
+
},
|
|
134
|
+
views
|
|
135
|
+
};
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
video: {
|
|
140
|
+
id: videoID,
|
|
141
|
+
...videoInfo
|
|
142
|
+
},
|
|
143
|
+
comments: resultComments,
|
|
144
|
+
totalComments: parseInt(commentCount),
|
|
145
|
+
totalLoaded: allComments.length,
|
|
146
|
+
hasMore: allComments.length > limit,
|
|
147
|
+
sortBy
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
} catch (error) {
|
|
151
|
+
console.error('Error extracting comments:', error);
|
|
152
|
+
throw error;
|
|
153
|
+
} finally {
|
|
154
|
+
await browser.close();
|
|
155
|
+
}
|
|
156
|
+
}
|
package/src/index.d.ts
CHANGED
|
@@ -19,4 +19,148 @@ export interface GetSubtitlesOptions {
|
|
|
19
19
|
* @param options - Configuration options
|
|
20
20
|
* @returns Promise that resolves to an array of subtitle segments
|
|
21
21
|
*/
|
|
22
|
-
export function getSubtitles(options: GetSubtitlesOptions): Promise<SubtitleSegment[]>;
|
|
22
|
+
export function getSubtitles(options: GetSubtitlesOptions): Promise<SubtitleSegment[]>;
|
|
23
|
+
|
|
24
|
+
// New types for channel videos
|
|
25
|
+
export interface VideoInfo {
|
|
26
|
+
/** YouTube video ID */
|
|
27
|
+
id: string;
|
|
28
|
+
/** Video title */
|
|
29
|
+
title: string;
|
|
30
|
+
/** View count text */
|
|
31
|
+
views: string;
|
|
32
|
+
/** Upload time text (e.g., "2 days ago") */
|
|
33
|
+
uploadTime: string;
|
|
34
|
+
/** Video duration text (e.g., "10:45") */
|
|
35
|
+
duration: string;
|
|
36
|
+
/** Thumbnail URL */
|
|
37
|
+
thumbnail: string;
|
|
38
|
+
/** Full YouTube video URL */
|
|
39
|
+
url: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ChannelInfo {
|
|
43
|
+
/** Channel name */
|
|
44
|
+
name: string;
|
|
45
|
+
/** Subscriber count text */
|
|
46
|
+
subscribers: string;
|
|
47
|
+
/** Total video count text */
|
|
48
|
+
videoCount: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface GetChannelVideosOptions {
|
|
52
|
+
/** Channel URL, @handle, or channel ID */
|
|
53
|
+
channelURL: string;
|
|
54
|
+
/** Maximum number of videos to return (default: 30) */
|
|
55
|
+
limit?: number;
|
|
56
|
+
/** Page token for pagination (optional) */
|
|
57
|
+
pageToken?: string | null;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface ChannelVideosResult {
|
|
61
|
+
/** Channel information */
|
|
62
|
+
channel: ChannelInfo;
|
|
63
|
+
/** Array of videos */
|
|
64
|
+
videos: VideoInfo[];
|
|
65
|
+
/** Total videos loaded */
|
|
66
|
+
totalLoaded: number;
|
|
67
|
+
/** Whether there are more videos available */
|
|
68
|
+
hasMore: boolean;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Get videos from a YouTube channel with pagination support
|
|
73
|
+
* @param options - Configuration options
|
|
74
|
+
* @returns Promise that resolves to channel videos result
|
|
75
|
+
*/
|
|
76
|
+
export function getChannelVideos(options: GetChannelVideosOptions): Promise<ChannelVideosResult>;
|
|
77
|
+
|
|
78
|
+
export interface SearchChannelVideosOptions {
|
|
79
|
+
/** Channel URL, @handle, or channel ID */
|
|
80
|
+
channelURL: string;
|
|
81
|
+
/** Search query */
|
|
82
|
+
query: string;
|
|
83
|
+
/** Maximum number of videos to return (default: 30) */
|
|
84
|
+
limit?: number;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export interface SearchChannelVideosResult {
|
|
88
|
+
/** Search query used */
|
|
89
|
+
query: string;
|
|
90
|
+
/** Array of matching videos */
|
|
91
|
+
results: VideoInfo[];
|
|
92
|
+
/** Total videos found */
|
|
93
|
+
totalFound: number;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Search for videos within a YouTube channel
|
|
98
|
+
* @param options - Configuration options
|
|
99
|
+
* @returns Promise that resolves to search results
|
|
100
|
+
*/
|
|
101
|
+
export function searchChannelVideos(options: SearchChannelVideosOptions): Promise<SearchChannelVideosResult>;
|
|
102
|
+
|
|
103
|
+
// Types for comments
|
|
104
|
+
export interface Comment {
|
|
105
|
+
/** Comment author name */
|
|
106
|
+
author: string;
|
|
107
|
+
/** Author channel URL */
|
|
108
|
+
authorUrl: string;
|
|
109
|
+
/** Author avatar URL */
|
|
110
|
+
authorAvatar: string;
|
|
111
|
+
/** Comment text */
|
|
112
|
+
text: string;
|
|
113
|
+
/** Time ago text (e.g., "2 days ago") */
|
|
114
|
+
time: string;
|
|
115
|
+
/** Like count */
|
|
116
|
+
likes: string;
|
|
117
|
+
/** Number of replies */
|
|
118
|
+
replyCount: string;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export interface VideoDetails {
|
|
122
|
+
/** Video ID */
|
|
123
|
+
id: string;
|
|
124
|
+
/** Video title */
|
|
125
|
+
title: string;
|
|
126
|
+
/** Channel information */
|
|
127
|
+
channel: {
|
|
128
|
+
name: string;
|
|
129
|
+
url: string;
|
|
130
|
+
};
|
|
131
|
+
/** View count text */
|
|
132
|
+
views: string;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
export interface GetVideoCommentsOptions {
|
|
136
|
+
/** YouTube video ID */
|
|
137
|
+
videoID: string;
|
|
138
|
+
/** Maximum number of comments to return (default: 50) */
|
|
139
|
+
limit?: number;
|
|
140
|
+
/** Sort order: 'top' or 'newest' (default: 'top') */
|
|
141
|
+
sortBy?: 'top' | 'newest';
|
|
142
|
+
/** Page token for pagination (optional) */
|
|
143
|
+
pageToken?: string | null;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
export interface VideoCommentsResult {
|
|
147
|
+
/** Video information */
|
|
148
|
+
video: VideoDetails;
|
|
149
|
+
/** Array of comments */
|
|
150
|
+
comments: Comment[];
|
|
151
|
+
/** Total comment count */
|
|
152
|
+
totalComments: number;
|
|
153
|
+
/** Total comments loaded */
|
|
154
|
+
totalLoaded: number;
|
|
155
|
+
/** Whether there are more comments available */
|
|
156
|
+
hasMore: boolean;
|
|
157
|
+
/** Sort order used */
|
|
158
|
+
sortBy: 'top' | 'newest';
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Get comments from a YouTube video with pagination support
|
|
163
|
+
* @param options - Configuration options
|
|
164
|
+
* @returns Promise that resolves to video comments result
|
|
165
|
+
*/
|
|
166
|
+
export function getVideoComments(options: GetVideoCommentsOptions): Promise<VideoCommentsResult>;
|
package/src/index.js
CHANGED
|
@@ -1,24 +1,11 @@
|
|
|
1
|
-
import
|
|
2
|
-
import lodash from 'lodash';
|
|
3
|
-
import striptags from 'striptags';
|
|
4
|
-
import puppeteer from 'puppeteer';
|
|
5
|
-
|
|
6
|
-
const { find } = lodash;
|
|
1
|
+
import { createBrowser, createPage, handleCookieConsent, skipAds } from './utils/browser.js';
|
|
7
2
|
|
|
3
|
+
// Export existing function
|
|
8
4
|
export async function getSubtitles({ videoID, lang = 'en' }) {
|
|
9
|
-
const browser = await
|
|
10
|
-
headless: true,
|
|
11
|
-
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
|
|
12
|
-
});
|
|
5
|
+
const browser = await createBrowser();
|
|
13
6
|
|
|
14
7
|
try {
|
|
15
|
-
const page = await browser
|
|
16
|
-
|
|
17
|
-
// Set viewport to a standard desktop size
|
|
18
|
-
await page.setViewport({ width: 1920, height: 1080 });
|
|
19
|
-
|
|
20
|
-
// Set a realistic user agent
|
|
21
|
-
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
8
|
+
const page = await createPage(browser);
|
|
22
9
|
|
|
23
10
|
// Navigate to the YouTube video page
|
|
24
11
|
console.error(`Navigating to https://youtube.com/watch?v=${videoID}`);
|
|
@@ -35,28 +22,10 @@ export async function getSubtitles({ videoID, lang = 'en' }) {
|
|
|
35
22
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
36
23
|
|
|
37
24
|
// Handle cookie consent if present
|
|
38
|
-
|
|
39
|
-
const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
|
|
40
|
-
if (consentButton) {
|
|
41
|
-
await consentButton.click();
|
|
42
|
-
console.error('Accepted cookies');
|
|
43
|
-
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
44
|
-
}
|
|
45
|
-
} catch (e) {
|
|
46
|
-
// Cookie consent not present or already accepted
|
|
47
|
-
}
|
|
25
|
+
await handleCookieConsent(page);
|
|
48
26
|
|
|
49
27
|
// Skip ads if present
|
|
50
|
-
|
|
51
|
-
const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
|
|
52
|
-
if (skipButton) {
|
|
53
|
-
await skipButton.click();
|
|
54
|
-
console.error('Skipped ad');
|
|
55
|
-
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
56
|
-
}
|
|
57
|
-
} catch (e) {
|
|
58
|
-
// No skip button
|
|
59
|
-
}
|
|
28
|
+
await skipAds(page);
|
|
60
29
|
|
|
61
30
|
// Scroll down to load more content
|
|
62
31
|
await page.evaluate(() => window.scrollBy(0, 800));
|
|
@@ -285,4 +254,8 @@ export async function getSubtitles({ videoID, lang = 'en' }) {
|
|
|
285
254
|
} finally {
|
|
286
255
|
await browser.close();
|
|
287
256
|
}
|
|
288
|
-
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Export new functions
|
|
260
|
+
export { getChannelVideos, searchChannelVideos } from './channel.js';
|
|
261
|
+
export { getVideoComments } from './comments.js';
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import puppeteer from 'puppeteer';
|
|
2
|
+
|
|
3
|
+
export async function createBrowser() {
|
|
4
|
+
const options = {
|
|
5
|
+
headless: true,
|
|
6
|
+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
// Add executablePath if environment variable is set
|
|
10
|
+
if (process.env.PUPPETEER_EXECUTABLE_PATH) {
|
|
11
|
+
options.executablePath = process.env.PUPPETEER_EXECUTABLE_PATH;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
return await puppeteer.launch(options);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export async function createPage(browser) {
|
|
18
|
+
const page = await browser.newPage();
|
|
19
|
+
|
|
20
|
+
// Set viewport to a standard desktop size
|
|
21
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
22
|
+
|
|
23
|
+
// Set a realistic user agent
|
|
24
|
+
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
25
|
+
|
|
26
|
+
return page;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function handleCookieConsent(page) {
|
|
30
|
+
try {
|
|
31
|
+
const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
|
|
32
|
+
if (consentButton) {
|
|
33
|
+
await consentButton.click();
|
|
34
|
+
console.error('Accepted cookies');
|
|
35
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
36
|
+
}
|
|
37
|
+
} catch (e) {
|
|
38
|
+
// Cookie consent not present or already accepted
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export async function skipAds(page) {
|
|
43
|
+
try {
|
|
44
|
+
const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
|
|
45
|
+
if (skipButton) {
|
|
46
|
+
await skipButton.click();
|
|
47
|
+
console.error('Skipped ad');
|
|
48
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
49
|
+
}
|
|
50
|
+
} catch (e) {
|
|
51
|
+
// No skip button
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
export async function extractVideoData(page) {
|
|
2
|
+
return await page.evaluate(() => {
|
|
3
|
+
const videos = document.querySelectorAll('ytd-rich-item-renderer');
|
|
4
|
+
return Array.from(videos).map(video => {
|
|
5
|
+
const link = video.querySelector('a#video-title-link');
|
|
6
|
+
const href = link ? link.href : '';
|
|
7
|
+
const videoId = href.match(/watch\?v=([^&]+)/)?.[1] || '';
|
|
8
|
+
|
|
9
|
+
const titleElement = video.querySelector('#video-title');
|
|
10
|
+
const title = titleElement ? titleElement.textContent.trim() : '';
|
|
11
|
+
|
|
12
|
+
const metadataLine = video.querySelector('#metadata-line');
|
|
13
|
+
const metadataSpans = metadataLine ? metadataLine.querySelectorAll('span') : [];
|
|
14
|
+
const views = metadataSpans[0]?.textContent || '';
|
|
15
|
+
const uploadTime = metadataSpans[metadataSpans.length - 1]?.textContent || '';
|
|
16
|
+
|
|
17
|
+
const durationElement = video.querySelector('ytd-thumbnail-overlay-time-status-renderer span');
|
|
18
|
+
const duration = durationElement ? durationElement.textContent.trim() : '';
|
|
19
|
+
|
|
20
|
+
const thumbnail = video.querySelector('img#img')?.src || '';
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
id: videoId,
|
|
24
|
+
title,
|
|
25
|
+
views,
|
|
26
|
+
uploadTime,
|
|
27
|
+
duration,
|
|
28
|
+
thumbnail,
|
|
29
|
+
url: `https://youtube.com/watch?v=${videoId}`
|
|
30
|
+
};
|
|
31
|
+
}).filter(video => video.id && video.title);
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export async function extractCommentData(page) {
|
|
36
|
+
return await page.evaluate(() => {
|
|
37
|
+
const threads = document.querySelectorAll('ytd-comment-thread-renderer');
|
|
38
|
+
return Array.from(threads).map(thread => {
|
|
39
|
+
const authorElement = thread.querySelector('#author-text');
|
|
40
|
+
const author = authorElement ? authorElement.textContent.trim() : '';
|
|
41
|
+
const authorUrl = authorElement ? authorElement.href : '';
|
|
42
|
+
|
|
43
|
+
const textElement = thread.querySelector('#content-text');
|
|
44
|
+
const text = textElement ? textElement.textContent.trim() : '';
|
|
45
|
+
|
|
46
|
+
const timeElement = thread.querySelector('#published-time-text');
|
|
47
|
+
const time = timeElement ? timeElement.textContent.trim() : '';
|
|
48
|
+
|
|
49
|
+
const likesElement = thread.querySelector('#vote-count-middle');
|
|
50
|
+
const likes = likesElement ? likesElement.textContent.trim() : '0';
|
|
51
|
+
|
|
52
|
+
const replyElement = thread.querySelector('#more-replies');
|
|
53
|
+
const replyText = replyElement ? replyElement.textContent : '';
|
|
54
|
+
const replyCount = replyText.match(/\d+/)?.[0] || '0';
|
|
55
|
+
|
|
56
|
+
const avatarElement = thread.querySelector('#author-thumbnail img');
|
|
57
|
+
const authorAvatar = avatarElement ? avatarElement.src : '';
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
author,
|
|
61
|
+
authorUrl,
|
|
62
|
+
authorAvatar,
|
|
63
|
+
text,
|
|
64
|
+
time,
|
|
65
|
+
likes,
|
|
66
|
+
replyCount
|
|
67
|
+
};
|
|
68
|
+
}).filter(comment => comment.text && comment.author);
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export async function extractChannelInfo(page) {
|
|
73
|
+
return await page.evaluate(() => {
|
|
74
|
+
// Try multiple selectors for channel name
|
|
75
|
+
const nameSelectors = [
|
|
76
|
+
'ytd-channel-name yt-formatted-string',
|
|
77
|
+
'#channel-name yt-formatted-string',
|
|
78
|
+
'.ytd-channel-name',
|
|
79
|
+
'#text.ytd-channel-name',
|
|
80
|
+
'yt-formatted-string.ytd-channel-name'
|
|
81
|
+
];
|
|
82
|
+
|
|
83
|
+
let channelName = '';
|
|
84
|
+
for (const selector of nameSelectors) {
|
|
85
|
+
const element = document.querySelector(selector);
|
|
86
|
+
if (element && element.textContent) {
|
|
87
|
+
channelName = element.textContent.trim();
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const subscriberCount = document.querySelector('#subscriber-count')?.textContent?.trim() || '';
|
|
93
|
+
const videoCount = document.querySelector('#videos-count')?.textContent?.trim() || '';
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
name: channelName,
|
|
97
|
+
subscribers: subscriberCount,
|
|
98
|
+
videoCount: videoCount
|
|
99
|
+
};
|
|
100
|
+
});
|
|
101
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export async function scrollToBottom(page) {
|
|
2
|
+
await page.evaluate(() => {
|
|
3
|
+
window.scrollTo(0, document.documentElement.scrollHeight);
|
|
4
|
+
});
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export async function scrollToElement(page, selector) {
|
|
8
|
+
await page.evaluate((sel) => {
|
|
9
|
+
const element = document.querySelector(sel);
|
|
10
|
+
if (element) {
|
|
11
|
+
element.scrollIntoView({ behavior: 'smooth', block: 'center' });
|
|
12
|
+
}
|
|
13
|
+
}, selector);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export async function scrollAndWaitForMore(page, itemSelector, currentCount, maxWaitTime = 5000) {
|
|
17
|
+
await scrollToBottom(page);
|
|
18
|
+
|
|
19
|
+
const startTime = Date.now();
|
|
20
|
+
while (Date.now() - startTime < maxWaitTime) {
|
|
21
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
22
|
+
|
|
23
|
+
const newCount = await page.evaluate((selector) => {
|
|
24
|
+
return document.querySelectorAll(selector).length;
|
|
25
|
+
}, itemSelector);
|
|
26
|
+
|
|
27
|
+
if (newCount > currentCount) {
|
|
28
|
+
return newCount;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return currentCount;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export async function scrollToLoadComments(page) {
|
|
36
|
+
// Scroll down to trigger comment loading
|
|
37
|
+
await page.evaluate(() => window.scrollBy(0, 800));
|
|
38
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
39
|
+
|
|
40
|
+
// Wait for comments section to appear
|
|
41
|
+
try {
|
|
42
|
+
await page.waitForSelector('ytd-comments', { timeout: 10000 });
|
|
43
|
+
return true;
|
|
44
|
+
} catch (e) {
|
|
45
|
+
console.error('Comments section not found');
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"permissions": {
|
|
3
|
-
"allow": [
|
|
4
|
-
"Bash(find:*)",
|
|
5
|
-
"Bash(npm test)",
|
|
6
|
-
"Bash(node:*)",
|
|
7
|
-
"Bash(npm run build:*)",
|
|
8
|
-
"WebFetch(domain:github.com)",
|
|
9
|
-
"Bash(npm install:*)",
|
|
10
|
-
"Bash(npm run test:*)",
|
|
11
|
-
"mcp__server-sequential-thinking__sequentialthinking",
|
|
12
|
-
"Bash(ls:*)",
|
|
13
|
-
"mcp__puppeteer__puppeteer_navigate",
|
|
14
|
-
"mcp__puppeteer__puppeteer_screenshot",
|
|
15
|
-
"mcp__puppeteer__puppeteer_evaluate",
|
|
16
|
-
"mcp__puppeteer__puppeteer_click",
|
|
17
|
-
"Bash(rm:*)",
|
|
18
|
-
"Bash(npm audit:*)",
|
|
19
|
-
"Bash(npm whoami:*)",
|
|
20
|
-
"Bash(npm publish:*)",
|
|
21
|
-
"Bash(npm view:*)"
|
|
22
|
-
],
|
|
23
|
-
"deny": []
|
|
24
|
-
}
|
|
25
|
-
}
|
package/test/index.test.js
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import { test } from 'node:test';
|
|
2
|
-
import assert from 'node:assert';
|
|
3
|
-
import { getSubtitles } from '../src/index.js';
|
|
4
|
-
|
|
5
|
-
test('Extract passive income video captions', async () => {
|
|
6
|
-
const captions = await getSubtitles({ videoID: 'JueUvj6X3DA' });
|
|
7
|
-
|
|
8
|
-
// Check that captions were extracted
|
|
9
|
-
assert(Array.isArray(captions), 'Captions should be an array');
|
|
10
|
-
assert(captions.length > 0, 'Should extract at least one caption');
|
|
11
|
-
|
|
12
|
-
// Check structure of first caption
|
|
13
|
-
const firstCaption = captions[0];
|
|
14
|
-
assert(typeof firstCaption.start === 'string', 'Start time should be a string');
|
|
15
|
-
assert(typeof firstCaption.dur === 'string', 'Duration should be a string');
|
|
16
|
-
assert(typeof firstCaption.text === 'string', 'Text should be a string');
|
|
17
|
-
|
|
18
|
-
// Check that the first caption contains expected content
|
|
19
|
-
assert(
|
|
20
|
-
firstCaption.text.toLowerCase().includes('creating passive income'),
|
|
21
|
-
`First caption should contain "creating passive income", got: "${firstCaption.text}"`
|
|
22
|
-
);
|
|
23
|
-
});
|