headless-youtube-captions 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.2.0] - 2025-07-06
4
+
5
+ ### Added
6
+ - Docker support with `PUPPETEER_EXECUTABLE_PATH` environment variable
7
+ - Ability to specify custom Chrome/Chromium executable path
8
+ - Comprehensive Docker usage documentation with example Dockerfile
9
+ - npm badges in README (version, license, Node.js version)
10
+ - Features section highlighting key capabilities
11
+
12
+ ### Changed
13
+ - Enhanced README with better organization and documentation
14
+ - Updated Node.js requirement documentation to correctly show v18+
15
+
3
16
  ## [1.0.1] - 2025-06-10
4
17
 
5
18
  ### Added
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 andrewlwn77
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,6 +1,21 @@
1
1
  # Headless YouTube Captions
2
2
 
3
- > Extract YouTube video transcripts by interacting with YouTube's UI using Puppeteer
3
+ [![npm version](https://badge.fury.io/js/headless-youtube-captions.svg)](https://www.npmjs.com/package/headless-youtube-captions)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Node.js Version](https://img.shields.io/node/v/headless-youtube-captions.svg)](https://nodejs.org)
6
+
7
+ > Extract YouTube video transcripts, channel videos, and comments by interacting with YouTube's UI using Puppeteer
8
+
9
+ ## Features
10
+
11
+ - 🎯 Extract video transcripts/captions in multiple languages
12
+ - 📺 Get channel videos with pagination support
13
+ - 🔍 Search videos within a specific channel
14
+ - 💬 Extract video comments with sorting options
15
+ - 🐳 Docker support with configurable Chrome executable path
16
+ - 📦 Zero build dependencies - runs directly from source
17
+ - 🚀 Modern ES modules with async/await
18
+ - 🛡️ Handles cookie consent and ad skipping automatically
4
19
 
5
20
  ## Installation
6
21
 
@@ -12,7 +27,7 @@ yarn add headless-youtube-captions
12
27
 
13
28
  ## Usage
14
29
 
15
- ### ES6 / TypeScript
30
+ ### Extract Video Transcripts
16
31
  ```js
17
32
  import { getSubtitles } from 'headless-youtube-captions';
18
33
 
@@ -24,16 +39,42 @@ const captions = await getSubtitles({
24
39
  console.log(captions);
25
40
  ```
26
41
 
27
- ### ES5 / CommonJS
42
+ ### Get Channel Videos
28
43
  ```js
29
- const { getSubtitles } = require('headless-youtube-captions');
44
+ import { getChannelVideos } from 'headless-youtube-captions';
30
45
 
31
- getSubtitles({
32
- videoID: 'JueUvj6X3DA', // YouTube video ID
33
- lang: 'en' // Optional, default: 'en'
34
- }).then(captions => {
35
- console.log(captions);
46
+ const result = await getChannelVideos({
47
+ channelURL: '@mkbhd', // or full URL like 'https://youtube.com/@mkbhd'
48
+ limit: 30 // Optional, default: 30
49
+ });
50
+
51
+ console.log(result.videos);
52
+ ```
53
+
54
+ ### Search Channel Videos
55
+ ```js
56
+ import { searchChannelVideos } from 'headless-youtube-captions';
57
+
58
+ const result = await searchChannelVideos({
59
+ channelURL: '@mkbhd',
60
+ query: 'iphone review',
61
+ limit: 20 // Optional, default: 30
62
+ });
63
+
64
+ console.log(result.results);
65
+ ```
66
+
67
+ ### Get Video Comments
68
+ ```js
69
+ import { getVideoComments } from 'headless-youtube-captions';
70
+
71
+ const result = await getVideoComments({
72
+ videoID: 'JueUvj6X3DA',
73
+ limit: 50, // Optional, default: 50
74
+ sortBy: 'top' // Optional, 'top' or 'newest', default: 'top'
36
75
  });
76
+
77
+ console.log(result.comments);
37
78
  ```
38
79
 
39
80
  ## API
@@ -100,9 +141,58 @@ This library uses Puppeteer to:
100
141
 
101
142
  ## Requirements
102
143
 
103
- - Node.js 12 or higher
144
+ - Node.js 18 or higher (ES modules support required)
104
145
  - Puppeteer (installed as a dependency)
105
146
 
147
+ ## Docker Usage
148
+
149
+ When running in Docker containers, you may need to specify the Chrome executable path using the `PUPPETEER_EXECUTABLE_PATH` environment variable:
150
+
151
+ ```bash
152
+ # Set the environment variable
153
+ export PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
154
+
155
+ # Or run directly
156
+ PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable node your-script.js
157
+ ```
158
+
159
+ Example Dockerfile configuration:
160
+ ```dockerfile
161
+ # Install Chrome dependencies
162
+ RUN apt-get update && apt-get install -y \
163
+ wget \
164
+ gnupg \
165
+ ca-certificates \
166
+ fonts-liberation \
167
+ libasound2 \
168
+ libatk-bridge2.0-0 \
169
+ libatk1.0-0 \
170
+ libatspi2.0-0 \
171
+ libcups2 \
172
+ libdbus-1-3 \
173
+ libdrm2 \
174
+ libgbm1 \
175
+ libgtk-3-0 \
176
+ libnspr4 \
177
+ libnss3 \
178
+ libxcomposite1 \
179
+ libxdamage1 \
180
+ libxfixes3 \
181
+ libxkbcommon0 \
182
+ libxrandr2 \
183
+ xdg-utils
184
+
185
+ # Install Chrome
186
+ RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
187
+ && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
188
+ && apt-get update \
189
+ && apt-get install -y google-chrome-stable \
190
+ && rm -rf /var/lib/apt/lists/*
191
+
192
+ # Set the Chrome executable path
193
+ ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
194
+ ```
195
+
106
196
  ## Error Handling
107
197
 
108
198
  The function will throw an error if:
@@ -129,6 +219,119 @@ try {
129
219
  - The library respects YouTube's UI structure as of the last update
130
220
  - Some videos may not have transcripts available
131
221
 
222
+ ### `getChannelVideos(options)`
223
+
224
+ Extracts videos from a YouTube channel with pagination support.
225
+
226
+ #### Parameters
227
+
228
+ - `options` (Object):
229
+ - `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
230
+ - `limit` (Number, optional): Maximum videos to return. Default: `30`
231
+ - `pageToken` (String, optional): For pagination (future use)
232
+
233
+ #### Returns
234
+
235
+ ```js
236
+ {
237
+ channel: {
238
+ name: "Channel Name",
239
+ subscribers: "1.2M subscribers",
240
+ videoCount: "500 videos"
241
+ },
242
+ videos: [
243
+ {
244
+ id: "videoId123",
245
+ title: "Video Title",
246
+ views: "1.2M views",
247
+ uploadTime: "2 days ago",
248
+ duration: "10:45",
249
+ thumbnail: "https://...",
250
+ url: "https://youtube.com/watch?v=videoId123"
251
+ }
252
+ // ... more videos
253
+ ],
254
+ totalLoaded: 30,
255
+ hasMore: true
256
+ }
257
+ ```
258
+
259
+ ### `searchChannelVideos(options)`
260
+
261
+ Search for videos within a specific YouTube channel.
262
+
263
+ #### Parameters
264
+
265
+ - `options` (Object):
266
+ - `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
267
+ - `query` (String, required): Search query
268
+ - `limit` (Number, optional): Maximum results. Default: `30`
269
+
270
+ #### Returns
271
+
272
+ ```js
273
+ {
274
+ query: "iphone review",
275
+ results: [
276
+ {
277
+ id: "videoId123",
278
+ title: "iPhone 15 Review",
279
+ views: "2.5M views",
280
+ uploadTime: "1 week ago",
281
+ duration: "15:23",
282
+ thumbnail: "https://...",
283
+ url: "https://youtube.com/watch?v=videoId123"
284
+ }
285
+ // ... more results
286
+ ],
287
+ totalFound: 25
288
+ }
289
+ ```
290
+
291
+ ### `getVideoComments(options)`
292
+
293
+ Extract comments from a YouTube video with pagination support.
294
+
295
+ #### Parameters
296
+
297
+ - `options` (Object):
298
+ - `videoID` (String, required): YouTube video ID
299
+ - `limit` (Number, optional): Maximum comments to return. Default: `50`
300
+ - `sortBy` (String, optional): Sort order - `'top'` or `'newest'`. Default: `'top'`
301
+ - `pageToken` (String, optional): For pagination (future use)
302
+
303
+ #### Returns
304
+
305
+ ```js
306
+ {
307
+ video: {
308
+ id: "JueUvj6X3DA",
309
+ title: "Video Title",
310
+ channel: {
311
+ name: "Channel Name",
312
+ url: "https://youtube.com/@channel"
313
+ },
314
+ views: "1.5M views"
315
+ },
316
+ comments: [
317
+ {
318
+ author: "Username",
319
+ authorUrl: "https://youtube.com/@username",
320
+ authorAvatar: "https://...",
321
+ text: "Great video! Thanks for sharing...",
322
+ time: "2 days ago",
323
+ likes: "245",
324
+ replyCount: "12"
325
+ }
326
+ // ... more comments
327
+ ],
328
+ totalComments: 1566,
329
+ totalLoaded: 50,
330
+ hasMore: true,
331
+ sortBy: "top"
332
+ }
333
+ ```
334
+
132
335
  ## License
133
336
 
134
337
  MIT
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "headless-youtube-captions",
3
- "version": "1.0.2",
4
- "description": "Extract YouTube video transcripts using headless browser automation",
3
+ "version": "1.2.0",
4
+ "description": "Extract YouTube video transcripts, channel videos, and comments using headless browser automation",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
7
7
  "type": "module",
@@ -35,8 +35,12 @@
35
35
  "captions",
36
36
  "subtitles",
37
37
  "transcript",
38
+ "comments",
39
+ "channel",
40
+ "videos",
38
41
  "puppeteer",
39
42
  "headless",
40
- "scraper"
43
+ "scraper",
44
+ "api"
41
45
  ]
42
46
  }
package/src/channel.js ADDED
@@ -0,0 +1,243 @@
1
+ import { createBrowser, createPage, handleCookieConsent } from './utils/browser.js';
2
+ import { scrollAndWaitForMore } from './utils/scroll.js';
3
+ import { extractVideoData, extractChannelInfo } from './utils/extract.js';
4
+
5
+ export async function getChannelVideos({ channelURL, limit = 30, pageToken = null }) {
6
+ const browser = await createBrowser();
7
+
8
+ try {
9
+ const page = await createPage(browser);
10
+
11
+ // Construct the full URL
12
+ let fullURL;
13
+ if (channelURL.startsWith('http')) {
14
+ // Ensure we're on the videos tab
15
+ fullURL = channelURL.includes('/videos') ? channelURL : channelURL.replace(/\/?$/, '/videos');
16
+ } else if (channelURL.startsWith('@')) {
17
+ fullURL = `https://youtube.com/${channelURL}/videos`;
18
+ } else if (channelURL.startsWith('UC')) {
19
+ fullURL = `https://youtube.com/channel/${channelURL}/videos`;
20
+ } else {
21
+ fullURL = `https://youtube.com/c/${channelURL}/videos`;
22
+ }
23
+
24
+ console.error(`Navigating to ${fullURL}`);
25
+ await page.goto(fullURL, {
26
+ waitUntil: 'networkidle2',
27
+ timeout: 60000
28
+ });
29
+
30
+ // Handle cookie consent
31
+ await handleCookieConsent(page);
32
+
33
+ // Wait a bit for dynamic content
34
+ await new Promise(resolve => setTimeout(resolve, 3000));
35
+
36
+ // Wait for initial videos to load
37
+ await page.waitForSelector('ytd-rich-item-renderer', { timeout: 30000 });
38
+ console.error('Initial videos loaded');
39
+
40
+ // Extract channel info
41
+ const channelInfo = await extractChannelInfo(page);
42
+
43
+ let allVideos = [];
44
+ let currentCount = 0;
45
+
46
+ // Load videos up to the limit
47
+ while (allVideos.length < limit) {
48
+ const videos = await extractVideoData(page);
49
+ allVideos = videos;
50
+
51
+ if (videos.length === currentCount) {
52
+ // No more videos to load
53
+ break;
54
+ }
55
+
56
+ currentCount = videos.length;
57
+
58
+ if (currentCount < limit) {
59
+ // Try to load more videos
60
+ const newCount = await scrollAndWaitForMore(page, 'ytd-rich-item-renderer', currentCount);
61
+ if (newCount === currentCount) {
62
+ break; // No new videos loaded
63
+ }
64
+ }
65
+ }
66
+
67
+ // Trim to requested limit
68
+ const resultVideos = allVideos.slice(0, limit);
69
+
70
+ console.error(`Successfully extracted ${resultVideos.length} videos`);
71
+
72
+ return {
73
+ channel: channelInfo,
74
+ videos: resultVideos,
75
+ totalLoaded: allVideos.length,
76
+ hasMore: allVideos.length > limit
77
+ };
78
+
79
+ } catch (error) {
80
+ console.error('Error extracting channel videos:', error);
81
+ throw error;
82
+ } finally {
83
+ await browser.close();
84
+ }
85
+ }
86
+
87
+ export async function searchChannelVideos({ channelURL, query, limit = 30 }) {
88
+ const browser = await createBrowser();
89
+
90
+ try {
91
+ const page = await createPage(browser);
92
+
93
+ // Navigate to channel page
94
+ let fullURL;
95
+ if (channelURL.startsWith('http')) {
96
+ // Remove /videos if present to get to main channel page
97
+ fullURL = channelURL.replace(/\/videos\/?$/, '');
98
+ } else if (channelURL.startsWith('@')) {
99
+ fullURL = `https://youtube.com/${channelURL}`;
100
+ } else if (channelURL.startsWith('UC')) {
101
+ fullURL = `https://youtube.com/channel/${channelURL}`;
102
+ } else {
103
+ fullURL = `https://youtube.com/c/${channelURL}`;
104
+ }
105
+
106
+ console.error(`Navigating to ${fullURL}`);
107
+ await page.goto(fullURL, {
108
+ waitUntil: 'networkidle2',
109
+ timeout: 60000
110
+ });
111
+
112
+ // Handle cookie consent
113
+ await handleCookieConsent(page);
114
+
115
+ // Wait for page to load
116
+ await new Promise(resolve => setTimeout(resolve, 3000));
117
+
118
+ // Look for search icon in channel header
119
+ const searchButtonSelectors = [
120
+ 'ytd-channel-header-renderer yt-icon-button[aria-label*="Search"]',
121
+ 'ytd-channel-header-renderer button[aria-label*="Search"]',
122
+ '#channel-header yt-icon-button[aria-label*="Search"]',
123
+ 'yt-icon[icon="yt-icons:search"]'
124
+ ];
125
+
126
+ let searchClicked = false;
127
+ for (const selector of searchButtonSelectors) {
128
+ try {
129
+ const searchButton = await page.$(selector);
130
+ if (searchButton) {
131
+ const isVisible = await searchButton.evaluate(el => {
132
+ const rect = el.getBoundingClientRect();
133
+ return rect.width > 0 && rect.height > 0;
134
+ });
135
+
136
+ if (isVisible) {
137
+ await searchButton.click();
138
+ console.error('Clicked search button');
139
+ searchClicked = true;
140
+ break;
141
+ }
142
+ }
143
+ } catch (e) {
144
+ // Try next selector
145
+ }
146
+ }
147
+
148
+ if (!searchClicked) {
149
+ // Try clicking on the search icon itself
150
+ const clicked = await page.evaluate(() => {
151
+ const icons = document.querySelectorAll('yt-icon');
152
+ for (const icon of icons) {
153
+ if (icon.getAttribute('icon') === 'yt-icons:search') {
154
+ const button = icon.closest('button') || icon.closest('yt-icon-button');
155
+ if (button) {
156
+ button.click();
157
+ return true;
158
+ }
159
+ }
160
+ }
161
+ return false;
162
+ });
163
+
164
+ if (clicked) {
165
+ console.error('Clicked search icon');
166
+ searchClicked = true;
167
+ }
168
+ }
169
+
170
+ if (!searchClicked) {
171
+ throw new Error('Could not find channel search button');
172
+ }
173
+
174
+ // Wait for search input to appear
175
+ await page.waitForSelector('input[placeholder*="Search"]', { timeout: 5000 });
176
+
177
+ // Type search query
178
+ await page.type('input[placeholder*="Search"]', query);
179
+ await page.keyboard.press('Enter');
180
+
181
+ // Wait for search results
182
+ await new Promise(resolve => setTimeout(resolve, 3000));
183
+ await page.waitForSelector('ytd-video-renderer, ytd-rich-item-renderer', { timeout: 10000 });
184
+
185
+ // Extract search results
186
+ const searchResults = await page.evaluate(() => {
187
+ // Try different selectors for search results
188
+ let videos = document.querySelectorAll('ytd-video-renderer');
189
+ if (videos.length === 0) {
190
+ videos = document.querySelectorAll('ytd-rich-item-renderer');
191
+ }
192
+
193
+ return Array.from(videos).map(video => {
194
+ // Extract video ID
195
+ const link = video.querySelector('a#video-title, a#video-title-link');
196
+ const href = link ? link.href : '';
197
+ const videoId = href.match(/watch\?v=([^&]+)/)?.[1] || '';
198
+
199
+ // Extract title
200
+ const titleElement = video.querySelector('#video-title');
201
+ const title = titleElement ? titleElement.textContent.trim() : '';
202
+
203
+ // Extract metadata
204
+ const viewsElement = video.querySelector('#metadata-line span:first-child, .view-count');
205
+ const views = viewsElement ? viewsElement.textContent : '';
206
+
207
+ const timeElement = video.querySelector('#metadata-line span:last-child, .published-time');
208
+ const uploadTime = timeElement ? timeElement.textContent : '';
209
+
210
+ // Extract duration
211
+ const durationElement = video.querySelector('ytd-thumbnail-overlay-time-status-renderer span, .video-time');
212
+ const duration = durationElement ? durationElement.textContent.trim() : '';
213
+
214
+ // Extract thumbnail
215
+ const thumbnail = video.querySelector('img#img')?.src || '';
216
+
217
+ return {
218
+ id: videoId,
219
+ title,
220
+ views,
221
+ uploadTime,
222
+ duration,
223
+ thumbnail,
224
+ url: `https://youtube.com/watch?v=${videoId}`
225
+ };
226
+ }).filter(video => video.id && video.title);
227
+ });
228
+
229
+ console.error(`Found ${searchResults.length} videos matching "${query}"`);
230
+
231
+ return {
232
+ query,
233
+ results: searchResults.slice(0, limit),
234
+ totalFound: searchResults.length
235
+ };
236
+
237
+ } catch (error) {
238
+ console.error('Error searching channel videos:', error);
239
+ throw error;
240
+ } finally {
241
+ await browser.close();
242
+ }
243
+ }
@@ -0,0 +1,156 @@
1
+ import { createBrowser, createPage, handleCookieConsent, skipAds } from './utils/browser.js';
2
+ import { scrollToLoadComments, scrollAndWaitForMore } from './utils/scroll.js';
3
+ import { extractCommentData } from './utils/extract.js';
4
+
5
+ export async function getVideoComments({ videoID, limit = 50, sortBy = 'top', pageToken = null }) {
6
+ const browser = await createBrowser();
7
+
8
+ try {
9
+ const page = await createPage(browser);
10
+
11
+ // Navigate to the YouTube video page
12
+ console.error(`Navigating to https://youtube.com/watch?v=${videoID}`);
13
+ await page.goto(`https://youtube.com/watch?v=${videoID}`, {
14
+ waitUntil: 'networkidle2',
15
+ timeout: 60000
16
+ });
17
+
18
+ // Wait for video player to load
19
+ await page.waitForSelector('#movie_player, video', { timeout: 30000 });
20
+ console.error('Video player loaded');
21
+
22
+ // Handle cookie consent
23
+ await handleCookieConsent(page);
24
+
25
+ // Skip ads if present
26
+ await skipAds(page);
27
+
28
+ // Wait for page to stabilize
29
+ await new Promise(resolve => setTimeout(resolve, 3000));
30
+
31
+ // Scroll to load comments
32
+ const commentsLoaded = await scrollToLoadComments(page);
33
+ if (!commentsLoaded) {
34
+ throw new Error('Could not load comments section');
35
+ }
36
+
37
+ // Wait for comment threads to load
38
+ await page.waitForSelector('ytd-comment-thread-renderer', {
39
+ timeout: 10000,
40
+ visible: true
41
+ });
42
+
43
+ console.error('Comments section loaded');
44
+
45
+ // Extract total comment count
46
+ const commentCount = await page.evaluate(() => {
47
+ const countElement = document.querySelector('ytd-comments-header-renderer h2 yt-formatted-string');
48
+ if (countElement) {
49
+ const text = countElement.textContent;
50
+ const match = text.match(/[\d,]+/);
51
+ return match ? match[0].replace(/,/g, '') : '0';
52
+ }
53
+ return '0';
54
+ });
55
+
56
+ // Check if we need to change sort order
57
+ if (sortBy === 'newest') {
58
+ // Click on sort menu
59
+ const sortMenuClicked = await page.evaluate(() => {
60
+ const sortButton = document.querySelector('ytd-comments-header-renderer tp-yt-paper-dropdown-menu-light');
61
+ if (sortButton) {
62
+ sortButton.click();
63
+ return true;
64
+ }
65
+ return false;
66
+ });
67
+
68
+ if (sortMenuClicked) {
69
+ await new Promise(resolve => setTimeout(resolve, 1000));
70
+
71
+ // Click on "Newest first" option
72
+ await page.evaluate(() => {
73
+ const menuItems = document.querySelectorAll('tp-yt-paper-listbox tp-yt-paper-item');
74
+ for (const item of menuItems) {
75
+ if (item.textContent.includes('Newest') || item.textContent.includes('newest')) {
76
+ item.click();
77
+ break;
78
+ }
79
+ }
80
+ });
81
+
82
+ // Wait for comments to reload
83
+ await new Promise(resolve => setTimeout(resolve, 3000));
84
+ }
85
+ }
86
+
87
+ let allComments = [];
88
+ let currentCount = 0;
89
+
90
+ // Load comments up to the limit
91
+ while (allComments.length < limit) {
92
+ const comments = await extractCommentData(page);
93
+ allComments = comments;
94
+
95
+ if (comments.length === currentCount) {
96
+ // No more comments to load
97
+ break;
98
+ }
99
+
100
+ currentCount = comments.length;
101
+
102
+ if (currentCount < limit) {
103
+ // Try to load more comments
104
+ const newCount = await scrollAndWaitForMore(page, 'ytd-comment-thread-renderer', currentCount, 3000);
105
+ if (newCount === currentCount) {
106
+ break; // No new comments loaded
107
+ }
108
+ }
109
+ }
110
+
111
+ // Trim to requested limit
112
+ const resultComments = allComments.slice(0, limit);
113
+
114
+ console.error(`Successfully extracted ${resultComments.length} comments`);
115
+
116
+ // Extract video info
117
+ const videoInfo = await page.evaluate(() => {
118
+ const titleElement = document.querySelector('h1.ytd-video-primary-info-renderer');
119
+ const title = titleElement ? titleElement.textContent.trim() : '';
120
+
121
+ const channelElement = document.querySelector('ytd-channel-name a');
122
+ const channelName = channelElement ? channelElement.textContent.trim() : '';
123
+ const channelUrl = channelElement ? channelElement.href : '';
124
+
125
+ const viewsElement = document.querySelector('.view-count');
126
+ const views = viewsElement ? viewsElement.textContent : '';
127
+
128
+ return {
129
+ title,
130
+ channel: {
131
+ name: channelName,
132
+ url: channelUrl
133
+ },
134
+ views
135
+ };
136
+ });
137
+
138
+ return {
139
+ video: {
140
+ id: videoID,
141
+ ...videoInfo
142
+ },
143
+ comments: resultComments,
144
+ totalComments: parseInt(commentCount),
145
+ totalLoaded: allComments.length,
146
+ hasMore: allComments.length > limit,
147
+ sortBy
148
+ };
149
+
150
+ } catch (error) {
151
+ console.error('Error extracting comments:', error);
152
+ throw error;
153
+ } finally {
154
+ await browser.close();
155
+ }
156
+ }
package/src/index.d.ts CHANGED
@@ -19,4 +19,148 @@ export interface GetSubtitlesOptions {
19
19
  * @param options - Configuration options
20
20
  * @returns Promise that resolves to an array of subtitle segments
21
21
  */
22
- export function getSubtitles(options: GetSubtitlesOptions): Promise<SubtitleSegment[]>;
22
+ export function getSubtitles(options: GetSubtitlesOptions): Promise<SubtitleSegment[]>;
23
+
24
+ // New types for channel videos
25
+ export interface VideoInfo {
26
+ /** YouTube video ID */
27
+ id: string;
28
+ /** Video title */
29
+ title: string;
30
+ /** View count text */
31
+ views: string;
32
+ /** Upload time text (e.g., "2 days ago") */
33
+ uploadTime: string;
34
+ /** Video duration text (e.g., "10:45") */
35
+ duration: string;
36
+ /** Thumbnail URL */
37
+ thumbnail: string;
38
+ /** Full YouTube video URL */
39
+ url: string;
40
+ }
41
+
42
+ export interface ChannelInfo {
43
+ /** Channel name */
44
+ name: string;
45
+ /** Subscriber count text */
46
+ subscribers: string;
47
+ /** Total video count text */
48
+ videoCount: string;
49
+ }
50
+
51
+ export interface GetChannelVideosOptions {
52
+ /** Channel URL, @handle, or channel ID */
53
+ channelURL: string;
54
+ /** Maximum number of videos to return (default: 30) */
55
+ limit?: number;
56
+ /** Page token for pagination (optional) */
57
+ pageToken?: string | null;
58
+ }
59
+
60
+ export interface ChannelVideosResult {
61
+ /** Channel information */
62
+ channel: ChannelInfo;
63
+ /** Array of videos */
64
+ videos: VideoInfo[];
65
+ /** Total videos loaded */
66
+ totalLoaded: number;
67
+ /** Whether there are more videos available */
68
+ hasMore: boolean;
69
+ }
70
+
71
+ /**
72
+ * Get videos from a YouTube channel with pagination support
73
+ * @param options - Configuration options
74
+ * @returns Promise that resolves to channel videos result
75
+ */
76
+ export function getChannelVideos(options: GetChannelVideosOptions): Promise<ChannelVideosResult>;
77
+
78
+ export interface SearchChannelVideosOptions {
79
+ /** Channel URL, @handle, or channel ID */
80
+ channelURL: string;
81
+ /** Search query */
82
+ query: string;
83
+ /** Maximum number of videos to return (default: 30) */
84
+ limit?: number;
85
+ }
86
+
87
+ export interface SearchChannelVideosResult {
88
+ /** Search query used */
89
+ query: string;
90
+ /** Array of matching videos */
91
+ results: VideoInfo[];
92
+ /** Total videos found */
93
+ totalFound: number;
94
+ }
95
+
96
+ /**
97
+ * Search for videos within a YouTube channel
98
+ * @param options - Configuration options
99
+ * @returns Promise that resolves to search results
100
+ */
101
+ export function searchChannelVideos(options: SearchChannelVideosOptions): Promise<SearchChannelVideosResult>;
102
+
103
+ // Types for comments
104
+ export interface Comment {
105
+ /** Comment author name */
106
+ author: string;
107
+ /** Author channel URL */
108
+ authorUrl: string;
109
+ /** Author avatar URL */
110
+ authorAvatar: string;
111
+ /** Comment text */
112
+ text: string;
113
+ /** Time ago text (e.g., "2 days ago") */
114
+ time: string;
115
+ /** Like count */
116
+ likes: string;
117
+ /** Number of replies */
118
+ replyCount: string;
119
+ }
120
+
121
+ export interface VideoDetails {
122
+ /** Video ID */
123
+ id: string;
124
+ /** Video title */
125
+ title: string;
126
+ /** Channel information */
127
+ channel: {
128
+ name: string;
129
+ url: string;
130
+ };
131
+ /** View count text */
132
+ views: string;
133
+ }
134
+
135
+ export interface GetVideoCommentsOptions {
136
+ /** YouTube video ID */
137
+ videoID: string;
138
+ /** Maximum number of comments to return (default: 50) */
139
+ limit?: number;
140
+ /** Sort order: 'top' or 'newest' (default: 'top') */
141
+ sortBy?: 'top' | 'newest';
142
+ /** Page token for pagination (optional) */
143
+ pageToken?: string | null;
144
+ }
145
+
146
+ export interface VideoCommentsResult {
147
+ /** Video information */
148
+ video: VideoDetails;
149
+ /** Array of comments */
150
+ comments: Comment[];
151
+ /** Total comment count */
152
+ totalComments: number;
153
+ /** Total comments loaded */
154
+ totalLoaded: number;
155
+ /** Whether there are more comments available */
156
+ hasMore: boolean;
157
+ /** Sort order used */
158
+ sortBy: 'top' | 'newest';
159
+ }
160
+
161
+ /**
162
+ * Get comments from a YouTube video with pagination support
163
+ * @param options - Configuration options
164
+ * @returns Promise that resolves to video comments result
165
+ */
166
+ export function getVideoComments(options: GetVideoCommentsOptions): Promise<VideoCommentsResult>;
package/src/index.js CHANGED
@@ -1,24 +1,11 @@
1
- import he from 'he';
2
- import lodash from 'lodash';
3
- import striptags from 'striptags';
4
- import puppeteer from 'puppeteer';
5
-
6
- const { find } = lodash;
1
+ import { createBrowser, createPage, handleCookieConsent, skipAds } from './utils/browser.js';
7
2
 
3
+ // Export existing function
8
4
  export async function getSubtitles({ videoID, lang = 'en' }) {
9
- const browser = await puppeteer.launch({
10
- headless: true,
11
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
12
- });
5
+ const browser = await createBrowser();
13
6
 
14
7
  try {
15
- const page = await browser.newPage();
16
-
17
- // Set viewport to a standard desktop size
18
- await page.setViewport({ width: 1920, height: 1080 });
19
-
20
- // Set a realistic user agent
21
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
8
+ const page = await createPage(browser);
22
9
 
23
10
  // Navigate to the YouTube video page
24
11
  console.error(`Navigating to https://youtube.com/watch?v=${videoID}`);
@@ -35,28 +22,10 @@ export async function getSubtitles({ videoID, lang = 'en' }) {
35
22
  await new Promise(resolve => setTimeout(resolve, 5000));
36
23
 
37
24
  // Handle cookie consent if present
38
- try {
39
- const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
40
- if (consentButton) {
41
- await consentButton.click();
42
- console.error('Accepted cookies');
43
- await new Promise(resolve => setTimeout(resolve, 1000));
44
- }
45
- } catch (e) {
46
- // Cookie consent not present or already accepted
47
- }
25
+ await handleCookieConsent(page);
48
26
 
49
27
  // Skip ads if present
50
- try {
51
- const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
52
- if (skipButton) {
53
- await skipButton.click();
54
- console.error('Skipped ad');
55
- await new Promise(resolve => setTimeout(resolve, 2000));
56
- }
57
- } catch (e) {
58
- // No skip button
59
- }
28
+ await skipAds(page);
60
29
 
61
30
  // Scroll down to load more content
62
31
  await page.evaluate(() => window.scrollBy(0, 800));
@@ -285,4 +254,8 @@ export async function getSubtitles({ videoID, lang = 'en' }) {
285
254
  } finally {
286
255
  await browser.close();
287
256
  }
288
- }
257
+ }
258
+
259
+ // Export new functions
260
+ export { getChannelVideos, searchChannelVideos } from './channel.js';
261
+ export { getVideoComments } from './comments.js';
@@ -0,0 +1,53 @@
1
+ import puppeteer from 'puppeteer';
2
+
3
+ export async function createBrowser() {
4
+ const options = {
5
+ headless: true,
6
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
7
+ };
8
+
9
+ // Add executablePath if environment variable is set
10
+ if (process.env.PUPPETEER_EXECUTABLE_PATH) {
11
+ options.executablePath = process.env.PUPPETEER_EXECUTABLE_PATH;
12
+ }
13
+
14
+ return await puppeteer.launch(options);
15
+ }
16
+
17
+ export async function createPage(browser) {
18
+ const page = await browser.newPage();
19
+
20
+ // Set viewport to a standard desktop size
21
+ await page.setViewport({ width: 1920, height: 1080 });
22
+
23
+ // Set a realistic user agent
24
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
25
+
26
+ return page;
27
+ }
28
+
29
+ export async function handleCookieConsent(page) {
30
+ try {
31
+ const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
32
+ if (consentButton) {
33
+ await consentButton.click();
34
+ console.error('Accepted cookies');
35
+ await new Promise(resolve => setTimeout(resolve, 1000));
36
+ }
37
+ } catch (e) {
38
+ // Cookie consent not present or already accepted
39
+ }
40
+ }
41
+
42
+ export async function skipAds(page) {
43
+ try {
44
+ const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
45
+ if (skipButton) {
46
+ await skipButton.click();
47
+ console.error('Skipped ad');
48
+ await new Promise(resolve => setTimeout(resolve, 2000));
49
+ }
50
+ } catch (e) {
51
+ // No skip button
52
+ }
53
+ }
@@ -0,0 +1,101 @@
1
+ export async function extractVideoData(page) {
2
+ return await page.evaluate(() => {
3
+ const videos = document.querySelectorAll('ytd-rich-item-renderer');
4
+ return Array.from(videos).map(video => {
5
+ const link = video.querySelector('a#video-title-link');
6
+ const href = link ? link.href : '';
7
+ const videoId = href.match(/watch\?v=([^&]+)/)?.[1] || '';
8
+
9
+ const titleElement = video.querySelector('#video-title');
10
+ const title = titleElement ? titleElement.textContent.trim() : '';
11
+
12
+ const metadataLine = video.querySelector('#metadata-line');
13
+ const metadataSpans = metadataLine ? metadataLine.querySelectorAll('span') : [];
14
+ const views = metadataSpans[0]?.textContent || '';
15
+ const uploadTime = metadataSpans[metadataSpans.length - 1]?.textContent || '';
16
+
17
+ const durationElement = video.querySelector('ytd-thumbnail-overlay-time-status-renderer span');
18
+ const duration = durationElement ? durationElement.textContent.trim() : '';
19
+
20
+ const thumbnail = video.querySelector('img#img')?.src || '';
21
+
22
+ return {
23
+ id: videoId,
24
+ title,
25
+ views,
26
+ uploadTime,
27
+ duration,
28
+ thumbnail,
29
+ url: `https://youtube.com/watch?v=${videoId}`
30
+ };
31
+ }).filter(video => video.id && video.title);
32
+ });
33
+ }
34
+
35
+ export async function extractCommentData(page) {
36
+ return await page.evaluate(() => {
37
+ const threads = document.querySelectorAll('ytd-comment-thread-renderer');
38
+ return Array.from(threads).map(thread => {
39
+ const authorElement = thread.querySelector('#author-text');
40
+ const author = authorElement ? authorElement.textContent.trim() : '';
41
+ const authorUrl = authorElement ? authorElement.href : '';
42
+
43
+ const textElement = thread.querySelector('#content-text');
44
+ const text = textElement ? textElement.textContent.trim() : '';
45
+
46
+ const timeElement = thread.querySelector('#published-time-text');
47
+ const time = timeElement ? timeElement.textContent.trim() : '';
48
+
49
+ const likesElement = thread.querySelector('#vote-count-middle');
50
+ const likes = likesElement ? likesElement.textContent.trim() : '0';
51
+
52
+ const replyElement = thread.querySelector('#more-replies');
53
+ const replyText = replyElement ? replyElement.textContent : '';
54
+ const replyCount = replyText.match(/\d+/)?.[0] || '0';
55
+
56
+ const avatarElement = thread.querySelector('#author-thumbnail img');
57
+ const authorAvatar = avatarElement ? avatarElement.src : '';
58
+
59
+ return {
60
+ author,
61
+ authorUrl,
62
+ authorAvatar,
63
+ text,
64
+ time,
65
+ likes,
66
+ replyCount
67
+ };
68
+ }).filter(comment => comment.text && comment.author);
69
+ });
70
+ }
71
+
72
+ export async function extractChannelInfo(page) {
73
+ return await page.evaluate(() => {
74
+ // Try multiple selectors for channel name
75
+ const nameSelectors = [
76
+ 'ytd-channel-name yt-formatted-string',
77
+ '#channel-name yt-formatted-string',
78
+ '.ytd-channel-name',
79
+ '#text.ytd-channel-name',
80
+ 'yt-formatted-string.ytd-channel-name'
81
+ ];
82
+
83
+ let channelName = '';
84
+ for (const selector of nameSelectors) {
85
+ const element = document.querySelector(selector);
86
+ if (element && element.textContent) {
87
+ channelName = element.textContent.trim();
88
+ break;
89
+ }
90
+ }
91
+
92
+ const subscriberCount = document.querySelector('#subscriber-count')?.textContent?.trim() || '';
93
+ const videoCount = document.querySelector('#videos-count')?.textContent?.trim() || '';
94
+
95
+ return {
96
+ name: channelName,
97
+ subscribers: subscriberCount,
98
+ videoCount: videoCount
99
+ };
100
+ });
101
+ }
@@ -0,0 +1,48 @@
1
+ export async function scrollToBottom(page) {
2
+ await page.evaluate(() => {
3
+ window.scrollTo(0, document.documentElement.scrollHeight);
4
+ });
5
+ }
6
+
7
+ export async function scrollToElement(page, selector) {
8
+ await page.evaluate((sel) => {
9
+ const element = document.querySelector(sel);
10
+ if (element) {
11
+ element.scrollIntoView({ behavior: 'smooth', block: 'center' });
12
+ }
13
+ }, selector);
14
+ }
15
+
16
+ export async function scrollAndWaitForMore(page, itemSelector, currentCount, maxWaitTime = 5000) {
17
+ await scrollToBottom(page);
18
+
19
+ const startTime = Date.now();
20
+ while (Date.now() - startTime < maxWaitTime) {
21
+ await new Promise(resolve => setTimeout(resolve, 1000));
22
+
23
+ const newCount = await page.evaluate((selector) => {
24
+ return document.querySelectorAll(selector).length;
25
+ }, itemSelector);
26
+
27
+ if (newCount > currentCount) {
28
+ return newCount;
29
+ }
30
+ }
31
+
32
+ return currentCount;
33
+ }
34
+
35
+ export async function scrollToLoadComments(page) {
36
+ // Scroll down to trigger comment loading
37
+ await page.evaluate(() => window.scrollBy(0, 800));
38
+ await new Promise(resolve => setTimeout(resolve, 2000));
39
+
40
+ // Wait for comments section to appear
41
+ try {
42
+ await page.waitForSelector('ytd-comments', { timeout: 10000 });
43
+ return true;
44
+ } catch (e) {
45
+ console.error('Comments section not found');
46
+ return false;
47
+ }
48
+ }
@@ -1,25 +0,0 @@
1
- {
2
- "permissions": {
3
- "allow": [
4
- "Bash(find:*)",
5
- "Bash(npm test)",
6
- "Bash(node:*)",
7
- "Bash(npm run build:*)",
8
- "WebFetch(domain:github.com)",
9
- "Bash(npm install:*)",
10
- "Bash(npm run test:*)",
11
- "mcp__server-sequential-thinking__sequentialthinking",
12
- "Bash(ls:*)",
13
- "mcp__puppeteer__puppeteer_navigate",
14
- "mcp__puppeteer__puppeteer_screenshot",
15
- "mcp__puppeteer__puppeteer_evaluate",
16
- "mcp__puppeteer__puppeteer_click",
17
- "Bash(rm:*)",
18
- "Bash(npm audit:*)",
19
- "Bash(npm whoami:*)",
20
- "Bash(npm publish:*)",
21
- "Bash(npm view:*)"
22
- ],
23
- "deny": []
24
- }
25
- }
@@ -1,23 +0,0 @@
1
- import { test } from 'node:test';
2
- import assert from 'node:assert';
3
- import { getSubtitles } from '../src/index.js';
4
-
5
- test('Extract passive income video captions', async () => {
6
- const captions = await getSubtitles({ videoID: 'JueUvj6X3DA' });
7
-
8
- // Check that captions were extracted
9
- assert(Array.isArray(captions), 'Captions should be an array');
10
- assert(captions.length > 0, 'Should extract at least one caption');
11
-
12
- // Check structure of first caption
13
- const firstCaption = captions[0];
14
- assert(typeof firstCaption.start === 'string', 'Start time should be a string');
15
- assert(typeof firstCaption.dur === 'string', 'Duration should be a string');
16
- assert(typeof firstCaption.text === 'string', 'Text should be a string');
17
-
18
- // Check that the first caption contains expected content
19
- assert(
20
- firstCaption.text.toLowerCase().includes('creating passive income'),
21
- `First caption should contain "creating passive income", got: "${firstCaption.text}"`
22
- );
23
- });