headless-youtube-captions 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(find:*)",
5
+ "Bash(npm test)",
6
+ "Bash(node:*)",
7
+ "Bash(npm run build:*)",
8
+ "WebFetch(domain:github.com)",
9
+ "Bash(npm install:*)",
10
+ "Bash(npm run test:*)",
11
+ "mcp__server-sequential-thinking__sequentialthinking",
12
+ "Bash(ls:*)",
13
+ "mcp__puppeteer__puppeteer_navigate",
14
+ "mcp__puppeteer__puppeteer_screenshot",
15
+ "mcp__puppeteer__puppeteer_evaluate",
16
+ "mcp__puppeteer__puppeteer_click",
17
+ "Bash(rm:*)",
18
+ "Bash(npm audit:*)",
19
+ "Bash(npm whoami:*)",
20
+ "Bash(npm publish:*)"
21
+ ],
22
+ "deny": []
23
+ }
24
+ }
package/CHANGELOG.md ADDED
@@ -0,0 +1,24 @@
1
+ # Changelog
2
+
3
+ ## [1.0.0] - 2025-01-06
4
+
5
+ ### Added
6
+ - Complete rewrite using Puppeteer for headless browser automation
7
+ - UI-based transcript extraction by clicking "Show transcript" button
8
+ - ES modules support with Node.js 18+ requirement
9
+ - Built-in Node.js test runner for testing
10
+ - Zero build dependencies - runs directly from source
11
+
12
+ ### Changed
13
+ - Project renamed from `youtube-captions-scraper` to `headless-youtube-captions`
14
+ - Switched from API-based extraction to UI automation approach
15
+ - Removed all build tools (Babel, Flow, ESLint configurations)
16
+ - Simplified to 4 core dependencies only
17
+ - Updated to modern ES6+ syntax without transpilation
18
+
19
+ ### Removed
20
+ - Legacy API-based caption extraction method
21
+ - Babel build pipeline and configuration
22
+ - Flow type checking
23
+ - Complex dev dependency chain
24
+ - Vulnerability-prone legacy dependencies
package/README.md ADDED
@@ -0,0 +1,134 @@
1
+ # Headless YouTube Captions
2
+
3
+ > Extract YouTube video transcripts by interacting with YouTube's UI using Puppeteer
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install -S headless-youtube-captions
9
+ # OR
10
+ yarn add headless-youtube-captions
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ### ES6 / TypeScript
16
+ ```js
17
+ import { getSubtitles } from 'headless-youtube-captions';
18
+
19
+ const captions = await getSubtitles({
20
+ videoID: 'JueUvj6X3DA', // YouTube video ID
21
+ lang: 'en' // Optional, default: 'en'
22
+ });
23
+
24
+ console.log(captions);
25
+ ```
26
+
27
+ ### ES5 / CommonJS
28
+ ```js
29
+ const { getSubtitles } = require('headless-youtube-captions');
30
+
31
+ getSubtitles({
32
+ videoID: 'JueUvj6X3DA', // YouTube video ID
33
+ lang: 'en' // Optional, default: 'en'
34
+ }).then(captions => {
35
+ console.log(captions);
36
+ });
37
+ ```
38
+
39
+ ## API
40
+
41
+ ### `getSubtitles(options)`
42
+
43
+ Extracts captions/transcripts from a YouTube video by automating browser interactions.
44
+
45
+ #### Parameters
46
+
47
+ - `options` (Object):
48
+ - `videoID` (String, required): The YouTube video ID
49
+ - `lang` (String, optional): Language code for captions. Default: `'en'`. Supported: `'en'`, `'de'`, `'fr'`
50
+
51
+ #### Returns
52
+
53
+ A Promise that resolves to an array of caption objects.
54
+
55
+ #### Caption Object Format
56
+
57
+ Each caption object contains:
58
+
59
+ ```js
60
+ {
61
+ "start": "0", // Start time in seconds (as string)
62
+ "dur": "3.0", // Duration in seconds (as string)
63
+ "text": "Caption text here" // The actual caption text
64
+ }
65
+ ```
66
+
67
+ #### Example Response
68
+
69
+ ```js
70
+ [
71
+ {
72
+ "start": "0",
73
+ "dur": "3.0",
74
+ "text": "- Creating passive income takes work,"
75
+ },
76
+ {
77
+ "start": "3",
78
+ "dur": "2.0",
79
+ "text": "but once you implement those processes,"
80
+ },
81
+ {
82
+ "start": "5",
83
+ "dur": "3.0",
84
+ "text": "it's one of the most fruitful income sources"
85
+ }
86
+ // ... more captions
87
+ ]
88
+ ```
89
+
90
+ ## How It Works
91
+
92
+ This library uses Puppeteer to:
93
+
94
+ 1. Navigate to the YouTube video page
95
+ 2. Handle cookie consent and ads if present
96
+ 3. Click the "Show transcript" button in the video description
97
+ 4. Extract transcript segments from the opened transcript panel
98
+ 5. Parse timestamps and text content
99
+ 6. Calculate proper durations for each caption segment
100
+
101
+ ## Requirements
102
+
103
+ - Node.js 12 or higher
104
+ - Puppeteer (installed as a dependency)
105
+
106
+ ## Error Handling
107
+
108
+ The function will throw an error if:
109
+ - The video ID is invalid or the video doesn't exist
110
+ - The video has no available captions/transcripts
111
+ - The "Show transcript" button cannot be found
112
+ - Network issues prevent loading the page
113
+
114
+ Example error handling:
115
+
116
+ ```js
117
+ try {
118
+ const captions = await getSubtitles({ videoID: 'XXXXX' });
119
+ console.log(captions);
120
+ } catch (error) {
121
+ console.error('Failed to extract captions:', error.message);
122
+ }
123
+ ```
124
+
125
+ ## Notes
126
+
127
+ - The library runs Puppeteer in headless mode by default
128
+ - Extraction time depends on video page load time and transcript length
129
+ - The library respects YouTube's UI structure as of the last update
130
+ - Some videos may not have transcripts available
131
+
132
+ ## License
133
+
134
+ MIT
package/package.json ADDED
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "headless-youtube-captions",
3
+ "version": "1.0.0",
4
+ "description": "Extract YouTube video transcripts using headless browser automation",
5
+ "main": "src/index.js",
6
+ "type": "module",
7
+ "engines": {
8
+ "node": ">=18.0.0"
9
+ },
10
+ "author": {
11
+ "name": "andrewlwn77"
12
+ },
13
+ "repository": {
14
+ "type": "git",
15
+ "url": "https://github.com/andrewlwn77/headless-youtube-captions.git"
16
+ },
17
+ "homepage": "https://github.com/andrewlwn77/headless-youtube-captions",
18
+ "bugs": {
19
+ "url": "https://github.com/andrewlwn77/headless-youtube-captions/issues"
20
+ },
21
+ "license": "MIT",
22
+ "scripts": {
23
+ "test": "node --test test/*.test.js"
24
+ },
25
+ "dependencies": {
26
+ "he": "^1.2.0",
27
+ "lodash": "^4.17.21",
28
+ "puppeteer": "^24.10.0",
29
+ "striptags": "^3.2.0"
30
+ },
31
+ "devDependencies": {},
32
+ "keywords": [
33
+ "youtube",
34
+ "captions",
35
+ "subtitles",
36
+ "transcript",
37
+ "puppeteer",
38
+ "headless",
39
+ "scraper"
40
+ ]
41
+ }
package/src/index.js ADDED
@@ -0,0 +1,290 @@
1
+ import he from 'he';
2
+ import lodash from 'lodash';
3
+ import striptags from 'striptags';
4
+ import puppeteer from 'puppeteer';
5
+
6
+ const { find } = lodash;
7
+
8
+ export async function getSubtitles({ videoID, lang = 'en' }) {
9
+ const browser = await puppeteer.launch({
10
+ headless: true,
11
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--window-size=1920,1080', '--disable-dev-shm-usage']
12
+ });
13
+
14
+ try {
15
+ const page = await browser.newPage();
16
+
17
+ // Set viewport to a standard desktop size
18
+ await page.setViewport({ width: 1920, height: 1080 });
19
+
20
+ // Set a realistic user agent
21
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
22
+
23
+ // Navigate to the YouTube video page
24
+ console.log(`Navigating to https://youtube.com/watch?v=${videoID}`);
25
+ await page.goto(`https://youtube.com/watch?v=${videoID}`, {
26
+ waitUntil: 'networkidle2',
27
+ timeout: 60000
28
+ });
29
+
30
+ // Wait for initial page load
31
+ await page.waitForSelector('#movie_player, video', { timeout: 30000 });
32
+ console.log('Video player loaded');
33
+
34
+ // Wait a bit more for dynamic content to load
35
+ await new Promise(resolve => setTimeout(resolve, 5000));
36
+
37
+ // Handle cookie consent if present
38
+ try {
39
+ const consentButton = await page.$('[aria-label*="Accept all"], [aria-label*="Accept cookies"], button:has-text("Accept all")');
40
+ if (consentButton) {
41
+ await consentButton.click();
42
+ console.log('Accepted cookies');
43
+ await new Promise(resolve => setTimeout(resolve, 1000));
44
+ }
45
+ } catch (e) {
46
+ // Cookie consent not present or already accepted
47
+ }
48
+
49
+ // Skip ads if present
50
+ try {
51
+ const skipButton = await page.$('.ytp-ad-skip-button, .ytp-skip-ad-button');
52
+ if (skipButton) {
53
+ await skipButton.click();
54
+ console.log('Skipped ad');
55
+ await new Promise(resolve => setTimeout(resolve, 2000));
56
+ }
57
+ } catch (e) {
58
+ // No skip button
59
+ }
60
+
61
+ // Scroll down to load more content
62
+ await page.evaluate(() => window.scrollBy(0, 800));
63
+ await new Promise(resolve => setTimeout(resolve, 2000));
64
+
65
+ // Click on "more" to expand description if needed
66
+ try {
67
+ // Multiple selectors for the more button
68
+ const moreSelectors = [
69
+ 'tp-yt-paper-button#expand',
70
+ 'tp-yt-paper-button[id="expand"]',
71
+ '#expand',
72
+ '#more',
73
+ 'yt-formatted-string:has-text("...more")',
74
+ '[aria-label*="more"]'
75
+ ];
76
+
77
+ for (const selector of moreSelectors) {
78
+ try {
79
+ const moreButton = await page.$(selector);
80
+ if (moreButton) {
81
+ const isVisible = await moreButton.evaluate(el => {
82
+ const rect = el.getBoundingClientRect();
83
+ return rect.width > 0 && rect.height > 0;
84
+ });
85
+
86
+ if (isVisible) {
87
+ await moreButton.click();
88
+ console.log('Clicked "more" button');
89
+ await new Promise(resolve => setTimeout(resolve, 1000));
90
+ break;
91
+ }
92
+ }
93
+ } catch (e) {
94
+ // Try next selector
95
+ }
96
+ }
97
+ } catch (e) {
98
+ console.log('No "more" button found or error clicking it');
99
+ }
100
+
101
+ // Look for and click the "Show transcript" button
102
+ console.log('Looking for "Show transcript" button...');
103
+
104
+ // Multiple strategies to find the transcript button
105
+ const transcriptButtonSelectors = [
106
+ 'button[aria-label="Show transcript"]',
107
+ 'yt-button-shape button[aria-label="Show transcript"]',
108
+ 'button[title*="transcript" i]',
109
+ 'button[aria-label*="transcript" i]',
110
+ 'yt-button-shape[aria-label*="transcript" i]',
111
+ '#button[aria-label*="transcript" i]',
112
+ 'ytd-button-renderer[aria-label*="transcript" i]'
113
+ ];
114
+
115
+ let transcriptClicked = false;
116
+
117
+ for (const selector of transcriptButtonSelectors) {
118
+ try {
119
+ await page.waitForSelector(selector, { timeout: 3000, visible: true });
120
+ await page.click(selector);
121
+ console.log(`Clicked transcript button with selector: ${selector}`);
122
+ transcriptClicked = true;
123
+ break;
124
+ } catch (e) {
125
+ // Try next selector
126
+ }
127
+ }
128
+
129
+ if (!transcriptClicked) {
130
+ // Try finding by text content
131
+ console.log('Trying to find transcript button by text...');
132
+ const clicked = await page.evaluate(() => {
133
+ const buttons = Array.from(document.querySelectorAll('button, yt-button-shape'));
134
+ for (const button of buttons) {
135
+ const text = button.textContent || '';
136
+ const ariaLabel = button.getAttribute('aria-label') || '';
137
+ if (text.toLowerCase().includes('transcript') || ariaLabel.toLowerCase().includes('transcript')) {
138
+ button.click();
139
+ return true;
140
+ }
141
+ }
142
+ return false;
143
+ });
144
+
145
+ if (clicked) {
146
+ console.log('Clicked transcript button by text search');
147
+ transcriptClicked = true;
148
+ }
149
+ }
150
+
151
+ if (!transcriptClicked) {
152
+ throw new Error('Could not find or click "Show transcript" button');
153
+ }
154
+
155
+ // Wait for the transcript panel to load
156
+ console.log('Waiting for transcript panel...');
157
+ await new Promise(resolve => setTimeout(resolve, 3000));
158
+
159
+ // Wait for transcript segments
160
+ await page.waitForSelector('ytd-transcript-segment-renderer, ytd-transcript-body-renderer', {
161
+ timeout: 10000,
162
+ visible: true
163
+ });
164
+
165
+ // Extract transcript data
166
+ console.log('Extracting transcript content...');
167
+ const transcriptData = await page.evaluate(() => {
168
+ // Multiple selectors for transcript segments
169
+ const segmentSelectors = [
170
+ 'ytd-transcript-segment-renderer',
171
+ 'ytd-transcript-body-renderer ytd-transcript-segment-renderer',
172
+ 'ytd-engagement-panel-section-list-renderer ytd-transcript-segment-renderer',
173
+ '#segments-container ytd-transcript-segment-renderer',
174
+ 'ytd-transcript-segment-list-renderer ytd-transcript-segment-renderer'
175
+ ];
176
+
177
+ let segments = [];
178
+ for (const selector of segmentSelectors) {
179
+ segments = document.querySelectorAll(selector);
180
+ if (segments.length > 0) {
181
+ console.log(`Found ${segments.length} segments with selector: ${selector}`);
182
+ break;
183
+ }
184
+ }
185
+
186
+ if (segments.length === 0) {
187
+ // Try a more general approach
188
+ segments = document.querySelectorAll('[class*="transcript"][class*="segment"]');
189
+ console.log(`Found ${segments.length} segments with general selector`);
190
+ }
191
+
192
+ if (segments.length === 0) {
193
+ return [];
194
+ }
195
+
196
+ // Extract data from each segment
197
+ return Array.from(segments).map((segment) => {
198
+ // Extract timestamp - multiple strategies
199
+ let timestampText = '';
200
+ const timestampSelectors = [
201
+ '.segment-timestamp',
202
+ '[class*="timestamp"]',
203
+ '.ytd-transcript-segment-renderer:first-child',
204
+ 'div:first-child'
205
+ ];
206
+
207
+ for (const selector of timestampSelectors) {
208
+ const elem = segment.querySelector(selector);
209
+ if (elem && elem.textContent && /\d+:\d+/.test(elem.textContent)) {
210
+ timestampText = elem.textContent.trim();
211
+ break;
212
+ }
213
+ }
214
+
215
+ // Extract text content - multiple strategies
216
+ let text = '';
217
+ const textSelectors = [
218
+ '.segment-text',
219
+ 'yt-formatted-string.segment-text',
220
+ '[class*="segment-text"]',
221
+ 'yt-formatted-string:last-child',
222
+ '.ytd-transcript-segment-renderer:last-child'
223
+ ];
224
+
225
+ for (const selector of textSelectors) {
226
+ const elem = segment.querySelector(selector);
227
+ if (elem && elem.textContent) {
228
+ const content = elem.textContent.trim();
229
+ // Make sure it's not the timestamp
230
+ if (content && !(/^\d+:\d+$/.test(content))) {
231
+ text = content;
232
+ break;
233
+ }
234
+ }
235
+ }
236
+
237
+ // If still no text, get all text and remove timestamp
238
+ if (!text) {
239
+ const fullText = segment.textContent || '';
240
+ text = fullText.replace(timestampText, '').trim();
241
+ }
242
+
243
+ // Convert timestamp to seconds
244
+ let startSeconds = 0;
245
+ if (timestampText && timestampText.includes(':')) {
246
+ const parts = timestampText.split(':').reverse();
247
+ startSeconds = parts.reduce((acc, part, idx) => {
248
+ return acc + (parseInt(part) || 0) * Math.pow(60, idx);
249
+ }, 0);
250
+ }
251
+
252
+ return {
253
+ start: startSeconds.toString(),
254
+ dur: "3",
255
+ text: text,
256
+ timestamp: timestampText
257
+ };
258
+ }).filter(item => item.text && item.text.trim() && item.text.length > 0);
259
+ });
260
+
261
+ if (!transcriptData || transcriptData.length === 0) {
262
+ throw new Error('No transcript data extracted');
263
+ }
264
+
265
+ console.log(`Successfully extracted ${transcriptData.length} transcript segments`);
266
+ console.log('First segment:', transcriptData[0]);
267
+
268
+ // Calculate proper durations
269
+ const processedCaptions = transcriptData.map((item, index) => {
270
+ const nextItem = transcriptData[index + 1];
271
+ const duration = nextItem
272
+ ? (parseFloat(nextItem.start) - parseFloat(item.start)).toFixed(1)
273
+ : "3.0";
274
+
275
+ return {
276
+ start: item.start,
277
+ dur: duration,
278
+ text: item.text
279
+ };
280
+ });
281
+
282
+ return processedCaptions;
283
+
284
+ } catch (error) {
285
+ console.error('Error extracting subtitles:', error);
286
+ throw error;
287
+ } finally {
288
+ await browser.close();
289
+ }
290
+ }
@@ -0,0 +1,23 @@
1
+ import { test } from 'node:test';
2
+ import assert from 'node:assert';
3
+ import { getSubtitles } from '../src/index.js';
4
+
5
+ test('Extract passive income video captions', async () => {
6
+ const captions = await getSubtitles({ videoID: 'JueUvj6X3DA' });
7
+
8
+ // Check that captions were extracted
9
+ assert(Array.isArray(captions), 'Captions should be an array');
10
+ assert(captions.length > 0, 'Should extract at least one caption');
11
+
12
+ // Check structure of first caption
13
+ const firstCaption = captions[0];
14
+ assert(typeof firstCaption.start === 'string', 'Start time should be a string');
15
+ assert(typeof firstCaption.dur === 'string', 'Duration should be a string');
16
+ assert(typeof firstCaption.text === 'string', 'Text should be a string');
17
+
18
+ // Check that the first caption contains expected content
19
+ assert(
20
+ firstCaption.text.toLowerCase().includes('creating passive income'),
21
+ `First caption should contain "creating passive income", got: "${firstCaption.text}"`
22
+ );
23
+ });