headless-youtube-captions 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,398 +4,176 @@
4
4
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
5
  [![Node.js Version](https://img.shields.io/node/v/headless-youtube-captions.svg)](https://nodejs.org)
6
6
 
7
- > Extract YouTube video transcripts, channel videos, and comments by interacting with YouTube's UI using Puppeteer
7
+ > Extract YouTube transcripts, channel videos, comments, search results, and video metadata using yt-dlp
8
8
 
9
9
  ## Features
10
10
 
11
- - 🎯 Extract video transcripts/captions in multiple languages
12
- - 📺 Get channel videos with pagination support
13
- - 🔍 Search videos within a specific channel
14
- - 🌍 **NEW**: Search across all of YouTube globally
15
- - 💬 Extract video comments with sorting options
16
- - 🐳 Docker support with configurable Chrome executable path
17
- - 📦 Zero build dependencies - runs directly from source
18
- - 🚀 Modern ES modules with async/await
19
- - 🛡️ Handles cookie consent and ad skipping automatically
11
+ - Transcripts/captions in multiple languages
12
+ - Channel video listings with pagination
13
+ - Channel-scoped video search
14
+ - Global YouTube search
15
+ - Video comments with sort options
16
+ - Full video metadata (description, tags, categories, like/view counts)
17
+ - Zero npm dependencies uses [yt-dlp](https://github.com/yt-dlp/yt-dlp) CLI
18
+ - Modern ES modules with TypeScript definitions
19
+
20
+ ## Prerequisites
21
+
22
+ [yt-dlp](https://github.com/yt-dlp/yt-dlp#installation) must be installed and available in your PATH.
23
+
24
+ ```bash
25
+ # macOS
26
+ brew install yt-dlp
27
+
28
+ # pip
29
+ pip install yt-dlp
30
+
31
+ # Linux
32
+ sudo curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp
33
+ sudo chmod a+rx /usr/local/bin/yt-dlp
34
+ ```
20
35
 
21
36
  ## Installation
22
37
 
23
38
  ```bash
24
- npm install -S headless-youtube-captions
25
- # OR
26
- yarn add headless-youtube-captions
39
+ npm install headless-youtube-captions
27
40
  ```
28
41
 
29
42
  ## Usage
30
43
 
31
44
  ### Extract Video Transcripts
45
+
32
46
  ```js
33
47
  import { getSubtitles } from 'headless-youtube-captions';
34
48
 
35
49
  const captions = await getSubtitles({
36
- videoID: 'JueUvj6X3DA', // YouTube video ID
50
+ videoID: 'dQw4w9WgXcQ',
37
51
  lang: 'en' // Optional, default: 'en'
38
52
  });
39
-
40
- console.log(captions);
53
+ // => [{ start: "0.0", dur: "3.0", text: "We're no strangers to love" }, ...]
41
54
  ```
42
55
 
43
56
  ### Get Channel Videos
57
+
44
58
  ```js
45
59
  import { getChannelVideos } from 'headless-youtube-captions';
46
60
 
47
61
  const result = await getChannelVideos({
48
- channelURL: '@mkbhd', // or full URL like 'https://youtube.com/@mkbhd'
49
- limit: 30 // Optional, default: 30
62
+ channelURL: '@mkbhd', // or full URL, or channel ID
63
+ page: 1, // Optional, default: 1
64
+ pageSize: 20 // Optional, default: 20
50
65
  });
51
66
 
52
67
  console.log(result.videos);
68
+ // => [{ id, title, views, uploadTime, duration, thumbnail, url }, ...]
69
+ console.log(result.hasMore); // true if more pages available
53
70
  ```
54
71
 
55
72
  ### Search Channel Videos
73
+
56
74
  ```js
57
75
  import { searchChannelVideos } from 'headless-youtube-captions';
58
76
 
59
77
  const result = await searchChannelVideos({
60
78
  channelURL: '@mkbhd',
61
79
  query: 'iphone review',
62
- limit: 20 // Optional, default: 30
80
+ page: 1, // Optional
81
+ pageSize: 20 // Optional
63
82
  });
64
83
 
65
84
  console.log(result.results);
66
85
  ```
67
86
 
68
87
  ### Search YouTube Globally
88
+
69
89
  ```js
70
90
  import { searchYouTubeGlobal } from 'headless-youtube-captions';
71
91
 
72
92
  const result = await searchYouTubeGlobal({
73
93
  query: 'javascript tutorial',
74
- maxResults: 10, // Optional, 1-20, default: 10
75
- resultTypes: ['videos'] // Optional, ['videos', 'channels', 'all'], default: ['all']
94
+ page: 1, // Optional, default: 1
95
+ pageSize: 10 // Optional, default: 10
76
96
  });
77
97
 
78
98
  console.log(result.results);
99
+ // => [{ id, type: 'video', title, url, channel, views, duration, uploadTime, thumbnail }, ...]
79
100
  ```
80
101
 
81
102
  ### Get Video Comments
103
+
82
104
  ```js
83
105
  import { getVideoComments } from 'headless-youtube-captions';
84
106
 
85
107
  const result = await getVideoComments({
86
- videoID: 'JueUvj6X3DA',
87
- limit: 50, // Optional, default: 50
88
- sortBy: 'top' // Optional, 'top' or 'newest', default: 'top'
108
+ videoID: 'dQw4w9WgXcQ',
109
+ sortBy: 'top', // Optional, 'top' or 'newest', default: 'top'
110
+ page: 1, // Optional
111
+ pageSize: 20 // Optional
89
112
  });
90
113
 
91
114
  console.log(result.comments);
115
+ // => [{ author, authorUrl, text, time, likes, isReply }, ...]
92
116
  ```
93
117
 
94
- ## API
95
-
96
- ### `getSubtitles(options)`
97
-
98
- Extracts captions/transcripts from a YouTube video by automating browser interactions.
99
-
100
- #### Parameters
101
-
102
- - `options` (Object):
103
- - `videoID` (String, required): The YouTube video ID
104
- - `lang` (String, optional): Language code for captions. Default: `'en'`. Supported: `'en'`, `'de'`, `'fr'`
105
-
106
- #### Returns
107
-
108
- A Promise that resolves to an array of caption objects.
109
-
110
- #### Caption Object Format
111
-
112
- Each caption object contains:
113
-
114
- ```js
115
- {
116
- "start": "0", // Start time in seconds (as string)
117
- "dur": "3.0", // Duration in seconds (as string)
118
- "text": "Caption text here" // The actual caption text
119
- }
120
- ```
121
-
122
- #### Example Response
123
-
124
- ```js
125
- [
126
- {
127
- "start": "0",
128
- "dur": "3.0",
129
- "text": "- Creating passive income takes work,"
130
- },
131
- {
132
- "start": "3",
133
- "dur": "2.0",
134
- "text": "but once you implement those processes,"
135
- },
136
- {
137
- "start": "5",
138
- "dur": "3.0",
139
- "text": "it's one of the most fruitful income sources"
140
- }
141
- // ... more captions
142
- ]
143
- ```
144
-
145
- ## How It Works
146
-
147
- This library uses Puppeteer to:
148
-
149
- 1. Navigate to the YouTube video page
150
- 2. Handle cookie consent and ads if present
151
- 3. Click the "Show transcript" button in the video description
152
- 4. Extract transcript segments from the opened transcript panel
153
- 5. Parse timestamps and text content
154
- 6. Calculate proper durations for each caption segment
155
-
156
- ## Requirements
157
-
158
- - Node.js 18 or higher (ES modules support required)
159
- - Puppeteer (installed as a dependency)
160
-
161
- ## Docker Usage
162
-
163
- When running in Docker containers, you may need to specify the Chrome executable path using the `PUPPETEER_EXECUTABLE_PATH` environment variable:
164
-
165
- ```bash
166
- # Set the environment variable
167
- export PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
168
-
169
- # Or run directly
170
- PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable node your-script.js
171
- ```
172
-
173
- Example Dockerfile configuration:
174
- ```dockerfile
175
- # Install Chrome dependencies
176
- RUN apt-get update && apt-get install -y \
177
- wget \
178
- gnupg \
179
- ca-certificates \
180
- fonts-liberation \
181
- libasound2 \
182
- libatk-bridge2.0-0 \
183
- libatk1.0-0 \
184
- libatspi2.0-0 \
185
- libcups2 \
186
- libdbus-1-3 \
187
- libdrm2 \
188
- libgbm1 \
189
- libgtk-3-0 \
190
- libnspr4 \
191
- libnss3 \
192
- libxcomposite1 \
193
- libxdamage1 \
194
- libxfixes3 \
195
- libxkbcommon0 \
196
- libxrandr2 \
197
- xdg-utils
198
-
199
- # Install Chrome
200
- RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
201
- && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
202
- && apt-get update \
203
- && apt-get install -y google-chrome-stable \
204
- && rm -rf /var/lib/apt/lists/*
205
-
206
- # Set the Chrome executable path
207
- ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
208
- ```
209
-
210
- ## Error Handling
211
-
212
- The function will throw an error if:
213
- - The video ID is invalid or the video doesn't exist
214
- - The video has no available captions/transcripts
215
- - The "Show transcript" button cannot be found
216
- - Network issues prevent loading the page
217
-
218
- Example error handling:
118
+ ### Get Video Metadata
219
119
 
220
120
  ```js
221
- try {
222
- const captions = await getSubtitles({ videoID: 'XXXXX' });
223
- console.log(captions);
224
- } catch (error) {
225
- console.error('Failed to extract captions:', error.message);
226
- }
227
- ```
228
-
229
- ## Notes
121
+ import { getVideoMetadata } from 'headless-youtube-captions';
230
122
 
231
- - The library runs Puppeteer in headless mode by default
232
- - Extraction time depends on video page load time and transcript length
233
- - The library respects YouTube's UI structure as of the last update
234
- - Some videos may not have transcripts available
235
-
236
- ### `getChannelVideos(options)`
237
-
238
- Extracts videos from a YouTube channel with pagination support.
239
-
240
- #### Parameters
241
-
242
- - `options` (Object):
243
- - `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
244
- - `limit` (Number, optional): Maximum videos to return. Default: `30`
245
- - `pageToken` (String, optional): For pagination (future use)
246
-
247
- #### Returns
123
+ const result = await getVideoMetadata({
124
+ videoID: 'dQw4w9WgXcQ'
125
+ });
248
126
 
249
- ```js
250
- {
251
- channel: {
252
- name: "Channel Name",
253
- subscribers: "1.2M subscribers",
254
- videoCount: "500 videos"
255
- },
256
- videos: [
257
- {
258
- id: "videoId123",
259
- title: "Video Title",
260
- views: "1.2M views",
261
- uploadTime: "2 days ago",
262
- duration: "10:45",
263
- thumbnail: "https://...",
264
- url: "https://youtube.com/watch?v=videoId123"
265
- }
266
- // ... more videos
267
- ],
268
- totalLoaded: 30,
269
- hasMore: true
270
- }
127
+ console.log(result.video);
128
+ // => { id, title, description, uploadDate, viewCount, likeCount, duration, tags, categories }
129
+ console.log(result.channel);
130
+ // => { name, url, subscriberCount }
271
131
  ```
272
132
 
273
- ### `searchChannelVideos(options)`
133
+ ## Pagination
274
134
 
275
- Search for videos within a specific YouTube channel.
135
+ All list-returning functions use `page` and `pageSize` parameters:
276
136
 
277
- #### Parameters
137
+ | Function | Default pageSize | Pagination type |
138
+ |----------|-----------------|-----------------|
139
+ | `getChannelVideos` | 20 | Server-side (only fetches requested page) |
140
+ | `searchChannelVideos` | 20 | Server-side |
141
+ | `getVideoComments` | 20 | Fetch up to page*pageSize, return slice |
142
+ | `searchYouTubeGlobal` | 10 | Fetch up to page*pageSize, return slice |
278
143
 
279
- - `options` (Object):
280
- - `channelURL` (String, required): Channel identifier (@handle, channel ID, or full URL)
281
- - `query` (String, required): Search query
282
- - `limit` (Number, optional): Maximum results. Default: `30`
144
+ All results include a `hasMore` boolean to indicate if more pages are available.
283
145
 
284
- #### Returns
146
+ ## API Reference
285
147
 
286
- ```js
287
- {
288
- query: "iphone review",
289
- results: [
290
- {
291
- id: "videoId123",
292
- title: "iPhone 15 Review",
293
- views: "2.5M views",
294
- uploadTime: "1 week ago",
295
- duration: "15:23",
296
- thumbnail: "https://...",
297
- url: "https://youtube.com/watch?v=videoId123"
298
- }
299
- // ... more results
300
- ],
301
- totalFound: 25
302
- }
303
- ```
148
+ ### `getSubtitles({ videoID, lang? })`
304
149
 
305
- ### `getVideoComments(options)`
150
+ Returns `Promise<{ start: string, dur: string, text: string }[]>`
306
151
 
307
- Extract comments from a YouTube video with pagination support.
152
+ ### `getChannelVideos({ channelURL, page?, pageSize? })`
308
153
 
309
- #### Parameters
154
+ Returns `Promise<{ channel, videos, page, pageSize, hasMore }>`
310
155
 
311
- - `options` (Object):
312
- - `videoID` (String, required): YouTube video ID
313
- - `limit` (Number, optional): Maximum comments to return. Default: `50`
314
- - `sortBy` (String, optional): Sort order - `'top'` or `'newest'`. Default: `'top'`
315
- - `pageToken` (String, optional): For pagination (future use)
156
+ ### `searchChannelVideos({ channelURL, query, page?, pageSize? })`
316
157
 
317
- #### Returns
158
+ Returns `Promise<{ query, results, page, pageSize, totalFound, hasMore }>`
318
159
 
319
- ```js
320
- {
321
- video: {
322
- id: "JueUvj6X3DA",
323
- title: "Video Title",
324
- channel: {
325
- name: "Channel Name",
326
- url: "https://youtube.com/@channel"
327
- },
328
- views: "1.5M views"
329
- },
330
- comments: [
331
- {
332
- author: "Username",
333
- authorUrl: "https://youtube.com/@username",
334
- authorAvatar: "https://...",
335
- text: "Great video! Thanks for sharing...",
336
- time: "2 days ago",
337
- likes: "245",
338
- replyCount: "12"
339
- }
340
- // ... more comments
341
- ],
342
- totalComments: 1566,
343
- totalLoaded: 50,
344
- hasMore: true,
345
- sortBy: "top"
346
- }
347
- ```
160
+ ### `getVideoComments({ videoID, sortBy?, page?, pageSize? })`
348
161
 
349
- ### `searchYouTubeGlobal(options)`
162
+ Returns `Promise<{ comments, page, pageSize, totalFetched, hasMore, sortBy }>`
350
163
 
351
- Search across all of YouTube for videos and channels.
164
+ ### `searchYouTubeGlobal({ query, page?, pageSize? })`
352
165
 
353
- #### Parameters
166
+ Returns `Promise<{ query, results, page, pageSize, hasMore }>`
354
167
 
355
- - `options` (Object):
356
- - `query` (String, required): Search term to find content
357
- - `maxResults` (Number, optional): Maximum results to return (1-20). Default: `10`
358
- - `resultTypes` (Array, optional): Types of results to include. Options: `['videos']`, `['channels']`, `['all']`. Default: `['all']`
168
+ ### `getVideoMetadata({ videoID })`
359
169
 
360
- #### Returns
170
+ Returns `Promise<{ video, channel }>` — no pagination needed.
361
171
 
362
- ```js
363
- {
364
- query: "javascript tutorial",
365
- resultTypes: ["videos"],
366
- maxResults: 10,
367
- totalFound: 8,
368
- results: [
369
- {
370
- id: "videoId123",
371
- type: "video",
372
- title: "JavaScript Tutorial for Beginners",
373
- url: "https://youtube.com/watch?v=videoId123",
374
- channel: "Code Academy",
375
- views: "2.1M views",
376
- uploadTime: "1 year ago",
377
- duration: "1:23:45",
378
- thumbnail: "https://i.ytimg.com/vi/..."
379
- },
380
- {
381
- id: "channelId456",
382
- type: "channel",
383
- title: "JavaScript Mastery",
384
- url: "https://youtube.com/@javascriptmastery",
385
- subscribers: "1.2M subscribers",
386
- videoCount: "200 videos",
387
- thumbnail: "https://yt3.ggpht.com/..."
388
- }
389
- // ... more results
390
- ]
391
- }
392
- ```
393
-
394
- #### Result Types
172
+ ## Requirements
395
173
 
396
- - **Video Results** include: `id`, `type`, `title`, `url`, `channel`, `views`, `uploadTime`, `duration`, `thumbnail`
397
- - **Channel Results** include: `id`, `type`, `title`, `url`, `subscribers`, `videoCount`, `thumbnail`
174
+ - Node.js 18+
175
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp) installed and in PATH
398
176
 
399
177
  ## License
400
178
 
401
- MIT
179
+ MIT
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "headless-youtube-captions",
3
- "version": "2.0.0",
4
- "description": "Extract YouTube video transcripts (via yt-dlp), channel videos, comments, and comprehensive video metadata",
3
+ "version": "3.0.0",
4
+ "description": "Extract YouTube video transcripts, channel videos, comments, search results, and video metadata via yt-dlp",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
7
7
  "type": "module",
@@ -23,12 +23,7 @@
23
23
  "scripts": {
24
24
  "test": "node --test test/*.test.js"
25
25
  },
26
- "dependencies": {
27
- "he": "^1.2.0",
28
- "lodash": "^4.17.21",
29
- "patchright": "^1.51.1",
30
- "striptags": "^3.2.0"
31
- },
26
+ "dependencies": {},
32
27
  "devDependencies": {},
33
28
  "keywords": [
34
29
  "youtube",
@@ -42,10 +37,7 @@
42
37
  "metadata",
43
38
  "description",
44
39
  "yt-dlp",
45
- "patchright",
46
40
  "scraper",
47
- "api",
48
- "automation",
49
- "global-search"
41
+ "api"
50
42
  ]
51
43
  }