@tacone/prosey 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +146 -0
- package/bin/prosey.js +604 -0
- package/package.json +64 -0
- package/src/format.test.ts +74 -0
- package/src/format.ts +57 -0
- package/src/index.ts +239 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 tacone
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Prosey
|
|
2
|
+
|
|
3
|
+
**Prosey** downloads YouTube transcripts to be printed or saved to a file.
|
|
4
|
+
|
|
5
|
+
You can read, skim, search, copy, and manipulate the text using the tools you love the most.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npx prosey https://www.youtube.com/watch?v=dQw4w9WgXcQ
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
### Quick — no install (npx)
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx prosey dQw4w9WgXcQ
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Global install (npm)
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install -g prosey
|
|
23
|
+
prosey dQw4w9WgXcQ
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### From source (development)
|
|
27
|
+
|
|
28
|
+
Uses **[Bun](https://bun.sh)** for development — scripts, package management,
|
|
29
|
+
and running the TypeScript source directly.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
git clone https://github.com/tacone/prosey.git
|
|
33
|
+
cd prosey
|
|
34
|
+
bun install
|
|
35
|
+
bun run start -- dQw4w9WgXcQ
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Prebuilt binary
|
|
39
|
+
|
|
40
|
+
Grab a compiled binary from the `dist/` directory (requires no runtime).
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
prosey [options] <video-url-or-id>
|
|
46
|
+
prosey info [options] <video-url-or-id>
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Pass a full YouTube URL or a bare video ID. The transcript is printed to
|
|
50
|
+
stdout by default, with video details prepended.
|
|
51
|
+
|
|
52
|
+
### Examples
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Basic — plain text with details
|
|
56
|
+
prosey dQw4w9WgXcQ
|
|
57
|
+
|
|
58
|
+
# Specify language
|
|
59
|
+
prosey https://www.youtube.com/watch?v=dQw4w9WgXcQ --lang es
|
|
60
|
+
|
|
61
|
+
# Include timestamps
|
|
62
|
+
prosey dQw4w9WgXcQ -t
|
|
63
|
+
|
|
64
|
+
# Save to file
|
|
65
|
+
prosey dQw4w9WgXcQ -o transcript.txt
|
|
66
|
+
|
|
67
|
+
# JSON output (timestamps always included)
|
|
68
|
+
prosey dQw4w9WgXcQ --json
|
|
69
|
+
|
|
70
|
+
# Transcript only, no video details
|
|
71
|
+
prosey dQw4w9WgXcQ --no-details
|
|
72
|
+
|
|
73
|
+
# List available transcript languages
|
|
74
|
+
prosey dQw4w9WgXcQ --list
|
|
75
|
+
|
|
76
|
+
# Show video metadata
|
|
77
|
+
prosey info dQw4w9WgXcQ
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Options
|
|
81
|
+
|
|
82
|
+
| Flag | Description |
|
|
83
|
+
| ----------------------- | -------------------------------------------------------------------------------------------------------------- |
|
|
84
|
+
| `--lang <code>` | Language code (e.g. `en`, `fr`). Auto-detected if omitted. |
|
|
85
|
+
| `-t`, `--timestamps` | Prepend `[MM:SS]` to each line in text output. |
|
|
86
|
+
| `--list` | List available transcript languages for the video, then exit. |
|
|
87
|
+
| `-o`, `--output <path>` | Write output to file instead of stdout. |
|
|
88
|
+
| `--json` | Output transcript as a JSON array. Each item includes `text`, `offset` (seconds), `duration`, and `timestamp`. |
|
|
89
|
+
| `--text` | Output as plain text (default). |
|
|
90
|
+
| `--details` | Prepend video details (title, channel, duration, views, description) to the transcript (default). |
|
|
91
|
+
| `--no-details` | Suppress video details, transcript only. |
|
|
92
|
+
| `--no-decode-entities` | Preserve raw HTML entities (e.g. `'`). Decoded by default in text mode. |
|
|
93
|
+
| `--help` | Show help message and exit. |
|
|
94
|
+
| `--version` | Show version number and exit. |
|
|
95
|
+
|
|
96
|
+
## JSON format
|
|
97
|
+
|
|
98
|
+
When `--json` is used, the output is an array of objects:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
[
|
|
102
|
+
{
|
|
103
|
+
"text": "♪ We're no strangers to love ♪",
|
|
104
|
+
"offset": 18.64,
|
|
105
|
+
"duration": 3.24,
|
|
106
|
+
"timestamp": "00:18"
|
|
107
|
+
}
|
|
108
|
+
]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
`timestamp` is always present in JSON mode. Video details are suppressed
|
|
112
|
+
(silently) since JSON is structured data.
|
|
113
|
+
|
|
114
|
+
## Binary
|
|
115
|
+
|
|
116
|
+
Build a single-file binary with no external dependencies:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
bun run build
|
|
120
|
+
./dist/prosey dQw4w9WgXcQ
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Development
|
|
124
|
+
|
|
125
|
+
This project uses **[Bun](https://bun.sh)** for development.
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
bun run typecheck # TypeScript check
|
|
129
|
+
bun run start # Run the CLI from source
|
|
130
|
+
bun run test # Run unit tests
|
|
131
|
+
bun run build # Compile standalone binary
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Before publishing to npm, `bun run build:node` compiles the TypeScript source
|
|
135
|
+
into a Node-compatible JS bundle at `bin/prosey.js`. This happens automatically
|
|
136
|
+
via the `prepack` hook.
|
|
137
|
+
|
|
138
|
+
## How it works
|
|
139
|
+
|
|
140
|
+
prosey uses YouTube's Innertube API via the
|
|
141
|
+
[youtube-transcript-plus](https://github.com/ericmmartin/youtube-transcript-plus)
|
|
142
|
+
library. No API keys or browser automation required.
|
|
143
|
+
|
|
144
|
+
The tool works with both manually created captions and auto-generated
|
|
145
|
+
transcripts (YouTube's speech-to-text). Auto-detection falls through languages
|
|
146
|
+
in the order provided by YouTube's player response.
|
package/bin/prosey.js
ADDED
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/index.ts
|
|
4
|
+
import { writeFile } from "node:fs/promises";
|
|
5
|
+
|
|
6
|
+
// node_modules/youtube-transcript-plus/dist/youtube-transcript-plus.mjs
|
|
7
|
+
function __awaiter(thisArg, _arguments, P, generator) {
|
|
8
|
+
function adopt(value) {
|
|
9
|
+
return value instanceof P ? value : new P(function(resolve) {
|
|
10
|
+
resolve(value);
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
return new (P || (P = Promise))(function(resolve, reject) {
|
|
14
|
+
function fulfilled(value) {
|
|
15
|
+
try {
|
|
16
|
+
step(generator.next(value));
|
|
17
|
+
} catch (e) {
|
|
18
|
+
reject(e);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
function rejected(value) {
|
|
22
|
+
try {
|
|
23
|
+
step(generator["throw"](value));
|
|
24
|
+
} catch (e) {
|
|
25
|
+
reject(e);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
function step(result) {
|
|
29
|
+
result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
|
|
30
|
+
}
|
|
31
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
var DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
35
|
+
var RE_YOUTUBE = /(?:v=|\/|v\/|embed\/|watch\?.*v=|youtu\.be\/|\/v\/|e\/|watch\?.*vi?=|\/embed\/|\/v\/|vi?\/|watch\?.*vi?=|youtu\.be\/|\/vi?\/|\/e\/)([a-zA-Z0-9_-]{11})/i;
|
|
36
|
+
var RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
|
|
37
|
+
class YoutubeTranscriptTooManyRequestError extends Error {
|
|
38
|
+
constructor() {
|
|
39
|
+
super("YouTube is receiving too many requests from your IP address. Please try again later or use a proxy. If the issue persists, consider reducing the frequency of requests.");
|
|
40
|
+
this.name = "YoutubeTranscriptTooManyRequestError";
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
class YoutubeTranscriptVideoUnavailableError extends Error {
|
|
45
|
+
constructor(videoId) {
|
|
46
|
+
super(`The video with ID "${videoId}" is no longer available or has been removed. Please check the video URL or ID and try again.`);
|
|
47
|
+
this.name = "YoutubeTranscriptVideoUnavailableError";
|
|
48
|
+
this.videoId = videoId;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
class YoutubeTranscriptDisabledError extends Error {
|
|
53
|
+
constructor(videoId) {
|
|
54
|
+
super(`Transcripts are disabled for the video with ID "${videoId}". This may be due to the video owner disabling captions or the video not supporting transcripts.`);
|
|
55
|
+
this.name = "YoutubeTranscriptDisabledError";
|
|
56
|
+
this.videoId = videoId;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
class YoutubeTranscriptNotAvailableError extends Error {
|
|
61
|
+
constructor(videoId) {
|
|
62
|
+
super(`No transcripts are available for the video with ID "${videoId}". This may be because the video does not have captions or the captions are not accessible.`);
|
|
63
|
+
this.name = "YoutubeTranscriptNotAvailableError";
|
|
64
|
+
this.videoId = videoId;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
class YoutubeTranscriptNotAvailableLanguageError extends Error {
|
|
69
|
+
constructor(lang, availableLangs, videoId) {
|
|
70
|
+
super(`No transcripts are available in "${lang}" for the video with ID "${videoId}". Available languages: ${availableLangs.join(", ")}. Please try a different language.`);
|
|
71
|
+
this.name = "YoutubeTranscriptNotAvailableLanguageError";
|
|
72
|
+
this.videoId = videoId;
|
|
73
|
+
this.lang = lang;
|
|
74
|
+
this.availableLangs = availableLangs;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
class YoutubeTranscriptInvalidLangError extends Error {
|
|
79
|
+
constructor(lang) {
|
|
80
|
+
super(`Invalid language code "${lang}". Please provide a valid BCP 47 language code (e.g., "en", "fr", "pt-BR").`);
|
|
81
|
+
this.name = "YoutubeTranscriptInvalidLangError";
|
|
82
|
+
this.lang = lang;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
class YoutubeTranscriptInvalidVideoIdError extends Error {
|
|
87
|
+
constructor() {
|
|
88
|
+
super('Invalid YouTube video ID or URL. Please provide a valid video ID or URL. Example: "dQw4w9WgXcQ" or "https://www.youtube.com/watch?v=dQw4w9WgXcQ".');
|
|
89
|
+
this.name = "YoutubeTranscriptInvalidVideoIdError";
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
var RE_VIDEO_ID = /^[a-zA-Z0-9_-]{11}$/;
|
|
93
|
+
var RE_BCP47_LANG = /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$/;
|
|
94
|
+
var XML_ENTITIES = {
|
|
95
|
+
"&": "&",
|
|
96
|
+
"<": "<",
|
|
97
|
+
">": ">",
|
|
98
|
+
""": '"',
|
|
99
|
+
"'": "'",
|
|
100
|
+
"'": "'"
|
|
101
|
+
};
|
|
102
|
+
var RE_XML_ENTITY = /&(?:amp|lt|gt|quot|apos|#39);/g;
|
|
103
|
+
function decodeXmlEntities(text) {
|
|
104
|
+
return text.replace(RE_XML_ENTITY, (match) => {
|
|
105
|
+
var _a;
|
|
106
|
+
return (_a = XML_ENTITIES[match]) !== null && _a !== undefined ? _a : match;
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
function retrieveVideoId(videoId) {
|
|
110
|
+
if (RE_VIDEO_ID.test(videoId)) {
|
|
111
|
+
return videoId;
|
|
112
|
+
}
|
|
113
|
+
const matchId = videoId.match(RE_YOUTUBE);
|
|
114
|
+
if (matchId && matchId.length) {
|
|
115
|
+
return matchId[1];
|
|
116
|
+
}
|
|
117
|
+
throw new YoutubeTranscriptInvalidVideoIdError;
|
|
118
|
+
}
|
|
119
|
+
function validateLang(lang) {
|
|
120
|
+
if (!RE_BCP47_LANG.test(lang)) {
|
|
121
|
+
throw new YoutubeTranscriptInvalidLangError(lang);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
function defaultFetch(params) {
|
|
125
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
126
|
+
const { url, lang, userAgent, method = "GET", body, headers = {}, signal } = params;
|
|
127
|
+
const fetchHeaders = Object.assign(Object.assign({ "User-Agent": userAgent || DEFAULT_USER_AGENT }, lang && { "Accept-Language": lang }), headers);
|
|
128
|
+
const fetchOptions = {
|
|
129
|
+
method,
|
|
130
|
+
headers: fetchHeaders,
|
|
131
|
+
signal
|
|
132
|
+
};
|
|
133
|
+
if (body && method === "POST") {
|
|
134
|
+
fetchOptions.body = body;
|
|
135
|
+
}
|
|
136
|
+
return fetch(url, fetchOptions);
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
function isRetryableStatus(status) {
|
|
140
|
+
return status === 429 || status >= 500 && status <= 599;
|
|
141
|
+
}
|
|
142
|
+
function sleep(ms, signal) {
|
|
143
|
+
return new Promise((resolve, reject) => {
|
|
144
|
+
signal === null || signal === undefined || signal.throwIfAborted();
|
|
145
|
+
const timer = setTimeout(resolve, ms);
|
|
146
|
+
if (signal) {
|
|
147
|
+
const onAbort = () => {
|
|
148
|
+
clearTimeout(timer);
|
|
149
|
+
reject(signal.reason);
|
|
150
|
+
};
|
|
151
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
function fetchWithRetry(fetchFn, retries, retryDelay, signal) {
|
|
156
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
157
|
+
for (let attempt = 0;attempt <= retries; attempt++) {
|
|
158
|
+
signal === null || signal === undefined || signal.throwIfAborted();
|
|
159
|
+
const response = yield fetchFn();
|
|
160
|
+
if (!isRetryableStatus(response.status) || attempt === retries) {
|
|
161
|
+
return response;
|
|
162
|
+
}
|
|
163
|
+
const delay = retryDelay * Math.pow(2, attempt);
|
|
164
|
+
yield sleep(delay, signal);
|
|
165
|
+
}
|
|
166
|
+
throw new Error("Unexpected: retry loop exited without returning");
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
class YoutubeTranscript {
|
|
170
|
+
constructor(config) {
|
|
171
|
+
this.config = config;
|
|
172
|
+
}
|
|
173
|
+
_fetchCaptionTracks(identifier, lang) {
|
|
174
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
175
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
|
|
176
|
+
const userAgent = (_b = (_a = this.config) === null || _a === undefined ? undefined : _a.userAgent) !== null && _b !== undefined ? _b : DEFAULT_USER_AGENT;
|
|
177
|
+
const protocol = ((_c = this.config) === null || _c === undefined ? undefined : _c.disableHttps) ? "http" : "https";
|
|
178
|
+
const retries = (_e = (_d = this.config) === null || _d === undefined ? undefined : _d.retries) !== null && _e !== undefined ? _e : 0;
|
|
179
|
+
const retryDelay = (_g = (_f = this.config) === null || _f === undefined ? undefined : _f.retryDelay) !== null && _g !== undefined ? _g : 1000;
|
|
180
|
+
const signal = (_h = this.config) === null || _h === undefined ? undefined : _h.signal;
|
|
181
|
+
const watchUrl = `${protocol}://www.youtube.com/watch?v=${identifier}`;
|
|
182
|
+
const watchFetchParams = { url: watchUrl, lang, userAgent, signal };
|
|
183
|
+
const videoPageResponse = yield fetchWithRetry(() => {
|
|
184
|
+
var _a2;
|
|
185
|
+
return ((_a2 = this.config) === null || _a2 === undefined ? undefined : _a2.videoFetch) ? this.config.videoFetch(watchFetchParams) : defaultFetch(watchFetchParams);
|
|
186
|
+
}, retries, retryDelay, signal);
|
|
187
|
+
if (!videoPageResponse.ok) {
|
|
188
|
+
throw new YoutubeTranscriptVideoUnavailableError(identifier);
|
|
189
|
+
}
|
|
190
|
+
const videoPageBody = yield videoPageResponse.text();
|
|
191
|
+
if (videoPageBody.includes('class="g-recaptcha"')) {
|
|
192
|
+
throw new YoutubeTranscriptTooManyRequestError;
|
|
193
|
+
}
|
|
194
|
+
const apiKeyMatch = videoPageBody.match(/"INNERTUBE_API_KEY":"([^"]+)"/) || videoPageBody.match(/INNERTUBE_API_KEY\\":\\"([^\\"]+)\\"/);
|
|
195
|
+
if (!apiKeyMatch) {
|
|
196
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
197
|
+
}
|
|
198
|
+
const apiKey = apiKeyMatch[1];
|
|
199
|
+
const playerEndpoint = `${protocol}://www.youtube.com/youtubei/v1/player?key=${apiKey}`;
|
|
200
|
+
const playerBody = {
|
|
201
|
+
context: {
|
|
202
|
+
client: {
|
|
203
|
+
clientName: "ANDROID",
|
|
204
|
+
clientVersion: "20.10.38"
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
videoId: identifier
|
|
208
|
+
};
|
|
209
|
+
const playerFetchParams = {
|
|
210
|
+
url: playerEndpoint,
|
|
211
|
+
method: "POST",
|
|
212
|
+
lang,
|
|
213
|
+
userAgent,
|
|
214
|
+
headers: { "Content-Type": "application/json" },
|
|
215
|
+
body: JSON.stringify(playerBody),
|
|
216
|
+
signal
|
|
217
|
+
};
|
|
218
|
+
const playerRes = yield fetchWithRetry(() => {
|
|
219
|
+
var _a2;
|
|
220
|
+
return ((_a2 = this.config) === null || _a2 === undefined ? undefined : _a2.playerFetch) ? this.config.playerFetch(playerFetchParams) : defaultFetch(playerFetchParams);
|
|
221
|
+
}, retries, retryDelay, signal);
|
|
222
|
+
if (!playerRes.ok) {
|
|
223
|
+
throw new YoutubeTranscriptVideoUnavailableError(identifier);
|
|
224
|
+
}
|
|
225
|
+
const playerJson = yield playerRes.json();
|
|
226
|
+
const tracklist = (_k = (_j = playerJson.captions) === null || _j === undefined ? undefined : _j.playerCaptionsTracklistRenderer) !== null && _k !== undefined ? _k : playerJson.playerCaptionsTracklistRenderer;
|
|
227
|
+
const tracks = tracklist === null || tracklist === undefined ? undefined : tracklist.captionTracks;
|
|
228
|
+
const isPlayableOk = ((_l = playerJson.playabilityStatus) === null || _l === undefined ? undefined : _l.status) === "OK";
|
|
229
|
+
if (!playerJson.captions || !tracklist) {
|
|
230
|
+
if (isPlayableOk) {
|
|
231
|
+
throw new YoutubeTranscriptDisabledError(identifier);
|
|
232
|
+
}
|
|
233
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
234
|
+
}
|
|
235
|
+
if (!Array.isArray(tracks) || tracks.length === 0) {
|
|
236
|
+
throw new YoutubeTranscriptDisabledError(identifier);
|
|
237
|
+
}
|
|
238
|
+
return { tracks, playerJson };
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
_extractVideoDetails(playerJson, identifier) {
|
|
242
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
|
|
243
|
+
const raw = playerJson.videoDetails;
|
|
244
|
+
return {
|
|
245
|
+
videoId: (_a = raw === null || raw === undefined ? undefined : raw.videoId) !== null && _a !== undefined ? _a : identifier,
|
|
246
|
+
title: (_b = raw === null || raw === undefined ? undefined : raw.title) !== null && _b !== undefined ? _b : "",
|
|
247
|
+
author: (_c = raw === null || raw === undefined ? undefined : raw.author) !== null && _c !== undefined ? _c : "",
|
|
248
|
+
channelId: (_d = raw === null || raw === undefined ? undefined : raw.channelId) !== null && _d !== undefined ? _d : "",
|
|
249
|
+
lengthSeconds: parseInt((_e = raw === null || raw === undefined ? undefined : raw.lengthSeconds) !== null && _e !== undefined ? _e : "0", 10),
|
|
250
|
+
viewCount: parseInt((_f = raw === null || raw === undefined ? undefined : raw.viewCount) !== null && _f !== undefined ? _f : "0", 10),
|
|
251
|
+
description: (_g = raw === null || raw === undefined ? undefined : raw.shortDescription) !== null && _g !== undefined ? _g : "",
|
|
252
|
+
keywords: (_h = raw === null || raw === undefined ? undefined : raw.keywords) !== null && _h !== undefined ? _h : [],
|
|
253
|
+
thumbnails: (_k = (_j = raw === null || raw === undefined ? undefined : raw.thumbnail) === null || _j === undefined ? undefined : _j.thumbnails) !== null && _k !== undefined ? _k : [],
|
|
254
|
+
isLiveContent: (_l = raw === null || raw === undefined ? undefined : raw.isLiveContent) !== null && _l !== undefined ? _l : false
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
fetchTranscript(videoId) {
|
|
258
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
259
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
|
|
260
|
+
const identifier = retrieveVideoId(videoId);
|
|
261
|
+
const lang = (_a = this.config) === null || _a === undefined ? undefined : _a.lang;
|
|
262
|
+
if (lang) {
|
|
263
|
+
validateLang(lang);
|
|
264
|
+
}
|
|
265
|
+
const userAgent = (_c = (_b = this.config) === null || _b === undefined ? undefined : _b.userAgent) !== null && _c !== undefined ? _c : DEFAULT_USER_AGENT;
|
|
266
|
+
const includeDetails = ((_d = this.config) === null || _d === undefined ? undefined : _d.videoDetails) === true;
|
|
267
|
+
const cache = (_e = this.config) === null || _e === undefined ? undefined : _e.cache;
|
|
268
|
+
const cacheTTL = (_f = this.config) === null || _f === undefined ? undefined : _f.cacheTTL;
|
|
269
|
+
const cacheKey = includeDetails ? `yt:transcript+details:${identifier}:${lang !== null && lang !== undefined ? lang : ""}` : `yt:transcript:${identifier}:${lang !== null && lang !== undefined ? lang : ""}`;
|
|
270
|
+
if (cache) {
|
|
271
|
+
const cached = yield cache.get(cacheKey);
|
|
272
|
+
if (cached) {
|
|
273
|
+
try {
|
|
274
|
+
return JSON.parse(cached);
|
|
275
|
+
} catch (_p) {}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
const { tracks, playerJson } = yield this._fetchCaptionTracks(identifier, lang);
|
|
279
|
+
const selectedTrack = lang ? tracks.find((t) => t.languageCode === lang) : tracks[0];
|
|
280
|
+
if (!selectedTrack) {
|
|
281
|
+
const available = tracks.map((t) => t.languageCode).filter(Boolean);
|
|
282
|
+
throw new YoutubeTranscriptNotAvailableLanguageError(lang, available, identifier);
|
|
283
|
+
}
|
|
284
|
+
const transcriptBaseURL = (_g = selectedTrack.baseUrl) !== null && _g !== undefined ? _g : selectedTrack.url;
|
|
285
|
+
if (!transcriptBaseURL) {
|
|
286
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
287
|
+
}
|
|
288
|
+
let transcriptURL = transcriptBaseURL;
|
|
289
|
+
transcriptURL = transcriptURL.replace(/&fmt=[^&]+/, "");
|
|
290
|
+
if ((_h = this.config) === null || _h === undefined ? undefined : _h.disableHttps) {
|
|
291
|
+
transcriptURL = transcriptURL.replace(/^https:\/\//, "http://");
|
|
292
|
+
}
|
|
293
|
+
const retries = (_k = (_j = this.config) === null || _j === undefined ? undefined : _j.retries) !== null && _k !== undefined ? _k : 0;
|
|
294
|
+
const retryDelay = (_m = (_l = this.config) === null || _l === undefined ? undefined : _l.retryDelay) !== null && _m !== undefined ? _m : 1000;
|
|
295
|
+
const signal = (_o = this.config) === null || _o === undefined ? undefined : _o.signal;
|
|
296
|
+
const transcriptFetchParams = { url: transcriptURL, lang, userAgent, signal };
|
|
297
|
+
const transcriptResponse = yield fetchWithRetry(() => {
|
|
298
|
+
var _a2;
|
|
299
|
+
return ((_a2 = this.config) === null || _a2 === undefined ? undefined : _a2.transcriptFetch) ? this.config.transcriptFetch(transcriptFetchParams) : defaultFetch(transcriptFetchParams);
|
|
300
|
+
}, retries, retryDelay, signal);
|
|
301
|
+
if (!transcriptResponse.ok) {
|
|
302
|
+
if (transcriptResponse.status === 429) {
|
|
303
|
+
throw new YoutubeTranscriptTooManyRequestError;
|
|
304
|
+
}
|
|
305
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
306
|
+
}
|
|
307
|
+
const transcriptBody = yield transcriptResponse.text();
|
|
308
|
+
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
|
|
309
|
+
const segments = results.map((m) => ({
|
|
310
|
+
text: decodeXmlEntities(m[3]),
|
|
311
|
+
duration: parseFloat(m[2]),
|
|
312
|
+
offset: parseFloat(m[1]),
|
|
313
|
+
lang: lang !== null && lang !== undefined ? lang : selectedTrack.languageCode
|
|
314
|
+
}));
|
|
315
|
+
if (segments.length === 0) {
|
|
316
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
317
|
+
}
|
|
318
|
+
const result = includeDetails ? { videoDetails: this._extractVideoDetails(playerJson, identifier), segments } : segments;
|
|
319
|
+
if (cache) {
|
|
320
|
+
try {
|
|
321
|
+
yield cache.set(cacheKey, JSON.stringify(result), cacheTTL);
|
|
322
|
+
} catch (_q) {}
|
|
323
|
+
}
|
|
324
|
+
return result;
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
listLanguages(videoId) {
|
|
328
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
329
|
+
const identifier = retrieveVideoId(videoId);
|
|
330
|
+
const { tracks } = yield this._fetchCaptionTracks(identifier);
|
|
331
|
+
return tracks.map((track) => {
|
|
332
|
+
var _a, _b;
|
|
333
|
+
return {
|
|
334
|
+
languageCode: track.languageCode,
|
|
335
|
+
languageName: (_b = (_a = track.name) === null || _a === undefined ? undefined : _a.simpleText) !== null && _b !== undefined ? _b : track.languageCode,
|
|
336
|
+
isAutoGenerated: track.kind === "asr"
|
|
337
|
+
};
|
|
338
|
+
});
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
static fetchTranscript(videoId, config) {
|
|
342
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
343
|
+
const instance = new YoutubeTranscript(config);
|
|
344
|
+
return instance.fetchTranscript(videoId);
|
|
345
|
+
});
|
|
346
|
+
}
|
|
347
|
+
static listLanguages(videoId, config) {
|
|
348
|
+
return __awaiter(this, undefined, undefined, function* () {
|
|
349
|
+
const instance = new YoutubeTranscript(config);
|
|
350
|
+
return instance.listLanguages(videoId);
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
function fetchTranscript(videoId, config) {
|
|
355
|
+
return YoutubeTranscript.fetchTranscript(videoId, config);
|
|
356
|
+
}
|
|
357
|
+
var listLanguages = YoutubeTranscript.listLanguages;
|
|
358
|
+
|
|
359
|
+
// src/format.ts
|
|
360
|
+
var namedEntities = {
|
|
361
|
+
amp: "&",
|
|
362
|
+
lt: "<",
|
|
363
|
+
gt: ">",
|
|
364
|
+
quot: '"',
|
|
365
|
+
apos: "'"
|
|
366
|
+
};
|
|
367
|
+
function formatTime(seconds) {
|
|
368
|
+
const m = Math.floor(seconds / 60);
|
|
369
|
+
const s = Math.floor(seconds % 60);
|
|
370
|
+
return `${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
371
|
+
}
|
|
372
|
+
function formatDuration(seconds) {
|
|
373
|
+
const h = Math.floor(seconds / 3600);
|
|
374
|
+
const m = Math.floor(seconds % 3600 / 60);
|
|
375
|
+
const s = Math.floor(seconds % 60);
|
|
376
|
+
if (h > 0) {
|
|
377
|
+
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
378
|
+
}
|
|
379
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
380
|
+
}
|
|
381
|
+
function decodeEntities(text) {
|
|
382
|
+
return text.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code))).replace(/&(\w+);/g, (_, name) => namedEntities[name] ?? `&${name};`);
|
|
383
|
+
}
|
|
384
|
+
function formatWithTimestamps(segments, decode) {
|
|
385
|
+
return segments.map((s) => {
|
|
386
|
+
const text = decode ? decodeEntities(s.text) : s.text;
|
|
387
|
+
return `[${formatTime(s.offset)}] ${text}`;
|
|
388
|
+
}).join(`
|
|
389
|
+
`);
|
|
390
|
+
}
|
|
391
|
+
function toText(segments, decode) {
|
|
392
|
+
return segments.map((s) => decode ? decodeEntities(s.text) : s.text).join(" ").replace(/ +/g, " ");
|
|
393
|
+
}
|
|
394
|
+
function toJSON(segments, decode) {
|
|
395
|
+
const data = segments.map((s) => ({
|
|
396
|
+
text: decode ? decodeEntities(s.text) : s.text,
|
|
397
|
+
offset: s.offset,
|
|
398
|
+
duration: s.duration,
|
|
399
|
+
timestamp: formatTime(s.offset)
|
|
400
|
+
}));
|
|
401
|
+
return JSON.stringify(data, null, 2);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// src/index.ts
|
|
405
|
+
var NAME = "prosey";
|
|
406
|
+
var VERSION = "0.1.0";
|
|
407
|
+
function help() {
|
|
408
|
+
return `${NAME} v${VERSION}
|
|
409
|
+
|
|
410
|
+
Usage: ${NAME} [options] <video-url-or-id>
|
|
411
|
+
${NAME} info [options] <video-url-or-id>
|
|
412
|
+
|
|
413
|
+
Download a YouTube video transcript or show video details.
|
|
414
|
+
|
|
415
|
+
Commands:
|
|
416
|
+
info Show video metadata (title, channel, duration, etc.)
|
|
417
|
+
|
|
418
|
+
Arguments:
|
|
419
|
+
video-url-or-id YouTube URL (full or short) or bare video ID
|
|
420
|
+
|
|
421
|
+
Options:
|
|
422
|
+
--lang <code> Language code (e.g. en, fr). Auto-detect if omitted.
|
|
423
|
+
-t, --timestamps Include timestamps [MM:SS] in output.
|
|
424
|
+
--list List available transcript languages and exit.
|
|
425
|
+
-o, --output <path> Write output to file instead of stdout.
|
|
426
|
+
--json Output as JSON (suppresses details).
|
|
427
|
+
--text Output as plain text (default).
|
|
428
|
+
--details Prepend video details to transcript (default, text only).
|
|
429
|
+
--no-details Suppress video details, transcript only.
|
|
430
|
+
--no-decode-entities Preserve HTML entities (decoded by default).
|
|
431
|
+
--help Show this help message.
|
|
432
|
+
--version Show version.
|
|
433
|
+
|
|
434
|
+
Examples:
|
|
435
|
+
${NAME} dQw4w9WgXcQ
|
|
436
|
+
${NAME} https://www.youtube.com/watch?v=dQw4w9WgXcQ --lang es
|
|
437
|
+
${NAME} dQw4w9WgXcQ -t -o transcript.txt
|
|
438
|
+
${NAME} dQw4w9WgXcQ --list
|
|
439
|
+
${NAME} dQw4w9WgXcQ --json
|
|
440
|
+
${NAME} dQw4w9WgXcQ --no-details
|
|
441
|
+
${NAME} info dQw4w9WgXcQ`;
|
|
442
|
+
}
|
|
443
|
+
function formatDetailsBlock(details) {
|
|
444
|
+
const lines = [
|
|
445
|
+
`Title: ${decodeEntities(details.title)}`,
|
|
446
|
+
`Channel: ${details.author}`,
|
|
447
|
+
`Duration: ${formatDuration(details.lengthSeconds)}`,
|
|
448
|
+
`Views: ${details.viewCount.toLocaleString()}`
|
|
449
|
+
];
|
|
450
|
+
if (details.description) {
|
|
451
|
+
const desc = details.description.length > 500 ? details.description.slice(0, 500) + "…" : details.description;
|
|
452
|
+
lines.push(`Description:
|
|
453
|
+
${desc.replace(/\n/g, `
|
|
454
|
+
`)}`);
|
|
455
|
+
}
|
|
456
|
+
return lines.join(`
|
|
457
|
+
`);
|
|
458
|
+
}
|
|
459
|
+
function printVideoInfo(details) {
|
|
460
|
+
const w = Math.max("Title:".length, "Channel:".length, "Duration:".length, "Views:".length, "Video ID:".length, "Channel ID:".length, "Keywords:".length, "Description:".length);
|
|
461
|
+
const pad = (s) => s.padEnd(w);
|
|
462
|
+
const lines = [
|
|
463
|
+
`${pad("Title:")} ${decodeEntities(details.title)}`,
|
|
464
|
+
`${pad("Channel:")} ${details.author}`,
|
|
465
|
+
`${pad("Duration:")} ${formatDuration(details.lengthSeconds)}`,
|
|
466
|
+
`${pad("Views:")} ${details.viewCount.toLocaleString()}`,
|
|
467
|
+
`${pad("Video ID:")} ${details.videoId}`,
|
|
468
|
+
`${pad("Channel ID:")} ${details.channelId}`
|
|
469
|
+
];
|
|
470
|
+
if (details.keywords.length > 0) {
|
|
471
|
+
lines.push(`${pad("Keywords:")} ${details.keywords.join(", ")}`);
|
|
472
|
+
}
|
|
473
|
+
if (details.description) {
|
|
474
|
+
lines.push(`${pad("Description:")}`);
|
|
475
|
+
const descLines = details.description.split(`
|
|
476
|
+
`).filter(Boolean);
|
|
477
|
+
for (const line of descLines) {
|
|
478
|
+
lines.push(` ${line}`);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
console.log(lines.join(`
|
|
482
|
+
`));
|
|
483
|
+
}
|
|
484
|
+
function printLanguages(languages) {
|
|
485
|
+
const rows = languages.map((l) => {
|
|
486
|
+
const auto = l.isAutoGenerated ? " (auto-generated)" : "";
|
|
487
|
+
return ` ${l.languageCode.padEnd(8)}${l.languageName}${auto}`;
|
|
488
|
+
});
|
|
489
|
+
console.log(`Available transcripts (${languages.length}):
|
|
490
|
+
${rows.join(`
|
|
491
|
+
`)}`);
|
|
492
|
+
}
|
|
493
|
+
var args = process.argv.slice(2);
|
|
494
|
+
if (args.length === 0 || args.includes("--help")) {
|
|
495
|
+
console.log(help());
|
|
496
|
+
process.exit(0);
|
|
497
|
+
}
|
|
498
|
+
if (args.includes("--version")) {
|
|
499
|
+
console.log(VERSION);
|
|
500
|
+
process.exit(0);
|
|
501
|
+
}
|
|
502
|
+
var mode = "transcript";
|
|
503
|
+
if (args[0] === "info") {
|
|
504
|
+
mode = "info";
|
|
505
|
+
args.splice(0, 1);
|
|
506
|
+
}
|
|
507
|
+
var videoId = "";
|
|
508
|
+
var lang;
|
|
509
|
+
var timestamps = false;
|
|
510
|
+
var listOnly = false;
|
|
511
|
+
var outputPath;
|
|
512
|
+
var outputJson = false;
|
|
513
|
+
var noDecode = false;
|
|
514
|
+
var showDetails = true;
|
|
515
|
+
for (let i = 0;i < args.length; i++) {
|
|
516
|
+
const arg = args[i];
|
|
517
|
+
if (!arg)
|
|
518
|
+
continue;
|
|
519
|
+
if (arg === "--lang") {
|
|
520
|
+
lang = args[++i] ?? undefined;
|
|
521
|
+
if (!lang) {
|
|
522
|
+
console.error("Error: --lang requires a language code");
|
|
523
|
+
process.exit(1);
|
|
524
|
+
}
|
|
525
|
+
} else if (arg === "--timestamps" || arg === "-t") {
|
|
526
|
+
timestamps = true;
|
|
527
|
+
} else if (arg === "--list") {
|
|
528
|
+
listOnly = true;
|
|
529
|
+
} else if (arg === "-o" || arg === "--output") {
|
|
530
|
+
outputPath = args[++i] ?? undefined;
|
|
531
|
+
if (!outputPath) {
|
|
532
|
+
console.error("Error: -o/--output requires a file path");
|
|
533
|
+
process.exit(1);
|
|
534
|
+
}
|
|
535
|
+
} else if (arg === "--json") {
|
|
536
|
+
outputJson = true;
|
|
537
|
+
} else if (arg === "--text") {
|
|
538
|
+
outputJson = false;
|
|
539
|
+
} else if (arg === "--details") {
|
|
540
|
+
showDetails = true;
|
|
541
|
+
} else if (arg === "--no-details") {
|
|
542
|
+
showDetails = false;
|
|
543
|
+
} else if (arg === "--no-decode-entities") {
|
|
544
|
+
noDecode = true;
|
|
545
|
+
} else if (arg.startsWith("-")) {
|
|
546
|
+
console.error(`Unknown option: ${arg}`);
|
|
547
|
+
process.exit(1);
|
|
548
|
+
} else {
|
|
549
|
+
videoId = arg;
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
if (!videoId) {
|
|
553
|
+
console.error("Error: missing video URL or ID");
|
|
554
|
+
console.log(help());
|
|
555
|
+
process.exit(1);
|
|
556
|
+
}
|
|
557
|
+
try {
|
|
558
|
+
if (mode === "info") {
|
|
559
|
+
const result = await fetchTranscript(videoId, { videoDetails: true, lang });
|
|
560
|
+
if (outputJson) {
|
|
561
|
+
console.log(JSON.stringify(result.videoDetails, null, 2));
|
|
562
|
+
} else {
|
|
563
|
+
printVideoInfo(result.videoDetails);
|
|
564
|
+
}
|
|
565
|
+
process.exit(0);
|
|
566
|
+
}
|
|
567
|
+
if (listOnly) {
|
|
568
|
+
const languages = await listLanguages(videoId);
|
|
569
|
+
printLanguages(languages);
|
|
570
|
+
process.exit(0);
|
|
571
|
+
}
|
|
572
|
+
const decode = !noDecode;
|
|
573
|
+
if (showDetails && !outputJson) {
|
|
574
|
+
const config = lang ? { lang, videoDetails: true } : { videoDetails: true };
|
|
575
|
+
const result = await fetchTranscript(videoId, config);
|
|
576
|
+
const detailsBlock = formatDetailsBlock(result.videoDetails);
|
|
577
|
+
const transcript = timestamps ? formatWithTimestamps(result.segments, decode) : toText(result.segments, decode);
|
|
578
|
+
const output = detailsBlock + `
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
` + transcript + `
|
|
582
|
+
`;
|
|
583
|
+
if (outputPath) {
|
|
584
|
+
await writeFile(outputPath, output, "utf8");
|
|
585
|
+
} else {
|
|
586
|
+
console.log(output);
|
|
587
|
+
}
|
|
588
|
+
} else {
|
|
589
|
+
const segments = lang ? await fetchTranscript(videoId, { lang }) : await fetchTranscript(videoId);
|
|
590
|
+
const output = outputJson ? toJSON(segments, decode) + `
|
|
591
|
+
` : timestamps ? formatWithTimestamps(segments, decode) + `
|
|
592
|
+
` : toText(segments, decode) + `
|
|
593
|
+
`;
|
|
594
|
+
if (outputPath) {
|
|
595
|
+
await writeFile(outputPath, output, "utf8");
|
|
596
|
+
} else {
|
|
597
|
+
console.log(output);
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
} catch (err) {
|
|
601
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
602
|
+
console.error(`Error: ${message}`);
|
|
603
|
+
process.exit(1);
|
|
604
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tacone/prosey",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Download YouTube video transcripts from the CLI",
|
|
5
|
+
"module": "src/index.ts",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"bin": {
|
|
8
|
+
"prosey": "bin/prosey.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"bin/",
|
|
12
|
+
"src/",
|
|
13
|
+
"package.json",
|
|
14
|
+
"README.md",
|
|
15
|
+
"LICENSE"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"start": "bun run src/index.ts",
|
|
19
|
+
"build": "bun build src/index.ts --compile --outfile dist/prosey",
|
|
20
|
+
"build:node": "bun build src/index.ts --target node --outfile bin/prosey.js",
|
|
21
|
+
"test": "bun test",
|
|
22
|
+
"typecheck": "tsc --noEmit",
|
|
23
|
+
"prettier": "prettier --write .",
|
|
24
|
+
"prepack": "bun run build:node",
|
|
25
|
+
"prepare": "husky || true"
|
|
26
|
+
},
|
|
27
|
+
"lint-staged": {
|
|
28
|
+
"*": "prettier --write --ignore-unknown"
|
|
29
|
+
},
|
|
30
|
+
"author": "tacone <tacone@gmail.com>",
|
|
31
|
+
"license": "MIT",
|
|
32
|
+
"repository": {
|
|
33
|
+
"type": "git",
|
|
34
|
+
"url": "git+https://github.com/tacone/prosey.git"
|
|
35
|
+
},
|
|
36
|
+
"bugs": {
|
|
37
|
+
"url": "https://github.com/tacone/prosey/issues"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://github.com/tacone/prosey#readme",
|
|
40
|
+
"keywords": [
|
|
41
|
+
"youtube",
|
|
42
|
+
"transcript",
|
|
43
|
+
"subtitles",
|
|
44
|
+
"captions",
|
|
45
|
+
"cli",
|
|
46
|
+
"npx",
|
|
47
|
+
"bun"
|
|
48
|
+
],
|
|
49
|
+
"engines": {
|
|
50
|
+
"node": ">=20"
|
|
51
|
+
},
|
|
52
|
+
"devDependencies": {
|
|
53
|
+
"@types/bun": "latest",
|
|
54
|
+
"husky": "^9.1.7",
|
|
55
|
+
"lint-staged": "^17.0.7",
|
|
56
|
+
"prettier": "^3.8.4"
|
|
57
|
+
},
|
|
58
|
+
"peerDependencies": {
|
|
59
|
+
"typescript": "^5"
|
|
60
|
+
},
|
|
61
|
+
"dependencies": {
|
|
62
|
+
"youtube-transcript-plus": "^2.0.0"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import type { TranscriptSegment } from "youtube-transcript-plus";
|
|
3
|
+
import { formatTime, decodeEntities, formatWithTimestamps, toText, toJSON } from "./format";
|
|
4
|
+
|
|
5
|
+
const segments: TranscriptSegment[] = [
|
|
6
|
+
{ text: "Hello world", offset: 1.5, duration: 2.0, lang: "en" },
|
|
7
|
+
{ text: "This is 'text'", offset: 10, duration: 3.0, lang: "en" },
|
|
8
|
+
{ text: "Line three", offset: 60, duration: 5.0, lang: "en" },
|
|
9
|
+
];
|
|
10
|
+
|
|
11
|
+
describe("formatTime", () => {
|
|
12
|
+
test("zero", () => expect(formatTime(0)).toBe("00:00"));
|
|
13
|
+
test("seconds only", () => expect(formatTime(5)).toBe("00:05"));
|
|
14
|
+
test("minute boundary", () => expect(formatTime(60)).toBe("01:00"));
|
|
15
|
+
test("minutes and seconds", () => expect(formatTime(65)).toBe("01:05"));
|
|
16
|
+
test("long duration", () => expect(formatTime(3661)).toBe("61:01"));
|
|
17
|
+
test("fractional truncated", () => expect(formatTime(90.7)).toBe("01:30"));
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
describe("decodeEntities", () => {
|
|
21
|
+
test("plain text unchanged", () => expect(decodeEntities("hello")).toBe("hello"));
|
|
22
|
+
test("apostrophe", () => expect(decodeEntities("'")).toBe("'"));
|
|
23
|
+
test("ampersand", () => expect(decodeEntities("&")).toBe("&"));
|
|
24
|
+
test("multiple entities", () =>
|
|
25
|
+
expect(decodeEntities("'hello' & 'world'")).toBe("'hello' & 'world'"));
|
|
26
|
+
test("numeric entity", () => expect(decodeEntities("&")).toBe("&"));
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
describe("formatWithTimestamps", () => {
|
|
30
|
+
test("includes timestamps and decodes by default", () => {
|
|
31
|
+
const result = formatWithTimestamps(segments, true);
|
|
32
|
+
expect(result).toBe("[00:01] Hello world\n[00:10] This is 'text'\n[01:00] Line three");
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test("skips decoding when false", () => {
|
|
36
|
+
const result = formatWithTimestamps(segments, false);
|
|
37
|
+
expect(result).toBe("[00:01] Hello world\n[00:10] This is 'text'\n[01:00] Line three");
|
|
38
|
+
});
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
describe("toText", () => {
|
|
42
|
+
test("joins segments with space and decodes", () => {
|
|
43
|
+
const result = toText(segments, true);
|
|
44
|
+
expect(result).toBe("Hello world This is 'text' Line three");
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test("skips decoding when false", () => {
|
|
48
|
+
const result = toText(segments, false);
|
|
49
|
+
expect(result).toBe("Hello world This is 'text' Line three");
|
|
50
|
+
});
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
describe("toJSON", () => {
|
|
54
|
+
test("includes all fields and always has timestamp", () => {
|
|
55
|
+
const result = JSON.parse(toJSON(segments, true));
|
|
56
|
+
expect(result).toHaveLength(3);
|
|
57
|
+
expect(result[0]).toEqual({
|
|
58
|
+
text: "Hello world",
|
|
59
|
+
offset: 1.5,
|
|
60
|
+
duration: 2.0,
|
|
61
|
+
timestamp: "00:01",
|
|
62
|
+
});
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test("decodes entities by default", () => {
|
|
66
|
+
const result = JSON.parse(toJSON(segments, true));
|
|
67
|
+
expect(result[1].text).toBe("This is 'text'");
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test("preserves entities when decode is false", () => {
|
|
71
|
+
const result = JSON.parse(toJSON(segments, false));
|
|
72
|
+
expect(result[1].text).toBe("This is 'text'");
|
|
73
|
+
});
|
|
74
|
+
});
|
package/src/format.ts
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type { TranscriptSegment } from "youtube-transcript-plus";
|
|
2
|
+
|
|
3
|
+
const namedEntities: Record<string, string> = {
|
|
4
|
+
amp: "&",
|
|
5
|
+
lt: "<",
|
|
6
|
+
gt: ">",
|
|
7
|
+
quot: '"',
|
|
8
|
+
apos: "'",
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
export function formatTime(seconds: number): string {
|
|
12
|
+
const m = Math.floor(seconds / 60);
|
|
13
|
+
const s = Math.floor(seconds % 60);
|
|
14
|
+
return `${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function formatDuration(seconds: number): string {
|
|
18
|
+
const h = Math.floor(seconds / 3600);
|
|
19
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
20
|
+
const s = Math.floor(seconds % 60);
|
|
21
|
+
if (h > 0) {
|
|
22
|
+
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
23
|
+
}
|
|
24
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function decodeEntities(text: string): string {
|
|
28
|
+
return text
|
|
29
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
|
|
30
|
+
.replace(/&(\w+);/g, (_, name) => namedEntities[name] ?? `&${name};`);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function formatWithTimestamps(segments: TranscriptSegment[], decode: boolean): string {
|
|
34
|
+
return segments
|
|
35
|
+
.map((s) => {
|
|
36
|
+
const text = decode ? decodeEntities(s.text) : s.text;
|
|
37
|
+
return `[${formatTime(s.offset)}] ${text}`;
|
|
38
|
+
})
|
|
39
|
+
.join("\n");
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function toText(segments: TranscriptSegment[], decode: boolean): string {
|
|
43
|
+
return segments
|
|
44
|
+
.map((s) => (decode ? decodeEntities(s.text) : s.text))
|
|
45
|
+
.join(" ")
|
|
46
|
+
.replace(/ +/g, " ");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export function toJSON(segments: TranscriptSegment[], decode: boolean): string {
|
|
50
|
+
const data = segments.map((s) => ({
|
|
51
|
+
text: decode ? decodeEntities(s.text) : s.text,
|
|
52
|
+
offset: s.offset,
|
|
53
|
+
duration: s.duration,
|
|
54
|
+
timestamp: formatTime(s.offset),
|
|
55
|
+
}));
|
|
56
|
+
return JSON.stringify(data, null, 2);
|
|
57
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { writeFile } from "node:fs/promises";
|
|
4
|
+
import { fetchTranscript, listLanguages } from "youtube-transcript-plus";
|
|
5
|
+
import type { CaptionTrackInfo, VideoDetails } from "youtube-transcript-plus";
|
|
6
|
+
import { formatWithTimestamps, toText, toJSON, formatDuration, decodeEntities } from "./format";
|
|
7
|
+
|
|
8
|
+
const NAME = "prosey";
|
|
9
|
+
const VERSION = "0.1.0";
|
|
10
|
+
|
|
11
|
+
function help(): string {
|
|
12
|
+
return `${NAME} v${VERSION}
|
|
13
|
+
|
|
14
|
+
Usage: ${NAME} [options] <video-url-or-id>
|
|
15
|
+
${NAME} info [options] <video-url-or-id>
|
|
16
|
+
|
|
17
|
+
Download a YouTube video transcript or show video details.
|
|
18
|
+
|
|
19
|
+
Commands:
|
|
20
|
+
info Show video metadata (title, channel, duration, etc.)
|
|
21
|
+
|
|
22
|
+
Arguments:
|
|
23
|
+
video-url-or-id YouTube URL (full or short) or bare video ID
|
|
24
|
+
|
|
25
|
+
Options:
|
|
26
|
+
--lang <code> Language code (e.g. en, fr). Auto-detect if omitted.
|
|
27
|
+
-t, --timestamps Include timestamps [MM:SS] in output.
|
|
28
|
+
--list List available transcript languages and exit.
|
|
29
|
+
-o, --output <path> Write output to file instead of stdout.
|
|
30
|
+
--json Output as JSON (suppresses details).
|
|
31
|
+
--text Output as plain text (default).
|
|
32
|
+
--details Prepend video details to transcript (default, text only).
|
|
33
|
+
--no-details Suppress video details, transcript only.
|
|
34
|
+
--no-decode-entities Preserve HTML entities (decoded by default).
|
|
35
|
+
--help Show this help message.
|
|
36
|
+
--version Show version.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
${NAME} dQw4w9WgXcQ
|
|
40
|
+
${NAME} https://www.youtube.com/watch?v=dQw4w9WgXcQ --lang es
|
|
41
|
+
${NAME} dQw4w9WgXcQ -t -o transcript.txt
|
|
42
|
+
${NAME} dQw4w9WgXcQ --list
|
|
43
|
+
${NAME} dQw4w9WgXcQ --json
|
|
44
|
+
${NAME} dQw4w9WgXcQ --no-details
|
|
45
|
+
${NAME} info dQw4w9WgXcQ`;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function formatDetailsBlock(details: VideoDetails): string {
|
|
49
|
+
const lines: string[] = [
|
|
50
|
+
`Title: ${decodeEntities(details.title)}`,
|
|
51
|
+
`Channel: ${details.author}`,
|
|
52
|
+
`Duration: ${formatDuration(details.lengthSeconds)}`,
|
|
53
|
+
`Views: ${details.viewCount.toLocaleString()}`,
|
|
54
|
+
];
|
|
55
|
+
|
|
56
|
+
if (details.description) {
|
|
57
|
+
const desc =
|
|
58
|
+
details.description.length > 500
|
|
59
|
+
? details.description.slice(0, 500) + "…"
|
|
60
|
+
: details.description;
|
|
61
|
+
lines.push(`Description:\n ${desc.replace(/\n/g, "\n ")}`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return lines.join("\n");
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function printVideoInfo(details: VideoDetails): void {
|
|
68
|
+
const w = Math.max(
|
|
69
|
+
"Title:".length,
|
|
70
|
+
"Channel:".length,
|
|
71
|
+
"Duration:".length,
|
|
72
|
+
"Views:".length,
|
|
73
|
+
"Video ID:".length,
|
|
74
|
+
"Channel ID:".length,
|
|
75
|
+
"Keywords:".length,
|
|
76
|
+
"Description:".length,
|
|
77
|
+
);
|
|
78
|
+
const pad = (s: string) => s.padEnd(w);
|
|
79
|
+
|
|
80
|
+
const lines: string[] = [
|
|
81
|
+
`${pad("Title:")} ${decodeEntities(details.title)}`,
|
|
82
|
+
`${pad("Channel:")} ${details.author}`,
|
|
83
|
+
`${pad("Duration:")} ${formatDuration(details.lengthSeconds)}`,
|
|
84
|
+
`${pad("Views:")} ${details.viewCount.toLocaleString()}`,
|
|
85
|
+
`${pad("Video ID:")} ${details.videoId}`,
|
|
86
|
+
`${pad("Channel ID:")} ${details.channelId}`,
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
if (details.keywords.length > 0) {
|
|
90
|
+
lines.push(`${pad("Keywords:")} ${details.keywords.join(", ")}`);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (details.description) {
|
|
94
|
+
lines.push(`${pad("Description:")}`);
|
|
95
|
+
const descLines = details.description.split("\n").filter(Boolean);
|
|
96
|
+
for (const line of descLines) {
|
|
97
|
+
lines.push(` ${line}`);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
console.log(lines.join("\n"));
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function printLanguages(languages: CaptionTrackInfo[]): void {
|
|
105
|
+
const rows = languages.map((l) => {
|
|
106
|
+
const auto = l.isAutoGenerated ? " (auto-generated)" : "";
|
|
107
|
+
return ` ${l.languageCode.padEnd(8)}${l.languageName}${auto}`;
|
|
108
|
+
});
|
|
109
|
+
console.log(`Available transcripts (${languages.length}):\n${rows.join("\n")}`);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const args = process.argv.slice(2);
|
|
113
|
+
|
|
114
|
+
if (args.length === 0 || args.includes("--help")) {
|
|
115
|
+
console.log(help());
|
|
116
|
+
process.exit(0);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (args.includes("--version")) {
|
|
120
|
+
console.log(VERSION);
|
|
121
|
+
process.exit(0);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let mode = "transcript";
|
|
125
|
+
if (args[0] === "info") {
|
|
126
|
+
mode = "info";
|
|
127
|
+
args.splice(0, 1);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let videoId = "";
|
|
131
|
+
let lang: string | undefined;
|
|
132
|
+
let timestamps = false;
|
|
133
|
+
let listOnly = false;
|
|
134
|
+
let outputPath: string | undefined;
|
|
135
|
+
let outputJson = false;
|
|
136
|
+
let noDecode = false;
|
|
137
|
+
let showDetails = true;
|
|
138
|
+
|
|
139
|
+
for (let i = 0; i < args.length; i++) {
|
|
140
|
+
const arg = args[i];
|
|
141
|
+
if (!arg) continue;
|
|
142
|
+
if (arg === "--lang") {
|
|
143
|
+
lang = args[++i] ?? undefined;
|
|
144
|
+
if (!lang) {
|
|
145
|
+
console.error("Error: --lang requires a language code");
|
|
146
|
+
process.exit(1);
|
|
147
|
+
}
|
|
148
|
+
} else if (arg === "--timestamps" || arg === "-t") {
|
|
149
|
+
timestamps = true;
|
|
150
|
+
} else if (arg === "--list") {
|
|
151
|
+
listOnly = true;
|
|
152
|
+
} else if (arg === "-o" || arg === "--output") {
|
|
153
|
+
outputPath = args[++i] ?? undefined;
|
|
154
|
+
if (!outputPath) {
|
|
155
|
+
console.error("Error: -o/--output requires a file path");
|
|
156
|
+
process.exit(1);
|
|
157
|
+
}
|
|
158
|
+
} else if (arg === "--json") {
|
|
159
|
+
outputJson = true;
|
|
160
|
+
} else if (arg === "--text") {
|
|
161
|
+
outputJson = false;
|
|
162
|
+
} else if (arg === "--details") {
|
|
163
|
+
showDetails = true;
|
|
164
|
+
} else if (arg === "--no-details") {
|
|
165
|
+
showDetails = false;
|
|
166
|
+
} else if (arg === "--no-decode-entities") {
|
|
167
|
+
noDecode = true;
|
|
168
|
+
} else if (arg.startsWith("-")) {
|
|
169
|
+
console.error(`Unknown option: ${arg}`);
|
|
170
|
+
process.exit(1);
|
|
171
|
+
} else {
|
|
172
|
+
videoId = arg;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
if (!videoId) {
|
|
177
|
+
console.error("Error: missing video URL or ID");
|
|
178
|
+
console.log(help());
|
|
179
|
+
process.exit(1);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
if (mode === "info") {
|
|
184
|
+
const result = await fetchTranscript(videoId, { videoDetails: true, lang } as any);
|
|
185
|
+
if (outputJson) {
|
|
186
|
+
console.log(JSON.stringify(result.videoDetails, null, 2));
|
|
187
|
+
} else {
|
|
188
|
+
printVideoInfo(result.videoDetails);
|
|
189
|
+
}
|
|
190
|
+
process.exit(0);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (listOnly) {
|
|
194
|
+
const languages = await listLanguages(videoId);
|
|
195
|
+
printLanguages(languages);
|
|
196
|
+
process.exit(0);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const decode = !noDecode;
|
|
200
|
+
|
|
201
|
+
if (showDetails && !outputJson) {
|
|
202
|
+
const config = lang ? { lang, videoDetails: true as const } : { videoDetails: true as const };
|
|
203
|
+
const result = (await fetchTranscript(videoId, config)) as {
|
|
204
|
+
videoDetails: VideoDetails;
|
|
205
|
+
segments: { text: string; offset: number; duration: number; lang: string }[];
|
|
206
|
+
};
|
|
207
|
+
const detailsBlock = formatDetailsBlock(result.videoDetails);
|
|
208
|
+
const transcript = timestamps
|
|
209
|
+
? formatWithTimestamps(result.segments, decode)
|
|
210
|
+
: toText(result.segments, decode);
|
|
211
|
+
const output = detailsBlock + "\n\n\n" + transcript + "\n";
|
|
212
|
+
|
|
213
|
+
if (outputPath) {
|
|
214
|
+
await writeFile(outputPath, output, "utf8");
|
|
215
|
+
} else {
|
|
216
|
+
console.log(output);
|
|
217
|
+
}
|
|
218
|
+
} else {
|
|
219
|
+
const segments = lang
|
|
220
|
+
? await fetchTranscript(videoId, { lang })
|
|
221
|
+
: await fetchTranscript(videoId);
|
|
222
|
+
|
|
223
|
+
const output = outputJson
|
|
224
|
+
? toJSON(segments, decode) + "\n"
|
|
225
|
+
: timestamps
|
|
226
|
+
? formatWithTimestamps(segments, decode) + "\n"
|
|
227
|
+
: toText(segments, decode) + "\n";
|
|
228
|
+
|
|
229
|
+
if (outputPath) {
|
|
230
|
+
await writeFile(outputPath, output, "utf8");
|
|
231
|
+
} else {
|
|
232
|
+
console.log(output);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
} catch (err: unknown) {
|
|
236
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
237
|
+
console.error(`Error: ${message}`);
|
|
238
|
+
process.exit(1);
|
|
239
|
+
}
|