@nadimtuhin/ytranscript 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +210 -123
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +104 -51
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +63 -25
- package/dist/lib/fetcher.d.ts +26 -0
- package/dist/lib/fetcher.d.ts.map +1 -0
- package/dist/lib/fs.d.ts +20 -0
- package/dist/lib/fs.d.ts.map +1 -0
- package/dist/lib/processor.d.ts +14 -0
- package/dist/lib/processor.d.ts.map +1 -0
- package/dist/loaders/history.d.ts +9 -0
- package/dist/loaders/history.d.ts.map +1 -0
- package/dist/loaders/index.d.ts +20 -0
- package/dist/loaders/index.d.ts.map +1 -0
- package/dist/loaders/watchLater.d.ts +9 -0
- package/dist/loaders/watchLater.d.ts.map +1 -0
- package/dist/mcp.d.ts +8 -0
- package/dist/mcp.d.ts.map +1 -0
- package/dist/mcp.js +24 -7
- package/dist/outputs/index.d.ts +30 -0
- package/dist/outputs/index.d.ts.map +1 -0
- package/dist/types.d.ts +93 -0
- package/dist/types.d.ts.map +1 -0
- package/package.json +6 -6
package/dist/cli.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#!/usr/bin/env
|
|
2
|
-
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {createRequire} from "node:module";
|
|
3
3
|
var __create = Object.create;
|
|
4
4
|
var __getProtoOf = Object.getPrototypeOf;
|
|
5
5
|
var __defProp = Object.defineProperty;
|
|
@@ -17,6 +17,7 @@ var __toESM = (mod, isNodeMode, target) => {
|
|
|
17
17
|
return to;
|
|
18
18
|
};
|
|
19
19
|
var __commonJS = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
|
|
20
|
+
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
20
21
|
|
|
21
22
|
// node_modules/commander/lib/error.js
|
|
22
23
|
var require_error = __commonJS((exports) => {
|
|
@@ -616,11 +617,11 @@ var require_command = __commonJS((exports) => {
|
|
|
616
617
|
return arg;
|
|
617
618
|
});
|
|
618
619
|
}
|
|
619
|
-
var EventEmitter =
|
|
620
|
-
var childProcess =
|
|
621
|
-
var path =
|
|
622
|
-
var fs =
|
|
623
|
-
var process2 =
|
|
620
|
+
var EventEmitter = __require("node:events").EventEmitter;
|
|
621
|
+
var childProcess = __require("node:child_process");
|
|
622
|
+
var path = __require("node:path");
|
|
623
|
+
var fs = __require("node:fs");
|
|
624
|
+
var process2 = __require("node:process");
|
|
624
625
|
var { Argument, humanReadableArgName } = require_argument();
|
|
625
626
|
var { CommanderError } = require_error();
|
|
626
627
|
var { Help } = require_help();
|
|
@@ -1844,6 +1845,9 @@ var require_commander = __commonJS((exports) => {
|
|
|
1844
1845
|
exports.InvalidOptionArgumentError = InvalidArgumentError;
|
|
1845
1846
|
});
|
|
1846
1847
|
|
|
1848
|
+
// src/cli.ts
|
|
1849
|
+
import {readFile as readFile2, writeFile as writeFile2} from "node:fs/promises";
|
|
1850
|
+
|
|
1847
1851
|
// node_modules/commander/esm.mjs
|
|
1848
1852
|
var import_ = __toESM(require_commander(), 1);
|
|
1849
1853
|
var {
|
|
@@ -1860,9 +1864,10 @@ var {
|
|
|
1860
1864
|
Help
|
|
1861
1865
|
} = import_.default;
|
|
1862
1866
|
// package.json
|
|
1863
|
-
var version = "1.0
|
|
1867
|
+
var version = "1.2.0";
|
|
1864
1868
|
|
|
1865
1869
|
// src/lib/fetcher.ts
|
|
1870
|
+
import {createRequire as createRequire2} from "node:module";
|
|
1866
1871
|
function extractVideoId(input) {
|
|
1867
1872
|
if (/^[a-zA-Z0-9_-]{11}$/.test(input)) {
|
|
1868
1873
|
return input;
|
|
@@ -1888,9 +1893,20 @@ function extractVideoId(input) {
|
|
|
1888
1893
|
}
|
|
1889
1894
|
return null;
|
|
1890
1895
|
}
|
|
1891
|
-
|
|
1896
|
+
function createProxyAgent(proxy) {
|
|
1897
|
+
if (proxy) {
|
|
1898
|
+
return new ProxyAgent(proxy.url);
|
|
1899
|
+
}
|
|
1900
|
+
const envProxy = process.env.HTTP_PROXY || process.env.http_proxy || process.env.HTTPS_PROXY || process.env.https_proxy;
|
|
1901
|
+
if (envProxy) {
|
|
1902
|
+
return new ProxyAgent(envProxy);
|
|
1903
|
+
}
|
|
1904
|
+
return;
|
|
1905
|
+
}
|
|
1906
|
+
async function fetchPlayerResponse(videoId, timeout, proxy) {
|
|
1892
1907
|
const controller = new AbortController;
|
|
1893
1908
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1909
|
+
const dispatcher = createProxyAgent(proxy);
|
|
1894
1910
|
try {
|
|
1895
1911
|
const response = await fetch("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", {
|
|
1896
1912
|
method: "POST",
|
|
@@ -1907,7 +1923,8 @@ async function fetchPlayerResponse(videoId, timeout) {
|
|
|
1907
1923
|
},
|
|
1908
1924
|
videoId
|
|
1909
1925
|
}),
|
|
1910
|
-
signal: controller.signal
|
|
1926
|
+
signal: controller.signal,
|
|
1927
|
+
...dispatcher && { dispatcher }
|
|
1911
1928
|
});
|
|
1912
1929
|
if (!response.ok) {
|
|
1913
1930
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
@@ -1917,14 +1934,16 @@ async function fetchPlayerResponse(videoId, timeout) {
|
|
|
1917
1934
|
clearTimeout(timeoutId);
|
|
1918
1935
|
}
|
|
1919
1936
|
}
|
|
1920
|
-
async function fetchCaptionTrack(url, timeout) {
|
|
1937
|
+
async function fetchCaptionTrack(url, timeout, proxy) {
|
|
1921
1938
|
const controller = new AbortController;
|
|
1922
1939
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1940
|
+
const dispatcher = createProxyAgent(proxy);
|
|
1923
1941
|
try {
|
|
1924
1942
|
const jsonUrl = `${url}&fmt=json3`;
|
|
1925
1943
|
const response = await fetch(jsonUrl, {
|
|
1926
1944
|
headers: { "User-Agent": USER_AGENT },
|
|
1927
|
-
signal: controller.signal
|
|
1945
|
+
signal: controller.signal,
|
|
1946
|
+
...dispatcher && { dispatcher }
|
|
1928
1947
|
});
|
|
1929
1948
|
if (!response.ok) {
|
|
1930
1949
|
throw new Error(`HTTP ${response.status}`);
|
|
@@ -1962,9 +1981,14 @@ function selectCaptionTrack(tracks, preferredLanguages, includeAutoGenerated) {
|
|
|
1962
1981
|
}
|
|
1963
1982
|
return searchOrder[0] || null;
|
|
1964
1983
|
}
|
|
1984
|
+
async function fetchVideoInfo(videoId, options = {}) {
|
|
1985
|
+
const { timeout = 30000, proxy } = options;
|
|
1986
|
+
const playerResponse = await fetchPlayerResponse(videoId, timeout, proxy);
|
|
1987
|
+
return playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
1988
|
+
}
|
|
1965
1989
|
async function fetchTranscript(videoId, options = {}) {
|
|
1966
|
-
const { languages = ["en"], timeout = 30000, includeAutoGenerated = true } = options;
|
|
1967
|
-
const playerResponse = await fetchPlayerResponse(videoId, timeout);
|
|
1990
|
+
const { languages = ["en"], timeout = 30000, includeAutoGenerated = true, proxy } = options;
|
|
1991
|
+
const playerResponse = await fetchPlayerResponse(videoId, timeout, proxy);
|
|
1968
1992
|
const captionTracks = playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
1969
1993
|
if (!captionTracks?.length) {
|
|
1970
1994
|
throw new Error("No captions available for this video");
|
|
@@ -1973,7 +1997,7 @@ async function fetchTranscript(videoId, options = {}) {
|
|
|
1973
1997
|
if (!selectedTrack) {
|
|
1974
1998
|
throw new Error("No suitable caption track found");
|
|
1975
1999
|
}
|
|
1976
|
-
const segments = await fetchCaptionTrack(selectedTrack.baseUrl, timeout);
|
|
2000
|
+
const segments = await fetchCaptionTrack(selectedTrack.baseUrl, timeout, proxy);
|
|
1977
2001
|
if (!segments.length) {
|
|
1978
2002
|
throw new Error("Caption track is empty");
|
|
1979
2003
|
}
|
|
@@ -1986,6 +2010,8 @@ async function fetchTranscript(videoId, options = {}) {
|
|
|
1986
2010
|
isAutoGenerated: selectedTrack.kind === "asr"
|
|
1987
2011
|
};
|
|
1988
2012
|
}
|
|
2013
|
+
var require2 = createRequire2(import.meta.url);
|
|
2014
|
+
var ProxyAgent = require2("undici/lib/dispatcher/proxy-agent");
|
|
1989
2015
|
var USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
1990
2016
|
// node_modules/yocto-queue/index.js
|
|
1991
2017
|
class Node {
|
|
@@ -2163,6 +2189,26 @@ async function* streamVideos(videos, options = {}) {
|
|
|
2163
2189
|
var DEFAULT_CONCURRENCY = 4;
|
|
2164
2190
|
var DEFAULT_PAUSE_AFTER = 10;
|
|
2165
2191
|
var DEFAULT_PAUSE_DURATION = 5000;
|
|
2192
|
+
// src/lib/fs.ts
|
|
2193
|
+
import {readFile, writeFile, appendFile, access, constants} from "node:fs/promises";
|
|
2194
|
+
async function fileExists(path) {
|
|
2195
|
+
try {
|
|
2196
|
+
await access(path, constants.F_OK);
|
|
2197
|
+
return true;
|
|
2198
|
+
} catch {
|
|
2199
|
+
return false;
|
|
2200
|
+
}
|
|
2201
|
+
}
|
|
2202
|
+
async function readTextFile(path) {
|
|
2203
|
+
return readFile(path, "utf-8");
|
|
2204
|
+
}
|
|
2205
|
+
async function writeTextFile(path, content) {
|
|
2206
|
+
await writeFile(path, content, "utf-8");
|
|
2207
|
+
}
|
|
2208
|
+
async function appendTextFile(path, content) {
|
|
2209
|
+
await appendFile(path, content, "utf-8");
|
|
2210
|
+
}
|
|
2211
|
+
|
|
2166
2212
|
// src/loaders/history.ts
|
|
2167
2213
|
function extractVideoIdFromUrl(url) {
|
|
2168
2214
|
try {
|
|
@@ -2178,8 +2224,7 @@ function extractVideoIdFromUrl(url) {
|
|
|
2178
2224
|
return null;
|
|
2179
2225
|
}
|
|
2180
2226
|
async function loadWatchHistory(filePath) {
|
|
2181
|
-
const
|
|
2182
|
-
const text = await file.text();
|
|
2227
|
+
const text = await readTextFile(filePath);
|
|
2183
2228
|
const data = JSON.parse(text);
|
|
2184
2229
|
const results = [];
|
|
2185
2230
|
for (const item of data) {
|
|
@@ -2243,8 +2288,7 @@ function parseCSVLine(line) {
|
|
|
2243
2288
|
return result;
|
|
2244
2289
|
}
|
|
2245
2290
|
async function loadWatchLater(filePath) {
|
|
2246
|
-
const
|
|
2247
|
-
const text = await file.text();
|
|
2291
|
+
const text = await readTextFile(filePath);
|
|
2248
2292
|
const rows = parseCSV(text);
|
|
2249
2293
|
const results = [];
|
|
2250
2294
|
for (const row of rows) {
|
|
@@ -2290,11 +2334,11 @@ function mergeVideoSources(...sources) {
|
|
|
2290
2334
|
async function loadProcessedIds(jsonlPath) {
|
|
2291
2335
|
const ids = new Set;
|
|
2292
2336
|
try {
|
|
2293
|
-
const
|
|
2294
|
-
if (!
|
|
2337
|
+
const exists = await fileExists(jsonlPath);
|
|
2338
|
+
if (!exists) {
|
|
2295
2339
|
return ids;
|
|
2296
2340
|
}
|
|
2297
|
-
const text = await
|
|
2341
|
+
const text = await readTextFile(jsonlPath);
|
|
2298
2342
|
const lines = text.split("\n").filter((l) => l.trim());
|
|
2299
2343
|
for (const line of lines) {
|
|
2300
2344
|
try {
|
|
@@ -2313,10 +2357,7 @@ async function loadProcessedIds(jsonlPath) {
|
|
|
2313
2357
|
}
|
|
2314
2358
|
// src/outputs/index.ts
|
|
2315
2359
|
async function appendJsonl(result, path) {
|
|
2316
|
-
|
|
2317
|
-
const existing = await file.exists() ? await file.text() : "";
|
|
2318
|
-
const newContent = `${existing + JSON.stringify(result)}\n`;
|
|
2319
|
-
await Bun.write(path, newContent);
|
|
2360
|
+
await appendTextFile(path, `${JSON.stringify(result)}\n`);
|
|
2320
2361
|
}
|
|
2321
2362
|
async function writeCsv(results, options) {
|
|
2322
2363
|
const headers = [
|
|
@@ -2348,12 +2389,12 @@ async function writeCsv(results, options) {
|
|
|
2348
2389
|
...rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(","))
|
|
2349
2390
|
].join("\n");
|
|
2350
2391
|
if (options.append) {
|
|
2351
|
-
const
|
|
2352
|
-
const
|
|
2353
|
-
const content =
|
|
2354
|
-
await
|
|
2392
|
+
const exists = await fileExists(options.path);
|
|
2393
|
+
const rowsContent = rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(",")).join("\n");
|
|
2394
|
+
const content = exists ? `${rowsContent}\n` : `${csvContent}\n`;
|
|
2395
|
+
await appendTextFile(options.path, content);
|
|
2355
2396
|
} else {
|
|
2356
|
-
await
|
|
2397
|
+
await writeTextFile(options.path, `${csvContent}\n`);
|
|
2357
2398
|
}
|
|
2358
2399
|
}
|
|
2359
2400
|
function formatSrt(transcript) {
|
|
@@ -2408,20 +2449,40 @@ function pad(num, size) {
|
|
|
2408
2449
|
return String(num).padStart(size, "0");
|
|
2409
2450
|
}
|
|
2410
2451
|
// src/cli.ts
|
|
2452
|
+
function validateProxyUrl(url) {
|
|
2453
|
+
try {
|
|
2454
|
+
const parsed = new URL(url);
|
|
2455
|
+
return ["http:", "https:", "socks4:", "socks5:"].includes(parsed.protocol);
|
|
2456
|
+
} catch {
|
|
2457
|
+
return false;
|
|
2458
|
+
}
|
|
2459
|
+
}
|
|
2460
|
+
function parseProxy(proxyUrl) {
|
|
2461
|
+
if (!proxyUrl)
|
|
2462
|
+
return;
|
|
2463
|
+
if (!validateProxyUrl(proxyUrl)) {
|
|
2464
|
+
console.error(red(`Invalid proxy URL: ${proxyUrl}`));
|
|
2465
|
+
console.error(dim("Expected format: http://[user:pass@]host:port"));
|
|
2466
|
+
process.exit(1);
|
|
2467
|
+
}
|
|
2468
|
+
return { url: proxyUrl };
|
|
2469
|
+
}
|
|
2411
2470
|
var green = (s) => `\x1B[32m${s}\x1B[0m`;
|
|
2412
2471
|
var red = (s) => `\x1B[31m${s}\x1B[0m`;
|
|
2413
2472
|
var yellow = (s) => `\x1B[33m${s}\x1B[0m`;
|
|
2414
2473
|
var dim = (s) => `\x1B[2m${s}\x1B[0m`;
|
|
2415
2474
|
program.name("ytranscript").description("Fast YouTube transcript extraction with bulk processing").version(version);
|
|
2416
|
-
program.command("get <video>").description("Fetch transcript for a single video (ID or URL)").option("-l, --lang <codes>", "Preferred language codes (comma-separated)", "en").option("-f, --format <format>", "Output format: text, json, srt, vtt", "text").option("-t, --timestamps", "Include timestamps in text output").option("-o, --output <file>", "Write to file instead of stdout").action(async (video, options) => {
|
|
2475
|
+
program.command("get <video>").description("Fetch transcript for a single video (ID or URL)").option("-l, --lang <codes>", "Preferred language codes (comma-separated)", "en").option("-f, --format <format>", "Output format: text, json, srt, vtt", "text").option("-t, --timestamps", "Include timestamps in text output").option("-o, --output <file>", "Write to file instead of stdout").option("--proxy <url>", "HTTP proxy URL (e.g., http://user:pass@host:port)").action(async (video, options) => {
|
|
2417
2476
|
const videoId = extractVideoId(video);
|
|
2418
2477
|
if (!videoId) {
|
|
2419
2478
|
console.error(red(`Invalid video ID or URL: ${video}`));
|
|
2420
2479
|
process.exit(1);
|
|
2421
2480
|
}
|
|
2481
|
+
const proxy = parseProxy(options.proxy);
|
|
2422
2482
|
try {
|
|
2423
2483
|
const transcript = await fetchTranscript(videoId, {
|
|
2424
|
-
languages: options.lang.split(",")
|
|
2484
|
+
languages: options.lang.split(","),
|
|
2485
|
+
proxy
|
|
2425
2486
|
});
|
|
2426
2487
|
let output;
|
|
2427
2488
|
switch (options.format) {
|
|
@@ -2438,7 +2499,7 @@ program.command("get <video>").description("Fetch transcript for a single video
|
|
|
2438
2499
|
output = formatText(transcript, options.timestamps);
|
|
2439
2500
|
}
|
|
2440
2501
|
if (options.output) {
|
|
2441
|
-
await
|
|
2502
|
+
await writeFile2(options.output, output, "utf-8");
|
|
2442
2503
|
console.log(green(`Written to ${options.output}`));
|
|
2443
2504
|
} else {
|
|
2444
2505
|
console.log(output);
|
|
@@ -2448,7 +2509,7 @@ program.command("get <video>").description("Fetch transcript for a single video
|
|
|
2448
2509
|
process.exit(1);
|
|
2449
2510
|
}
|
|
2450
2511
|
});
|
|
2451
|
-
program.command("bulk").description("Bulk fetch transcripts from Google Takeout or video list").option("--history <file>", "Path to Google Takeout watch-history.json").option("--watch-later <file>", "Path to Google Takeout watch-later.csv").option("--videos <ids>", "Comma-separated video IDs or URLs").option("--file <file>", "File with video IDs/URLs (one per line)").option("-o, --out-jsonl <file>", "Output JSONL file", "transcripts.jsonl").option("--out-csv <file>", "Also write to CSV file").option("-c, --concurrency <n>", "Concurrent requests", "4").option("--pause-after <n>", "Pause after N requests", "10").option("--pause-ms <n>", "Pause duration in ms", "5000").option("-l, --lang <codes>", "Preferred languages (comma-separated)", "en").option("--resume", "Resume from previous run (skip already processed)").action(async (options) => {
|
|
2512
|
+
program.command("bulk").description("Bulk fetch transcripts from Google Takeout or video list").option("--history <file>", "Path to Google Takeout watch-history.json").option("--watch-later <file>", "Path to Google Takeout watch-later.csv").option("--videos <ids>", "Comma-separated video IDs or URLs").option("--file <file>", "File with video IDs/URLs (one per line)").option("-o, --out-jsonl <file>", "Output JSONL file", "transcripts.jsonl").option("--out-csv <file>", "Also write to CSV file").option("-c, --concurrency <n>", "Concurrent requests", "4").option("--pause-after <n>", "Pause after N requests", "10").option("--pause-ms <n>", "Pause duration in ms", "5000").option("-l, --lang <codes>", "Preferred languages (comma-separated)", "en").option("--proxy <url>", "HTTP proxy URL (e.g., http://user:pass@host:port)").option("--resume", "Resume from previous run (skip already processed)").action(async (options) => {
|
|
2452
2513
|
const sources = [];
|
|
2453
2514
|
if (options.history) {
|
|
2454
2515
|
console.log(dim(`Loading watch history from ${options.history}...`));
|
|
@@ -2477,7 +2538,7 @@ program.command("bulk").description("Bulk fetch transcripts from Google Takeout
|
|
|
2477
2538
|
}
|
|
2478
2539
|
if (options.file) {
|
|
2479
2540
|
try {
|
|
2480
|
-
const content = await
|
|
2541
|
+
const content = await readFile2(options.file, "utf-8");
|
|
2481
2542
|
const ids = content.split("\n").map((l) => l.trim()).filter((l) => l && !l.startsWith("#"));
|
|
2482
2543
|
sources.push(fromVideoIds(ids));
|
|
2483
2544
|
console.log(` Added ${ids.length} videos from ${options.file}`);
|
|
@@ -2503,15 +2564,17 @@ program.command("bulk").description("Bulk fetch transcripts from Google Takeout
|
|
|
2503
2564
|
console.log(green("All videos already processed!"));
|
|
2504
2565
|
return;
|
|
2505
2566
|
}
|
|
2506
|
-
console.log(`Processing ${toProcess.length} videos
|
|
2567
|
+
console.log(`Processing ${toProcess.length} videos...\\n`);
|
|
2507
2568
|
let successCount = 0;
|
|
2508
2569
|
let failCount = 0;
|
|
2509
2570
|
const csvResults = [];
|
|
2571
|
+
const proxy = parseProxy(options.proxy);
|
|
2510
2572
|
for await (const result of streamVideos(toProcess, {
|
|
2511
2573
|
concurrency: Number.parseInt(options.concurrency, 10),
|
|
2512
2574
|
pauseAfter: Number.parseInt(options.pauseAfter, 10),
|
|
2513
2575
|
pauseDuration: Number.parseInt(options.pauseMs, 10),
|
|
2514
|
-
languages: options.lang.split(",")
|
|
2576
|
+
languages: options.lang.split(","),
|
|
2577
|
+
proxy
|
|
2515
2578
|
})) {
|
|
2516
2579
|
const status = result.transcript ? green("OK") : red("FAIL");
|
|
2517
2580
|
const title = result.meta.title?.slice(0, 50) || result.meta.videoId;
|
|
@@ -2533,25 +2596,15 @@ program.command("bulk").description("Bulk fetch transcripts from Google Takeout
|
|
|
2533
2596
|
console.log(`\n${green("Done!")} ${successCount} succeeded, ${failCount} failed`);
|
|
2534
2597
|
console.log(`Output: ${options.outJsonl}`);
|
|
2535
2598
|
});
|
|
2536
|
-
program.command("info <video>").description("Show available transcript languages for a video").action(async (video) => {
|
|
2599
|
+
program.command("info <video>").description("Show available transcript languages for a video").option("--proxy <url>", "HTTP proxy URL (e.g., http://user:pass@host:port)").action(async (video, options) => {
|
|
2537
2600
|
const videoId = extractVideoId(video);
|
|
2538
2601
|
if (!videoId) {
|
|
2539
2602
|
console.error(red(`Invalid video ID or URL: ${video}`));
|
|
2540
2603
|
process.exit(1);
|
|
2541
2604
|
}
|
|
2605
|
+
const proxy = parseProxy(options.proxy);
|
|
2542
2606
|
try {
|
|
2543
|
-
const
|
|
2544
|
-
method: "POST",
|
|
2545
|
-
headers: { "Content-Type": "application/json" },
|
|
2546
|
-
body: JSON.stringify({
|
|
2547
|
-
context: {
|
|
2548
|
-
client: { clientName: "WEB", clientVersion: "2.20240101.00.00" }
|
|
2549
|
-
},
|
|
2550
|
-
videoId
|
|
2551
|
-
})
|
|
2552
|
-
});
|
|
2553
|
-
const data = await response.json();
|
|
2554
|
-
const tracks = data.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
2607
|
+
const tracks = await fetchVideoInfo(videoId, { proxy });
|
|
2555
2608
|
if (!tracks.length) {
|
|
2556
2609
|
console.log(yellow("No captions available for this video"));
|
|
2557
2610
|
return;
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ytranscript - Fast YouTube transcript extraction
|
|
3
|
+
*
|
|
4
|
+
* @example
|
|
5
|
+
* ```typescript
|
|
6
|
+
* import { fetchTranscript, processVideos } from 'ytranscript';
|
|
7
|
+
*
|
|
8
|
+
* // Fetch a single transcript
|
|
9
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
10
|
+
* console.log(transcript.text);
|
|
11
|
+
*
|
|
12
|
+
* // Bulk process from Google Takeout
|
|
13
|
+
* import { loadWatchHistory, loadWatchLater, mergeVideoSources } from 'ytranscript';
|
|
14
|
+
*
|
|
15
|
+
* const history = await loadWatchHistory('./watch-history.json');
|
|
16
|
+
* const watchLater = await loadWatchLater('./watch-later.csv');
|
|
17
|
+
* const videos = mergeVideoSources(history, watchLater);
|
|
18
|
+
*
|
|
19
|
+
* const results = await processVideos(videos, {
|
|
20
|
+
* concurrency: 4,
|
|
21
|
+
* onProgress: (done, total) => console.log(`${done}/${total}`)
|
|
22
|
+
* });
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
export { fetchTranscript, extractVideoId, fetchVideoInfo } from './lib/fetcher';
|
|
26
|
+
export { processVideos, streamVideos } from './lib/processor';
|
|
27
|
+
export { loadWatchHistory, loadWatchLater, fromVideoIds, mergeVideoSources, loadProcessedIds, } from './loaders';
|
|
28
|
+
export { writeJsonl, appendJsonl, writeCsv, formatSrt, formatVtt, formatText, } from './outputs';
|
|
29
|
+
export type { ProxyConfig, Transcript, TranscriptSegment, TranscriptResult, WatchHistoryMeta, FetchOptions, BulkOptions, OutputFormat, OutputOptions, } from './types';
|
|
30
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAGH,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAGhF,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAG9D,OAAO,EACL,gBAAgB,EAChB,cAAc,EACd,YAAY,EACZ,iBAAiB,EACjB,gBAAgB,GACjB,MAAM,WAAW,CAAC;AAGnB,OAAO,EACL,UAAU,EACV,WAAW,EACX,QAAQ,EACR,SAAS,EACT,SAAS,EACT,UAAU,GACX,MAAM,WAAW,CAAC;AAGnB,YAAY,EACV,WAAW,EACX,UAAU,EACV,iBAAiB,EACjB,gBAAgB,EAChB,gBAAgB,EAChB,YAAY,EACZ,WAAW,EACX,YAAY,EACZ,aAAa,GACd,MAAM,SAAS,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
// src/lib/fetcher.ts
|
|
2
|
+
import {createRequire} from "node:module";
|
|
2
3
|
function extractVideoId(input) {
|
|
3
4
|
if (/^[a-zA-Z0-9_-]{11}$/.test(input)) {
|
|
4
5
|
return input;
|
|
@@ -24,9 +25,20 @@ function extractVideoId(input) {
|
|
|
24
25
|
}
|
|
25
26
|
return null;
|
|
26
27
|
}
|
|
27
|
-
|
|
28
|
+
function createProxyAgent(proxy) {
|
|
29
|
+
if (proxy) {
|
|
30
|
+
return new ProxyAgent(proxy.url);
|
|
31
|
+
}
|
|
32
|
+
const envProxy = process.env.HTTP_PROXY || process.env.http_proxy || process.env.HTTPS_PROXY || process.env.https_proxy;
|
|
33
|
+
if (envProxy) {
|
|
34
|
+
return new ProxyAgent(envProxy);
|
|
35
|
+
}
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
async function fetchPlayerResponse(videoId, timeout, proxy) {
|
|
28
39
|
const controller = new AbortController;
|
|
29
40
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
41
|
+
const dispatcher = createProxyAgent(proxy);
|
|
30
42
|
try {
|
|
31
43
|
const response = await fetch("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", {
|
|
32
44
|
method: "POST",
|
|
@@ -43,7 +55,8 @@ async function fetchPlayerResponse(videoId, timeout) {
|
|
|
43
55
|
},
|
|
44
56
|
videoId
|
|
45
57
|
}),
|
|
46
|
-
signal: controller.signal
|
|
58
|
+
signal: controller.signal,
|
|
59
|
+
...dispatcher && { dispatcher }
|
|
47
60
|
});
|
|
48
61
|
if (!response.ok) {
|
|
49
62
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
@@ -53,14 +66,16 @@ async function fetchPlayerResponse(videoId, timeout) {
|
|
|
53
66
|
clearTimeout(timeoutId);
|
|
54
67
|
}
|
|
55
68
|
}
|
|
56
|
-
async function fetchCaptionTrack(url, timeout) {
|
|
69
|
+
async function fetchCaptionTrack(url, timeout, proxy) {
|
|
57
70
|
const controller = new AbortController;
|
|
58
71
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
72
|
+
const dispatcher = createProxyAgent(proxy);
|
|
59
73
|
try {
|
|
60
74
|
const jsonUrl = `${url}&fmt=json3`;
|
|
61
75
|
const response = await fetch(jsonUrl, {
|
|
62
76
|
headers: { "User-Agent": USER_AGENT },
|
|
63
|
-
signal: controller.signal
|
|
77
|
+
signal: controller.signal,
|
|
78
|
+
...dispatcher && { dispatcher }
|
|
64
79
|
});
|
|
65
80
|
if (!response.ok) {
|
|
66
81
|
throw new Error(`HTTP ${response.status}`);
|
|
@@ -98,9 +113,14 @@ function selectCaptionTrack(tracks, preferredLanguages, includeAutoGenerated) {
|
|
|
98
113
|
}
|
|
99
114
|
return searchOrder[0] || null;
|
|
100
115
|
}
|
|
116
|
+
async function fetchVideoInfo(videoId, options = {}) {
|
|
117
|
+
const { timeout = 30000, proxy } = options;
|
|
118
|
+
const playerResponse = await fetchPlayerResponse(videoId, timeout, proxy);
|
|
119
|
+
return playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
120
|
+
}
|
|
101
121
|
async function fetchTranscript(videoId, options = {}) {
|
|
102
|
-
const { languages = ["en"], timeout = 30000, includeAutoGenerated = true } = options;
|
|
103
|
-
const playerResponse = await fetchPlayerResponse(videoId, timeout);
|
|
122
|
+
const { languages = ["en"], timeout = 30000, includeAutoGenerated = true, proxy } = options;
|
|
123
|
+
const playerResponse = await fetchPlayerResponse(videoId, timeout, proxy);
|
|
104
124
|
const captionTracks = playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
105
125
|
if (!captionTracks?.length) {
|
|
106
126
|
throw new Error("No captions available for this video");
|
|
@@ -109,7 +129,7 @@ async function fetchTranscript(videoId, options = {}) {
|
|
|
109
129
|
if (!selectedTrack) {
|
|
110
130
|
throw new Error("No suitable caption track found");
|
|
111
131
|
}
|
|
112
|
-
const segments = await fetchCaptionTrack(selectedTrack.baseUrl, timeout);
|
|
132
|
+
const segments = await fetchCaptionTrack(selectedTrack.baseUrl, timeout, proxy);
|
|
113
133
|
if (!segments.length) {
|
|
114
134
|
throw new Error("Caption track is empty");
|
|
115
135
|
}
|
|
@@ -122,6 +142,8 @@ async function fetchTranscript(videoId, options = {}) {
|
|
|
122
142
|
isAutoGenerated: selectedTrack.kind === "asr"
|
|
123
143
|
};
|
|
124
144
|
}
|
|
145
|
+
var require2 = createRequire(import.meta.url);
|
|
146
|
+
var ProxyAgent = require2("undici/lib/dispatcher/proxy-agent");
|
|
125
147
|
var USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
126
148
|
// node_modules/yocto-queue/index.js
|
|
127
149
|
class Node {
|
|
@@ -344,6 +366,26 @@ async function* streamVideos(videos, options = {}) {
|
|
|
344
366
|
var DEFAULT_CONCURRENCY = 4;
|
|
345
367
|
var DEFAULT_PAUSE_AFTER = 10;
|
|
346
368
|
var DEFAULT_PAUSE_DURATION = 5000;
|
|
369
|
+
// src/lib/fs.ts
|
|
370
|
+
import {readFile, writeFile, appendFile, access, constants} from "node:fs/promises";
|
|
371
|
+
async function fileExists(path) {
|
|
372
|
+
try {
|
|
373
|
+
await access(path, constants.F_OK);
|
|
374
|
+
return true;
|
|
375
|
+
} catch {
|
|
376
|
+
return false;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
async function readTextFile(path) {
|
|
380
|
+
return readFile(path, "utf-8");
|
|
381
|
+
}
|
|
382
|
+
async function writeTextFile(path, content) {
|
|
383
|
+
await writeFile(path, content, "utf-8");
|
|
384
|
+
}
|
|
385
|
+
async function appendTextFile(path, content) {
|
|
386
|
+
await appendFile(path, content, "utf-8");
|
|
387
|
+
}
|
|
388
|
+
|
|
347
389
|
// src/loaders/history.ts
|
|
348
390
|
function extractVideoIdFromUrl(url) {
|
|
349
391
|
try {
|
|
@@ -359,8 +401,7 @@ function extractVideoIdFromUrl(url) {
|
|
|
359
401
|
return null;
|
|
360
402
|
}
|
|
361
403
|
async function loadWatchHistory(filePath) {
|
|
362
|
-
const
|
|
363
|
-
const text = await file.text();
|
|
404
|
+
const text = await readTextFile(filePath);
|
|
364
405
|
const data = JSON.parse(text);
|
|
365
406
|
const results = [];
|
|
366
407
|
for (const item of data) {
|
|
@@ -424,8 +465,7 @@ function parseCSVLine(line) {
|
|
|
424
465
|
return result;
|
|
425
466
|
}
|
|
426
467
|
async function loadWatchLater(filePath) {
|
|
427
|
-
const
|
|
428
|
-
const text = await file.text();
|
|
468
|
+
const text = await readTextFile(filePath);
|
|
429
469
|
const rows = parseCSV(text);
|
|
430
470
|
const results = [];
|
|
431
471
|
for (const row of rows) {
|
|
@@ -471,11 +511,11 @@ function mergeVideoSources(...sources) {
|
|
|
471
511
|
async function loadProcessedIds(jsonlPath) {
|
|
472
512
|
const ids = new Set;
|
|
473
513
|
try {
|
|
474
|
-
const
|
|
475
|
-
if (!
|
|
514
|
+
const exists = await fileExists(jsonlPath);
|
|
515
|
+
if (!exists) {
|
|
476
516
|
return ids;
|
|
477
517
|
}
|
|
478
|
-
const text = await
|
|
518
|
+
const text = await readTextFile(jsonlPath);
|
|
479
519
|
const lines = text.split("\n").filter((l) => l.trim());
|
|
480
520
|
for (const line of lines) {
|
|
481
521
|
try {
|
|
@@ -497,16 +537,13 @@ async function writeJsonl(results, options) {
|
|
|
497
537
|
const lines = results.map((r) => JSON.stringify(r));
|
|
498
538
|
const content = `${lines.join("\n")}\n`;
|
|
499
539
|
if (options.append) {
|
|
500
|
-
await
|
|
540
|
+
await appendTextFile(options.path, content);
|
|
501
541
|
} else {
|
|
502
|
-
await
|
|
542
|
+
await writeTextFile(options.path, content);
|
|
503
543
|
}
|
|
504
544
|
}
|
|
505
545
|
async function appendJsonl(result, path) {
|
|
506
|
-
|
|
507
|
-
const existing = await file.exists() ? await file.text() : "";
|
|
508
|
-
const newContent = `${existing + JSON.stringify(result)}\n`;
|
|
509
|
-
await Bun.write(path, newContent);
|
|
546
|
+
await appendTextFile(path, `${JSON.stringify(result)}\n`);
|
|
510
547
|
}
|
|
511
548
|
async function writeCsv(results, options) {
|
|
512
549
|
const headers = [
|
|
@@ -538,12 +575,12 @@ async function writeCsv(results, options) {
|
|
|
538
575
|
...rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(","))
|
|
539
576
|
].join("\n");
|
|
540
577
|
if (options.append) {
|
|
541
|
-
const
|
|
542
|
-
const
|
|
543
|
-
const content =
|
|
544
|
-
await
|
|
578
|
+
const exists = await fileExists(options.path);
|
|
579
|
+
const rowsContent = rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(",")).join("\n");
|
|
580
|
+
const content = exists ? `${rowsContent}\n` : `${csvContent}\n`;
|
|
581
|
+
await appendTextFile(options.path, content);
|
|
545
582
|
} else {
|
|
546
|
-
await
|
|
583
|
+
await writeTextFile(options.path, `${csvContent}\n`);
|
|
547
584
|
}
|
|
548
585
|
}
|
|
549
586
|
function formatSrt(transcript) {
|
|
@@ -610,6 +647,7 @@ export {
|
|
|
610
647
|
formatVtt,
|
|
611
648
|
formatText,
|
|
612
649
|
formatSrt,
|
|
650
|
+
fetchVideoInfo,
|
|
613
651
|
fetchTranscript,
|
|
614
652
|
extractVideoId,
|
|
615
653
|
appendJsonl
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YouTube transcript fetcher using YouTube's innertube API
|
|
3
|
+
* No third-party services required
|
|
4
|
+
*/
|
|
5
|
+
import type { FetchOptions, Transcript } from '../types';
|
|
6
|
+
export interface CaptionTrack {
|
|
7
|
+
baseUrl: string;
|
|
8
|
+
languageCode: string;
|
|
9
|
+
kind?: string;
|
|
10
|
+
name?: {
|
|
11
|
+
simpleText?: string;
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Extract video ID from various YouTube URL formats
|
|
16
|
+
*/
|
|
17
|
+
export declare function extractVideoId(input: string): string | null;
|
|
18
|
+
/**
|
|
19
|
+
* Fetch available caption tracks for a video
|
|
20
|
+
*/
|
|
21
|
+
export declare function fetchVideoInfo(videoId: string, options?: FetchOptions): Promise<CaptionTrack[]>;
|
|
22
|
+
/**
|
|
23
|
+
* Fetch transcript for a single video
|
|
24
|
+
*/
|
|
25
|
+
export declare function fetchTranscript(videoId: string, options?: FetchOptions): Promise<Transcript>;
|
|
26
|
+
//# sourceMappingURL=fetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/lib/fetcher.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,KAAK,EAAE,YAAY,EAAe,UAAU,EAAqB,MAAM,UAAU,CAAC;AASzF,MAAM,WAAW,YAAY;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE;QAAE,UAAU,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;CAChC;AAUD;;GAEG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA+B3D;AAsJD;;GAEG;AACH,wBAAsB,cAAc,CAClC,OAAO,EAAE,MAAM,EACf,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC,CAIzB;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,OAAO,EAAE,MAAM,EACf,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,UAAU,CAAC,CAgCrB"}
|
package/dist/lib/fs.d.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-runtime file utilities (works with Node.js and Bun)
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Check if a file exists
|
|
6
|
+
*/
|
|
7
|
+
export declare function fileExists(path: string): Promise<boolean>;
|
|
8
|
+
/**
|
|
9
|
+
* Read file contents as text
|
|
10
|
+
*/
|
|
11
|
+
export declare function readTextFile(path: string): Promise<string>;
|
|
12
|
+
/**
|
|
13
|
+
* Write content to file (overwrites existing)
|
|
14
|
+
*/
|
|
15
|
+
export declare function writeTextFile(path: string, content: string): Promise<void>;
|
|
16
|
+
/**
|
|
17
|
+
* Append content to file
|
|
18
|
+
*/
|
|
19
|
+
export declare function appendTextFile(path: string, content: string): Promise<void>;
|
|
20
|
+
//# sourceMappingURL=fs.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fs.d.ts","sourceRoot":"","sources":["../../src/lib/fs.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH;;GAEG;AACH,wBAAsB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAO/D;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAEhE;AAED;;GAEG;AACH,wBAAsB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAEhF;AAED;;GAEG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAEjF"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bulk transcript processor with concurrency control,
|
|
3
|
+
* rate limiting, and resume support
|
|
4
|
+
*/
|
|
5
|
+
import type { BulkOptions, TranscriptResult, WatchHistoryMeta } from '../types';
|
|
6
|
+
/**
|
|
7
|
+
* Process multiple videos in bulk with concurrency control
|
|
8
|
+
*/
|
|
9
|
+
export declare function processVideos(videos: WatchHistoryMeta[], options?: BulkOptions): Promise<TranscriptResult[]>;
|
|
10
|
+
/**
|
|
11
|
+
* Create a streaming processor that yields results as they complete
|
|
12
|
+
*/
|
|
13
|
+
export declare function streamVideos(videos: WatchHistoryMeta[], options?: BulkOptions): AsyncGenerator<TranscriptResult>;
|
|
14
|
+
//# sourceMappingURL=processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../../src/lib/processor.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,KAAK,EAAE,WAAW,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,UAAU,CAAC;AAOhF;;GAEG;AACH,wBAAsB,aAAa,CACjC,MAAM,EAAE,gBAAgB,EAAE,EAC1B,OAAO,GAAE,WAAgB,GACxB,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA2D7B;AAED;;GAEG;AACH,wBAAuB,YAAY,CACjC,MAAM,EAAE,gBAAgB,EAAE,EAC1B,OAAO,GAAE,WAAgB,GACxB,cAAc,CAAC,gBAAgB,CAAC,CAwClC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Load YouTube watch history from Google Takeout JSON
|
|
3
|
+
*/
|
|
4
|
+
import type { WatchHistoryMeta } from '../types';
|
|
5
|
+
/**
|
|
6
|
+
* Load watch history from Google Takeout JSON file
|
|
7
|
+
*/
|
|
8
|
+
export declare function loadWatchHistory(filePath: string): Promise<WatchHistoryMeta[]>;
|
|
9
|
+
//# sourceMappingURL=history.d.ts.map
|