@nadimtuhin/ytranscript 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,616 @@
1
+ // src/lib/fetcher.ts
2
+ function extractVideoId(input) {
3
+ if (/^[a-zA-Z0-9_-]{11}$/.test(input)) {
4
+ return input;
5
+ }
6
+ try {
7
+ const url = new URL(input);
8
+ if (url.hostname.includes("youtube.com")) {
9
+ const v = url.searchParams.get("v");
10
+ if (v && /^[a-zA-Z0-9_-]{11}$/.test(v))
11
+ return v;
12
+ }
13
+ if (url.hostname === "youtu.be") {
14
+ const id = url.pathname.slice(1).split("/")[0];
15
+ if (/^[a-zA-Z0-9_-]{11}$/.test(id))
16
+ return id;
17
+ }
18
+ if (url.pathname.startsWith("/embed/")) {
19
+ const id = url.pathname.split("/")[2];
20
+ if (/^[a-zA-Z0-9_-]{11}$/.test(id))
21
+ return id;
22
+ }
23
+ } catch {
24
+ }
25
+ return null;
26
+ }
27
+ async function fetchPlayerResponse(videoId, timeout) {
28
+ const controller = new AbortController;
29
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
30
+ try {
31
+ const response = await fetch("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", {
32
+ method: "POST",
33
+ headers: {
34
+ "Content-Type": "application/json",
35
+ "User-Agent": USER_AGENT
36
+ },
37
+ body: JSON.stringify({
38
+ context: {
39
+ client: {
40
+ clientName: "WEB",
41
+ clientVersion: "2.20240101.00.00"
42
+ }
43
+ },
44
+ videoId
45
+ }),
46
+ signal: controller.signal
47
+ });
48
+ if (!response.ok) {
49
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
50
+ }
51
+ return await response.json();
52
+ } finally {
53
+ clearTimeout(timeoutId);
54
+ }
55
+ }
56
+ async function fetchCaptionTrack(url, timeout) {
57
+ const controller = new AbortController;
58
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
59
+ try {
60
+ const jsonUrl = `${url}&fmt=json3`;
61
+ const response = await fetch(jsonUrl, {
62
+ headers: { "User-Agent": USER_AGENT },
63
+ signal: controller.signal
64
+ });
65
+ if (!response.ok) {
66
+ throw new Error(`HTTP ${response.status}`);
67
+ }
68
+ const data = await response.json();
69
+ const events = data.events || [];
70
+ const segments = [];
71
+ for (const event of events) {
72
+ if (!event.segs)
73
+ continue;
74
+ const text = event.segs.map((seg) => seg.utf8 || "").join("").trim();
75
+ if (!text)
76
+ continue;
77
+ segments.push({
78
+ text,
79
+ start: (event.tStartMs || 0) / 1000,
80
+ duration: (event.dDurationMs || 0) / 1000
81
+ });
82
+ }
83
+ return segments;
84
+ } finally {
85
+ clearTimeout(timeoutId);
86
+ }
87
+ }
88
+ function selectCaptionTrack(tracks, preferredLanguages, includeAutoGenerated) {
89
+ if (!tracks.length)
90
+ return null;
91
+ const manual = tracks.filter((t) => t.kind !== "asr");
92
+ const auto = tracks.filter((t) => t.kind === "asr");
93
+ const searchOrder = includeAutoGenerated ? [...manual, ...auto] : manual;
94
+ for (const lang of preferredLanguages) {
95
+ const match = searchOrder.find((t) => t.languageCode.toLowerCase().startsWith(lang.toLowerCase()));
96
+ if (match)
97
+ return match;
98
+ }
99
+ return searchOrder[0] || null;
100
+ }
101
+ async function fetchTranscript(videoId, options = {}) {
102
+ const { languages = ["en"], timeout = 30000, includeAutoGenerated = true } = options;
103
+ const playerResponse = await fetchPlayerResponse(videoId, timeout);
104
+ const captionTracks = playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks;
105
+ if (!captionTracks?.length) {
106
+ throw new Error("No captions available for this video");
107
+ }
108
+ const selectedTrack = selectCaptionTrack(captionTracks, languages, includeAutoGenerated);
109
+ if (!selectedTrack) {
110
+ throw new Error("No suitable caption track found");
111
+ }
112
+ const segments = await fetchCaptionTrack(selectedTrack.baseUrl, timeout);
113
+ if (!segments.length) {
114
+ throw new Error("Caption track is empty");
115
+ }
116
+ const fullText = segments.map((s) => s.text).join(" ");
117
+ return {
118
+ videoId,
119
+ text: fullText,
120
+ segments,
121
+ language: selectedTrack.languageCode,
122
+ isAutoGenerated: selectedTrack.kind === "asr"
123
+ };
124
+ }
125
+ var USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
126
+ // node_modules/yocto-queue/index.js
127
+ class Node {
128
+ value;
129
+ next;
130
+ constructor(value) {
131
+ this.value = value;
132
+ }
133
+ }
134
+
135
+ class Queue {
136
+ #head;
137
+ #tail;
138
+ #size;
139
+ constructor() {
140
+ this.clear();
141
+ }
142
+ enqueue(value) {
143
+ const node = new Node(value);
144
+ if (this.#head) {
145
+ this.#tail.next = node;
146
+ this.#tail = node;
147
+ } else {
148
+ this.#head = node;
149
+ this.#tail = node;
150
+ }
151
+ this.#size++;
152
+ }
153
+ dequeue() {
154
+ const current = this.#head;
155
+ if (!current) {
156
+ return;
157
+ }
158
+ this.#head = this.#head.next;
159
+ this.#size--;
160
+ if (!this.#head) {
161
+ this.#tail = undefined;
162
+ }
163
+ return current.value;
164
+ }
165
+ peek() {
166
+ if (!this.#head) {
167
+ return;
168
+ }
169
+ return this.#head.value;
170
+ }
171
+ clear() {
172
+ this.#head = undefined;
173
+ this.#tail = undefined;
174
+ this.#size = 0;
175
+ }
176
+ get size() {
177
+ return this.#size;
178
+ }
179
+ *[Symbol.iterator]() {
180
+ let current = this.#head;
181
+ while (current) {
182
+ yield current.value;
183
+ current = current.next;
184
+ }
185
+ }
186
+ *drain() {
187
+ while (this.#head) {
188
+ yield this.dequeue();
189
+ }
190
+ }
191
+ }
192
+
193
+ // node_modules/p-limit/index.js
194
+ function validateConcurrency(concurrency) {
195
+ if (!((Number.isInteger(concurrency) || concurrency === Number.POSITIVE_INFINITY) && concurrency > 0)) {
196
+ throw new TypeError("Expected `concurrency` to be a number from 1 and up");
197
+ }
198
+ }
199
+ function pLimit(concurrency) {
200
+ validateConcurrency(concurrency);
201
+ const queue = new Queue;
202
+ let activeCount = 0;
203
+ const resumeNext = () => {
204
+ if (activeCount < concurrency && queue.size > 0) {
205
+ queue.dequeue()();
206
+ activeCount++;
207
+ }
208
+ };
209
+ const next = () => {
210
+ activeCount--;
211
+ resumeNext();
212
+ };
213
+ const run = async (function_, resolve, arguments_) => {
214
+ const result = (async () => function_(...arguments_))();
215
+ resolve(result);
216
+ try {
217
+ await result;
218
+ } catch {
219
+ }
220
+ next();
221
+ };
222
+ const enqueue = (function_, resolve, arguments_) => {
223
+ new Promise((internalResolve) => {
224
+ queue.enqueue(internalResolve);
225
+ }).then(run.bind(undefined, function_, resolve, arguments_));
226
+ (async () => {
227
+ await Promise.resolve();
228
+ if (activeCount < concurrency) {
229
+ resumeNext();
230
+ }
231
+ })();
232
+ };
233
+ const generator = (function_, ...arguments_) => new Promise((resolve) => {
234
+ enqueue(function_, resolve, arguments_);
235
+ });
236
+ Object.defineProperties(generator, {
237
+ activeCount: {
238
+ get: () => activeCount
239
+ },
240
+ pendingCount: {
241
+ get: () => queue.size
242
+ },
243
+ clearQueue: {
244
+ value() {
245
+ queue.clear();
246
+ }
247
+ },
248
+ concurrency: {
249
+ get: () => concurrency,
250
+ set(newConcurrency) {
251
+ validateConcurrency(newConcurrency);
252
+ concurrency = newConcurrency;
253
+ queueMicrotask(() => {
254
+ while (activeCount < concurrency && queue.size > 0) {
255
+ resumeNext();
256
+ }
257
+ });
258
+ }
259
+ }
260
+ });
261
+ return generator;
262
+ }
263
+
264
+ // src/lib/processor.ts
265
+ async function processVideos(videos, options = {}) {
266
+ const {
267
+ concurrency = DEFAULT_CONCURRENCY,
268
+ pauseAfter = DEFAULT_PAUSE_AFTER,
269
+ pauseDuration = DEFAULT_PAUSE_DURATION,
270
+ skipIds = new Set,
271
+ onProgress,
272
+ ...fetchOptions
273
+ } = options;
274
+ const toProcess = videos.filter((v) => !skipIds.has(v.videoId));
275
+ if (!toProcess.length) {
276
+ return [];
277
+ }
278
+ const limit = pLimit(concurrency);
279
+ const results = [];
280
+ let completed = 0;
281
+ const processOne = async (meta) => {
282
+ try {
283
+ const transcript = await fetchTranscript(meta.videoId, fetchOptions);
284
+ return { meta, transcript };
285
+ } catch (error) {
286
+ const message = error instanceof Error ? error.message : "Unknown error";
287
+ return { meta, transcript: null, error: message };
288
+ }
289
+ };
290
+ const batches = [];
291
+ for (let i = 0;i < toProcess.length; i += pauseAfter) {
292
+ batches.push(toProcess.slice(i, i + pauseAfter));
293
+ }
294
+ for (let batchIndex = 0;batchIndex < batches.length; batchIndex++) {
295
+ const batch = batches[batchIndex];
296
+ const batchPromises = batch.map((meta) => limit(async () => {
297
+ const result = await processOne(meta);
298
+ completed++;
299
+ onProgress?.(completed, toProcess.length, result);
300
+ return result;
301
+ }));
302
+ const batchResults = await Promise.all(batchPromises);
303
+ results.push(...batchResults);
304
+ if (batchIndex < batches.length - 1 && pauseDuration > 0) {
305
+ await new Promise((resolve) => setTimeout(resolve, pauseDuration));
306
+ }
307
+ }
308
+ return results;
309
+ }
310
+ async function* streamVideos(videos, options = {}) {
311
+ const {
312
+ concurrency = DEFAULT_CONCURRENCY,
313
+ pauseAfter = DEFAULT_PAUSE_AFTER,
314
+ pauseDuration = DEFAULT_PAUSE_DURATION,
315
+ skipIds = new Set,
316
+ ...fetchOptions
317
+ } = options;
318
+ const toProcess = videos.filter((v) => !skipIds.has(v.videoId));
319
+ if (!toProcess.length) {
320
+ return;
321
+ }
322
+ const limit = pLimit(concurrency);
323
+ let processedInBatch = 0;
324
+ for (const meta of toProcess) {
325
+ const result = await limit(async () => {
326
+ try {
327
+ const transcript = await fetchTranscript(meta.videoId, fetchOptions);
328
+ return { meta, transcript };
329
+ } catch (error) {
330
+ const message = error instanceof Error ? error.message : "Unknown error";
331
+ return { meta, transcript: null, error: message };
332
+ }
333
+ });
334
+ yield result;
335
+ processedInBatch++;
336
+ if (processedInBatch >= pauseAfter) {
337
+ processedInBatch = 0;
338
+ if (pauseDuration > 0) {
339
+ await new Promise((resolve) => setTimeout(resolve, pauseDuration));
340
+ }
341
+ }
342
+ }
343
+ }
344
+ var DEFAULT_CONCURRENCY = 4;
345
+ var DEFAULT_PAUSE_AFTER = 10;
346
+ var DEFAULT_PAUSE_DURATION = 5000;
347
+ // src/loaders/history.ts
348
+ function extractVideoIdFromUrl(url) {
349
+ try {
350
+ const parsed = new URL(url);
351
+ if (parsed.hostname.includes("youtube.com")) {
352
+ return parsed.searchParams.get("v");
353
+ }
354
+ if (parsed.hostname === "youtu.be") {
355
+ return parsed.pathname.slice(1);
356
+ }
357
+ } catch {
358
+ }
359
+ return null;
360
+ }
361
+ async function loadWatchHistory(filePath) {
362
+ const file = Bun.file(filePath);
363
+ const text = await file.text();
364
+ const data = JSON.parse(text);
365
+ const results = [];
366
+ for (const item of data) {
367
+ const url = item.titleUrl;
368
+ if (!url)
369
+ continue;
370
+ const videoId = extractVideoIdFromUrl(url);
371
+ if (!videoId)
372
+ continue;
373
+ const channel = item.subtitles?.[0];
374
+ results.push({
375
+ videoId,
376
+ title: item.title,
377
+ url,
378
+ channel: channel ? { name: channel.name, url: channel.url } : undefined,
379
+ watchedAt: item.time,
380
+ source: "history"
381
+ });
382
+ }
383
+ return results;
384
+ }
385
+ // src/loaders/watchLater.ts
386
+ function parseCSV(text) {
387
+ const lines = text.split("\n").filter((line) => line.trim());
388
+ if (lines.length < 2)
389
+ return [];
390
+ const headers = parseCSVLine(lines[0]);
391
+ const rows = [];
392
+ for (let i = 1;i < lines.length; i++) {
393
+ const values = parseCSVLine(lines[i]);
394
+ const row = {};
395
+ for (let j = 0;j < headers.length; j++) {
396
+ row[headers[j]] = values[j] || "";
397
+ }
398
+ rows.push(row);
399
+ }
400
+ return rows;
401
+ }
402
+ function parseCSVLine(line) {
403
+ const result = [];
404
+ let current = "";
405
+ let inQuotes = false;
406
+ for (let i = 0;i < line.length; i++) {
407
+ const char = line[i];
408
+ const nextChar = line[i + 1];
409
+ if (char === '"') {
410
+ if (inQuotes && nextChar === '"') {
411
+ current += '"';
412
+ i++;
413
+ } else {
414
+ inQuotes = !inQuotes;
415
+ }
416
+ } else if (char === "," && !inQuotes) {
417
+ result.push(current.trim());
418
+ current = "";
419
+ } else {
420
+ current += char;
421
+ }
422
+ }
423
+ result.push(current.trim());
424
+ return result;
425
+ }
426
+ async function loadWatchLater(filePath) {
427
+ const file = Bun.file(filePath);
428
+ const text = await file.text();
429
+ const rows = parseCSV(text);
430
+ const results = [];
431
+ for (const row of rows) {
432
+ const videoId = row["Video ID"] || row.video_id || row["Video Id"];
433
+ const addedAt = row["Playlist Video Creation Timestamp"] || row.added_at || row["Added At"];
434
+ if (!videoId)
435
+ continue;
436
+ results.push({
437
+ videoId,
438
+ watchedAt: addedAt,
439
+ source: "watch_later"
440
+ });
441
+ }
442
+ return results;
443
+ }
444
+
445
+ // src/loaders/index.ts
446
+ function fromVideoIds(inputs) {
447
+ const results = [];
448
+ for (const input of inputs) {
449
+ const videoId = extractVideoId(input);
450
+ if (!videoId)
451
+ continue;
452
+ results.push({
453
+ videoId,
454
+ url: input.startsWith("http") ? input : `https://www.youtube.com/watch?v=${videoId}`,
455
+ source: "manual"
456
+ });
457
+ }
458
+ return results;
459
+ }
460
+ function mergeVideoSources(...sources) {
461
+ const seen = new Map;
462
+ for (const source of sources) {
463
+ for (const meta of source) {
464
+ if (!seen.has(meta.videoId)) {
465
+ seen.set(meta.videoId, meta);
466
+ }
467
+ }
468
+ }
469
+ return Array.from(seen.values());
470
+ }
471
+ async function loadProcessedIds(jsonlPath) {
472
+ const ids = new Set;
473
+ try {
474
+ const file = Bun.file(jsonlPath);
475
+ if (!await file.exists()) {
476
+ return ids;
477
+ }
478
+ const text = await file.text();
479
+ const lines = text.split("\n").filter((l) => l.trim());
480
+ for (const line of lines) {
481
+ try {
482
+ const record = JSON.parse(line);
483
+ if (record.meta?.videoId) {
484
+ ids.add(record.meta.videoId);
485
+ } else if (record.videoId) {
486
+ ids.add(record.videoId);
487
+ }
488
+ } catch {
489
+ }
490
+ }
491
+ } catch {
492
+ }
493
+ return ids;
494
+ }
495
+ // src/outputs/index.ts
496
+ async function writeJsonl(results, options) {
497
+ const lines = results.map((r) => JSON.stringify(r));
498
+ const content = `${lines.join("\n")}\n`;
499
+ if (options.append) {
500
+ await Bun.write(options.path, content, { mode: "a" });
501
+ } else {
502
+ await Bun.write(options.path, content);
503
+ }
504
+ }
505
+ async function appendJsonl(result, path) {
506
+ const file = Bun.file(path);
507
+ const existing = await file.exists() ? await file.text() : "";
508
+ const newContent = `${existing + JSON.stringify(result)}\n`;
509
+ await Bun.write(path, newContent);
510
+ }
511
+ async function writeCsv(results, options) {
512
+ const headers = [
513
+ "video_id",
514
+ "source",
515
+ "title",
516
+ "url",
517
+ "channel_name",
518
+ "watched_at",
519
+ "language",
520
+ "is_auto_generated",
521
+ "transcript",
522
+ "error"
523
+ ];
524
+ const rows = results.map((r) => [
525
+ r.meta.videoId,
526
+ r.meta.source,
527
+ r.meta.title || "",
528
+ r.meta.url || "",
529
+ r.meta.channel?.name || "",
530
+ r.meta.watchedAt || "",
531
+ r.transcript?.language || "",
532
+ r.transcript?.isAutoGenerated ? "true" : "false",
533
+ r.transcript?.text || "",
534
+ r.error || ""
535
+ ]);
536
+ const csvContent = [
537
+ headers.join(","),
538
+ ...rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(","))
539
+ ].join("\n");
540
+ if (options.append) {
541
+ const file = Bun.file(options.path);
542
+ const existing = await file.exists() ? await file.text() : "";
543
+ const content = existing ? `${rows.map((row) => row.map((cell) => `"${String(cell).replace(/"/g, '""')}"`).join(",")).join("\n")}\n` : `${csvContent}\n`;
544
+ await Bun.write(options.path, existing + content);
545
+ } else {
546
+ await Bun.write(options.path, `${csvContent}\n`);
547
+ }
548
+ }
549
+ function formatSrt(transcript) {
550
+ const lines = [];
551
+ transcript.segments.forEach((segment, index) => {
552
+ const startTime = formatSrtTime(segment.start);
553
+ const endTime = formatSrtTime(segment.start + segment.duration);
554
+ lines.push(String(index + 1));
555
+ lines.push(`${startTime} --> ${endTime}`);
556
+ lines.push(segment.text);
557
+ lines.push("");
558
+ });
559
+ return lines.join("\n");
560
+ }
561
+ function formatVtt(transcript) {
562
+ const lines = ["WEBVTT", ""];
563
+ for (const segment of transcript.segments) {
564
+ const startTime = formatVttTime(segment.start);
565
+ const endTime = formatVttTime(segment.start + segment.duration);
566
+ lines.push(`${startTime} --> ${endTime}`);
567
+ lines.push(segment.text);
568
+ lines.push("");
569
+ }
570
+ return lines.join("\n");
571
+ }
572
+ function formatText(transcript, includeTimestamps = false) {
573
+ if (!includeTimestamps) {
574
+ return transcript.text;
575
+ }
576
+ return transcript.segments.map((s) => `[${formatTimestamp(s.start)}] ${s.text}`).join("\n");
577
+ }
578
+ function formatSrtTime(seconds) {
579
+ const hours = Math.floor(seconds / 3600);
580
+ const minutes = Math.floor(seconds % 3600 / 60);
581
+ const secs = Math.floor(seconds % 60);
582
+ const ms = Math.floor(seconds % 1 * 1000);
583
+ return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(secs, 2)},${pad(ms, 3)}`;
584
+ }
585
+ function formatVttTime(seconds) {
586
+ const hours = Math.floor(seconds / 3600);
587
+ const minutes = Math.floor(seconds % 3600 / 60);
588
+ const secs = Math.floor(seconds % 60);
589
+ const ms = Math.floor(seconds % 1 * 1000);
590
+ return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(secs, 2)}.${pad(ms, 3)}`;
591
+ }
592
+ function formatTimestamp(seconds) {
593
+ const minutes = Math.floor(seconds / 60);
594
+ const secs = Math.floor(seconds % 60);
595
+ return `${minutes}:${pad(secs, 2)}`;
596
+ }
597
+ function pad(num, size) {
598
+ return String(num).padStart(size, "0");
599
+ }
600
+ export {
601
+ writeJsonl,
602
+ writeCsv,
603
+ streamVideos,
604
+ processVideos,
605
+ mergeVideoSources,
606
+ loadWatchLater,
607
+ loadWatchHistory,
608
+ loadProcessedIds,
609
+ fromVideoIds,
610
+ formatVtt,
611
+ formatText,
612
+ formatSrt,
613
+ fetchTranscript,
614
+ extractVideoId,
615
+ appendJsonl
616
+ };