@tikoci/rosetta 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,734 @@
1
+ /**
2
+ * extract-videos.ts — Extract MikroTik YouTube channel transcripts into the videos table.
3
+ *
4
+ * Uses yt-dlp (system dependency) to download video metadata and auto-generated
5
+ * subtitles (English VTT), then parses them into the `videos` and `video_segments` tables.
6
+ *
7
+ * Incremental: skips videos already in the DB (use --force to re-extract).
8
+ * NOT part of `make extract` — requires yt-dlp installed separately.
9
+ *
10
+ * Usage:
11
+ * bun run src/extract-videos.ts # full channel, incremental
12
+ * bun run src/extract-videos.ts --limit=10 # dev: process at most 10 new videos
13
+ * bun run src/extract-videos.ts --force # re-extract all (delete + reinsert)
14
+ * bun run src/extract-videos.ts --playlist=URL # override channel URL
15
+ * bun run src/extract-videos.ts --max-duration=600 # cap at 10 min (default: 1500)
16
+ *
17
+ * Requirements:
18
+ * brew install yt-dlp # macOS
19
+ * apt install yt-dlp # Ubuntu/Debian
20
+ * pip install yt-dlp # any platform
21
+ */
22
+
23
+ import { existsSync, mkdirSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
24
+ import { tmpdir } from "node:os";
25
+ import { dirname, join } from "node:path";
26
+ import { db, initDb } from "./db.ts";
27
+
28
+ // ── Config ──
29
+
30
+ /** yt-dlp executable — override with YTDLP env var for testing. */
31
+ export const YTDLP_DEFAULT = process.env.YTDLP ?? "yt-dlp";
32
+
33
+ /** Hard timeout per video download (ms). yt-dlp with --retries 2 --socket-timeout 15 should
34
+ * self-terminate well before this, but this is the absolute backstop. */
35
+ const DOWNLOAD_TIMEOUT_MS = 120_000; // 2 minutes
36
+
37
+ /** Hard timeout for playlist listing (ms). */
38
+ const LIST_TIMEOUT_MS = 300_000; // 5 minutes
39
+
40
+ const CHANNEL_URL = "https://www.youtube.com/@MikroTik/videos";
41
+ const DEFAULT_MAX_DURATION = 1500; // 25 minutes — excludes long MUM talks
42
+ const MIN_DURATION = 90; // 1.5 minutes — excludes Shorts
43
+
44
+ /** Title substrings that indicate MUM conference content to skip. */
45
+ const MUM_TITLE_PATTERNS = [
46
+ /\bMUM\b/,
47
+ /mikrotik user meeting/i,
48
+ /\bpresentation\b.*\b(mum|meeting)\b/i,
49
+ ];
50
+
51
+ // ── CLI flags ──
52
+
53
+ const rawArgs = process.argv.slice(2);
54
+
55
+ function getFlag(name: string): string | undefined {
56
+ for (const arg of rawArgs) {
57
+ if (arg.startsWith(`--${name}=`)) return arg.slice(name.length + 3);
58
+ }
59
+ return undefined;
60
+ }
61
+
62
+ const LIMIT = getFlag("limit") ? Number(getFlag("limit")) : undefined;
63
+ const FORCE = rawArgs.includes("--force");
64
+ const PLAYLIST = getFlag("playlist") ?? CHANNEL_URL;
65
+ const MAX_DURATION = getFlag("max-duration") ? Number(getFlag("max-duration")) : DEFAULT_MAX_DURATION;
66
+ /** Exit non-zero if any video fails to download. Use in CI: make extract-videos ARGS=--strict */
67
+ const STRICT = rawArgs.includes("--strict");
68
+ /** Read NDJSON from latest transcripts/ dir and import into DB (no yt-dlp). */
69
+ const FROM_CACHE = rawArgs.includes("--from-cache");
70
+ /** Write NDJSON to transcripts/YYYY-MM-DD/videos.ndjson after yt-dlp extraction. */
71
+ const SAVE_CACHE = rawArgs.includes("--save-cache");
72
+ /** Path to known-bad JSON {id: reason} \u2014 skip these video IDs during yt-dlp extraction. */
73
+ const KNOWN_BAD_PATH = getFlag("known-bad");
74
+
75
+ // ── Types ──
76
+
77
+ type Chapter = {
78
+ start_time: number;
79
+ end_time: number;
80
+ title: string;
81
+ };
82
+
83
+ type YtVideoInfo = {
84
+ id: string;
85
+ title: string;
86
+ description?: string;
87
+ channel?: string;
88
+ upload_date?: string;
89
+ duration?: number;
90
+ webpage_url?: string;
91
+ view_count?: number;
92
+ like_count?: number;
93
+ chapters?: Chapter[];
94
+ };
95
+
96
+ export type VttCue = {
97
+ start_s: number;
98
+ text: string;
99
+ };
100
+
101
+ export type TranscriptSegment = {
102
+ chapter_title: string | null;
103
+ start_s: number;
104
+ end_s: number | null;
105
+ transcript: string;
106
+ };
107
+
108
+ // ── Product name normalization ──
109
+
110
+ /** Map of Unicode superscript/subscript digits → ASCII digits. */
111
+ const DIGIT_SUPER_SUB: Record<string, string> = {
112
+ "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4",
113
+ "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
114
+ "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4",
115
+ "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9",
116
+ };
117
+
118
+ /**
119
+ * Normalize Unicode superscript/subscript digits to ASCII digits.
120
+ * e.g. "hAP ax³" → "hAP ax3", "hAP ax²" → "hAP ax2"
121
+ * Preserves null for nullable columns.
122
+ */
123
+ function normalizeSuperscripts(s: string): string;
124
+ function normalizeSuperscripts(s: string | null): string | null;
125
+ function normalizeSuperscripts(s: string | null): string | null {
126
+ if (s === null) return null;
127
+ return s.replace(/[⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]/g, (c) => DIGIT_SUPER_SUB[c] ?? c);
128
+ }
129
+
130
+ // ── yt-dlp check ──
131
+
132
+ function checkYtDlp(ytdlp = YTDLP_DEFAULT): boolean {
133
+ const result = Bun.spawnSync([ytdlp, "--version"], { stdio: ["inherit", "pipe", "pipe"] });
134
+ if (result.exitCode === 0) {
135
+ const version = new TextDecoder().decode(result.stdout).trim();
136
+ console.log(`yt-dlp ${version}`);
137
+ return true;
138
+ }
139
+ console.error(`yt-dlp not found. Install it before running this extractor:
140
+
141
+ macOS: brew install yt-dlp
142
+ Ubuntu: apt install yt-dlp
143
+ Any: pip install yt-dlp
144
+ Docs: https://github.com/yt-dlp/yt-dlp#installation`);
145
+ return false;
146
+ }
147
+
148
+ // ── VTT parsing ──
149
+
150
+ const VTT_TIMESTAMP_RE = /^(\d{2}):(\d{2}):(\d{2})\.(\d{3}) --> /;
151
+ const HTML_TAG_RE = /<[^>]+>/g;
152
+ // Inline timestamp tags like <00:00:01.234>
153
+ const INLINE_TS_RE = /<\d{2}:\d{2}:\d{2}\.\d{3}>/g;
154
+
155
+ function vttTimestampToSeconds(text: string): number {
156
+ const m = text.match(VTT_TIMESTAMP_RE);
157
+ if (!m) return 0;
158
+ return Number(m[1]) * 3600 + Number(m[2]) * 60 + Number(m[3]) + Number(m[4]) / 1000;
159
+ }
160
+
161
+ /**
162
+ * Parse a WebVTT string into cues with start time (seconds) and clean text.
163
+ * Auto-generated YouTube VTT has overlapping sliding-window cues — we deduplicate
164
+ * by skipping cues whose text is already a suffix of the accumulated line buffer.
165
+ */
166
+ export function parseVtt(vttText: string): VttCue[] {
167
+ const lines = vttText.split("\n");
168
+ const cues: VttCue[] = [];
169
+ let currentStart = 0;
170
+ let currentLines: string[] = [];
171
+ let inCueText = false;
172
+ let prevAccumulated = "";
173
+
174
+ function flushCue() {
175
+ if (currentLines.length === 0) return;
176
+ const raw = currentLines.join(" ").trim();
177
+ if (!raw) return;
178
+ // Strip HTML tags and inline timestamp tags
179
+ const clean = raw.replace(INLINE_TS_RE, "").replace(HTML_TAG_RE, "").replace(/\s+/g, " ").trim();
180
+ if (!clean) return;
181
+ // Deduplicate: skip if clean text is a suffix of what we've already emitted
182
+ if (prevAccumulated.endsWith(clean)) return;
183
+ cues.push({ start_s: currentStart, text: clean });
184
+ // Track last ~200 chars of accumulated text for dedup check
185
+ prevAccumulated = `${prevAccumulated} ${clean}`.slice(-200);
186
+ }
187
+
188
+ for (const rawLine of lines) {
189
+ const line = rawLine.replace(/\r/, "");
190
+
191
+ if (line.startsWith("WEBVTT") || line.startsWith("Kind:") || line.startsWith("Language:")) {
192
+ inCueText = false;
193
+ continue;
194
+ }
195
+
196
+ if (VTT_TIMESTAMP_RE.test(line)) {
197
+ flushCue();
198
+ currentStart = vttTimestampToSeconds(line);
199
+ currentLines = [];
200
+ inCueText = true;
201
+ continue;
202
+ }
203
+
204
+ if (line.trim() === "") {
205
+ if (inCueText) flushCue();
206
+ inCueText = false;
207
+ currentLines = [];
208
+ continue;
209
+ }
210
+
211
+ // Cue identifier lines (pure numbers or alphanumeric IDs before timestamp)
212
+ if (inCueText) {
213
+ currentLines.push(line);
214
+ }
215
+ }
216
+ flushCue();
217
+
218
+ return cues;
219
+ }
220
+
221
+ /**
222
+ * Group VTT cues into segments by chapter.
223
+ * If no chapters provided, returns a single segment covering the whole video.
224
+ */
225
+ export function segmentTranscript(cues: VttCue[], chapters?: Chapter[]): TranscriptSegment[] {
226
+ if (!chapters || chapters.length === 0) {
227
+ return [
228
+ {
229
+ chapter_title: null,
230
+ start_s: 0,
231
+ end_s: null,
232
+ transcript: cues.map((c) => c.text).join(" ").trim(),
233
+ },
234
+ ];
235
+ }
236
+
237
+ return chapters.map((ch, i) => {
238
+ const next = chapters[i + 1];
239
+ const chCues = cues.filter((c) => c.start_s >= ch.start_time && c.start_s < ch.end_time);
240
+ return {
241
+ chapter_title: ch.title,
242
+ start_s: Math.round(ch.start_time),
243
+ end_s: next ? Math.round(next.start_time) : Math.round(ch.end_time),
244
+ transcript: chCues.map((c) => c.text).join(" ").trim(),
245
+ };
246
+ });
247
+ }
248
+
249
+ // ── Filtering ──
250
+
251
+ function isMumContent(title: string): boolean {
252
+ return MUM_TITLE_PATTERNS.some((p) => p.test(title));
253
+ }
254
+
255
+ function isInDurationRange(duration: number | undefined): boolean {
256
+ if (duration === undefined) return false;
257
+ return duration >= MIN_DURATION && duration <= MAX_DURATION;
258
+ }
259
+
260
+ // ── yt-dlp invocation helpers ──
261
+
262
+ /** List all videos in a playlist/channel, return flat metadata entries. */
263
+ export function listPlaylist(
264
+ url: string,
265
+ ytdlp = YTDLP_DEFAULT,
266
+ timeoutMs = LIST_TIMEOUT_MS,
267
+ ): Array<{ id: string; title: string; duration?: number }> {
268
+ console.log(`Listing videos from: ${url}`);
269
+ const result = Bun.spawnSync(
270
+ [ytdlp, "--flat-playlist", "--dump-json", "--socket-timeout", "15", "--retries", "2", "--no-warnings", url],
271
+ { stdio: ["inherit", "pipe", "pipe"], timeout: timeoutMs },
272
+ );
273
+ if (result.exitCode === null) {
274
+ throw new Error(`yt-dlp playlist listing timed out after ${timeoutMs / 1000}s`);
275
+ }
276
+ if (result.exitCode !== 0) {
277
+ const stderr = new TextDecoder().decode(result.stderr);
278
+ throw new Error(`yt-dlp playlist listing failed (exit ${result.exitCode}): ${stderr.trim()}`);
279
+ }
280
+ const stdout = new TextDecoder().decode(result.stdout);
281
+ const entries: Array<{ id: string; title: string; duration?: number }> = [];
282
+ for (const line of stdout.split("\n")) {
283
+ const trimmed = line.trim();
284
+ if (!trimmed) continue;
285
+ try {
286
+ const obj = JSON.parse(trimmed) as { id?: string; title?: string; duration?: number };
287
+ if (obj.id && obj.title) {
288
+ entries.push({ id: obj.id, title: obj.title, duration: obj.duration });
289
+ }
290
+ } catch {
291
+ // skip malformed lines
292
+ }
293
+ }
294
+ return entries;
295
+ }
296
+
297
+ /** Download metadata + VTT transcript for one video into tmpDir.
298
+ * Returns "ok" | "timeout" | "error" — never throws.
299
+ * The ytdlp and timeoutMs params exist for testing (pass a mock binary path). */
300
+ export function downloadTranscript(
301
+ videoId: string,
302
+ tmpDir: string,
303
+ ytdlp = YTDLP_DEFAULT,
304
+ timeoutMs = DOWNLOAD_TIMEOUT_MS,
305
+ ): "ok" | "timeout" | "error" {
306
+ const url = `https://www.youtube.com/watch?v=${videoId}`;
307
+ const result = Bun.spawnSync(
308
+ [
309
+ ytdlp,
310
+ "--skip-download",
311
+ "--write-auto-subs",
312
+ "--write-info-json",
313
+ "--sub-format", "vtt",
314
+ "--sub-langs", "en",
315
+ "--socket-timeout", "15", // per-connection HTTP timeout (seconds)
316
+ "--retries", "2", // was default 10 — prevents indefinite retry loops
317
+ "--fragment-retries", "2",
318
+ "--no-warnings",
319
+ "-o", join(tmpDir, "%(id)s.%(ext)s"),
320
+ url,
321
+ ],
322
+ { stdio: ["inherit", "pipe", "pipe"], timeout: timeoutMs },
323
+ );
324
+ if (result.exitCode === null) return "timeout";
325
+ if (result.exitCode !== 0) return "error";
326
+ return "ok";
327
+ }
328
+
329
+ // ── DB helpers ──
330
+
331
+ function videoExists(videoId: string): boolean {
332
+ const row = db.prepare("SELECT id FROM videos WHERE video_id = ?").get(videoId);
333
+ return row !== null;
334
+ }
335
+
336
+ function deleteVideoData(videoId: string): void {
337
+ const row = db.prepare("SELECT id FROM videos WHERE video_id = ?").get(videoId) as { id: number } | null;
338
+ if (!row) return;
339
+ db.run("DELETE FROM video_segments WHERE video_id = ?", [row.id]);
340
+ db.run("DELETE FROM videos WHERE id = ?", [row.id]);
341
+ }
342
+
343
+ // ── Cache: NDJSON export / import ──
344
+ //
345
+ // Format: one JSON object per line (NDJSON). Each line is a VideoCacheEntry.
346
+ // The transcripts/ directory mirrors the matrix/ pattern:
347
+ // transcripts/YYYY-MM-DD/videos.ndjson \u2014 committed to git, used by CI
348
+ // transcripts/known-bad.json \u2014 manually maintained {id: reason}
349
+ //
350
+ // Workflow:
351
+ // Local: make extract-videos # fetch from YouTube, slow (~30\u201360 min)
352
+ // make save-videos-cache # export DB \u2192 transcripts/YYYY-MM-DD/videos.ndjson
353
+ // git add transcripts/ && git commit
354
+ // CI: make extract-videos-from-cache # import from committed NDJSON, fast (~5 s)
355
+
356
+ export type VideoCacheSegment = {
357
+ chapter_title: string | null;
358
+ start_s: number;
359
+ end_s: number | null;
360
+ transcript: string;
361
+ sort_order: number;
362
+ };
363
+
364
+ export type VideoCacheEntry = {
365
+ video_id: string;
366
+ title: string;
367
+ description: string | null;
368
+ channel: string | null;
369
+ upload_date: string | null;
370
+ duration_s: number | null;
371
+ url: string;
372
+ view_count: number | null;
373
+ like_count: number | null;
374
+ has_chapters: number;
375
+ segments: VideoCacheSegment[];
376
+ };
377
+
378
+ /**
379
+ * Export all videos + segments from DB to an NDJSON file.
380
+ * Creates the output directory if needed. Returns the number of videos written.
381
+ */
382
+ export function saveCache(outputPath: string): number {
383
+ const videos = db
384
+ .prepare("SELECT video_id, title, description, channel, upload_date, duration_s, url, view_count, like_count, has_chapters FROM videos ORDER BY upload_date DESC, video_id")
385
+ .all() as Omit<VideoCacheEntry, "segments">[];
386
+
387
+ type SegRow = VideoCacheSegment & { video_id: string };
388
+ const segRows = db
389
+ .prepare("SELECT v.video_id, vs.chapter_title, vs.start_s, vs.end_s, vs.transcript, vs.sort_order FROM video_segments vs JOIN videos v ON v.id = vs.video_id ORDER BY v.video_id, vs.sort_order")
390
+ .all() as SegRow[];
391
+
392
+ // Index segments by video_id string
393
+ const segMap = new Map<string, VideoCacheSegment[]>();
394
+ for (const row of segRows) {
395
+ const segs = segMap.get(row.video_id) ?? [];
396
+ segs.push({ chapter_title: row.chapter_title, start_s: row.start_s, end_s: row.end_s, transcript: row.transcript, sort_order: row.sort_order });
397
+ segMap.set(row.video_id, segs);
398
+ }
399
+
400
+ const lines = videos.map((v) => {
401
+ const entry: VideoCacheEntry = { ...v, segments: segMap.get(v.video_id) ?? [] };
402
+ return JSON.stringify(entry);
403
+ });
404
+
405
+ mkdirSync(dirname(outputPath), { recursive: true });
406
+ writeFileSync(outputPath, `${lines.join("\n")}\n`, "utf8");
407
+ return videos.length;
408
+ }
409
+
410
+ /**
411
+ * Import videos + segments from an NDJSON cache file into the DB.
412
+ * Skips videos already present unless force=true.
413
+ * knownBad is a Set of video IDs to skip entirely.
414
+ * Returns { imported, skipped, knownBadSkipped }.
415
+ */
416
+ export function importCache(
417
+ ndjsonPath: string,
418
+ opts: { force?: boolean; knownBad?: Set<string> } = {},
419
+ ): { imported: number; skipped: number; knownBadSkipped: number } {
420
+ const { force = false, knownBad = new Set<string>() } = opts;
421
+
422
+ const text = readFileSync(ndjsonPath, "utf8");
423
+ const lines = text.split("\n").filter((l) => l.trim());
424
+
425
+ const insertVideo = db.prepare(`
426
+ INSERT OR REPLACE INTO videos (video_id, title, description, channel, upload_date, duration_s, url, view_count, like_count, has_chapters)
427
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
428
+ `);
429
+ const insertSegment = db.prepare(`
430
+ INSERT INTO video_segments (video_id, chapter_title, start_s, end_s, transcript, sort_order)
431
+ VALUES (?, ?, ?, ?, ?, ?)
432
+ `);
433
+
434
+ let imported = 0;
435
+ let skipped = 0;
436
+ let knownBadSkipped = 0;
437
+
438
+ for (const line of lines) {
439
+ let entry: VideoCacheEntry;
440
+ try {
441
+ entry = JSON.parse(line) as VideoCacheEntry;
442
+ } catch {
443
+ console.warn(` \u26a0 skipping malformed NDJSON line`);
444
+ continue;
445
+ }
446
+
447
+ if (knownBad.has(entry.video_id)) {
448
+ knownBadSkipped++;
449
+ continue;
450
+ }
451
+
452
+ if (!force && videoExists(entry.video_id)) {
453
+ skipped++;
454
+ continue;
455
+ }
456
+
457
+ if (force) deleteVideoData(entry.video_id);
458
+
459
+ db.transaction(() => {
460
+ insertVideo.run(
461
+ entry.video_id, normalizeSuperscripts(entry.title), normalizeSuperscripts(entry.description ?? null), entry.channel,
462
+ entry.upload_date, entry.duration_s, entry.url,
463
+ entry.view_count, entry.like_count, entry.has_chapters,
464
+ );
465
+ const row = db.prepare("SELECT id FROM videos WHERE video_id = ?").get(entry.video_id) as { id: number };
466
+ for (const seg of entry.segments) {
467
+ insertSegment.run(row.id, normalizeSuperscripts(seg.chapter_title), seg.start_s, seg.end_s, seg.transcript, seg.sort_order);
468
+ }
469
+ })();
470
+
471
+ imported++;
472
+ }
473
+
474
+ return { imported, skipped, knownBadSkipped };
475
+ }
476
+
477
+ /**
478
+ * Load the known-bad map from a JSON file ({id: reason}).
479
+ * Returns an empty Set if the file doesn't exist or can't be parsed.
480
+ * Keys starting with "_" are treated as metadata/comments and ignored.
481
+ */
482
+ export function loadKnownBad(jsonPath: string): Set<string> {
483
+ if (!existsSync(jsonPath)) return new Set();
484
+ try {
485
+ const obj = JSON.parse(readFileSync(jsonPath, "utf8")) as Record<string, string>;
486
+ return new Set(Object.keys(obj).filter((k) => !k.startsWith("_")));
487
+ } catch {
488
+ console.warn(` ⚠ could not parse known-bad file: ${jsonPath}`);
489
+ return new Set();
490
+ }
491
+ }
492
+
493
+ /**
494
+ * Find the most recent transcripts/YYYY-MM-DD/videos.ndjson under the project root.
495
+ * Returns null if none found.
496
+ */
497
+ export function findLatestCache(projectRoot: string): string | null {
498
+ const transcriptsDir = join(projectRoot, "transcripts");
499
+ if (!existsSync(transcriptsDir)) return null;
500
+
501
+ const dirs = readdirSync(transcriptsDir, { withFileTypes: true })
502
+ .filter((d) => d.isDirectory() && /^\d{4}-\d{2}-\d{2}$/.test(d.name))
503
+ .map((d) => d.name)
504
+ .sort()
505
+ .reverse(); // newest first
506
+
507
+ for (const dir of dirs) {
508
+ const candidate = join(transcriptsDir, dir, "videos.ndjson");
509
+ if (existsSync(candidate)) return candidate;
510
+ }
511
+ return null;
512
+ }
513
+
514
+ // ── Main ──
515
+
516
+ async function main() {
517
+ // ── Fast path: --from-cache ──
518
+ if (FROM_CACHE) {
519
+ initDb();
520
+ const projectRoot = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
521
+ const knownBadPath = KNOWN_BAD_PATH ?? join(projectRoot, "transcripts", "known-bad.json");
522
+ const knownBad = loadKnownBad(knownBadPath);
523
+ const cachePath = findLatestCache(projectRoot);
524
+ if (!cachePath) {
525
+ console.error("No cache found. Run `make save-videos-cache` after a local extraction.");
526
+ process.exit(1);
527
+ }
528
+ console.log(`Importing from cache: ${cachePath}`);
529
+ if (knownBad.size > 0) console.log(` Skipping ${knownBad.size} known-bad IDs`);
530
+ const result = importCache(cachePath, { force: FORCE, knownBad });
531
+ console.log(`Done: ${result.imported} imported, ${result.skipped} skipped (already present), ${result.knownBadSkipped} known-bad`);
532
+ return;
533
+ }
534
+
535
+ if (!checkYtDlp()) process.exit(1);
536
+
537
+ initDb();
538
+
539
+ const tmpDir = join(tmpdir(), "rosetta-yt");
540
+ mkdirSync(tmpDir, { recursive: true });
541
+
542
+ let listed: Array<{ id: string; title: string; duration?: number }>;
543
+ try {
544
+ listed = listPlaylist(PLAYLIST);
545
+ } catch (err) {
546
+ console.error(`Failed to list playlist: ${err}`);
547
+ process.exit(1);
548
+ }
549
+
550
+ // Filter by duration + MUM content
551
+ const filtered = listed.filter(
552
+ (v) => isInDurationRange(v.duration) && !isMumContent(v.title),
553
+ );
554
+
555
+ // Load known-bad list and filter those out too
556
+ const projectRoot = new URL("..", import.meta.url).pathname.replace(/\/$/, "");
557
+ const knownBadPath = KNOWN_BAD_PATH ?? join(projectRoot, "transcripts", "known-bad.json");
558
+ const knownBad = loadKnownBad(knownBadPath);
559
+ if (knownBad.size > 0) console.log(`Loaded ${knownBad.size} known-bad video IDs from ${knownBadPath}`);
560
+
561
+ const afterKnownBad = filtered.filter((v) => !knownBad.has(v.id));
562
+ console.log(`\nPlaylist: ${listed.length} total → ${filtered.length} after filter (duration ${MIN_DURATION}–${MAX_DURATION}s, no MUM)${knownBad.size > 0 ? ` → ${afterKnownBad.length} after known-bad` : ""}`);
563
+
564
+ // Apply limit for dev
565
+ const toProcess = LIMIT !== undefined ? afterKnownBad.slice(0, LIMIT) : afterKnownBad;
566
+
567
+ // Cleanup on SIGINT so the temp dir is removed even if the user Ctrl+C's
568
+ process.on("SIGINT", () => {
569
+ try { rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
570
+ process.exit(130);
571
+ });
572
+
573
+ let newCount = 0;
574
+ let skippedCount = 0;
575
+ let failedCount = 0;
576
+ let timedOutCount = 0;
577
+ let noTranscriptCount = 0;
578
+ const failedIds: string[] = [];
579
+
580
+ const insertVideo = db.prepare(`
581
+ INSERT OR REPLACE INTO videos (video_id, title, description, channel, upload_date, duration_s, url, view_count, like_count, has_chapters)
582
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
583
+ `);
584
+ const insertSegment = db.prepare(`
585
+ INSERT INTO video_segments (video_id, chapter_title, start_s, end_s, transcript, sort_order)
586
+ VALUES (?, ?, ?, ?, ?, ?)
587
+ `);
588
+
589
+ for (let i = 0; i < toProcess.length; i++) {
590
+ const entry = toProcess[i];
591
+ const prefix = `[${i + 1}/${toProcess.length}]`;
592
+
593
+ if (!FORCE && videoExists(entry.id)) {
594
+ console.log(`${prefix} skip (already extracted): ${entry.title}`);
595
+ skippedCount++;
596
+ continue;
597
+ }
598
+
599
+ const t0 = Date.now();
600
+ console.log(`${prefix} extracting: ${entry.title}`);
601
+
602
+ const dlResult = downloadTranscript(entry.id, tmpDir);
603
+ const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
604
+ if (dlResult === "timeout") {
605
+ console.error(` ✗ TIMEOUT after ${elapsed}s — ${entry.title} (${entry.id})`);
606
+ failedIds.push(entry.id);
607
+ timedOutCount++;
608
+ failedCount++;
609
+ continue;
610
+ }
611
+ if (dlResult === "error") {
612
+ console.error(` ✗ yt-dlp error after ${elapsed}s — ${entry.title} (${entry.id})`);
613
+ failedIds.push(entry.id);
614
+ failedCount++;
615
+ continue;
616
+ }
617
+
618
+ // Find generated files
619
+ const infoPath = join(tmpDir, `${entry.id}.info.json`);
620
+ // yt-dlp may output en.vtt or en-orig.vtt depending on version
621
+ const vttCandidates = [
622
+ join(tmpDir, `${entry.id}.en.vtt`),
623
+ join(tmpDir, `${entry.id}.en-orig.vtt`),
624
+ ];
625
+ const vttPath = vttCandidates.find((p) => existsSync(p));
626
+
627
+ if (!existsSync(infoPath)) {
628
+ console.warn(` ✗ info.json not found for ${entry.id}`);
629
+ failedCount++;
630
+ continue;
631
+ }
632
+
633
+ let info: YtVideoInfo;
634
+ try {
635
+ info = JSON.parse(readFileSync(infoPath, "utf8")) as YtVideoInfo;
636
+ } catch {
637
+ console.warn(` ✗ failed to parse info.json for ${entry.id}`);
638
+ failedCount++;
639
+ continue;
640
+ }
641
+
642
+ let segments: TranscriptSegment[] = [];
643
+ if (vttPath && existsSync(vttPath)) {
644
+ const vttText = readFileSync(vttPath, "utf8");
645
+ const cues = parseVtt(vttText);
646
+ segments = segmentTranscript(cues, info.chapters);
647
+ } else {
648
+ console.warn(` ⚠ no English transcript for ${entry.id} — storing metadata only`);
649
+ noTranscriptCount++;
650
+ // Store a placeholder segment so the video is still discoverable
651
+ segments = [{ chapter_title: null, start_s: 0, end_s: null, transcript: "" }];
652
+ }
653
+
654
+ // Skip segments with empty transcripts if we got no VTT at all
655
+ const hasRealTranscript = segments.some((s) => s.transcript.length > 0);
656
+ if (!hasRealTranscript) {
657
+ console.warn(` ⚠ empty transcript for ${entry.id}, storing metadata only`);
658
+ }
659
+
660
+ // Remove old data if --force
661
+ if (FORCE) deleteVideoData(entry.id);
662
+
663
+ const hasChapters = (info.chapters?.length ?? 0) > 1 ? 1 : 0;
664
+
665
+ db.transaction(() => {
666
+ insertVideo.run(
667
+ info.id,
668
+ normalizeSuperscripts(info.title),
669
+ normalizeSuperscripts(info.description ?? null),
670
+ info.channel ?? null,
671
+ info.upload_date ?? null,
672
+ info.duration != null ? Math.round(info.duration) : null,
673
+ info.webpage_url ?? `https://youtu.be/${info.id}`,
674
+ info.view_count ?? null,
675
+ info.like_count ?? null,
676
+ hasChapters,
677
+ );
678
+ const videoRow = db.prepare("SELECT id FROM videos WHERE video_id = ?").get(info.id) as { id: number };
679
+ for (let si = 0; si < segments.length; si++) {
680
+ const seg = segments[si];
681
+ insertSegment.run(
682
+ videoRow.id,
683
+ normalizeSuperscripts(seg.chapter_title),
684
+ seg.start_s,
685
+ seg.end_s,
686
+ seg.transcript,
687
+ si,
688
+ );
689
+ }
690
+ })();
691
+
692
+ const chapNote = hasChapters ? ` (${info.chapters?.length} chapters)` : "";
693
+ const segNote = hasRealTranscript ? ` → ${segments.length} segment(s)${chapNote}` : " (no transcript)";
694
+ console.log(` ✓ ${info.title}${segNote}`);
695
+ newCount++;
696
+
697
+ // Clean up temp files for this video
698
+ for (const p of [infoPath, ...vttCandidates]) {
699
+ try { if (existsSync(p)) rmSync(p); } catch { /* ignore */ }
700
+ }
701
+ }
702
+
703
+ // Final cleanup of temp dir
704
+ try { rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
705
+
706
+ console.log(`\nDone: ${newCount} new, ${skippedCount} skipped, ${noTranscriptCount} no-transcript, ${failedCount} failed (${timedOutCount} timeout)`);
707
+
708
+ if (failedIds.length > 0) {
709
+ console.error(`\nFailed video IDs (${failedIds.length}):`);
710
+ for (const id of failedIds) {
711
+ console.error(` https://www.youtube.com/watch?v=${id}`);
712
+ }
713
+ }
714
+
715
+ // Write NDJSON cache if requested (--save-cache)
716
+ if (SAVE_CACHE) {
717
+ const date = new Date().toISOString().slice(0, 10);
718
+ const outPath = join(projectRoot, "transcripts", date, "videos.ndjson");
719
+ const count = saveCache(outPath);
720
+ console.log(`\nCache written: ${outPath} (${count} videos)`);
721
+ }
722
+
723
+ if (STRICT && failedCount > 0) {
724
+ console.error(`\nExiting non-zero: ${failedCount} video(s) failed and --strict mode is active`);
725
+ process.exit(1);
726
+ }
727
+ }
728
+
729
+ if (import.meta.main) {
730
+ main().catch((err) => {
731
+ console.error(err);
732
+ process.exit(1);
733
+ });
734
+ }