@thunderkiller/video-clipper 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/CONTRIBUTING.md +100 -0
- package/LICENSE +15 -0
- package/commitlint.config.js +25 -0
- package/package.json +3 -1
- package/.github/workflows/ci.yml +0 -42
- package/.github/workflows/release.yml +0 -76
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -6
- package/.prettierrc +0 -7
- package/.releaserc.json +0 -21
- package/AGENTS.md +0 -122
- package/docs/free-models.md +0 -78
- package/docs/plan.md +0 -442
- package/docs/refactorPhases.md +0 -105
- package/docs/yt-downloader.md +0 -440
- package/requirements.txt +0 -5
- package/scripts/detect_events.py +0 -81
- package/scripts/detect_events_whisper.py +0 -101
- package/scripts/transcribe_whisper.py +0 -70
- package/src/cli.ts +0 -186
- package/src/config/env.ts +0 -18
- package/src/config/index.ts +0 -2
- package/src/index.ts +0 -46
- package/src/pipeline/runner.ts +0 -147
- package/src/pipeline/stages/audioProcessor.ts +0 -127
- package/src/pipeline/stages/clipExporter.ts +0 -76
- package/src/pipeline/stages/segmentAnalyzer.ts +0 -72
- package/src/pipeline/stages/segmentSelector.ts +0 -39
- package/src/pipeline/stages/videoResolver.ts +0 -44
- package/src/services/audioAnalyzers/base.ts +0 -32
- package/src/services/audioAnalyzers/factory.ts +0 -69
- package/src/services/audioAnalyzers/gemini.ts +0 -136
- package/src/services/audioAnalyzers/index.ts +0 -6
- package/src/services/audioAnalyzers/whisper.ts +0 -80
- package/src/services/audioAnalyzers/yamnet.ts +0 -54
- package/src/services/audioDownloader/index.ts +0 -102
- package/src/services/chunkBuilder/index.ts +0 -82
- package/src/services/clipGenerator/index.ts +0 -210
- package/src/services/clipRefiner/index.ts +0 -141
- package/src/services/eventDetector/index.ts +0 -68
- package/src/services/llmAnalyzer/LLMAnalyzer.ts +0 -98
- package/src/services/llmAnalyzer/index.ts +0 -231
- package/src/services/metadataExtractor/index.ts +0 -83
- package/src/services/segmentRanker/index.ts +0 -88
- package/src/services/signalMerger/index.ts +0 -53
- package/src/services/transcriptAnalyzers/base.ts +0 -26
- package/src/services/transcriptAnalyzers/factory.ts +0 -66
- package/src/services/transcriptAnalyzers/gemini.ts +0 -24
- package/src/services/transcriptAnalyzers/index.ts +0 -6
- package/src/services/transcriptAnalyzers/whisper.ts +0 -68
- package/src/services/transcriptAnalyzers/ytdlp.ts +0 -19
- package/src/services/transcriptDetector/index.ts +0 -122
- package/src/services/transcriptFetcher/index.ts +0 -147
- package/src/services/urlParser/index.ts +0 -52
- package/src/services/videoDownloader/index.ts +0 -268
- package/src/types/analyzer.ts +0 -23
- package/src/types/audio.ts +0 -19
- package/src/types/cache.ts +0 -8
- package/src/types/cli.ts +0 -22
- package/src/types/config.ts +0 -151
- package/src/types/downloader.ts +0 -15
- package/src/types/factory.ts +0 -3
- package/src/types/index.ts +0 -40
- package/src/types/pipeline.ts +0 -60
- package/src/types/segment.ts +0 -43
- package/src/types/transcript.ts +0 -22
- package/src/types/video.ts +0 -18
- package/src/utils/cache.ts +0 -224
- package/src/utils/chunker.ts +0 -60
- package/src/utils/dumper.ts +0 -41
- package/src/utils/format.ts +0 -10
- package/src/utils/logger.ts +0 -17
- package/src/utils/modelFactory.ts +0 -71
- package/src/utils/redactConfig.ts +0 -23
- package/src/utils/sliceAudio.ts +0 -35
- package/test-trigger.txt +0 -1
- package/tests/analyzerFactory.test.ts +0 -146
- package/tests/audioEventDetector.test.ts +0 -69
- package/tests/cache.test.ts +0 -203
- package/tests/chunkBuilder.test.ts +0 -146
- package/tests/chunker.test.ts +0 -95
- package/tests/eventDetector.test.ts +0 -103
- package/tests/llmAnalyzer.test.ts +0 -283
- package/tests/segmentRanker.test.ts +0 -133
- package/tests/setup.ts +0 -48
- package/tests/signalMerger.test.ts +0 -197
- package/tests/transcriptDetector.test.ts +0 -150
- package/tests/transcriptFetcher.test.ts +0 -179
- package/tests/urlParser.test.ts +0 -70
- package/tsconfig.json +0 -16
- package/tsconfig.test.json +0 -8
- package/vitest.config.ts +0 -8
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
import { buildMicroBlocks, buildLLMChunks } from '../chunkBuilder/index.js';
|
|
2
|
-
import { log } from '../../utils/logger.js';
|
|
3
|
-
import { config } from '../../config/index.js';
|
|
4
|
-
import type { TranscriptAnalyzer } from '../transcriptAnalyzers/index.js';
|
|
5
|
-
import type { Cache } from '../../utils/cache.js';
|
|
6
|
-
import type {
|
|
7
|
-
TranscriptLine,
|
|
8
|
-
MicroBlock,
|
|
9
|
-
LLMChunk,
|
|
10
|
-
TranscriptDetectorResult,
|
|
11
|
-
} from '../../types/index.js';
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Top-level transcript detector.
|
|
15
|
-
*
|
|
16
|
-
* Holds an ordered chain of TranscriptAnalyzer instances and walks the chain
|
|
17
|
-
* on each `detect()` call: the first analyzer that succeeds wins. If an
|
|
18
|
-
* analyzer throws, the error is logged and the next analyzer in the chain is
|
|
19
|
-
* tried. If the entire chain is exhausted without success the error from the
|
|
20
|
-
* last analyzer is re-thrown.
|
|
21
|
-
*
|
|
22
|
-
* After obtaining raw transcript lines the detector groups them into
|
|
23
|
-
* micro-blocks and builds overlapping LLM analysis chunks — keeping the full
|
|
24
|
-
* "transcript concern" self-contained under one class.
|
|
25
|
-
*
|
|
26
|
-
* The chain is built once at startup via `createTranscriptChain(config.TRANSCRIPT_PROVIDER)`
|
|
27
|
-
* and injected here, keeping provider-selection logic out of this class.
|
|
28
|
-
*
|
|
29
|
-
* Results are cached via the injected Cache instance so that repeat runs skip
|
|
30
|
-
* the network round-trip to yt-dlp / Whisper.
|
|
31
|
-
*
|
|
32
|
-
* @example
|
|
33
|
-
* const chain = createTranscriptChain('ytdlp,whisper');
|
|
34
|
-
* const detector = new TranscriptDetector(chain);
|
|
35
|
-
* const { lines, microBlocks, chunks } = await detector.detect(videoId, audioPath, cache);
|
|
36
|
-
*/
|
|
37
|
-
export class TranscriptDetector {
|
|
38
|
-
constructor(private readonly chain: TranscriptAnalyzer[]) {
|
|
39
|
-
if (chain.length === 0) {
|
|
40
|
-
throw new Error('TranscriptDetector requires at least one TranscriptAnalyzer in the chain.');
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
/**
|
|
45
|
-
* Fetches, groups, and chunks the transcript for the given video ID.
|
|
46
|
-
*
|
|
47
|
-
* Walks the analyzer chain in order, falling back on error. Cache is checked
|
|
48
|
-
* first (before any analyzer is tried) and written after the first successful
|
|
49
|
-
* fetch so subsequent runs with the same provider config are instant.
|
|
50
|
-
*
|
|
51
|
-
* @param videoId - YouTube video ID
|
|
52
|
-
* @param audioPath - Path to the downloaded WAV, or null if audio is not yet available
|
|
53
|
-
* @param cache - Cache instance for read/write of transcript lines
|
|
54
|
-
*/
|
|
55
|
-
async detect(
|
|
56
|
-
videoId: string,
|
|
57
|
-
audioPath: string | null,
|
|
58
|
-
cache: Cache,
|
|
59
|
-
): Promise<TranscriptDetectorResult> {
|
|
60
|
-
let lines: TranscriptLine[];
|
|
61
|
-
|
|
62
|
-
const cached = await cache.readTranscript(videoId);
|
|
63
|
-
if (cached) {
|
|
64
|
-
log.info(`[cache hit] Transcript loaded from cache (${cached.length} lines)`);
|
|
65
|
-
lines = cached;
|
|
66
|
-
} else {
|
|
67
|
-
lines = await this.fetchFromChain(videoId, audioPath);
|
|
68
|
-
await cache.writeTranscript(videoId, lines);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
const microBlocks = this.buildMicroBlocks(lines);
|
|
72
|
-
const chunks = this.buildChunks(microBlocks);
|
|
73
|
-
|
|
74
|
-
return { lines, microBlocks, chunks };
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* Walks the analyzer chain in order.
|
|
79
|
-
* Falls back to the next analyzer whenever one throws.
|
|
80
|
-
*/
|
|
81
|
-
private async fetchFromChain(
|
|
82
|
-
videoId: string,
|
|
83
|
-
audioPath: string | null,
|
|
84
|
-
): Promise<TranscriptLine[]> {
|
|
85
|
-
let lastError: unknown;
|
|
86
|
-
|
|
87
|
-
for (let i = 0; i < this.chain.length; i++) {
|
|
88
|
-
const analyzer = this.chain[i];
|
|
89
|
-
const isLast = i === this.chain.length - 1;
|
|
90
|
-
|
|
91
|
-
try {
|
|
92
|
-
const lines = await analyzer.detect(videoId, audioPath);
|
|
93
|
-
log.info(`[transcript:${analyzer.source}] fetched ${lines.length} lines`);
|
|
94
|
-
return lines;
|
|
95
|
-
} catch (err) {
|
|
96
|
-
lastError = err;
|
|
97
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
98
|
-
|
|
99
|
-
if (!isLast) {
|
|
100
|
-
const nextSource = this.chain[i + 1].source;
|
|
101
|
-
log.warn(
|
|
102
|
-
`[transcript:${analyzer.source}] failed, falling back to ${nextSource}: ${message}`,
|
|
103
|
-
);
|
|
104
|
-
} else {
|
|
105
|
-
log.error(`[transcript:${analyzer.source}] failed (no more fallbacks): ${message}`);
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
throw lastError;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
/** Groups raw transcript lines into micro-blocks. */
|
|
114
|
-
private buildMicroBlocks(lines: TranscriptLine[]): MicroBlock[] {
|
|
115
|
-
return buildMicroBlocks(lines, config.MICRO_BLOCK_SEC);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/** Builds overlapping LLM analysis chunks from micro-blocks. */
|
|
119
|
-
private buildChunks(microBlocks: MicroBlock[]): LLMChunk[] {
|
|
120
|
-
return buildLLMChunks(microBlocks, config.CHUNK_LENGTH_SEC, config.CHUNK_OVERLAP_SEC);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
@@ -1,147 +0,0 @@
|
|
|
1
|
-
import { execa } from 'execa';
|
|
2
|
-
import fs from 'node:fs/promises';
|
|
3
|
-
import os from 'node:os';
|
|
4
|
-
import path from 'node:path';
|
|
5
|
-
import { log } from '../../utils/logger.js';
|
|
6
|
-
import { config } from '../../config/index.js';
|
|
7
|
-
import type { TranscriptLine } from '../../types/index.js';
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* Parses a WebVTT string into TranscriptLine[].
|
|
11
|
-
*
|
|
12
|
-
* Handles:
|
|
13
|
-
* - `HH:MM:SS.mmm --> HH:MM:SS.mmm` timestamp lines
|
|
14
|
-
* - `<MM:SS.mmm><c>text</c>` inline cue tags (stripped)
|
|
15
|
-
* - Duplicate / empty cues (skipped)
|
|
16
|
-
*
|
|
17
|
-
* Exported for unit testing.
|
|
18
|
-
*/
|
|
19
|
-
export function parseVtt(vttContent: string): TranscriptLine[] {
|
|
20
|
-
const lines = vttContent.split(/\r?\n/);
|
|
21
|
-
const result: TranscriptLine[] = [];
|
|
22
|
-
|
|
23
|
-
/** Regex to match HH:MM:SS.mmm --> HH:MM:SS.mmm timestamp lines */
|
|
24
|
-
const TIMESTAMP_RE =
|
|
25
|
-
/^(\d{2}):(\d{2}):(\d{2})[.,](\d{3})\s+-->\s+(\d{2}):(\d{2}):(\d{2})[.,](\d{3})/;
|
|
26
|
-
|
|
27
|
-
let i = 0;
|
|
28
|
-
while (i < lines.length) {
|
|
29
|
-
const line = lines[i].trim();
|
|
30
|
-
const match = TIMESTAMP_RE.exec(line);
|
|
31
|
-
|
|
32
|
-
if (match) {
|
|
33
|
-
const startSec =
|
|
34
|
-
parseInt(match[1], 10) * 3600 +
|
|
35
|
-
parseInt(match[2], 10) * 60 +
|
|
36
|
-
parseInt(match[3], 10) +
|
|
37
|
-
parseInt(match[4], 10) / 1000;
|
|
38
|
-
|
|
39
|
-
const endSec =
|
|
40
|
-
parseInt(match[5], 10) * 3600 +
|
|
41
|
-
parseInt(match[6], 10) * 60 +
|
|
42
|
-
parseInt(match[7], 10) +
|
|
43
|
-
parseInt(match[8], 10) / 1000;
|
|
44
|
-
|
|
45
|
-
i++;
|
|
46
|
-
const textLines: string[] = [];
|
|
47
|
-
while (i < lines.length && lines[i].trim() !== '') {
|
|
48
|
-
textLines.push(lines[i].trim());
|
|
49
|
-
i++;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
const rawText = textLines.join(' ');
|
|
53
|
-
|
|
54
|
-
const text = rawText
|
|
55
|
-
.replace(/<[^>]+>/g, '')
|
|
56
|
-
.replace(/&/g, '&')
|
|
57
|
-
.replace(/</g, '<')
|
|
58
|
-
.replace(/>/g, '>')
|
|
59
|
-
.replace(/ /g, ' ')
|
|
60
|
-
.replace(/\s+/g, ' ')
|
|
61
|
-
.trim();
|
|
62
|
-
|
|
63
|
-
if (text.length === 0) {
|
|
64
|
-
continue;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
const duration = Math.max(0, endSec - startSec);
|
|
68
|
-
|
|
69
|
-
/** Skip duplicate cues - YouTube VTT often repeats same line as text scrolls */
|
|
70
|
-
if (result.length > 0 && result[result.length - 1].text === text) {
|
|
71
|
-
continue;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
result.push({ text, start: startSec, duration });
|
|
75
|
-
continue;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
i++;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
return result;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Fetches the transcript for a given YouTube video ID using yt-dlp
|
|
86
|
-
* auto-generated subtitles (VTT format).
|
|
87
|
-
*
|
|
88
|
-
* The VTT file is written to a temp directory, parsed into TranscriptLine[],
|
|
89
|
-
* then cleaned up. Cookie config (YT_DLP_COOKIES_FROM_BROWSER /
|
|
90
|
-
* YT_DLP_COOKIES_FILE) is forwarded to yt-dlp automatically.
|
|
91
|
-
*
|
|
92
|
-
* @throws {Error} with the yt-dlp stderr if the command fails
|
|
93
|
-
* @throws {Error} if no subtitle file is produced
|
|
94
|
-
* @throws {Error} if the subtitle file contains no parseable cues
|
|
95
|
-
*/
|
|
96
|
-
export async function fetchTranscript(videoId: string): Promise<TranscriptLine[]> {
|
|
97
|
-
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'vc-vtt-'));
|
|
98
|
-
|
|
99
|
-
try {
|
|
100
|
-
const args = [
|
|
101
|
-
'--write-auto-sub',
|
|
102
|
-
'--sub-format',
|
|
103
|
-
'vtt',
|
|
104
|
-
'--sub-lang',
|
|
105
|
-
'en.*',
|
|
106
|
-
'--skip-download',
|
|
107
|
-
'--output',
|
|
108
|
-
path.join(tmpDir, '%(id)s.%(ext)s'),
|
|
109
|
-
`https://www.youtube.com/watch?v=${videoId}`,
|
|
110
|
-
];
|
|
111
|
-
|
|
112
|
-
if (config.YT_DLP_COOKIES_FROM_BROWSER) {
|
|
113
|
-
args.unshift('--cookies-from-browser', config.YT_DLP_COOKIES_FROM_BROWSER);
|
|
114
|
-
} else if (config.YT_DLP_COOKIES_FILE) {
|
|
115
|
-
args.unshift('--cookies', config.YT_DLP_COOKIES_FILE);
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
try {
|
|
119
|
-
await execa('yt-dlp', args);
|
|
120
|
-
} catch (err) {
|
|
121
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
122
|
-
throw new Error(`yt-dlp failed to fetch subtitles for "${videoId}": ${message}`);
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
const files = await fs.readdir(tmpDir);
|
|
126
|
-
const vttFile = files.find((f) => f.endsWith('.vtt'));
|
|
127
|
-
|
|
128
|
-
if (!vttFile) {
|
|
129
|
-
throw new Error(
|
|
130
|
-
`No subtitles found for "${videoId}". The video may not have auto-generated captions.`,
|
|
131
|
-
);
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
const content = await fs.readFile(path.join(tmpDir, vttFile), 'utf8');
|
|
135
|
-
const lines = parseVtt(content);
|
|
136
|
-
|
|
137
|
-
log.info(`Parsed ${lines.length} cues from subtitle file "${vttFile}".`);
|
|
138
|
-
|
|
139
|
-
if (lines.length === 0) {
|
|
140
|
-
throw new Error(`Subtitle file for "${videoId}" was empty or contained no parseable cues.`);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
return lines;
|
|
144
|
-
} finally {
|
|
145
|
-
await fs.rm(tmpDir, { recursive: true, force: true });
|
|
146
|
-
}
|
|
147
|
-
}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
const VIDEO_ID_LENGTH = 11;
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Parses a YouTube URL and returns the 11-character video ID.
|
|
5
|
-
* Supports:
|
|
6
|
-
* - https://www.youtube.com/watch?v=VIDEO_ID
|
|
7
|
-
* - https://youtu.be/VIDEO_ID
|
|
8
|
-
* - https://www.youtube.com/embed/VIDEO_ID
|
|
9
|
-
* - https://www.youtube.com/shorts/VIDEO_ID
|
|
10
|
-
*
|
|
11
|
-
* @throws {Error} if the URL is not a valid YouTube URL or the video ID is not 11 characters
|
|
12
|
-
*/
|
|
13
|
-
export function parseUrl(url: string): string {
|
|
14
|
-
let parsed: URL;
|
|
15
|
-
|
|
16
|
-
try {
|
|
17
|
-
parsed = new URL(url);
|
|
18
|
-
} catch {
|
|
19
|
-
throw new Error(`Invalid URL: "${url}"`);
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const { hostname, pathname, searchParams } = parsed;
|
|
23
|
-
const host = hostname.replace(/^www\./, '');
|
|
24
|
-
|
|
25
|
-
let videoId: string | null = null;
|
|
26
|
-
|
|
27
|
-
if (host === 'youtube.com') {
|
|
28
|
-
if (pathname === '/watch') {
|
|
29
|
-
videoId = searchParams.get('v');
|
|
30
|
-
} else if (pathname.startsWith('/embed/')) {
|
|
31
|
-
videoId = pathname.split('/embed/')[1]?.split('/')[0] ?? null;
|
|
32
|
-
} else if (pathname.startsWith('/shorts/')) {
|
|
33
|
-
videoId = pathname.split('/shorts/')[1]?.split('/')[0] ?? null;
|
|
34
|
-
}
|
|
35
|
-
} else if (host === 'youtu.be') {
|
|
36
|
-
videoId = pathname.slice(1).split('/')[0] ?? null;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
if (!videoId) {
|
|
40
|
-
throw new Error(`Could not extract video ID from URL: "${url}"`);
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
videoId = videoId.split('?')[0];
|
|
44
|
-
|
|
45
|
-
if (videoId.length !== VIDEO_ID_LENGTH) {
|
|
46
|
-
throw new Error(
|
|
47
|
-
`Invalid video ID "${videoId}": expected ${VIDEO_ID_LENGTH} characters, got ${videoId.length}`,
|
|
48
|
-
);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
return videoId;
|
|
52
|
-
}
|
|
@@ -1,268 +0,0 @@
|
|
|
1
|
-
import { execa } from 'execa';
|
|
2
|
-
import { promises as fs } from 'fs';
|
|
3
|
-
import { join } from 'path';
|
|
4
|
-
import pLimit from 'p-limit';
|
|
5
|
-
import { config } from '../../config/index.js';
|
|
6
|
-
import { log } from '../../utils/logger.js';
|
|
7
|
-
import type { RankedSegment, DownloadMode, DownloadResult } from '../../types/index.js';
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* Formats a timestamp for yt-dlp --download-sections.
|
|
11
|
-
* Converts seconds to HH:MM:SS.mmm format with millisecond precision.
|
|
12
|
-
*/
|
|
13
|
-
function formatTimestamp(seconds: number): string {
|
|
14
|
-
const h = Math.floor(seconds / 3600);
|
|
15
|
-
const m = Math.floor((seconds % 3600) / 60);
|
|
16
|
-
const s = seconds % 60;
|
|
17
|
-
const sInt = Math.floor(s);
|
|
18
|
-
const ms = Math.round((s - sInt) * 1000);
|
|
19
|
-
return `${String(h).padStart(2, '0')}:${String(m).padStart(2, '0')}:${String(sInt).padStart(2, '0')}.${String(ms).padStart(3, '0')}`;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Displays progress from yt-dlp stdout/stderr.
|
|
24
|
-
*/
|
|
25
|
-
function displayProgress(stream: 'stdout' | 'stderr'): (data: Buffer | string) => void {
|
|
26
|
-
return (data: Buffer | string) => {
|
|
27
|
-
const text = String(data);
|
|
28
|
-
const lines = text.split('\n').filter((line) => line.trim());
|
|
29
|
-
|
|
30
|
-
for (const line of lines) {
|
|
31
|
-
const progressMatch = line.match(/\[download\]\s+(\d+\.?\d*%)/);
|
|
32
|
-
if (progressMatch) {
|
|
33
|
-
process.stdout.write(`\r${progressMatch[0]}`);
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/**
|
|
40
|
-
* Downloads a YouTube video using yt-dlp and returns the local file path.
|
|
41
|
-
*
|
|
42
|
-
* Strategy:
|
|
43
|
-
* - Skips download if the target file already exists.
|
|
44
|
-
* - Auto-creates the download directory if it doesn't exist.
|
|
45
|
-
* - Surfaces clear errors for common failure modes (yt-dlp not installed,
|
|
46
|
-
* private/geo-blocked video, etc.).
|
|
47
|
-
*
|
|
48
|
-
* @param videoId - 11-character YouTube video ID
|
|
49
|
-
* @param customPath - Custom output directory (optional, overrides DOWNLOAD_DIR)
|
|
50
|
-
* @returns Absolute path to the downloaded mp4 file
|
|
51
|
-
* @throws {Error} if yt-dlp is not installed or the download fails
|
|
52
|
-
*/
|
|
53
|
-
export async function downloadFullVideo(videoId: string, customPath?: string): Promise<string> {
|
|
54
|
-
const downloadDir = customPath || config.DOWNLOAD_DIR;
|
|
55
|
-
await fs.mkdir(downloadDir, { recursive: true });
|
|
56
|
-
|
|
57
|
-
const outputPath = join(downloadDir, `${videoId}.mp4`);
|
|
58
|
-
|
|
59
|
-
try {
|
|
60
|
-
await fs.access(outputPath);
|
|
61
|
-
log.info(`Video already downloaded: ${outputPath}`);
|
|
62
|
-
return outputPath;
|
|
63
|
-
} catch {}
|
|
64
|
-
|
|
65
|
-
log.info(`Downloading full video ${videoId} via yt-dlp...`);
|
|
66
|
-
|
|
67
|
-
try {
|
|
68
|
-
const args = [
|
|
69
|
-
'-f',
|
|
70
|
-
'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
|
|
71
|
-
'--merge-output-format',
|
|
72
|
-
'mp4',
|
|
73
|
-
'-o',
|
|
74
|
-
outputPath,
|
|
75
|
-
'--no-playlist',
|
|
76
|
-
'--newline',
|
|
77
|
-
`https://www.youtube.com/watch?v=${videoId}`,
|
|
78
|
-
];
|
|
79
|
-
|
|
80
|
-
if (config.YT_DLP_COOKIES_FROM_BROWSER) {
|
|
81
|
-
args.splice(0, 0, '--cookies-from-browser', config.YT_DLP_COOKIES_FROM_BROWSER);
|
|
82
|
-
} else if (config.YT_DLP_COOKIES_FILE) {
|
|
83
|
-
args.splice(0, 0, '--cookies', config.YT_DLP_COOKIES_FILE);
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
const subprocess = execa('yt-dlp', args);
|
|
87
|
-
|
|
88
|
-
subprocess.stdout?.on('data', displayProgress('stdout'));
|
|
89
|
-
subprocess.stderr?.on('data', displayProgress('stderr'));
|
|
90
|
-
|
|
91
|
-
await subprocess;
|
|
92
|
-
process.stdout.write('\n');
|
|
93
|
-
} catch (err) {
|
|
94
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
95
|
-
|
|
96
|
-
if (message.includes('command not found') || message.includes('ENOENT')) {
|
|
97
|
-
throw new Error('yt-dlp is required. Install it: https://github.com/yt-dlp/yt-dlp');
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (message.includes('Private video') || message.includes('Sign in')) {
|
|
101
|
-
throw new Error(`Video "${videoId}" is private and cannot be downloaded.`);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
if (message.includes('not available in your country') || message.includes('geo')) {
|
|
105
|
-
throw new Error(`Video "${videoId}" is geo-blocked in your region.`);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
throw new Error(`Download failed: ${message}`);
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
log.info(`Download complete: ${outputPath}`);
|
|
112
|
-
return outputPath;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
/**
|
|
116
|
-
* Downloads a single segment using yt-dlp --download-sections.
|
|
117
|
-
*/
|
|
118
|
-
async function downloadSegment(
|
|
119
|
-
videoId: string,
|
|
120
|
-
segment: RankedSegment,
|
|
121
|
-
index: number,
|
|
122
|
-
customPath?: string,
|
|
123
|
-
): Promise<string> {
|
|
124
|
-
const downloadDir = customPath || config.DOWNLOAD_DIR;
|
|
125
|
-
await fs.mkdir(downloadDir, { recursive: true });
|
|
126
|
-
|
|
127
|
-
const adjustedStart = Math.max(0, segment.start + config.TIMESTAMP_OFFSET_SECONDS);
|
|
128
|
-
const adjustedEnd = Math.max(adjustedStart + 1, segment.end + config.TIMESTAMP_OFFSET_SECONDS);
|
|
129
|
-
const startInt = Math.floor(adjustedStart);
|
|
130
|
-
const endInt = Math.ceil(adjustedEnd);
|
|
131
|
-
const outputPath = join(downloadDir, `${videoId}_${startInt}_${endInt}.mp4`);
|
|
132
|
-
|
|
133
|
-
try {
|
|
134
|
-
await fs.access(outputPath);
|
|
135
|
-
log.info(`Segment ${index + 1}/${index} already downloaded: ${outputPath}`);
|
|
136
|
-
return outputPath;
|
|
137
|
-
} catch {}
|
|
138
|
-
|
|
139
|
-
const startTs = formatTimestamp(adjustedStart);
|
|
140
|
-
const endTs = formatTimestamp(adjustedEnd);
|
|
141
|
-
|
|
142
|
-
log.info(`Downloading segment ${index + 1}: ${startTs} - ${endTs} (${segment.reason})`);
|
|
143
|
-
log.info(` Requested: ${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s`);
|
|
144
|
-
if (config.TIMESTAMP_OFFSET_SECONDS !== 0) {
|
|
145
|
-
log.info(
|
|
146
|
-
` Adjusted: ${adjustedStart.toFixed(2)}s - ${adjustedEnd.toFixed(2)}s (offset: ${config.TIMESTAMP_OFFSET_SECONDS}s)`,
|
|
147
|
-
);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
try {
|
|
151
|
-
const args = [
|
|
152
|
-
'-f',
|
|
153
|
-
'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
|
|
154
|
-
'--merge-output-format',
|
|
155
|
-
'mp4',
|
|
156
|
-
'--download-sections',
|
|
157
|
-
`*${startTs}-${endTs}`,
|
|
158
|
-
'-o',
|
|
159
|
-
outputPath,
|
|
160
|
-
'--no-playlist',
|
|
161
|
-
'--newline',
|
|
162
|
-
`https://www.youtube.com/watch?v=${videoId}`,
|
|
163
|
-
];
|
|
164
|
-
|
|
165
|
-
if (config.YT_DLP_COOKIES_FROM_BROWSER) {
|
|
166
|
-
args.splice(0, 0, '--cookies-from-browser', config.YT_DLP_COOKIES_FROM_BROWSER);
|
|
167
|
-
} else if (config.YT_DLP_COOKIES_FILE) {
|
|
168
|
-
args.splice(0, 0, '--cookies', config.YT_DLP_COOKIES_FILE);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
const subprocess = execa('yt-dlp', args);
|
|
172
|
-
|
|
173
|
-
subprocess.stdout?.on('data', displayProgress('stdout'));
|
|
174
|
-
subprocess.stderr?.on('data', displayProgress('stderr'));
|
|
175
|
-
|
|
176
|
-
await subprocess;
|
|
177
|
-
process.stdout.write('\n');
|
|
178
|
-
} catch (err) {
|
|
179
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
180
|
-
|
|
181
|
-
if (message.includes('command not found') || message.includes('ENOENT')) {
|
|
182
|
-
throw new Error('yt-dlp is required. Install it: https://github.com/yt-dlp/yt-dlp');
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
if (message.includes('Private video') || message.includes('Sign in')) {
|
|
186
|
-
throw new Error(`Video "${videoId}" is private and cannot be downloaded.`);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
if (message.includes('not available in your country') || message.includes('geo')) {
|
|
190
|
-
throw new Error(`Video "${videoId}" is geo-blocked in your region.`);
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
throw new Error(`Segment download failed: ${message}`);
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
log.info(`Segment complete: ${outputPath}`);
|
|
197
|
-
return outputPath;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
/**
|
|
201
|
-
* Downloads multiple segments in parallel.
|
|
202
|
-
*/
|
|
203
|
-
async function downloadSegments(
|
|
204
|
-
videoId: string,
|
|
205
|
-
segments: RankedSegment[],
|
|
206
|
-
customPath?: string,
|
|
207
|
-
): Promise<string[]> {
|
|
208
|
-
if (segments.length === 0) {
|
|
209
|
-
return [];
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
const limit = pLimit(Math.min(config.LLM_CONCURRENCY, 3));
|
|
213
|
-
const results: Array<PromiseSettledResult<string>> = await Promise.allSettled(
|
|
214
|
-
segments.map((segment, index) =>
|
|
215
|
-
limit(() => downloadSegment(videoId, segment, index, customPath)),
|
|
216
|
-
),
|
|
217
|
-
);
|
|
218
|
-
|
|
219
|
-
const paths: string[] = [];
|
|
220
|
-
for (let i = 0; i < results.length; i++) {
|
|
221
|
-
const result = results[i];
|
|
222
|
-
const segment = segments[i];
|
|
223
|
-
if (result.status === 'fulfilled') {
|
|
224
|
-
paths.push(result.value);
|
|
225
|
-
} else {
|
|
226
|
-
const reason = result.reason instanceof Error ? result.reason.message : String(result.reason);
|
|
227
|
-
log.warn(
|
|
228
|
-
`Failed to download segment [${formatTimestamp(segment.start)} – ${formatTimestamp(segment.end)}] (rank ${segment.rank}): ${reason}`,
|
|
229
|
-
);
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
return paths;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
/**
|
|
237
|
-
* Downloads a YouTube video based on the specified mode.
|
|
238
|
-
*
|
|
239
|
-
* @param videoId - 11-character YouTube video ID
|
|
240
|
-
* @param mode - Download mode: 'all' (full video) or 'segments' (individual clips)
|
|
241
|
-
* @param segments - Ranked segments (required when mode is 'segments')
|
|
242
|
-
* @param customPath - Custom output directory (optional, overrides config defaults)
|
|
243
|
-
* @returns Download result containing the mode and either path or paths
|
|
244
|
-
*/
|
|
245
|
-
export async function downloadVideo(
|
|
246
|
-
videoId: string,
|
|
247
|
-
mode: DownloadMode = 'all',
|
|
248
|
-
segments: RankedSegment[] = [],
|
|
249
|
-
customPath?: string,
|
|
250
|
-
): Promise<DownloadResult> {
|
|
251
|
-
if (mode === 'all') {
|
|
252
|
-
const path = await downloadFullVideo(videoId, customPath);
|
|
253
|
-
return { mode: 'all', path };
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
if (mode === 'segments') {
|
|
257
|
-
if (segments.length === 0) {
|
|
258
|
-
log.warn('No segments provided for download-segments mode. Skipping download.');
|
|
259
|
-
return { mode: 'segments', paths: [] };
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
log.info(`Downloading ${segments.length} segments in parallel...`);
|
|
263
|
-
const paths = await downloadSegments(videoId, segments, customPath);
|
|
264
|
-
return { mode: 'segments', paths };
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
throw new Error(`Invalid download mode: ${mode}`);
|
|
268
|
-
}
|
package/src/types/analyzer.ts
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import type { TranscriptLine, MicroBlock, LLMChunk, ChunkEvaluation } from './index.js';
|
|
2
|
-
|
|
3
|
-
export interface LLMAnalyzerResult {
|
|
4
|
-
lines: TranscriptLine[];
|
|
5
|
-
microBlocks: MicroBlock[];
|
|
6
|
-
chunks: LLMChunk[];
|
|
7
|
-
chunkEvals: ChunkEvaluation[];
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
export interface LLMAnalyzerOpts {
|
|
11
|
-
videoId: string;
|
|
12
|
-
audioPath: string | null;
|
|
13
|
-
audioEvents: import('./audio.js').AudioEvent[];
|
|
14
|
-
maxChunks?: number;
|
|
15
|
-
maxParallel: number;
|
|
16
|
-
noCache: boolean;
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
export interface TranscriptDetectorResult {
|
|
20
|
-
lines: TranscriptLine[];
|
|
21
|
-
microBlocks: MicroBlock[];
|
|
22
|
-
chunks: LLMChunk[];
|
|
23
|
-
}
|
package/src/types/audio.ts
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import { z } from 'zod';
|
|
2
|
-
|
|
3
|
-
export const AudioEventSchema = z.object({
|
|
4
|
-
time: z.number(),
|
|
5
|
-
event: z.string(),
|
|
6
|
-
confidence: z.number().min(0).max(1),
|
|
7
|
-
source: z.enum(['gemini', 'yamnet', 'whisper']),
|
|
8
|
-
});
|
|
9
|
-
export type AudioEvent = z.infer<typeof AudioEventSchema>;
|
|
10
|
-
|
|
11
|
-
export const MergedCandidateSchema = z.object({
|
|
12
|
-
start: z.number(),
|
|
13
|
-
end: z.number(),
|
|
14
|
-
score: z.number().min(1).max(10),
|
|
15
|
-
source: z.enum(['transcript', 'audio', 'both']),
|
|
16
|
-
reason: z.string(),
|
|
17
|
-
audio_event: z.string().optional(),
|
|
18
|
-
});
|
|
19
|
-
export type MergedCandidate = z.infer<typeof MergedCandidateSchema>;
|
package/src/types/cache.ts
DELETED
package/src/types/cli.ts
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Parsed CLI argument shape.
|
|
3
|
-
* Defined here so both `src/cli.ts` (which creates it) and
|
|
4
|
-
* `src/pipeline/runner.ts` (which consumes it) share a single source of truth.
|
|
5
|
-
*/
|
|
6
|
-
export interface CliArgs {
|
|
7
|
-
url: string | undefined;
|
|
8
|
-
clip: boolean;
|
|
9
|
-
downloadSections: 'all' | number | undefined;
|
|
10
|
-
localVideo?: string;
|
|
11
|
-
videoPath: string | undefined;
|
|
12
|
-
threshold: number | undefined;
|
|
13
|
-
topN: number | undefined;
|
|
14
|
-
maxDuration: number | undefined;
|
|
15
|
-
maxChunks: number | undefined;
|
|
16
|
-
maxParallel: number | undefined;
|
|
17
|
-
outputJson: string | undefined;
|
|
18
|
-
noCache: boolean;
|
|
19
|
-
noAudio: boolean;
|
|
20
|
-
gameProfile?: string;
|
|
21
|
-
help: boolean;
|
|
22
|
-
}
|