@thunderkiller/video-clipper 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/CONTRIBUTING.md +100 -0
- package/LICENSE +15 -0
- package/commitlint.config.js +25 -0
- package/package.json +3 -1
- package/.github/workflows/ci.yml +0 -42
- package/.github/workflows/release.yml +0 -76
- package/.husky/pre-commit +0 -3
- package/.prettierignore +0 -6
- package/.prettierrc +0 -7
- package/.releaserc.json +0 -21
- package/AGENTS.md +0 -122
- package/docs/free-models.md +0 -78
- package/docs/plan.md +0 -442
- package/docs/refactorPhases.md +0 -105
- package/docs/yt-downloader.md +0 -440
- package/requirements.txt +0 -5
- package/scripts/detect_events.py +0 -81
- package/scripts/detect_events_whisper.py +0 -101
- package/scripts/transcribe_whisper.py +0 -70
- package/src/cli.ts +0 -186
- package/src/config/env.ts +0 -18
- package/src/config/index.ts +0 -2
- package/src/index.ts +0 -46
- package/src/pipeline/runner.ts +0 -147
- package/src/pipeline/stages/audioProcessor.ts +0 -127
- package/src/pipeline/stages/clipExporter.ts +0 -76
- package/src/pipeline/stages/segmentAnalyzer.ts +0 -72
- package/src/pipeline/stages/segmentSelector.ts +0 -39
- package/src/pipeline/stages/videoResolver.ts +0 -44
- package/src/services/audioAnalyzers/base.ts +0 -32
- package/src/services/audioAnalyzers/factory.ts +0 -69
- package/src/services/audioAnalyzers/gemini.ts +0 -136
- package/src/services/audioAnalyzers/index.ts +0 -6
- package/src/services/audioAnalyzers/whisper.ts +0 -80
- package/src/services/audioAnalyzers/yamnet.ts +0 -54
- package/src/services/audioDownloader/index.ts +0 -102
- package/src/services/chunkBuilder/index.ts +0 -82
- package/src/services/clipGenerator/index.ts +0 -210
- package/src/services/clipRefiner/index.ts +0 -141
- package/src/services/eventDetector/index.ts +0 -68
- package/src/services/llmAnalyzer/LLMAnalyzer.ts +0 -98
- package/src/services/llmAnalyzer/index.ts +0 -231
- package/src/services/metadataExtractor/index.ts +0 -83
- package/src/services/segmentRanker/index.ts +0 -88
- package/src/services/signalMerger/index.ts +0 -53
- package/src/services/transcriptAnalyzers/base.ts +0 -26
- package/src/services/transcriptAnalyzers/factory.ts +0 -66
- package/src/services/transcriptAnalyzers/gemini.ts +0 -24
- package/src/services/transcriptAnalyzers/index.ts +0 -6
- package/src/services/transcriptAnalyzers/whisper.ts +0 -68
- package/src/services/transcriptAnalyzers/ytdlp.ts +0 -19
- package/src/services/transcriptDetector/index.ts +0 -122
- package/src/services/transcriptFetcher/index.ts +0 -147
- package/src/services/urlParser/index.ts +0 -52
- package/src/services/videoDownloader/index.ts +0 -268
- package/src/types/analyzer.ts +0 -23
- package/src/types/audio.ts +0 -19
- package/src/types/cache.ts +0 -8
- package/src/types/cli.ts +0 -22
- package/src/types/config.ts +0 -151
- package/src/types/downloader.ts +0 -15
- package/src/types/factory.ts +0 -3
- package/src/types/index.ts +0 -40
- package/src/types/pipeline.ts +0 -60
- package/src/types/segment.ts +0 -43
- package/src/types/transcript.ts +0 -22
- package/src/types/video.ts +0 -18
- package/src/utils/cache.ts +0 -224
- package/src/utils/chunker.ts +0 -60
- package/src/utils/dumper.ts +0 -41
- package/src/utils/format.ts +0 -10
- package/src/utils/logger.ts +0 -17
- package/src/utils/modelFactory.ts +0 -71
- package/src/utils/redactConfig.ts +0 -23
- package/src/utils/sliceAudio.ts +0 -35
- package/test-trigger.txt +0 -1
- package/tests/analyzerFactory.test.ts +0 -146
- package/tests/audioEventDetector.test.ts +0 -69
- package/tests/cache.test.ts +0 -203
- package/tests/chunkBuilder.test.ts +0 -146
- package/tests/chunker.test.ts +0 -95
- package/tests/eventDetector.test.ts +0 -103
- package/tests/llmAnalyzer.test.ts +0 -283
- package/tests/segmentRanker.test.ts +0 -133
- package/tests/setup.ts +0 -48
- package/tests/signalMerger.test.ts +0 -197
- package/tests/transcriptDetector.test.ts +0 -150
- package/tests/transcriptFetcher.test.ts +0 -179
- package/tests/urlParser.test.ts +0 -70
- package/tsconfig.json +0 -16
- package/tsconfig.test.json +0 -8
- package/vitest.config.ts +0 -8
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
import { generateObject } from 'ai';
|
|
2
|
-
import pLimit from 'p-limit';
|
|
3
|
-
import { config } from '../../config/index.js';
|
|
4
|
-
import { log } from '../../utils/logger.js';
|
|
5
|
-
import { formatSeconds } from '../../utils/format.js';
|
|
6
|
-
import { getModel } from '../../utils/modelFactory.js';
|
|
7
|
-
import { Cache } from '../../utils/cache.js';
|
|
8
|
-
import { AnalyzedSegmentSchema } from '../../types/index.js';
|
|
9
|
-
import type {
|
|
10
|
-
LLMChunk,
|
|
11
|
-
TranscriptLine,
|
|
12
|
-
AnalyzedSegment,
|
|
13
|
-
ChunkEvaluation,
|
|
14
|
-
AudioEvent,
|
|
15
|
-
} from '../../types/index.js';
|
|
16
|
-
|
|
17
|
-
const BACKOFF_BASE_MS = 1000;
|
|
18
|
-
const BACKOFF_JITTER_MS = 500;
|
|
19
|
-
|
|
20
|
-
const DEFAULT_SYSTEM_PROMPT = `You are an expert video editor analyzing a YouTube transcript segment.
|
|
21
|
-
|
|
22
|
-
Identify if this segment contains a potentially interesting moment worth clipping.
|
|
23
|
-
|
|
24
|
-
Interesting moments include:
|
|
25
|
-
- surprising insights or revelations
|
|
26
|
-
- strong or controversial opinions
|
|
27
|
-
- humor or entertaining storytelling
|
|
28
|
-
- emotional moments
|
|
29
|
-
- key explanations of important concepts
|
|
30
|
-
- "aha" moments or turning points
|
|
31
|
-
|
|
32
|
-
If audio events are listed in the segment, treat them as strong positive signals —
|
|
33
|
-
they indicate high-action or high-energy moments that are often clip-worthy.`;
|
|
34
|
-
|
|
35
|
-
function isRateLimitError(err: unknown): boolean {
|
|
36
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
37
|
-
return (
|
|
38
|
-
message.toLowerCase().includes('rate limit') ||
|
|
39
|
-
message.includes('429') ||
|
|
40
|
-
message.toLowerCase().includes('too many requests')
|
|
41
|
-
);
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function sleep(ms: number): Promise<void> {
|
|
45
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Builds the user prompt for a single transcript chunk.
|
|
50
|
-
* Semantic scoring hints are omitted when a custom system prompt is in use.
|
|
51
|
-
*/
|
|
52
|
-
function buildPrompt(
|
|
53
|
-
chunk: LLMChunk,
|
|
54
|
-
chunkLines: TranscriptLine[],
|
|
55
|
-
chunkAudioEvents: AudioEvent[],
|
|
56
|
-
isCustomSystemPrompt: boolean,
|
|
57
|
-
): string {
|
|
58
|
-
const semanticRules = isCustomSystemPrompt
|
|
59
|
-
? ''
|
|
60
|
-
: `- Set interesting=true only if the segment is genuinely compelling
|
|
61
|
-
- Score 1-10 (7+ means worth clipping, 9-10 means viral potential)
|
|
62
|
-
`;
|
|
63
|
-
|
|
64
|
-
const transcriptBody =
|
|
65
|
-
chunkLines.length > 0
|
|
66
|
-
? chunkLines.map((l) => `[${l.start.toFixed(1)}s] ${l.text}`).join('\n')
|
|
67
|
-
: chunk.text;
|
|
68
|
-
|
|
69
|
-
const audioSection =
|
|
70
|
-
chunkAudioEvents.length > 0
|
|
71
|
-
? `\nAudio Events Detected (within this segment):\n${chunkAudioEvents
|
|
72
|
-
.slice()
|
|
73
|
-
.sort((a, b) => a.time - b.time)
|
|
74
|
-
.map(
|
|
75
|
-
(e) => ` [${e.time.toFixed(1)}s] ${e.event} (confidence: ${e.confidence.toFixed(2)})`,
|
|
76
|
-
)
|
|
77
|
-
.join('\n')}\n`
|
|
78
|
-
: '';
|
|
79
|
-
|
|
80
|
-
return `Transcript Segment:
|
|
81
|
-
START: ${chunk.start}s
|
|
82
|
-
END: ${chunk.end}s
|
|
83
|
-
|
|
84
|
-
${transcriptBody}
|
|
85
|
-
${audioSection}
|
|
86
|
-
Rules for your response:
|
|
87
|
-
${semanticRules}- clip_start and clip_end must be within [${chunk.start}, ${chunk.end}]
|
|
88
|
-
- clip_start must be less than clip_end`;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/**
|
|
92
|
-
* Analyzes a single LLM chunk with exponential backoff + jitter on rate-limit errors.
|
|
93
|
-
*/
|
|
94
|
-
async function analyzeChunk(
|
|
95
|
-
chunk: LLMChunk,
|
|
96
|
-
chunkLines: TranscriptLine[],
|
|
97
|
-
chunkAudioEvents: AudioEvent[],
|
|
98
|
-
noCache: boolean,
|
|
99
|
-
chunkIndex: number,
|
|
100
|
-
): Promise<AnalyzedSegment> {
|
|
101
|
-
if (!noCache) {
|
|
102
|
-
const cache = new Cache(config.CACHE_DIR);
|
|
103
|
-
const cached = await cache.readChunk(chunk, chunkAudioEvents);
|
|
104
|
-
if (cached && cached.status === 'success') {
|
|
105
|
-
log.info(`[chunk] cache hit (${formatSeconds(chunk.start)}–${formatSeconds(chunk.end)})`);
|
|
106
|
-
return {
|
|
107
|
-
interesting: cached.interesting,
|
|
108
|
-
score: cached.score,
|
|
109
|
-
reason: cached.reason,
|
|
110
|
-
clip_start: cached.clip_start,
|
|
111
|
-
clip_end: cached.clip_end,
|
|
112
|
-
};
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
let lastError: unknown;
|
|
117
|
-
|
|
118
|
-
for (let attempt = 0; attempt <= config.LLM_MAX_RETRIES; attempt++) {
|
|
119
|
-
try {
|
|
120
|
-
const prompt = buildPrompt(chunk, chunkLines, chunkAudioEvents, !!config.LLM_SYSTEM_PROMPT);
|
|
121
|
-
const { object } = await generateObject({
|
|
122
|
-
model: getModel(),
|
|
123
|
-
schema: AnalyzedSegmentSchema,
|
|
124
|
-
system: config.LLM_SYSTEM_PROMPT ?? DEFAULT_SYSTEM_PROMPT,
|
|
125
|
-
prompt: prompt,
|
|
126
|
-
maxRetries: 0,
|
|
127
|
-
});
|
|
128
|
-
log.info(
|
|
129
|
-
`Chunk ${chunkIndex} analysis complete: interesting=${object.interesting}, score=${object.score}, reason=${object.reason}`,
|
|
130
|
-
);
|
|
131
|
-
if (!noCache) {
|
|
132
|
-
const evaluation: ChunkEvaluation = {
|
|
133
|
-
status: 'success' as const,
|
|
134
|
-
chunk_index: chunkIndex,
|
|
135
|
-
chunk_start: chunk.start,
|
|
136
|
-
chunk_end: chunk.end,
|
|
137
|
-
interesting: object.interesting,
|
|
138
|
-
score: object.score,
|
|
139
|
-
reason: object.reason,
|
|
140
|
-
clip_start: object.clip_start,
|
|
141
|
-
clip_end: object.clip_end,
|
|
142
|
-
};
|
|
143
|
-
await new Cache(config.CACHE_DIR).writeChunk(chunk, evaluation, chunkAudioEvents);
|
|
144
|
-
}
|
|
145
|
-
return object;
|
|
146
|
-
} catch (err) {
|
|
147
|
-
lastError = err;
|
|
148
|
-
|
|
149
|
-
if (isRateLimitError(err) && attempt < config.LLM_MAX_RETRIES) {
|
|
150
|
-
const delay = BACKOFF_BASE_MS * Math.pow(2, attempt) + Math.random() * BACKOFF_JITTER_MS;
|
|
151
|
-
log.warn(
|
|
152
|
-
`[chunk] Rate limit hit (attempt ${attempt + 1}/${config.LLM_MAX_RETRIES + 1}). ` +
|
|
153
|
-
`Retrying in ${Math.round(delay)}ms...`,
|
|
154
|
-
);
|
|
155
|
-
await sleep(delay);
|
|
156
|
-
continue;
|
|
157
|
-
} else {
|
|
158
|
-
log.error(`[chunk] Analysis failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
throw err;
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
throw lastError;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
/**
|
|
169
|
-
* Analyzes all LLM chunks via Promise.allSettled — one failure never aborts the rest.
|
|
170
|
-
* Each chunk receives only the transcript lines and audio events within its window.
|
|
171
|
-
* Pass noCache=true to bypass the on-disk chunk cache.
|
|
172
|
-
*/
|
|
173
|
-
export async function analyzeChunks(
|
|
174
|
-
chunks: LLMChunk[],
|
|
175
|
-
lines: TranscriptLine[],
|
|
176
|
-
audioEvents: AudioEvent[],
|
|
177
|
-
concurrency: number,
|
|
178
|
-
noCache = false,
|
|
179
|
-
): Promise<ChunkEvaluation[]> {
|
|
180
|
-
log.info(
|
|
181
|
-
`Analyzing ${chunks.length} chunk${chunks.length !== 1 ? 's' : ''} (max ${concurrency} parallel)...`,
|
|
182
|
-
);
|
|
183
|
-
|
|
184
|
-
const limit = pLimit(concurrency);
|
|
185
|
-
const results = await Promise.allSettled(
|
|
186
|
-
chunks.map((chunk, i) => {
|
|
187
|
-
const chunkLines = lines.filter((l) => l.start >= chunk.start && l.start < chunk.end);
|
|
188
|
-
const chunkAudioEvents = audioEvents.filter(
|
|
189
|
-
(e) => e.time >= chunk.start && e.time < chunk.end,
|
|
190
|
-
);
|
|
191
|
-
return limit(() => analyzeChunk(chunk, chunkLines, chunkAudioEvents, noCache, i));
|
|
192
|
-
}),
|
|
193
|
-
);
|
|
194
|
-
|
|
195
|
-
let succeeded = 0;
|
|
196
|
-
const evaluations: ChunkEvaluation[] = await Promise.all(
|
|
197
|
-
results.map(async (result, i) => {
|
|
198
|
-
const chunk = chunks[i];
|
|
199
|
-
if (result.status === 'fulfilled') {
|
|
200
|
-
succeeded++;
|
|
201
|
-
const seg: AnalyzedSegment = result.value;
|
|
202
|
-
const evaluation: ChunkEvaluation = {
|
|
203
|
-
status: 'success' as const,
|
|
204
|
-
chunk_index: i,
|
|
205
|
-
chunk_start: chunk.start,
|
|
206
|
-
chunk_end: chunk.end,
|
|
207
|
-
interesting: seg.interesting,
|
|
208
|
-
score: seg.score,
|
|
209
|
-
reason: seg.reason,
|
|
210
|
-
clip_start: seg.clip_start,
|
|
211
|
-
clip_end: seg.clip_end,
|
|
212
|
-
};
|
|
213
|
-
return evaluation;
|
|
214
|
-
} else {
|
|
215
|
-
const error =
|
|
216
|
-
result.reason instanceof Error ? result.reason.message : String(result.reason);
|
|
217
|
-
log.warn(`[chunk ${i}] LLM analysis skipped: ${error}`);
|
|
218
|
-
return {
|
|
219
|
-
status: 'failed' as const,
|
|
220
|
-
chunk_index: i,
|
|
221
|
-
chunk_start: chunk.start,
|
|
222
|
-
chunk_end: chunk.end,
|
|
223
|
-
error,
|
|
224
|
-
};
|
|
225
|
-
}
|
|
226
|
-
}),
|
|
227
|
-
);
|
|
228
|
-
|
|
229
|
-
log.info(`Analysis complete: ${succeeded}/${chunks.length} chunks succeeded`);
|
|
230
|
-
return evaluations;
|
|
231
|
-
}
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
import { execa } from 'execa';
|
|
2
|
-
import { config } from '../../config/index.js';
|
|
3
|
-
import { log } from '../../utils/logger.js';
|
|
4
|
-
import type { VideoMetadata } from '../../types/index.js';
|
|
5
|
-
|
|
6
|
-
const OEMBED_URL = 'https://www.youtube.com/oembed';
|
|
7
|
-
|
|
8
|
-
interface YtDlpJson {
|
|
9
|
-
title: string;
|
|
10
|
-
duration: number;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
interface OEmbedResponse {
|
|
14
|
-
title: string;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* Attempts to extract video metadata via yt-dlp --dump-json.
|
|
19
|
-
* Returns null if yt-dlp is not installed or the call fails.
|
|
20
|
-
*/
|
|
21
|
-
async function extractViaYtDlp(videoId: string): Promise<VideoMetadata | null> {
|
|
22
|
-
try {
|
|
23
|
-
const args = ['--dump-json', '--no-playlist', `https://www.youtube.com/watch?v=${videoId}`];
|
|
24
|
-
|
|
25
|
-
if (config.YT_DLP_COOKIES_FROM_BROWSER) {
|
|
26
|
-
args.unshift('--cookies-from-browser', config.YT_DLP_COOKIES_FROM_BROWSER);
|
|
27
|
-
} else if (config.YT_DLP_COOKIES_FILE) {
|
|
28
|
-
args.unshift('--cookies', config.YT_DLP_COOKIES_FILE);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
const { stdout } = await execa('yt-dlp', args);
|
|
32
|
-
const data = JSON.parse(stdout) as YtDlpJson;
|
|
33
|
-
return {
|
|
34
|
-
videoId,
|
|
35
|
-
title: data.title,
|
|
36
|
-
duration: data.duration,
|
|
37
|
-
};
|
|
38
|
-
} catch (err) {
|
|
39
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
40
|
-
log.warn(`yt-dlp failed for "${videoId}": ${message}`);
|
|
41
|
-
return null;
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Falls back to YouTube oEmbed for the title (no API key needed).
|
|
47
|
-
* Duration is unavailable via oEmbed, so it returns 0 with a warning.
|
|
48
|
-
*/
|
|
49
|
-
async function extractViaOEmbed(videoId: string): Promise<VideoMetadata> {
|
|
50
|
-
const url = `${OEMBED_URL}?url=https://www.youtube.com/watch?v=${videoId}&format=json`;
|
|
51
|
-
const res = await fetch(url);
|
|
52
|
-
if (!res.ok) {
|
|
53
|
-
throw new Error(`oEmbed request failed for "${videoId}": HTTP ${res.status}`);
|
|
54
|
-
}
|
|
55
|
-
const data = (await res.json()) as OEmbedResponse;
|
|
56
|
-
log.warn(
|
|
57
|
-
`yt-dlp unavailable — duration unknown for "${videoId}". Install yt-dlp for full metadata.`,
|
|
58
|
-
);
|
|
59
|
-
return {
|
|
60
|
-
videoId,
|
|
61
|
-
title: data.title,
|
|
62
|
-
duration: 0,
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Extracts video metadata (title, duration) for a given YouTube video ID.
|
|
68
|
-
*
|
|
69
|
-
* Strategy:
|
|
70
|
-
* 1. Try yt-dlp --dump-json (full metadata, requires yt-dlp installed)
|
|
71
|
-
* 2. Fall back to YouTube oEmbed API (title only, duration = 0)
|
|
72
|
-
*
|
|
73
|
-
* @throws {Error} if both strategies fail
|
|
74
|
-
*/
|
|
75
|
-
export async function extractMetadata(videoId: string): Promise<VideoMetadata> {
|
|
76
|
-
const ytDlpResult = await extractViaYtDlp(videoId);
|
|
77
|
-
if (ytDlpResult) {
|
|
78
|
-
return ytDlpResult;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
log.warn(`yt-dlp failed for "${videoId}", falling back to oEmbed...`);
|
|
82
|
-
return extractViaOEmbed(videoId);
|
|
83
|
-
}
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
import type { MergedCandidate, RankedSegment } from '../../types/index.js';
|
|
2
|
-
|
|
3
|
-
function overlapSeconds(aStart: number, aEnd: number, bStart: number, bEnd: number): number {
|
|
4
|
-
const start = Math.max(aStart, bStart);
|
|
5
|
-
const end = Math.min(aEnd, bEnd);
|
|
6
|
-
return Math.max(0, end - start);
|
|
7
|
-
}
|
|
8
|
-
|
|
9
|
-
function significantlyOverlaps(
|
|
10
|
-
aStart: number,
|
|
11
|
-
aEnd: number,
|
|
12
|
-
bStart: number,
|
|
13
|
-
bEnd: number,
|
|
14
|
-
aSource: string,
|
|
15
|
-
bSource: string,
|
|
16
|
-
): boolean {
|
|
17
|
-
const overlap = overlapSeconds(aStart, aEnd, bStart, bEnd);
|
|
18
|
-
if (overlap === 0) return false;
|
|
19
|
-
|
|
20
|
-
const aDuration = aEnd - aStart;
|
|
21
|
-
const bDuration = bEnd - bStart;
|
|
22
|
-
|
|
23
|
-
const isAudioSource = aSource === 'audio' || bSource === 'audio';
|
|
24
|
-
|
|
25
|
-
if (isAudioSource) {
|
|
26
|
-
return overlap > 8;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
return overlap / aDuration > 0.5 || overlap / bDuration > 0.5;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
function deduplicateSegments(
|
|
33
|
-
segments: Array<{
|
|
34
|
-
start: number;
|
|
35
|
-
end: number;
|
|
36
|
-
score: number;
|
|
37
|
-
reason: string;
|
|
38
|
-
source: string;
|
|
39
|
-
audio_event?: string;
|
|
40
|
-
}>,
|
|
41
|
-
): Array<{
|
|
42
|
-
start: number;
|
|
43
|
-
end: number;
|
|
44
|
-
score: number;
|
|
45
|
-
reason: string;
|
|
46
|
-
source: string;
|
|
47
|
-
audio_event?: string;
|
|
48
|
-
}> {
|
|
49
|
-
const kept: typeof segments = [];
|
|
50
|
-
|
|
51
|
-
for (const candidate of segments) {
|
|
52
|
-
const dominated = kept.some((existing) =>
|
|
53
|
-
significantlyOverlaps(
|
|
54
|
-
existing.start,
|
|
55
|
-
existing.end,
|
|
56
|
-
candidate.start,
|
|
57
|
-
candidate.end,
|
|
58
|
-
existing.source,
|
|
59
|
-
candidate.source,
|
|
60
|
-
),
|
|
61
|
-
);
|
|
62
|
-
if (!dominated) {
|
|
63
|
-
kept.push(candidate);
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
return kept;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
export function rankSegments(
|
|
71
|
-
segments: MergedCandidate[],
|
|
72
|
-
threshold: number,
|
|
73
|
-
topN: number,
|
|
74
|
-
): RankedSegment[] {
|
|
75
|
-
const filtered = segments.filter((s) => s.score >= threshold).sort((a, b) => b.score - a.score);
|
|
76
|
-
|
|
77
|
-
const deduped = deduplicateSegments(filtered);
|
|
78
|
-
|
|
79
|
-
return deduped.slice(0, topN).map((s, i) => ({
|
|
80
|
-
rank: i + 1,
|
|
81
|
-
start: s.start,
|
|
82
|
-
end: s.end,
|
|
83
|
-
score: s.score,
|
|
84
|
-
reason: s.reason,
|
|
85
|
-
source: s.source as 'transcript' | 'audio' | 'both',
|
|
86
|
-
audio_event: s.audio_event,
|
|
87
|
-
}));
|
|
88
|
-
}
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
import type { MergedCandidate } from '../../types/index.js';
|
|
2
|
-
import type { ChunkEvaluation } from '../../types/index.js';
|
|
3
|
-
import type { AudioEvent } from '../../types/index.js';
|
|
4
|
-
import { config } from '../../config/index.js';
|
|
5
|
-
|
|
6
|
-
export function mergeSignals(
|
|
7
|
-
llmSegments: ChunkEvaluation[],
|
|
8
|
-
audioEvents: AudioEvent[],
|
|
9
|
-
boostWindow?: number,
|
|
10
|
-
scoreBoost?: number,
|
|
11
|
-
preRoll?: number,
|
|
12
|
-
postRoll?: number,
|
|
13
|
-
): MergedCandidate[] {
|
|
14
|
-
const candidates: MergedCandidate[] = [];
|
|
15
|
-
const windowSec = boostWindow ?? config.AUDIO_LLM_BOOST_WINDOW;
|
|
16
|
-
const boost = scoreBoost ?? config.AUDIO_LLM_SCORE_BOOST;
|
|
17
|
-
const pre = preRoll ?? config.AUDIO_CLIP_PRE_ROLL;
|
|
18
|
-
const post = postRoll ?? config.AUDIO_CLIP_POST_ROLL;
|
|
19
|
-
|
|
20
|
-
const successfulSegments = llmSegments.filter((s) => s.status === 'success');
|
|
21
|
-
|
|
22
|
-
for (const seg of successfulSegments) {
|
|
23
|
-
if (!seg.interesting) continue;
|
|
24
|
-
|
|
25
|
-
const nearby = audioEvents.filter((e) => Math.abs(e.time - seg.clip_start) < windowSec);
|
|
26
|
-
|
|
27
|
-
candidates.push({
|
|
28
|
-
start: seg.clip_start,
|
|
29
|
-
end: seg.clip_end,
|
|
30
|
-
score: Math.min(10, seg.score + (nearby.length > 0 ? boost : 0)),
|
|
31
|
-
source: nearby.length > 0 ? 'both' : 'transcript',
|
|
32
|
-
reason: seg.reason,
|
|
33
|
-
audio_event: nearby.length > 0 ? nearby[0].event : undefined,
|
|
34
|
-
});
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
for (const evt of audioEvents) {
|
|
38
|
-
const hasLLM = successfulSegments.some((s) => Math.abs(s.clip_start - evt.time) < windowSec);
|
|
39
|
-
|
|
40
|
-
if (!hasLLM) {
|
|
41
|
-
candidates.push({
|
|
42
|
-
start: Math.max(0, evt.time - pre),
|
|
43
|
-
end: evt.time + post,
|
|
44
|
-
score: Math.round(evt.confidence * 10),
|
|
45
|
-
source: 'audio',
|
|
46
|
-
reason: `Audio event: ${evt.event} (${(evt.confidence * 100).toFixed(0)}% confidence)`,
|
|
47
|
-
audio_event: evt.event,
|
|
48
|
-
});
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
return candidates;
|
|
53
|
-
}
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import type { TranscriptLine } from '../../types/index.js';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Contract every transcript analyzer implementation must satisfy.
|
|
5
|
-
*
|
|
6
|
-
* Each concrete analyzer (YtDlp, Whisper, Gemini) extends this class and
|
|
7
|
-
* implements `detect()`. The `source` property tags the lines it returns so
|
|
8
|
-
* downstream logging knows which backend produced them.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* const analyzer = new YtDlpTranscriptAnalyzer();
|
|
12
|
-
* const lines = await analyzer.detect(videoId, null);
|
|
13
|
-
*/
|
|
14
|
-
export abstract class TranscriptAnalyzer {
|
|
15
|
-
abstract readonly source: string;
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* Fetch transcript lines for the given video.
|
|
19
|
-
*
|
|
20
|
-
* @param videoId - YouTube video ID (no URL)
|
|
21
|
-
* @param audioPath - Path to the downloaded WAV on disk, or null if audio
|
|
22
|
-
* is not yet available (e.g. ytdlp path has no audio dep)
|
|
23
|
-
* @returns Array of transcript lines sorted by start time
|
|
24
|
-
*/
|
|
25
|
-
abstract detect(videoId: string, audioPath: string | null): Promise<TranscriptLine[]>;
|
|
26
|
-
}
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import { log } from '../../utils/logger.js';
|
|
2
|
-
import { TranscriptAnalyzer } from './base.js';
|
|
3
|
-
import { YtDlpTranscriptAnalyzer } from './ytdlp.js';
|
|
4
|
-
import { WhisperTranscriptAnalyzer } from './whisper.js';
|
|
5
|
-
import { GeminiTranscriptAnalyzer } from './gemini.js';
|
|
6
|
-
import type { TranscriptProviderName } from '../../types/index.js';
|
|
7
|
-
|
|
8
|
-
const KNOWN_PROVIDERS = new Set<TranscriptProviderName>(['ytdlp', 'whisper', 'gemini']);
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Parses the TRANSCRIPT_PROVIDER config string into an ordered list of provider names.
|
|
12
|
-
*
|
|
13
|
-
* Accepts a comma-separated list: "ytdlp,whisper" → ['ytdlp', 'whisper']
|
|
14
|
-
* Single values still work: "ytdlp" → ['ytdlp']
|
|
15
|
-
*/
|
|
16
|
-
export function parseTranscriptProviderChain(providerString: string): TranscriptProviderName[] {
|
|
17
|
-
const names = providerString
|
|
18
|
-
.split(',')
|
|
19
|
-
.map((s) => s.trim().toLowerCase())
|
|
20
|
-
.filter(Boolean);
|
|
21
|
-
|
|
22
|
-
if (names.length === 0) {
|
|
23
|
-
throw new Error(
|
|
24
|
-
`TRANSCRIPT_PROVIDER is empty. Provide at least one of: ytdlp, whisper, gemini`,
|
|
25
|
-
);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
for (const name of names) {
|
|
29
|
-
if (!KNOWN_PROVIDERS.has(name as TranscriptProviderName)) {
|
|
30
|
-
throw new Error(
|
|
31
|
-
`Unknown transcript provider "${name}". Valid options: ytdlp, whisper, gemini (comma-separated for chain)`,
|
|
32
|
-
);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
return names as TranscriptProviderName[];
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/**
|
|
40
|
-
* Builds an ordered array of TranscriptAnalyzer instances from a provider chain string.
|
|
41
|
-
*
|
|
42
|
-
* The TranscriptDetector will walk this array in order — if the first analyzer
|
|
43
|
-
* fails, it falls back to the next, and so on.
|
|
44
|
-
*
|
|
45
|
-
* @example
|
|
46
|
-
* // TRANSCRIPT_PROVIDER=ytdlp,whisper → [YtDlpTranscriptAnalyzer, WhisperTranscriptAnalyzer]
|
|
47
|
-
* const chain = createTranscriptChain(config.TRANSCRIPT_PROVIDER);
|
|
48
|
-
*/
|
|
49
|
-
export function createTranscriptChain(providerString: string): TranscriptAnalyzer[] {
|
|
50
|
-
const names = parseTranscriptProviderChain(providerString);
|
|
51
|
-
|
|
52
|
-
return names.map((name) => {
|
|
53
|
-
switch (name) {
|
|
54
|
-
case 'ytdlp':
|
|
55
|
-
return new YtDlpTranscriptAnalyzer();
|
|
56
|
-
case 'whisper':
|
|
57
|
-
return new WhisperTranscriptAnalyzer();
|
|
58
|
-
case 'gemini':
|
|
59
|
-
log.warn(
|
|
60
|
-
'[transcript] GeminiTranscriptAnalyzer is not yet implemented — ' +
|
|
61
|
-
'it will throw if reached in the chain.',
|
|
62
|
-
);
|
|
63
|
-
return new GeminiTranscriptAnalyzer();
|
|
64
|
-
}
|
|
65
|
-
});
|
|
66
|
-
}
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import type { TranscriptLine } from '../../types/index.js';
|
|
2
|
-
import { TranscriptAnalyzer } from './base.js';
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Transcribes a video using Google Gemini's multimodal audio understanding.
|
|
6
|
-
*
|
|
7
|
-
* @stub This implementation is not yet complete. It will:
|
|
8
|
-
* 1. Split the audio file into overlapping chunks
|
|
9
|
-
* 2. Send each chunk to the Gemini audio API for transcription
|
|
10
|
-
* 3. Stitch the returned text segments back into TranscriptLine[]
|
|
11
|
-
*
|
|
12
|
-
* Set TRANSCRIPT_PROVIDER=ytdlp or TRANSCRIPT_PROVIDER=whisper to use a
|
|
13
|
-
* working provider in the meantime.
|
|
14
|
-
*/
|
|
15
|
-
export class GeminiTranscriptAnalyzer extends TranscriptAnalyzer {
|
|
16
|
-
readonly source = 'gemini' as const;
|
|
17
|
-
|
|
18
|
-
async detect(_videoId: string, _audioPath: string | null): Promise<TranscriptLine[]> {
|
|
19
|
-
throw new Error(
|
|
20
|
-
'GeminiTranscriptAnalyzer is not yet implemented. ' +
|
|
21
|
-
'Use TRANSCRIPT_PROVIDER=ytdlp or TRANSCRIPT_PROVIDER=whisper instead.',
|
|
22
|
-
);
|
|
23
|
-
}
|
|
24
|
-
}
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
export { TranscriptAnalyzer } from './base.js';
|
|
2
|
-
export { YtDlpTranscriptAnalyzer } from './ytdlp.js';
|
|
3
|
-
export { WhisperTranscriptAnalyzer } from './whisper.js';
|
|
4
|
-
export { GeminiTranscriptAnalyzer } from './gemini.js';
|
|
5
|
-
export { createTranscriptChain, parseTranscriptProviderChain } from './factory.js';
|
|
6
|
-
export type { TranscriptProviderName } from '../../types/index.js';
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
import { execa } from 'execa';
|
|
2
|
-
import { z } from 'zod';
|
|
3
|
-
import { config } from '../../config/index.js';
|
|
4
|
-
import type { TranscriptLine } from '../../types/index.js';
|
|
5
|
-
import { TranscriptAnalyzer } from './base.js';
|
|
6
|
-
import { getPythonBin } from '../audioAnalyzers/whisper.js';
|
|
7
|
-
|
|
8
|
-
const WhisperSegmentSchema = z.object({
|
|
9
|
-
text: z.string(),
|
|
10
|
-
start: z.number(),
|
|
11
|
-
duration: z.number(),
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
/**
|
|
15
|
-
* Generates a transcript by running OpenAI Whisper locally on the downloaded
|
|
16
|
-
* audio file, then normalising the output to TranscriptLine[].
|
|
17
|
-
*
|
|
18
|
-
* Requires:
|
|
19
|
-
* - audioPath must not be null (audio must be downloaded before calling detect)
|
|
20
|
-
* - pip install openai-whisper
|
|
21
|
-
*
|
|
22
|
-
* The underlying script is `scripts/transcribe_whisper.py` which writes a JSON
|
|
23
|
-
* array of `{text, start, duration}` objects to stdout.
|
|
24
|
-
*/
|
|
25
|
-
export class WhisperTranscriptAnalyzer extends TranscriptAnalyzer {
|
|
26
|
-
readonly source = 'whisper' as const;
|
|
27
|
-
|
|
28
|
-
async detect(videoId: string, audioPath: string | null): Promise<TranscriptLine[]> {
|
|
29
|
-
if (!audioPath) {
|
|
30
|
-
throw new Error(
|
|
31
|
-
'WhisperTranscriptAnalyzer requires an audio file. ' +
|
|
32
|
-
'Ensure downloadAudio() runs before processTranscript() in the pipeline.',
|
|
33
|
-
);
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
const python = await getPythonBin();
|
|
37
|
-
|
|
38
|
-
let stdout: string;
|
|
39
|
-
try {
|
|
40
|
-
const result = await execa(python, [
|
|
41
|
-
'scripts/transcribe_whisper.py',
|
|
42
|
-
audioPath,
|
|
43
|
-
config.AUDIO_WHISPER_MODEL,
|
|
44
|
-
]);
|
|
45
|
-
stdout = result.stdout;
|
|
46
|
-
} catch (err) {
|
|
47
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
48
|
-
|
|
49
|
-
if (message.includes('ModuleNotFoundError') || message.includes('No module named')) {
|
|
50
|
-
throw new Error(
|
|
51
|
-
'openai-whisper not installed. Run: pip install openai-whisper\n' +
|
|
52
|
-
'Or set TRANSCRIPT_PROVIDER=ytdlp in .env to use yt-dlp subtitles instead.',
|
|
53
|
-
);
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
throw new Error(`Whisper transcription failed for "${videoId}": ${message}`);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const raw = JSON.parse(stdout) as unknown[];
|
|
60
|
-
const segments = raw.map((seg) => WhisperSegmentSchema.parse(seg));
|
|
61
|
-
|
|
62
|
-
return segments.map((seg) => ({
|
|
63
|
-
text: seg.text.trim(),
|
|
64
|
-
start: seg.start,
|
|
65
|
-
duration: seg.duration,
|
|
66
|
-
}));
|
|
67
|
-
}
|
|
68
|
-
}
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import { fetchTranscript } from '../transcriptFetcher/index.js';
|
|
2
|
-
import type { TranscriptLine } from '../../types/index.js';
|
|
3
|
-
import { TranscriptAnalyzer } from './base.js';
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Fetches the YouTube transcript via yt-dlp auto-generated subtitles (VTT).
|
|
7
|
-
*
|
|
8
|
-
* This is the default transcript provider. It does not use the audio file —
|
|
9
|
-
* the `audioPath` parameter is accepted but ignored.
|
|
10
|
-
*
|
|
11
|
-
* Requires: yt-dlp installed and available on PATH.
|
|
12
|
-
*/
|
|
13
|
-
export class YtDlpTranscriptAnalyzer extends TranscriptAnalyzer {
|
|
14
|
-
readonly source = 'ytdlp' as const;
|
|
15
|
-
|
|
16
|
-
async detect(videoId: string, _audioPath: string | null): Promise<TranscriptLine[]> {
|
|
17
|
-
return fetchTranscript(videoId);
|
|
18
|
-
}
|
|
19
|
-
}
|