@thunderkiller/video-clipper 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/.env.example +130 -0
  2. package/.github/workflows/ci.yml +42 -0
  3. package/.github/workflows/release.yml +72 -0
  4. package/.husky/pre-commit +3 -0
  5. package/.prettierignore +6 -0
  6. package/.prettierrc +7 -0
  7. package/.releaserc.json +21 -0
  8. package/AGENTS.md +122 -0
  9. package/CHANGELOG.md +45 -0
  10. package/README.md +410 -0
  11. package/dist/cli.js +187 -0
  12. package/dist/config/env.js +14 -0
  13. package/dist/config/index.js +1 -0
  14. package/dist/index.js +35 -0
  15. package/dist/pipeline/runner.js +132 -0
  16. package/dist/pipeline/stages/audioProcessor.js +75 -0
  17. package/dist/pipeline/stages/clipExporter.js +44 -0
  18. package/dist/pipeline/stages/segmentAnalyzer.js +46 -0
  19. package/dist/pipeline/stages/segmentSelector.js +23 -0
  20. package/dist/pipeline/stages/videoResolver.js +34 -0
  21. package/dist/services/audioAnalyzers/base.js +13 -0
  22. package/dist/services/audioAnalyzers/factory.js +56 -0
  23. package/dist/services/audioAnalyzers/gemini.js +109 -0
  24. package/dist/services/audioAnalyzers/index.js +5 -0
  25. package/dist/services/audioAnalyzers/whisper.js +62 -0
  26. package/dist/services/audioAnalyzers/yamnet.js +40 -0
  27. package/dist/services/audioDownloader/index.js +81 -0
  28. package/dist/services/chunkBuilder/index.js +71 -0
  29. package/dist/services/clipGenerator/index.js +156 -0
  30. package/dist/services/clipRefiner/index.js +103 -0
  31. package/dist/services/eventDetector/index.js +54 -0
  32. package/dist/services/llmAnalyzer/LLMAnalyzer.js +63 -0
  33. package/dist/services/llmAnalyzer/index.js +173 -0
  34. package/dist/services/metadataExtractor/index.js +66 -0
  35. package/dist/services/segmentRanker/index.js +40 -0
  36. package/dist/services/signalMerger/index.js +36 -0
  37. package/dist/services/transcriptAnalyzers/base.js +13 -0
  38. package/dist/services/transcriptAnalyzers/factory.js +51 -0
  39. package/dist/services/transcriptAnalyzers/gemini.js +19 -0
  40. package/dist/services/transcriptAnalyzers/index.js +5 -0
  41. package/dist/services/transcriptAnalyzers/whisper.js +55 -0
  42. package/dist/services/transcriptAnalyzers/ytdlp.js +16 -0
  43. package/dist/services/transcriptDetector/index.js +102 -0
  44. package/dist/services/transcriptFetcher/index.js +124 -0
  45. package/dist/services/urlParser/index.js +46 -0
  46. package/dist/services/videoDownloader/index.js +212 -0
  47. package/dist/types/audio.js +15 -0
  48. package/dist/types/cli.js +1 -0
  49. package/dist/types/config.js +150 -0
  50. package/dist/types/index.js +5 -0
  51. package/dist/types/pipeline.js +9 -0
  52. package/dist/types/segment.js +36 -0
  53. package/dist/types/transcript.js +16 -0
  54. package/dist/types/video.js +14 -0
  55. package/dist/utils/cache.js +143 -0
  56. package/dist/utils/chunker.js +51 -0
  57. package/dist/utils/dumper.js +36 -0
  58. package/dist/utils/format.js +10 -0
  59. package/dist/utils/logger.js +16 -0
  60. package/dist/utils/modelFactory.js +60 -0
  61. package/dist/utils/redactConfig.js +20 -0
  62. package/dist/utils/sliceAudio.js +26 -0
  63. package/docs/free-models.md +78 -0
  64. package/docs/plan.md +442 -0
  65. package/docs/refactorPhases.md +105 -0
  66. package/docs/yt-downloader.md +440 -0
  67. package/package.json +65 -0
  68. package/requirements.txt +5 -0
  69. package/scripts/detect_events.py +81 -0
  70. package/scripts/detect_events_whisper.py +101 -0
  71. package/scripts/transcribe_whisper.py +70 -0
  72. package/src/cli.ts +186 -0
  73. package/src/config/env.ts +18 -0
  74. package/src/config/index.ts +2 -0
  75. package/src/index.ts +46 -0
  76. package/src/pipeline/runner.ts +155 -0
  77. package/src/pipeline/stages/audioProcessor.ts +129 -0
  78. package/src/pipeline/stages/clipExporter.ts +80 -0
  79. package/src/pipeline/stages/segmentAnalyzer.ts +72 -0
  80. package/src/pipeline/stages/segmentSelector.ts +39 -0
  81. package/src/pipeline/stages/videoResolver.ts +47 -0
  82. package/src/services/audioAnalyzers/base.ts +32 -0
  83. package/src/services/audioAnalyzers/factory.ts +71 -0
  84. package/src/services/audioAnalyzers/gemini.ts +137 -0
  85. package/src/services/audioAnalyzers/index.ts +6 -0
  86. package/src/services/audioAnalyzers/whisper.ts +80 -0
  87. package/src/services/audioAnalyzers/yamnet.ts +54 -0
  88. package/src/services/audioDownloader/index.ts +102 -0
  89. package/src/services/chunkBuilder/index.ts +86 -0
  90. package/src/services/clipGenerator/index.ts +210 -0
  91. package/src/services/clipRefiner/index.ts +141 -0
  92. package/src/services/eventDetector/index.ts +68 -0
  93. package/src/services/llmAnalyzer/LLMAnalyzer.ts +114 -0
  94. package/src/services/llmAnalyzer/index.ts +231 -0
  95. package/src/services/metadataExtractor/index.ts +83 -0
  96. package/src/services/segmentRanker/index.ts +88 -0
  97. package/src/services/signalMerger/index.ts +53 -0
  98. package/src/services/transcriptAnalyzers/base.ts +26 -0
  99. package/src/services/transcriptAnalyzers/factory.ts +67 -0
  100. package/src/services/transcriptAnalyzers/gemini.ts +24 -0
  101. package/src/services/transcriptAnalyzers/index.ts +6 -0
  102. package/src/services/transcriptAnalyzers/whisper.ts +68 -0
  103. package/src/services/transcriptAnalyzers/ytdlp.ts +19 -0
  104. package/src/services/transcriptDetector/index.ts +128 -0
  105. package/src/services/transcriptFetcher/index.ts +151 -0
  106. package/src/services/urlParser/index.ts +53 -0
  107. package/src/services/videoDownloader/index.ts +282 -0
  108. package/src/types/audio.ts +19 -0
  109. package/src/types/cli.ts +22 -0
  110. package/src/types/config.ts +174 -0
  111. package/src/types/index.ts +26 -0
  112. package/src/types/pipeline.ts +93 -0
  113. package/src/types/segment.ts +43 -0
  114. package/src/types/transcript.ts +22 -0
  115. package/src/types/video.ts +18 -0
  116. package/src/utils/cache.ts +223 -0
  117. package/src/utils/chunker.ts +60 -0
  118. package/src/utils/dumper.ts +41 -0
  119. package/src/utils/format.ts +10 -0
  120. package/src/utils/logger.ts +17 -0
  121. package/src/utils/modelFactory.ts +71 -0
  122. package/src/utils/redactConfig.ts +23 -0
  123. package/src/utils/sliceAudio.ts +35 -0
  124. package/test-trigger.txt +1 -0
  125. package/tests/analyzerFactory.test.ts +146 -0
  126. package/tests/audioEventDetector.test.ts +69 -0
  127. package/tests/cache.test.ts +203 -0
  128. package/tests/chunkBuilder.test.ts +146 -0
  129. package/tests/chunker.test.ts +95 -0
  130. package/tests/eventDetector.test.ts +103 -0
  131. package/tests/llmAnalyzer.test.ts +283 -0
  132. package/tests/segmentRanker.test.ts +133 -0
  133. package/tests/setup.ts +48 -0
  134. package/tests/signalMerger.test.ts +197 -0
  135. package/tests/transcriptDetector.test.ts +150 -0
  136. package/tests/transcriptFetcher.test.ts +179 -0
  137. package/tests/urlParser.test.ts +70 -0
  138. package/tsconfig.json +16 -0
  139. package/tsconfig.test.json +8 -0
  140. package/vitest.config.ts +8 -0
@@ -0,0 +1,132 @@
1
+ import { promises as fs } from 'fs';
2
+ import { config } from '../config/index.js';
3
+ import { Cache } from '../utils/cache.js';
4
+ import { log } from '../utils/logger.js';
5
+ import { dumpAnalysis, dumpTranscript } from '../utils/dumper.js';
6
+ import { resolveVideo } from './stages/videoResolver.js';
7
+ import { processAudio } from './stages/audioProcessor.js';
8
+ import { analyzeSegments, refineRankedSegments } from './stages/segmentAnalyzer.js';
9
+ import { selectSegments } from './stages/segmentSelector.js';
10
+ import { exportClips } from './stages/clipExporter.js';
11
+ import { downloadAudio } from '../services/audioDownloader/index.js';
12
+ async function outputResult(result, outputJsonPath) {
13
+ const json = JSON.stringify(result, null, 2);
14
+ if (outputJsonPath) {
15
+ await fs.writeFile(outputJsonPath, json, 'utf-8');
16
+ log.info(`Output written to ${outputJsonPath}`);
17
+ }
18
+ else {
19
+ console.log('\n' + json);
20
+ }
21
+ }
22
+ /**
23
+ * Runs the full video-clipper pipeline for the given CLI arguments.
24
+ *
25
+ * Stage ordering:
26
+ * 1. resolveVideo — parse URL, extract video ID + metadata
27
+ * 2. downloadAudio — download WAV so Whisper/Gemini transcript providers can use it
28
+ * 3. processAudio — detect audio events per window (reuses downloaded WAV)
29
+ * 4a. analyzeSegments — fetch transcript + LLM pass 1 (informed by audio events)
30
+ * 5. selectSegments — merge signals, rank, threshold filter
31
+ * 4b. refineRankedSegments — LLM pass 2 to tighten clip boundaries
32
+ * 6. exportClips — download video + run ffmpeg (only if --clip)
33
+ *
34
+ * downloadAudio runs before analyzeSegments so that `audioPath` is available
35
+ * for Whisper/Gemini transcript providers. processAudio reuses the same WAV.
36
+ *
37
+ * Hard errors (invalid URL, transcript failure, all LLM chunks failed) are
38
+ * thrown so the caller can catch, log, and exit(1). Soft failures (audio
39
+ * detection, individual clip failures) are logged as warnings and the pipeline
40
+ * continues.
41
+ */
42
+ export async function runPipeline(args) {
43
+ const threshold = args.threshold ?? config.SCORE_THRESHOLD;
44
+ const topN = args.topN ?? config.TOP_N_SEGMENTS;
45
+ const gameProfile = args.gameProfile ?? config.GAME_PROFILE;
46
+ const maxParallel = args.maxParallel ?? config.LLM_CONCURRENCY;
47
+ const cache = new Cache(config.CACHE_DIR, args.noCache);
48
+ // ── Stage 1: Resolve video ID + metadata ─────────────────────────────────
49
+ const { videoId, metadata } = await resolveVideo(args.url, args.maxDuration);
50
+ // ── Stage 2: Download audio ───────────────────────────────────────────────
51
+ // Downloaded before transcript so Whisper/Gemini transcript providers can
52
+ // use the WAV. Returns null when audio detection is disabled.
53
+ let audioPath = null;
54
+ const audioEnabled = config.AUDIO_DETECTION_ENABLED && !args.noAudio;
55
+ if (audioEnabled) {
56
+ try {
57
+ audioPath = await downloadAudio(videoId, `${config.OUTPUT_DIR}/audio`);
58
+ }
59
+ catch (err) {
60
+ const message = err instanceof Error ? err.message : String(err);
61
+ log.warn(`Audio download failed — continuing without audio: ${message}`);
62
+ }
63
+ }
64
+ // ── Stage 3: Audio event detection ───────────────────────────────────────
65
+ const audioEvents = await processAudio(videoId, metadata.duration, cache, {
66
+ noAudio: args.noAudio,
67
+ gameProfile,
68
+ maxParallel,
69
+ audioPath,
70
+ });
71
+ // ── Stage 4a: Fetch transcript + LLM analysis (informed by audio events) ──
72
+ const { lines, microBlocks, chunkEvals } = await analyzeSegments(videoId, audioPath, audioEvents, cache, {
73
+ maxChunks: args.maxChunks,
74
+ maxParallel,
75
+ noCache: args.noCache,
76
+ });
77
+ if (config.DUMP_OUTPUTS) {
78
+ await dumpTranscript(videoId, lines);
79
+ }
80
+ // ── Stage 5: Merge signals + rank ─────────────────────────────────────────
81
+ const rankedSegments = selectSegments(chunkEvals, audioEvents, { threshold, topN });
82
+ // Build partial result for early-exit path (no segments above threshold)
83
+ const partialResult = {
84
+ video_id: videoId,
85
+ title: metadata.title,
86
+ duration: metadata.duration,
87
+ chunk_evaluations: chunkEvals,
88
+ segments: rankedSegments,
89
+ };
90
+ if (rankedSegments.length === 0) {
91
+ await outputResult(partialResult, args.outputJson);
92
+ if (config.DUMP_OUTPUTS)
93
+ await dumpAnalysis(videoId, partialResult);
94
+ return;
95
+ }
96
+ // ── Stage 4b: Refine clip boundaries (LLM pass 2) ─────────────────────────
97
+ const refinedSegments = await refineRankedSegments(rankedSegments, microBlocks, cache, {
98
+ maxParallel,
99
+ noCache: args.noCache,
100
+ });
101
+ // ── Output result ─────────────────────────────────────────────────────────
102
+ const result = {
103
+ video_id: videoId,
104
+ title: metadata.title,
105
+ duration: metadata.duration,
106
+ chunk_evaluations: chunkEvals,
107
+ segments: refinedSegments,
108
+ };
109
+ await outputResult(result, args.outputJson);
110
+ if (config.DUMP_OUTPUTS)
111
+ await dumpAnalysis(videoId, result);
112
+ log.info('Done.');
113
+ // ── Stage 6: Download + generate clips (only with --clip) ─────────────────
114
+ if (!args.clip) {
115
+ log.info('Tip: run with --clip to download the video and generate mp4 clips.');
116
+ return;
117
+ }
118
+ const clipPaths = await exportClips(videoId, refinedSegments, {
119
+ localVideo: args.localVideo,
120
+ downloadSections: args.downloadSections,
121
+ videoPath: args.videoPath,
122
+ });
123
+ if (clipPaths.length === 0) {
124
+ log.warn('No clips were generated successfully.');
125
+ }
126
+ else {
127
+ log.info(`Done — ${clipPaths.length} clip${clipPaths.length !== 1 ? 's' : ''} saved:`);
128
+ for (const p of clipPaths) {
129
+ log.info(` ${p}`);
130
+ }
131
+ }
132
+ }
@@ -0,0 +1,75 @@
1
+ import { promises as fs } from 'fs';
2
+ import pLimit from 'p-limit';
3
+ import { downloadAudio } from '../../services/audioDownloader/index.js';
4
+ import { createAnalyzerChain } from '../../services/audioAnalyzers/index.js';
5
+ import { EventDetector } from '../../services/eventDetector/index.js';
6
+ import { sliceAudio } from '../../utils/sliceAudio.js';
7
+ import { buildWindows } from '../../utils/chunker.js';
8
+ import { log } from '../../utils/logger.js';
9
+ import { config } from '../../config/index.js';
10
+ /**
11
+ * Stage 3 — Audio Processor
12
+ *
13
+ * Downloads audio-only WAV, slices it into chunks using the generic
14
+ * `buildWindows` utility, runs event detection on each slice via an
15
+ * EventDetector (constructed from the ordered provider chain in config),
16
+ * and persists the results to cache.
17
+ *
18
+ * The provider chain is built once per run from `config.AUDIO_PROVIDER`
19
+ * (e.g. "gemini,whisper") via `createAnalyzerChain`. The EventDetector
20
+ * walks the chain in order, falling back to the next analyzer on failure.
21
+ *
22
+ * Returns an empty array immediately when audio detection is disabled via
23
+ * `--no-audio` or the `AUDIO_DETECTION_ENABLED` config flag.
24
+ */
25
+ export async function processAudio(videoId, duration, cache, opts) {
26
+ const audioEnabled = config.AUDIO_DETECTION_ENABLED && !opts.noAudio;
27
+ if (!audioEnabled)
28
+ return [];
29
+ // Cache-first
30
+ const cached = await cache.readAudioEvents(videoId, opts.gameProfile, config.AUDIO_PROVIDER);
31
+ if (cached) {
32
+ log.info(`[cache hit] Audio events loaded from cache (${cached.length} events)`);
33
+ return cached;
34
+ }
35
+ try {
36
+ const audioPath = opts.audioPath ?? (await downloadAudio(videoId, `${config.OUTPUT_DIR}/audio`));
37
+ // Build the analyzer chain once per run from config
38
+ const chain = createAnalyzerChain(config.AUDIO_PROVIDER);
39
+ const detector = new EventDetector(chain);
40
+ const providerNames = chain.map((a) => a.source).join(' → ');
41
+ log.info(`Detecting audio events (chain: ${providerNames}, profile: ${opts.gameProfile}, max ${opts.maxParallel} parallel)...`);
42
+ const windows = buildWindows(duration, config.CHUNK_LENGTH_SEC, config.CHUNK_OVERLAP_SEC);
43
+ const limit = pLimit(opts.maxParallel);
44
+ const results = await Promise.allSettled(windows.map((window) => limit(async () => {
45
+ log.info(` Processing audio chunk ${window.start}s - ${window.end}s...`);
46
+ const cachedChunk = await cache.readAudioChunk(videoId, opts.gameProfile, config.AUDIO_PROVIDER, window.start, window.end);
47
+ if (cachedChunk) {
48
+ log.info(` [cache hit] Audio chunk ${window.start}s - ${window.end}s (${cachedChunk.length} events)`);
49
+ return cachedChunk;
50
+ }
51
+ const slicePath = await sliceAudio(audioPath, window.start, window.end - window.start, config.OUTPUT_DIR);
52
+ const events = await detector.detect(slicePath, opts.gameProfile, window.start, window.end - window.start);
53
+ await fs.unlink(slicePath);
54
+ await cache.writeAudioChunk(videoId, opts.gameProfile, config.AUDIO_PROVIDER, window.start, window.end, events);
55
+ return events;
56
+ })));
57
+ const audioEvents = results
58
+ .flatMap((r, i) => {
59
+ if (r.status === 'fulfilled')
60
+ return r.value;
61
+ const w = windows[i];
62
+ log.warn(` Audio event detection failed for chunk ${w.start}s - ${w.end}s: ${String(r.reason)}`);
63
+ return [];
64
+ })
65
+ .sort((a, b) => a.time - b.time);
66
+ log.info(`Audio event detection complete: ${audioEvents.length} events found`);
67
+ await cache.writeAudioEvents(videoId, opts.gameProfile, config.AUDIO_PROVIDER, audioEvents);
68
+ return audioEvents;
69
+ }
70
+ catch (err) {
71
+ const message = err instanceof Error ? err.message : String(err);
72
+ log.warn(`Audio event detection disabled due to error: ${message}`);
73
+ return [];
74
+ }
75
+ }
@@ -0,0 +1,44 @@
1
+ import { downloadVideo } from '../../services/videoDownloader/index.js';
2
+ import { generateClips, organizeClips } from '../../services/clipGenerator/index.js';
3
+ import { log } from '../../utils/logger.js';
4
+ import { config } from '../../config/index.js';
5
+ /**
6
+ * Stage 6 — Clip Exporter
7
+ *
8
+ * Handles all three clip-generation modes:
9
+ * 1. Local video — user supplied --local-video; run ffmpeg directly
10
+ * 2. Segments -- --download-sections N; download top-N clips via yt-dlp
11
+ * --download-sections, then copy to outputs/
12
+ * 3. Full video — download full video with yt-dlp, then cut clips with ffmpeg
13
+ *
14
+ * @returns Array of absolute paths to the generated clip files.
15
+ */
16
+ export async function exportClips(videoId, segments, opts) {
17
+ // Mode 1: local video already on disk — cut with ffmpeg
18
+ if (opts.localVideo) {
19
+ log.info(`Using local video: ${opts.localVideo}`);
20
+ return generateClips(opts.localVideo, segments, videoId, opts.videoPath, config.CLIP_CONCURRENCY);
21
+ }
22
+ // Determine yt-dlp mode
23
+ const downloadSections = opts.downloadSections ?? config.DOWNLOAD_SECTIONS_MODE;
24
+ if (typeof downloadSections === 'number') {
25
+ // Mode 2: download only the top-N segments via --download-sections
26
+ const segmentsToDownload = segments.slice(0, downloadSections);
27
+ if (segmentsToDownload.length < downloadSections) {
28
+ log.warn(`Requested ${downloadSections} segments, but only ${segmentsToDownload.length} are available above threshold.`);
29
+ }
30
+ log.info(`Downloading ${segmentsToDownload.length} segments via yt-dlp --download-sections...`);
31
+ const downloadResult = await downloadVideo(videoId, 'segments', segmentsToDownload, opts.videoPath);
32
+ if (downloadResult.mode !== 'segments') {
33
+ throw new Error('Expected segments download result but got full-video result.');
34
+ }
35
+ return organizeClips(downloadResult.paths, videoId, opts.videoPath, config.CLIP_CONCURRENCY);
36
+ }
37
+ // Mode 3: full-video download → cut clips with ffmpeg
38
+ log.info('Downloading full video via yt-dlp...');
39
+ const downloadResult = await downloadVideo(videoId, 'all', [], opts.videoPath);
40
+ if (downloadResult.mode !== 'all') {
41
+ throw new Error('Expected full-video download result but got segments result.');
42
+ }
43
+ return generateClips(downloadResult.path, segments, videoId, opts.videoPath, config.CLIP_CONCURRENCY);
44
+ }
@@ -0,0 +1,46 @@
1
+ import { LLMAnalyzer } from '../../services/llmAnalyzer/LLMAnalyzer.js';
2
+ import { TranscriptDetector } from '../../services/transcriptDetector/index.js';
3
+ import { createTranscriptChain } from '../../services/transcriptAnalyzers/index.js';
4
+ import { refineSegments } from '../../services/clipRefiner/index.js';
5
+ import { log } from '../../utils/logger.js';
6
+ import { config } from '../../config/index.js';
7
+ /**
8
+ * Stage 4a — Segment Analyzer (LLM pass 1)
9
+ *
10
+ * Builds a TranscriptDetector from config.TRANSCRIPT_PROVIDER and an
11
+ * LLMAnalyzer that owns it. Fetches the transcript (cache-first) and runs
12
+ * LLM chunk analysis informed by pre-computed audio events.
13
+ *
14
+ * Returns raw ChunkEvaluation results plus transcript data (lines, microBlocks,
15
+ * chunks) so the runner has everything it needs for ranking.
16
+ *
17
+ * NOTE: `processTranscript` no longer needs to run as a separate stage before
18
+ * this function — `LLMAnalyzer.analyze()` handles transcript fetching internally.
19
+ */
20
+ export async function analyzeSegments(videoId, audioPath, audioEvents, cache, opts) {
21
+ log.info('Fetching transcript and analyzing segments...');
22
+ const chain = createTranscriptChain(config.TRANSCRIPT_PROVIDER);
23
+ const transcriptDetector = new TranscriptDetector(chain);
24
+ const analyzer = new LLMAnalyzer(transcriptDetector, cache);
25
+ const { lines, microBlocks, chunks, chunkEvals } = await analyzer.analyze({
26
+ videoId,
27
+ audioPath,
28
+ audioEvents,
29
+ maxChunks: opts.maxChunks,
30
+ maxParallel: opts.maxParallel,
31
+ noCache: opts.noCache,
32
+ });
33
+ return { lines, microBlocks, chunks, chunkEvals };
34
+ }
35
+ /**
36
+ * Stage 4b — Segment Refiner (LLM pass 2)
37
+ *
38
+ * Calls refineSegments() directly — no TranscriptDetector needed here since
39
+ * refinement only tightens clip boundaries and never touches the transcript.
40
+ * Separated from `analyzeSegments` because ranking (stage 5) must happen
41
+ * between the two passes.
42
+ */
43
+ export async function refineRankedSegments(rankedSegments, microBlocks, _cache, opts) {
44
+ log.info('Refining clip boundaries...');
45
+ return refineSegments(rankedSegments, microBlocks, opts.maxParallel, opts.noCache);
46
+ }
@@ -0,0 +1,23 @@
1
+ import { mergeSignals } from '../../services/signalMerger/index.js';
2
+ import { rankSegments } from '../../services/segmentRanker/index.js';
3
+ import { log } from '../../utils/logger.js';
4
+ /**
5
+ * Stage 5 — Segment Selector
6
+ *
7
+ * Merges transcript LLM evaluations with audio events (if any), then ranks
8
+ * and deduplicates candidates to produce the final ordered list of segments.
9
+ *
10
+ * This stage sits between the two LLM passes: it runs after `analyzeSegments`
11
+ * (pass 1) and its output feeds `refineRankedSegments` (pass 2).
12
+ */
13
+ export function selectSegments(chunkEvals, audioEvents, opts) {
14
+ const merged = mergeSignals(chunkEvals, audioEvents);
15
+ const ranked = rankSegments(merged, opts.threshold, opts.topN);
16
+ if (ranked.length === 0) {
17
+ log.warn(`No segments scored above threshold ${opts.threshold}. Try lowering --threshold.`);
18
+ }
19
+ else {
20
+ log.info(`Analysis complete: ${ranked.length} segment${ranked.length !== 1 ? 's' : ''} above threshold ${opts.threshold}`);
21
+ }
22
+ return ranked;
23
+ }
@@ -0,0 +1,34 @@
1
+ import { parseUrl } from '../../services/urlParser/index.js';
2
+ import { extractMetadata } from '../../services/metadataExtractor/index.js';
3
+ import { log } from '../../utils/logger.js';
4
+ import { formatSeconds } from '../../utils/format.js';
5
+ /**
6
+ * Stage 1 — Video Resolver
7
+ *
8
+ * Parses a raw YouTube URL into a validated video ID, fetches metadata
9
+ * (title + duration), and enforces the optional --max-duration guard.
10
+ *
11
+ * @throws {Error} on invalid URL, metadata fetch failure, or exceeded duration
12
+ */
13
+ export async function resolveVideo(rawUrl, maxDurationSec) {
14
+ // Parse URL → video ID
15
+ let videoId;
16
+ try {
17
+ videoId = parseUrl(rawUrl);
18
+ }
19
+ catch {
20
+ throw new Error(`Invalid YouTube URL: ${rawUrl}`);
21
+ }
22
+ // Fetch metadata (yt-dlp → oEmbed fallback)
23
+ log.info(`Fetching metadata for ${videoId}...`);
24
+ const metadata = await extractMetadata(videoId);
25
+ log.info(`Video: "${metadata.title}" (${metadata.duration > 0 ? formatSeconds(metadata.duration) : 'duration unknown'})`);
26
+ // --max-duration guard
27
+ if (maxDurationSec !== undefined && metadata.duration > 0) {
28
+ if (metadata.duration > maxDurationSec) {
29
+ throw new Error(`Video duration exceeds --max-duration limit. ` +
30
+ `(${formatSeconds(metadata.duration)} > ${formatSeconds(maxDurationSec)})`);
31
+ }
32
+ }
33
+ return { videoId, metadata };
34
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Contract every audio analyzer implementation must satisfy.
3
+ *
4
+ * Each concrete analyzer (Gemini, Whisper, YAMNet) extends this class and
5
+ * implements `detect()`. The `source` property is used to tag the events they
6
+ * return so downstream code knows which backend produced them.
7
+ *
8
+ * Usage:
9
+ * const analyzer = new GeminiAudioAnalyzer();
10
+ * const events = await analyzer.detect(audioPath, gameProfile, offsetSec, durationSec);
11
+ */
12
+ export class AudioAnalyzer {
13
+ }
@@ -0,0 +1,56 @@
1
+ import { log } from '../../utils/logger.js';
2
+ import { GeminiAudioAnalyzer } from './gemini.js';
3
+ import { WhisperAudioAnalyzer } from './whisper.js';
4
+ import { YAMNetAudioAnalyzer } from './yamnet.js';
5
+ const KNOWN_PROVIDERS = new Set(['gemini', 'whisper', 'yamnet']);
6
+ /**
7
+ * Parses the AUDIO_PROVIDER config string into an ordered list of provider names.
8
+ *
9
+ * Accepts a comma-separated list: "gemini,whisper" → ['gemini', 'whisper']
10
+ * Single values still work: "yamnet" → ['yamnet']
11
+ *
12
+ * Backward-compat: "both" is mapped to ['gemini', 'whisper'] with a deprecation warning.
13
+ */
14
+ export function parseProviderChain(providerString) {
15
+ // Backward compatibility: map legacy 'both' to the new comma-separated form
16
+ if (providerString.trim() === 'both') {
17
+ log.warn('[audio] AUDIO_PROVIDER=both is deprecated. Use AUDIO_PROVIDER=gemini,whisper instead.');
18
+ return ['gemini', 'whisper'];
19
+ }
20
+ const names = providerString
21
+ .split(',')
22
+ .map((s) => s.trim().toLowerCase())
23
+ .filter(Boolean);
24
+ if (names.length === 0) {
25
+ throw new Error(`AUDIO_PROVIDER is empty. Provide at least one of: gemini, whisper, yamnet`);
26
+ }
27
+ for (const name of names) {
28
+ if (!KNOWN_PROVIDERS.has(name)) {
29
+ throw new Error(`Unknown audio provider "${name}". Valid options: gemini, whisper, yamnet (comma-separated for chain)`);
30
+ }
31
+ }
32
+ return names;
33
+ }
34
+ /**
35
+ * Builds an ordered array of AudioAnalyzer instances from a provider chain string.
36
+ *
37
+ * The EventDetector will walk this array in order — if the first analyzer fails,
38
+ * it falls back to the next, and so on.
39
+ *
40
+ * @example
41
+ * // AUDIO_PROVIDER=gemini,whisper → [GeminiAudioAnalyzer, WhisperAudioAnalyzer]
42
+ * const chain = createAnalyzerChain(config.AUDIO_PROVIDER);
43
+ */
44
+ export function createAnalyzerChain(providerString) {
45
+ const names = parseProviderChain(providerString);
46
+ return names.map((name) => {
47
+ switch (name) {
48
+ case 'gemini':
49
+ return new GeminiAudioAnalyzer();
50
+ case 'whisper':
51
+ return new WhisperAudioAnalyzer();
52
+ case 'yamnet':
53
+ return new YAMNetAudioAnalyzer();
54
+ }
55
+ });
56
+ }
@@ -0,0 +1,109 @@
1
+ import { GoogleGenerativeAI } from '@google/generative-ai';
2
+ import * as fs from 'fs';
3
+ import { z } from 'zod';
4
+ import { config } from '../../config/index.js';
5
+ import { log } from '../../utils/logger.js';
6
+ import { AudioAnalyzer } from './base.js';
7
+ const GeminiEventSchema = z.array(z.object({
8
+ // Gemini inconsistently returns timestamps in either:
9
+ // - MM.SS notation: 1.03 = 1 min 3 sec = 63s
10
+ // - True decimal seconds: 53.403 = 53.403s
11
+ // Use normalizeGeminiTime() to resolve the correct value.
12
+ time_sec: z.number(),
13
+ event: z.string(),
14
+ confidence: z.number().min(0).max(1),
15
+ }));
16
+ const GAME_PROFILE_PROMPTS = {
17
+ valorant: 'You are analyzing audio from a Valorant gaming video. Identify ALL significant game events: kills, deaths, explosions, ability uses, spike plants/defuses, ace moments, clutch situations, crowd reactions, hype moments.',
18
+ fps: 'You are analyzing audio from an FPS gaming video. Identify ALL significant game events: kills, deaths, explosions, weapon fire, headshot sounds, kill streaks, crowd reactions, battle cries.',
19
+ boss_fight: 'You are analyzing audio from a boss fight video. Identify ALL significant game events: boss phase transitions, big hits, explosions, boss death, crowd cheering, epic moments, victory sounds.',
20
+ general: 'You are analyzing audio from a gaming video. Identify ALL significant audio events: explosions, gunshots, crowd reactions, cheering, epic moments, dramatic sounds.',
21
+ };
22
+ /**
23
+ * Converts a MM.SS-notation value to decimal seconds.
24
+ * e.g. 1.03 → 63, 1.40 → 100
25
+ */
26
+ function mmssToSeconds(value) {
27
+ const minutes = Math.floor(value);
28
+ const seconds = Math.round((value % 1) * 100);
29
+ return minutes * 60 + seconds;
30
+ }
31
+ /**
32
+ * Resolves a Gemini `time_sec` value to true decimal seconds.
33
+ *
34
+ * Gemini inconsistently returns either MM.SS notation (e.g. 1.03 meaning 63s)
35
+ * or true decimal seconds (e.g. 53.403). This function disambiguates using
36
+ * the known chunk duration:
37
+ *
38
+ * 1. If the fractional part > 0.59, it cannot be a seconds component (seconds
39
+ * only go 0-59), so it must be true decimal seconds — use as-is.
40
+ * 2. Otherwise, check if the MM.SS conversion produces a value within the
41
+ * valid chunk range [0, chunkDurationSec). If yes, treat as MM.SS.
42
+ * 3. Fallback: use the value as true decimal seconds (the format we asked for).
43
+ *
44
+ * YAMNet always returns true decimal seconds and does NOT use this function.
45
+ */
46
+ export function normalizeGeminiTime(value, chunkDurationSec) {
47
+ const frac = value % 1;
48
+ // Fractional part > 0.59 is impossible in MM.SS — must be decimal seconds
49
+ if (Math.round(frac * 100) > 59) {
50
+ return value;
51
+ }
52
+ // Fractional part ≤ 0.59: could be MM.SS — check if converted value fits in chunk
53
+ const mmss = mmssToSeconds(value);
54
+ if (mmss < chunkDurationSec) {
55
+ return mmss;
56
+ }
57
+ // MM.SS conversion overflows the chunk — must be true decimal seconds
58
+ return value;
59
+ }
60
+ /**
61
+ * Uses Google Gemini's multimodal API to detect audio events in a WAV slice.
62
+ * Understands game context semantically — best accuracy for gaming content.
63
+ *
64
+ * Requires GOOGLE_GENERATIVE_AI_API_KEY to be set.
65
+ */
66
+ export class GeminiAudioAnalyzer extends AudioAnalyzer {
67
+ source = 'gemini';
68
+ async detect(audioPath, gameProfile, chunkOffsetSec, chunkDurationSec) {
69
+ const genai = new GoogleGenerativeAI(config.GOOGLE_GENERATIVE_AI_API_KEY);
70
+ const model = genai.getGenerativeModel({ model: config.AUDIO_GEMINI_MODEL });
71
+ const audioData = fs.readFileSync(audioPath);
72
+ const base64Audio = audioData.toString('base64');
73
+ const extraInstructions = config.AUDIO_EXTRA_INSTRUCTIONS
74
+ ? `\nAdditional instructions:\n${config.AUDIO_EXTRA_INSTRUCTIONS}\n`
75
+ : '';
76
+ const prompt = `${GAME_PROFILE_PROMPTS[gameProfile] ?? GAME_PROFILE_PROMPTS.general} ${extraInstructions}
77
+
78
+ For each event, return a JSON object with:
79
+ - time_sec: the time in seconds (be very precise with the timestamp, Gemini is good at this when the format is correct)
80
+ - event: a short description of the event (e.g., "gunshot", "explosion", "clutch moment")
81
+ - confidence: your confidence level (0.0 to 1.0)
82
+
83
+ Return ONLY a JSON array, no explanation. Format:
84
+ [
85
+ {"time_sec": 12.5, "event": "gunshot", "confidence": 0.8},
86
+ {"time_sec": 45.2, "event": "explosion", "confidence": 0.9}
87
+ ]`;
88
+ const result = await model.generateContent([
89
+ { inlineData: { mimeType: 'audio/wav', data: base64Audio } },
90
+ prompt,
91
+ ]);
92
+ const text = result.response.text();
93
+ log.info(`[audio:gemini] response: ${text}`);
94
+ const cleaned = text
95
+ .replace(/^```(?:json)?\s*/i, '')
96
+ .replace(/\s*```\s*$/i, '')
97
+ .trim();
98
+ const parsed = GeminiEventSchema.safeParse(JSON.parse(cleaned));
99
+ if (!parsed.success) {
100
+ throw new Error(`Gemini response failed validation: ${parsed.error.message}`);
101
+ }
102
+ return parsed.data.map((e) => ({
103
+ time: normalizeGeminiTime(e.time_sec, chunkDurationSec) + chunkOffsetSec,
104
+ event: e.event,
105
+ confidence: e.confidence,
106
+ source: this.source,
107
+ }));
108
+ }
109
+ }
@@ -0,0 +1,5 @@
1
+ export { AudioAnalyzer } from './base.js';
2
+ export { GeminiAudioAnalyzer, normalizeGeminiTime } from './gemini.js';
3
+ export { WhisperAudioAnalyzer, getPythonBin } from './whisper.js';
4
+ export { YAMNetAudioAnalyzer } from './yamnet.js';
5
+ export { createAnalyzerChain, parseProviderChain } from './factory.js';
@@ -0,0 +1,62 @@
1
+ import { execa } from 'execa';
2
+ import { config } from '../../config/index.js';
3
+ import { log } from '../../utils/logger.js';
4
+ import { AudioAnalyzer } from './base.js';
5
+ /**
6
+ * Resolves the Python interpreter binary, caching the result after the first
7
+ * successful lookup. Shared by both Python-based analyzers (Whisper, YAMNet).
8
+ */
9
+ let _pythonBin = null;
10
+ export async function getPythonBin() {
11
+ if (_pythonBin)
12
+ return _pythonBin;
13
+ for (const bin of ['python3', 'python']) {
14
+ try {
15
+ await execa(bin, ['--version']);
16
+ _pythonBin = bin;
17
+ return bin;
18
+ }
19
+ catch {
20
+ log.warn(`[audio] ${bin} not found, trying next binary...`);
21
+ }
22
+ }
23
+ throw new Error('No Python interpreter found (tried python3, python). Install Python 3 to use YAMNet or Whisper.');
24
+ }
25
+ /**
26
+ * Uses OpenAI Whisper (local) to transcribe the audio chunk and scan the
27
+ * resulting transcript for hype keywords per game profile.
28
+ *
29
+ * Requires: pip install openai-whisper
30
+ */
31
+ export class WhisperAudioAnalyzer extends AudioAnalyzer {
32
+ source = 'whisper';
33
+ async detect(audioPath, gameProfile, chunkOffsetSec, _chunkDurationSec) {
34
+ const python = await getPythonBin();
35
+ let stdout;
36
+ try {
37
+ const result = await execa(python, [
38
+ 'scripts/detect_events_whisper.py',
39
+ audioPath,
40
+ String(config.AUDIO_CONFIDENCE_THRESHOLD),
41
+ gameProfile,
42
+ config.AUDIO_WHISPER_MODEL,
43
+ ]);
44
+ stdout = result.stdout;
45
+ }
46
+ catch (err) {
47
+ const message = err instanceof Error ? err.message : String(err);
48
+ if (message.includes('ModuleNotFoundError') || message.includes('No module named')) {
49
+ throw new Error('openai-whisper not installed. Run: pip install openai-whisper\n' +
50
+ 'Or set AUDIO_PROVIDER=gemini in .env and configure GOOGLE_GENERATIVE_AI_API_KEY.');
51
+ }
52
+ throw new Error(`Whisper detection failed: ${message}`);
53
+ }
54
+ const events = JSON.parse(stdout);
55
+ return events.map((e) => ({
56
+ time: e.time + chunkOffsetSec,
57
+ event: e.event,
58
+ confidence: e.confidence,
59
+ source: this.source,
60
+ }));
61
+ }
62
+ }
@@ -0,0 +1,40 @@
1
+ import { execa } from 'execa';
2
+ import { config } from '../../config/index.js';
3
+ import { AudioAnalyzer } from './base.js';
4
+ import { getPythonBin } from './whisper.js';
5
+ /**
6
+ * Uses YAMNet (TensorFlow Hub) via a Python script to classify audio frames
7
+ * against a fixed set of game-relevant sound classes (gunshot, explosion, etc.).
8
+ *
9
+ * Requires: pip install tensorflow-hub soundfile numpy
10
+ */
11
+ export class YAMNetAudioAnalyzer extends AudioAnalyzer {
12
+ source = 'yamnet';
13
+ async detect(audioPath, _gameProfile, chunkOffsetSec, _chunkDurationSec) {
14
+ const python = await getPythonBin();
15
+ let stdout;
16
+ try {
17
+ const result = await execa(python, [
18
+ 'scripts/detect_events.py',
19
+ audioPath,
20
+ String(config.AUDIO_CONFIDENCE_THRESHOLD),
21
+ ]);
22
+ stdout = result.stdout;
23
+ }
24
+ catch (err) {
25
+ const message = err instanceof Error ? err.message : String(err);
26
+ if (message.includes('ModuleNotFoundError') || message.includes('No module named')) {
27
+ throw new Error('YAMNet dependencies missing. Run: pip3 install tensorflow-hub soundfile numpy\n' +
28
+ 'Or set AUDIO_PROVIDER=gemini in .env and configure GOOGLE_GENERATIVE_AI_API_KEY.');
29
+ }
30
+ throw new Error(`YAMNet detection failed: ${message}`);
31
+ }
32
+ const events = JSON.parse(stdout);
33
+ return events.map((e) => ({
34
+ time: e.time + chunkOffsetSec,
35
+ event: e.event,
36
+ confidence: e.confidence,
37
+ source: this.source,
38
+ }));
39
+ }
40
+ }