ctx-reels 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import chalk from 'chalk';
4
+ import { generateScript } from './script.js';
5
+ import { generateVoice } from './voice.js';
6
+ import { generateSubs } from './subs.js';
7
+ import { renderReel } from './engine.js';
8
+ import { slugify, DIRS } from '../utils/fileConfig.js';
9
+
10
+ /**
11
+ * Runs the complete generation pipeline for a single topic.
12
+ * @param {string} topic The topic to generate a reel for.
13
+ * @param {object} options Configuration options (e.g., skipSubs)
14
+ */
15
+ export async function runPipeline(topic, options = {}) {
16
+ const startTime = Date.now();
17
+ const slug = slugify(topic);
18
+ console.log(chalk.magenta.bold(`\n=== Starting CTX Pipeline for topic: "${topic}" ===\n`));
19
+
20
+ // 1. Script Generation
21
+ let scriptPath = options.scriptPath || path.join(DIRS.SCRIPTS, `${slug}.txt`);
22
+
23
+ if (options.scriptPath) {
24
+ console.log(chalk.yellow(`[Pipeline] Using explicit script file: ${scriptPath}`));
25
+ } else if (!options.force && fs.existsSync(scriptPath) && fs.statSync(scriptPath).size > 0) {
26
+ console.log(chalk.yellow(`[Pipeline] Skipping Script: Existing file found at ${scriptPath}`));
27
+ } else {
28
+ const generatedScript = await generateScript(topic);
29
+ if (!generatedScript) {
30
+ console.error(chalk.red('\n[Pipeline] Aborting: Failed at Script generation stage.'));
31
+ return false;
32
+ }
33
+ scriptPath = generatedScript;
34
+ }
35
+
36
+ // 2. Voice Generation
37
+ let audioPath = path.join(DIRS.AUDIO, `${slug}.mp3`);
38
+ if (!options.force && fs.existsSync(audioPath) && fs.statSync(audioPath).size > 0) {
39
+ console.log(chalk.yellow(`[Pipeline] Skipping Voice: Existing file found at ${audioPath}`));
40
+ } else {
41
+ const generatedAudio = await generateVoice(scriptPath);
42
+ if (!generatedAudio) {
43
+ console.error(chalk.red('\n[Pipeline] Aborting: Failed at Voice generation stage.'));
44
+ return false;
45
+ }
46
+ audioPath = generatedAudio;
47
+ }
48
+
49
+ // 3. Subtitles Generation (Optional)
50
+ let subsPath = path.join(DIRS.SUBTITLES, `${slug}.ass`);
51
+ if (options.skipSubs) {
52
+ console.log(chalk.yellow(`[Pipeline] Skipping subtitles as requested.`));
53
+ subsPath = null;
54
+ } else if (!options.force && fs.existsSync(subsPath) && fs.statSync(subsPath).size > 0) {
55
+ console.log(chalk.yellow(`[Pipeline] Skipping Subtitles: Existing file found at ${subsPath}`));
56
+ } else {
57
+ const generatedSubs = await generateSubs(audioPath);
58
+ if (!generatedSubs) {
59
+ console.error(chalk.red('\n[Pipeline] Aborting: Failed at Subtitles generation stage.'));
60
+ return false;
61
+ }
62
+ subsPath = generatedSubs;
63
+ }
64
+
65
+ // 4. Video Rendering
66
+ const reelPath = await renderReel(audioPath, subsPath);
67
+ if (!reelPath) {
68
+ console.error(chalk.red('\n[Pipeline] Aborting: Failed at Engine rendering stage.'));
69
+ return false;
70
+ }
71
+
72
+ const durationMs = Date.now() - startTime;
73
+ console.log(chalk.green.bold(`\n=== CTX Pipeline Completed in ${(durationMs / 1000).toFixed(1)}s ===`));
74
+ console.log(chalk.green(`Final Output: ${reelPath}\n`));
75
+ return true;
76
+ }
77
+
78
+ /**
79
+ * Runs the pipeline for a batch array of topics.
80
+ * @param {string[]} topics Array of topics
81
+ * @param {object} options Configuration options
82
+ */
83
+ export async function runBatchPipeline(topics, options = {}) {
84
+ console.log(chalk.magenta.bold(`\n=== Starting Batch Pipeline for ${topics.length} topics ===\n`));
85
+ let successCount = 0;
86
+
87
+ // Running sequentially for Phase 1 MVP stability
88
+ for (let i = 0; i < topics.length; i++) {
89
+ console.log(chalk.cyan(`\n--- Batch Item ${i + 1}/${topics.length} ---`));
90
+ const success = await runPipeline(topics[i], options);
91
+ if (success) successCount++;
92
+ }
93
+
94
+ console.log(chalk.magenta.bold(`\n=== Batch Pipeline Finished ===`));
95
+ console.log(chalk.magenta(`Successfully completed ${successCount} out of ${topics.length} reels.\n`));
96
+ }
@@ -0,0 +1,70 @@
1
+ import { GoogleGenAI } from '@google/genai';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import chalk from 'chalk';
5
+ import { slugify, DIRS, ensureDirectories } from '../utils/fileConfig.js';
6
+ import dotenv from 'dotenv';
7
+ dotenv.config();
8
+
9
+ /**
10
+ * Generates a short-form video script based on a given topic using the Gemini API.
11
+ * @param {string} topic The topic to write about.
12
+ * @returns {Promise<string|null>} The path to the generated script, or null on error.
13
+ */
14
+ export async function generateScript(topic) {
15
+ try {
16
+ ensureDirectories();
17
+
18
+ const apiKey = process.env.GEMINI_API_KEY;
19
+ if (!apiKey) {
20
+ console.error(chalk.red('\n[Error] GEMINI_API_KEY is missing in .env. Please set it before running the script generator.'));
21
+ return null;
22
+ }
23
+
24
+ const ai = new GoogleGenAI({ apiKey });
25
+
26
+ console.log(chalk.blue(`[Script] Generating script for topic: "${topic}"...`));
27
+
28
+ const prompt = `You are a world-class viral scriptwriter for TikTok, Instagram Reels, and YouTube Shorts.
29
+ Your specialty is high-retention, fast-paced storytelling that GRABS attention in the first 2 seconds.
30
+
31
+ Write a compelling 45-60 second spoken script about: "${topic}"
32
+
33
+ STRUCTURE & STYLE:
34
+ 1. THE HOOK: Start with an "I can't believe this" or "Did you know" style statement that creates immediate curiosity. No generic intros like "Welcome back" or "Today we're talking about". Jump straight into the action.
35
+ 2. THE PUNCHY FACT: Follow the hook with a shocking or counter-intuitive fact.
36
+ 3. THE NARRATIVE: Build the middle of the script using short, punchy sentences. Avoid flowery language. Use "power words" (e.g., 'Insane', 'Secret', 'Massive', 'Failure', 'Genius').
37
+ 4. THE LOOP/CTA: End with a thought-provoking question or a loop-ready statement that flows naturally back to the hook.
38
+
39
+ CRITICAL CONSTRAINTS:
40
+ - ONLY output the spoken text.
41
+ - NO markdown, NO bolding, NO emojis, NO technical instructions like "(pause)" or "[Visual]".
42
+ - NO labels like "Hook:" or "Outro:".
43
+ - The output must be one single, clean block of naturally spoken text.
44
+ - Approximately 130-150 words total.`;
45
+
46
+ const response = await ai.models.generateContent({
47
+ model: process.env.GEMINI_MODEL || 'gemini-3-flash-preview',
48
+ contents: prompt,
49
+ });
50
+
51
+ const scriptText = response.text;
52
+
53
+ if (!scriptText || scriptText.trim() === '') {
54
+ console.error(chalk.red('[Script] Error: Gemini returned an empty response.'));
55
+ return null;
56
+ }
57
+
58
+ const slug = slugify(topic);
59
+ const outputPath = path.join(DIRS.SCRIPTS, `${slug}.txt`);
60
+
61
+ fs.writeFileSync(outputPath, scriptText.trim());
62
+ console.log(chalk.green(`[Script] Script generated successfully and saved to ${outputPath}`));
63
+
64
+ return outputPath;
65
+
66
+ } catch (error) {
67
+ console.error(chalk.red(`[Script] Error generating script: ${error.message}`));
68
+ return null;
69
+ }
70
+ }
@@ -0,0 +1,142 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import chalk from 'chalk';
4
+ import { exec } from 'child_process';
5
+ import util from 'util';
6
+ import { fileURLToPath } from 'url';
7
+ import { GoogleGenAI } from '@google/genai';
8
+ import { DIRS, ensureDirectories } from '../utils/fileConfig.js';
9
+ import dotenv from 'dotenv';
10
+ dotenv.config();
11
+
12
+ const execPromise = util.promisify(exec);
13
+
14
+ // Get __dirname equivalent in ES modules
15
+ const __filename = fileURLToPath(import.meta.url);
16
+ const __dirname = path.dirname(__filename);
17
+
18
+ /**
19
+ * Generates an SRT subtitle file from a given audio file.
20
+ * Defaults to Gemini API to avoid local dependency issues on this machine.
21
+ *
22
+ * @param {string} audioPath The path to the generated audio (MP3).
23
+ * @returns {Promise<string|null>} The path to the generated subtitles (SRT), or null on error.
24
+ */
25
+ export async function generateSubs(audioPath) {
26
+ try {
27
+ ensureDirectories();
28
+
29
+ if (!fs.existsSync(audioPath)) {
30
+ console.error(chalk.red(`[Subs] Error: Audio file not found at ${audioPath}`));
31
+ return null;
32
+ }
33
+
34
+ const audioBasename = path.basename(audioPath, '.mp3');
35
+ const outputPath = path.join(DIRS.SUBTITLES, `${audioBasename}.ass`);
36
+
37
+ // --- GEMINI IMPLEMENTATION (DISABLED) ---
38
+ // return await generateSubsGemini(audioPath, outputPath);
39
+
40
+ // --- PYTHON IMPLEMENTATION (ACTIVE) ---
41
+ // To switch back to python-based whisper, uncomment the line below and comment the gemini line above.
42
+ return await generateSubsPython(audioPath, outputPath);
43
+
44
+ } catch (error) {
45
+ if (error.message.includes('429') || error.message.includes('quota')) {
46
+ console.error(chalk.yellow(`\n[Subs] Rate limit hit. Please wait ~1 minute and try again.`));
47
+ }
48
+ console.error(chalk.red(`[Subs] Error generating subtitles: ${error.message}`));
49
+ return null;
50
+ }
51
+ }
52
+
53
+ /**
54
+ * Uses Gemini API for transcription.
55
+ */
56
+ async function generateSubsGemini(audioPath, outputPath) {
57
+ const apiKey = process.env.GEMINI_API_KEY;
58
+ if (!apiKey) {
59
+ console.error(chalk.red('\n[Subs] Error: GEMINI_API_KEY is missing in .env.'));
60
+ return null;
61
+ }
62
+
63
+ const ai = new GoogleGenAI({ apiKey });
64
+
65
+ console.log(chalk.blue(`[Subs] Transcribing audio with Gemini: "${path.basename(audioPath)}"...`));
66
+
67
+ const audioBuffer = fs.readFileSync(audioPath);
68
+ const base64Audio = audioBuffer.toString('base64');
69
+
70
+ const prompt = `Transcribe this audio file and output it in a valid SRT (SubRip Subtitle) format.
71
+ Ensure the timestamps are accurate and synchronized with the speech.
72
+ Only output the SRT text content, no other explanations.`;
73
+
74
+ const response = await ai.models.generateContent({
75
+ model: process.env.GEMINI_MODEL || 'gemini-3-flash-preview',
76
+ contents: [
77
+ { text: prompt },
78
+ {
79
+ inlineData: {
80
+ data: base64Audio,
81
+ mimeType: "audio/mp3"
82
+ }
83
+ }
84
+ ]
85
+ });
86
+
87
+ let srtText = response.text;
88
+
89
+ // Clean up markdown code blocks if the AI includes them
90
+ srtText = srtText.replace(/```srt\n?|```/g, '').trim();
91
+
92
+ if (!srtText || srtText.length < 10) {
93
+ console.error(chalk.red('[Subs] Error: Gemini returned an invalid SRT.'));
94
+ return null;
95
+ }
96
+
97
+ fs.writeFileSync(outputPath, srtText);
98
+ console.log(chalk.green(`[Subs] Subtitles generated via Gemini and saved to ${outputPath}`));
99
+
100
+ return outputPath;
101
+ }
102
+
103
+ /**
104
+ * Uses local Python script (faster-whisper) for transcription.
105
+ * (Preserved for when dependencies are fixed)
106
+ */
107
+ async function generateSubsPython(audioPath, outputPath) {
108
+ const pythonScriptPath = path.resolve(__dirname, '../python/whisper_runner.py');
109
+
110
+ if (!fs.existsSync(pythonScriptPath)) {
111
+ console.error(chalk.red(`[Subs] Error: Python script not found at ${pythonScriptPath}`));
112
+ return null;
113
+ }
114
+
115
+ const audioBasename = path.basename(audioPath, '.mp3');
116
+ console.log(chalk.blue(`[Subs] Generating subtitles for audio: "${audioBasename}" (Python)...`));
117
+
118
+ // Portable Python detection: env var > local .venv > python3 on PATH
119
+ let pythonBin = process.env.PYTHON_PATH;
120
+ if (!pythonBin) {
121
+ const venvPythonPath = path.resolve(__dirname, '../../.venv/bin/python');
122
+ if (fs.existsSync(venvPythonPath)) {
123
+ pythonBin = venvPythonPath;
124
+ } else {
125
+ pythonBin = 'python3';
126
+ }
127
+ }
128
+ const command = `${pythonBin} "${pythonScriptPath}" --audio "${audioPath}" --output "${outputPath}" --format ass`;
129
+ const { stdout, stderr } = await execPromise(command);
130
+
131
+ if (stderr && stderr.trim().length > 0) {
132
+ console.warn(chalk.yellow(`[Subs] Python Output (stderr): ${stderr}`));
133
+ }
134
+
135
+ if (!fs.existsSync(outputPath)) {
136
+ console.error(chalk.red(`[Subs] Error: Python script finished but SRT file wasn't created.`));
137
+ return null;
138
+ }
139
+
140
+ console.log(chalk.green(`[Subs] Subtitles generated successfully and saved to ${outputPath}`));
141
+ return outputPath;
142
+ }
@@ -0,0 +1,211 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import chalk from 'chalk';
4
+ import { exec } from 'child_process';
5
+ import util from 'util';
6
+ import ffmpeg from 'fluent-ffmpeg';
7
+ import { DIRS, ensureDirectories } from '../utils/fileConfig.js';
8
+ import dotenv from 'dotenv';
9
+ dotenv.config();
10
+
11
+ const execPromise = util.promisify(exec);
12
+
13
+ /**
14
+ * Generates an audio file from a given script text file using edge-tts.
15
+ * @param {string} scriptPath The path to the generated script text file.
16
+ * @returns {Promise<string|null>} The path to the generated audio file, or null on error.
17
+ */
18
+ export async function generateVoice(scriptPath) {
19
+ try {
20
+ ensureDirectories();
21
+
22
+ if (!fs.existsSync(scriptPath)) {
23
+ console.error(chalk.red(`[Voice] Error: Script file not found at ${scriptPath}`));
24
+ return null;
25
+ }
26
+
27
+ const scriptBasename = path.basename(scriptPath, '.txt');
28
+ const outputPath = path.join(DIRS.AUDIO, `${scriptBasename}.mp3`);
29
+
30
+ console.log(chalk.blue(`[Voice] Generating audio for script: "${scriptBasename}"...`));
31
+
32
+ // Read the script to log length or handle empty files
33
+ const scriptContent = fs.readFileSync(scriptPath, 'utf-8');
34
+ if (!scriptContent || scriptContent.trim() === '') {
35
+ console.error(chalk.red('[Voice] Error: Script is empty.'));
36
+ return null;
37
+ }
38
+
39
+ const engine = process.env.VOICE_ENGINE || 'edge-tts';
40
+
41
+ if (engine === 'elevenlabs' && process.env.ELEVENLABS_API_KEY && process.env.ELEVENLABS_API_KEY.trim() !== '') {
42
+ console.log(chalk.magenta(`[Voice] ELEVENLABS_API_KEY detected. Using ElevenLabs API...`));
43
+ try {
44
+ const { ElevenLabsClient } = await import('@elevenlabs/elevenlabs-js');
45
+ const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
46
+
47
+ // Using 'Adam' as a default reliable voice, configurable later
48
+ const voiceId = process.env.ELEVENLABS_VOICE_ID || "pNInz6obpgDQGcFmaJcg";
49
+
50
+ const audioStream = await client.textToSpeech.convert(voiceId, {
51
+ text: scriptContent,
52
+ model_id: "eleven_multilingual_v2",
53
+ output_format: "mp3_44100_128",
54
+ });
55
+
56
+ const writeStream = fs.createWriteStream(outputPath);
57
+ for await (const chunk of audioStream) {
58
+ writeStream.write(chunk);
59
+ }
60
+ writeStream.end();
61
+
62
+ await new Promise((resolve) => writeStream.on('finish', resolve));
63
+
64
+ } catch (err) {
65
+ console.error(chalk.red(`[Voice] ElevenLabs API Error: ${err.message}`));
66
+ console.log(chalk.yellow(`[Voice] Falling back to Edge-TTS...`));
67
+ await generateWithEdgeTTS(scriptPath, outputPath);
68
+ }
69
+ } else if (engine === 'kokoro') {
70
+ try {
71
+ await generateWithKokoro(scriptContent, outputPath);
72
+ } catch (err) {
73
+ console.error(chalk.red(`[Voice] Kokoro Generation Error: ${err.message}`));
74
+ console.log(chalk.yellow(`[Voice] Falling back to Edge-TTS...`));
75
+ await generateWithEdgeTTS(scriptPath, outputPath);
76
+ }
77
+ } else {
78
+ await generateWithEdgeTTS(scriptPath, outputPath);
79
+ }
80
+
81
+ if (!fs.existsSync(outputPath)) {
82
+ console.error(chalk.red(`[Voice] Error: Audio file wasn't created at ${outputPath}`));
83
+ return null;
84
+ }
85
+
86
+ const stats = fs.statSync(outputPath);
87
+ if (stats.size === 0) {
88
+ console.error(chalk.red(`[Voice] Error: Generated audio file is 0 bytes.`));
89
+ return null;
90
+ }
91
+
92
+ console.log(chalk.blue(`[Voice] Trimming silence to make narration snappier...`));
93
+ const snappyPath = outputPath.replace('.mp3', '-snappy.mp3');
94
+
95
+ await new Promise((resolve, reject) => {
96
+ if (process.env.FFMPEG_PATH) ffmpeg.setFfmpegPath(process.env.FFMPEG_PATH);
97
+
98
+ ffmpeg(outputPath)
99
+ // Filter: removes pauses > 0.3s (gentle trim, preserves natural speech)
100
+ .audioFilters('silenceremove=stop_periods=-1:stop_duration=0.3:stop_threshold=-50dB')
101
+ .save(snappyPath)
102
+ .on('end', () => {
103
+ fs.renameSync(snappyPath, outputPath);
104
+ resolve();
105
+ })
106
+ .on('error', (err) => {
107
+ console.warn(chalk.yellow(`[Voice] Warning: Silence trimming failed: ${err.message}`));
108
+ resolve();
109
+ });
110
+ });
111
+
112
+ console.log(chalk.green(`[Voice] Audio generated & processed successfully: ${outputPath}`));
113
+ return outputPath;
114
+
115
+ } catch (error) {
116
+ console.error(chalk.red(`[Voice] Error generating voice: ${error.message}`));
117
+ return null;
118
+ }
119
+ }
120
+
121
+ async function generateWithEdgeTTS(scriptPath, outputPath) {
122
+ console.log(chalk.blue(`[Voice] Generating with Edge-TTS...`));
123
+ const voice = process.env.EDGE_TTS_VOICE || 'en-US-ChristopherNeural';
124
+ const command = `edge-tts --voice ${voice} --file "${scriptPath}" --write-media "${outputPath}"`;
125
+
126
+ const { stdout, stderr } = await execPromise(command);
127
+
128
+ if (stderr && stderr.trim().length > 0) {
129
+ console.warn(chalk.yellow(`[Voice] Edge-TTS CLI Output: ${stderr}`));
130
+ }
131
+ }
132
+
133
+ async function generateWithKokoro(scriptContent, outputPath) {
134
+ console.log(chalk.blue(`[Voice] Generating with Kokoro-JS...`));
135
+ const { KokoroTTS } = await import('kokoro-js');
136
+
137
+ const tts = await KokoroTTS.from_pretrained('onnx-community/Kokoro-82M-v1.0-ONNX', {
138
+ dtype: 'q8',
139
+ });
140
+ const voice = process.env.KOKORO_VOICE || 'af_heart';
141
+
142
+ // Split text into sentences to avoid Kokoro truncating long scripts
143
+ const sentences = scriptContent
144
+ .replace(/([.?!])\s+/g, '$1|SPLIT|')
145
+ .split('|SPLIT|')
146
+ .map(s => s.trim())
147
+ .filter(s => s.length > 0);
148
+
149
+ console.log(chalk.gray(`[Voice] Generating ${sentences.length} audio chunks...`));
150
+
151
+ const tempDir = path.dirname(outputPath);
152
+ const chunkPaths = [];
153
+
154
+ for (let i = 0; i < sentences.length; i++) {
155
+ const sentence = sentences[i];
156
+ const preview = sentence.length > 50 ? sentence.substring(0, 50) + '...' : sentence;
157
+ console.log(chalk.gray(`[Voice] Chunk ${i + 1}/${sentences.length}: "${preview}"`));
158
+
159
+ const audio = await tts.generate(sentence, { voice });
160
+
161
+ const chunkRawPath = path.join(tempDir, `_chunk_${i}.raw`);
162
+ const chunkMp3Path = path.join(tempDir, `_chunk_${i}.mp3`);
163
+
164
+ // Write raw PCM
165
+ fs.writeFileSync(chunkRawPath, Buffer.from(audio.audio.buffer));
166
+
167
+ // Convert raw PCM to MP3
168
+ await new Promise((resolve, reject) => {
169
+ if (process.env.FFMPEG_PATH) ffmpeg.setFfmpegPath(process.env.FFMPEG_PATH);
170
+
171
+ ffmpeg()
172
+ .input(chunkRawPath)
173
+ .inputOptions(['-f', 'f32le', '-ar', '24000', '-ac', '1'])
174
+ .output(chunkMp3Path)
175
+ .audioBitrate('192k')
176
+ .on('end', () => {
177
+ fs.unlinkSync(chunkRawPath);
178
+ resolve();
179
+ })
180
+ .on('error', (err) => reject(err))
181
+ .run();
182
+ });
183
+
184
+ chunkPaths.push(chunkMp3Path);
185
+ }
186
+
187
+ // Concatenate all chunks using FFmpeg concat demuxer
188
+ const concatListPath = path.join(tempDir, '_concat_list.txt');
189
+ const concatContent = chunkPaths.map(p => `file '${p}'`).join('\n');
190
+ fs.writeFileSync(concatListPath, concatContent);
191
+
192
+ await new Promise((resolve, reject) => {
193
+ if (process.env.FFMPEG_PATH) ffmpeg.setFfmpegPath(process.env.FFMPEG_PATH);
194
+
195
+ ffmpeg()
196
+ .input(concatListPath)
197
+ .inputOptions(['-f', 'concat', '-safe', '0'])
198
+ .outputOptions(['-c', 'copy'])
199
+ .output(outputPath)
200
+ .on('end', () => {
201
+ // Cleanup temp files
202
+ chunkPaths.forEach(p => { try { fs.unlinkSync(p); } catch (e) { } });
203
+ try { fs.unlinkSync(concatListPath); } catch (e) { }
204
+ resolve();
205
+ })
206
+ .on('error', (err) => reject(err))
207
+ .run();
208
+ });
209
+
210
+ console.log(chalk.green(`[Voice] Kokoro: ${sentences.length} chunks concatenated successfully.`));
211
+ }
@@ -0,0 +1,139 @@
1
+ import sys
2
+ import os
3
+ import argparse
4
+ from typing import List
5
+ from faster_whisper import WhisperModel
6
+
7
+ def format_time_srt(seconds: float) -> str:
8
+ """Format time in seconds to SRT format (HH:MM:SS,mmm)."""
9
+ hours = int(seconds // 3600)
10
+ minutes = int((seconds % 3600) // 60)
11
+ seconds_remainder = seconds % 60
12
+ return f"{hours:02d}:{minutes:02d}:{seconds_remainder:06.3f}".replace('.', ',')
13
+
14
+ def format_time_ass(seconds: float) -> str:
15
+ """Format time in seconds to ASS format (H:MM:SS.cc)."""
16
+ hours = int(seconds // 3600)
17
+ minutes = int((seconds % 3600) // 60)
18
+ secs = seconds % 60
19
+ centiseconds = int((secs - int(secs)) * 100)
20
+ return f"{hours}:{minutes:02d}:{int(secs):02d}.{centiseconds:02d}"
21
+
22
+ def chunk_words(words, max_words=4):
23
+ """Split a list of words into chunks of max_words."""
24
+ chunks = []
25
+ current_chunk = []
26
+ for word in words:
27
+ current_chunk.append(word)
28
+ if len(current_chunk) >= max_words:
29
+ chunks.append(current_chunk)
30
+ current_chunk = []
31
+ if current_chunk:
32
+ chunks.append(current_chunk)
33
+ return chunks
34
+
35
+ def create_srt(segments: List, output_file: str):
36
+ """Write Faster-Whisper segments to an SRT file using word-level timestamps."""
37
+ counter = 1
38
+ with open(output_file, 'w', encoding='utf-8') as f:
39
+ for segment in segments:
40
+ if not segment.words:
41
+ # Fallback: use segment-level if no word timestamps
42
+ start_time = format_time_srt(segment.start)
43
+ end_time = format_time_srt(segment.end)
44
+ text = segment.text.strip()
45
+ f.write(f"{counter}\n")
46
+ f.write(f"{start_time} --> {end_time}\n")
47
+ f.write(f"{text}\n\n")
48
+ counter += 1
49
+ continue
50
+
51
+ chunks = chunk_words(segment.words, max_words=4)
52
+ for chunk in chunks:
53
+ start_time = format_time_srt(chunk[0].start)
54
+ end_time = format_time_srt(chunk[-1].end)
55
+ text = " ".join(w.word.strip() for w in chunk)
56
+ f.write(f"{counter}\n")
57
+ f.write(f"{start_time} --> {end_time}\n")
58
+ f.write(f"{text}\n\n")
59
+ counter += 1
60
+
61
+ def create_ass(segments: List, output_file: str):
62
+ """Write ASS subtitle file with karaoke word-level highlighting.
63
+
64
+ Base color: Yellow. Active word highlight: White.
65
+ Uses \\kf (smooth karaoke fill) tags for per-word timing.
66
+ """
67
+ header = """[Script Info]
68
+ Title: CTX Karaoke Subtitles
69
+ ScriptType: v4.00+
70
+ PlayResX: 1080
71
+ PlayResY: 1920
72
+ WrapStyle: 0
73
+
74
+ [V4+ Styles]
75
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
76
+ Style: Default,Impact,38,&H0000FFFF,&H00FFFFFF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,2,1,5,10,10,480,1
77
+
78
+ [Events]
79
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
80
+ """
81
+ with open(output_file, 'w', encoding='utf-8') as f:
82
+ f.write(header)
83
+
84
+ for segment in segments:
85
+ if not segment.words:
86
+ start = format_time_ass(segment.start)
87
+ end = format_time_ass(segment.end)
88
+ text = segment.text.strip()
89
+ f.write(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{text}\n")
90
+ continue
91
+
92
+ chunks = chunk_words(segment.words, max_words=4)
93
+
94
+ for chunk in chunks:
95
+ chunk_start = format_time_ass(chunk[0].start)
96
+ chunk_end = format_time_ass(chunk[-1].end)
97
+
98
+ # Build karaoke text with \kf tags
99
+ # \kf<duration_in_centisecs> makes the word transition from secondary to primary color
100
+ karaoke_parts = []
101
+ for word in chunk:
102
+ duration_cs = int((word.end - word.start) * 100)
103
+ if duration_cs < 1:
104
+ duration_cs = 1
105
+ word_text = word.word.strip()
106
+ karaoke_parts.append(f"{{\\kf{duration_cs}}}{word_text}")
107
+
108
+ line = " ".join(karaoke_parts)
109
+ f.write(f"Dialogue: 0,{chunk_start},{chunk_end},Default,,0,0,0,,{line}\n")
110
+
111
+ def transcribe(audio_path: str, output_path: str, model_size="base", output_format="ass"):
112
+ """Transcribe an audio file using faster-whisper."""
113
+ if not os.path.exists(audio_path):
114
+ print(f"Error: Audio file not found at {audio_path}", file=sys.stderr)
115
+ sys.exit(1)
116
+
117
+ print(f"Loading faster-whisper model ({model_size})...")
118
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
119
+
120
+ print(f"Transcribing {audio_path}...")
121
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
122
+ segment_list = list(segments)
123
+
124
+ if output_format == "ass":
125
+ create_ass(segment_list, output_path)
126
+ else:
127
+ create_srt(segment_list, output_path)
128
+
129
+ print(f"Successfully generated {output_format.upper()} at {output_path}")
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser(description="Generate subtitles from audio using Faster-Whisper")
133
+ parser.add_argument("--audio", required=True, help="Path to the input audio file")
134
+ parser.add_argument("--output", required=True, help="Path to save the output subtitle file")
135
+ parser.add_argument("--model", default="base", help="Model size for Faster-Whisper (e.g., tiny, base, small)")
136
+ parser.add_argument("--format", default="ass", choices=["srt", "ass"], help="Output format: srt or ass")
137
+
138
+ args = parser.parse_args()
139
+ transcribe(args.audio, args.output, args.model, args.format)