npm - video_transcript_whisper - Versions diffs - 1.0.1 - Mend

video_transcript_whisper 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +92 -0
package/dist/cli.d.ts +3 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +49 -0
package/dist/cli.js.map +1 -0
package/dist/libs/audio_helper.d.ts +23 -0
package/dist/libs/audio_helper.d.ts.map +1 -0
package/dist/libs/audio_helper.js +53 -0
package/dist/libs/audio_helper.js.map +1 -0
package/dist/libs/transcription_word.d.ts +32 -0
package/dist/libs/transcription_word.d.ts.map +1 -0
package/dist/libs/transcription_word.js +75 -0
package/dist/libs/transcription_word.js.map +1 -0
package/output/.gitignore +3 -0
package/package.json +28 -0
package/src/cli.ts +68 -0
package/src/libs/audio_helper.ts +57 -0
package/src/libs/transcription_word.ts +100 -0
package/tsconfig.json +30 -0

package/README.md ADDED Viewed

@@ -0,0 +1,92 @@
+# video_transcript_whisper
+A CLI tool that transcribes the audio from a video file to text using [whisper.cpp](https://github.com/ggerganov/whisper.cpp) — no cloud API needed, everything runs locally.
+## How it works
+1. Extracts audio from the input video and converts it to 16 kHz mono WAV (the format whisper.cpp requires) via bundled `ffmpeg`.
+2. On first run, automatically downloads and compiles whisper.cpp and fetches the chosen Whisper model (default: `medium.en`).
+3. Runs transcription locally and prints the result to stdout, or saves it to a file.
+## Installation
+```bash
+npm install -g video_transcript_whisper
+```
+Or run directly with `npx`:
+```bash
+npx video_transcript_whisper -i video.mp4
+```
+## Usage
+```
+video_transcript_whisper [options]
+Options:
+  -i, --input-video-path <path>   Path to the video file to transcribe (required)
+  -o, --output-path <path>        Path to save the transcription (default: print to stdout)
+  -v, --verbose                   Enable verbose logging (default: false)
+  -h, --help                      Display help
+```
+### Examples
+Print transcription to stdout:
+```bash
+video_transcript_whisper -i lecture.mp4
+```
+Save transcription to a file:
+```bash
+video_transcript_whisper -i interview.mp4 -o transcript.txt
+```
+Watch download/compilation progress on first run:
+```bash
+video_transcript_whisper -i demo.mp4 -v
+```
+## First run
+The first time the tool runs it will:
+1. Compile whisper.cpp v1.5.5 into `./output/whisper.cpp/` (requires a C++ compiler).
+2. Download the `medium.en` Whisper model (~1.5 GB) into the same directory.
+Subsequent runs skip both steps and go straight to transcription.
+## Supported video/audio formats
+Any format supported by ffmpeg (MP4, MOV, MKV, AVI, MP3, WAV, …).
+## Requirements
+- Node.js 18+
+- A C++ compiler (`gcc` / `clang`) for the one-time whisper.cpp build step
+- `make`
+## Development
+```bash
+# Install dependencies
+npm install
+# Type-check
+npm run typecheck
+# Build
+npm run build
+# Run from source
+node dist/cli.js -i video.mp4
+```
+## License
+MIT

package/dist/cli.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env node
+export {};
+//# sourceMappingURL=cli.d.ts.map

package/dist/cli.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,49 @@
+#!/usr/bin/env node
+// node imports
+import Path from 'node:path';
+import Fs from 'node:fs';
+// npm imports
+import * as Commander from 'commander';
+// local imports
+import { TranscriptionWordHelper } from './libs/transcription_word.js';
+import { AudioHelper } from './libs/audio_helper.js';
+const __dirname = new URL('.', import.meta.url).pathname;
+async function main() {
+    // parse command line arguments
+    const program = new Commander.Command();
+    program
+        .option('-i, --input-video-path <path>', 'Path to the video file to transcribe')
+        .option('-o, --output-path <path>', 'Path to save the transcribed captions')
+        .option('-v, --verbose', 'Enable verbose logging', false)
+        .parse(process.argv);
+    const options = program.opts();
+    // Extract audio from video and save as .wav file
+    // convert audio to wav 16KHz Mono - the required format for whisper.cpp
+    const extractedAudioPath = Path.join(__dirname, '../output/extracted_audio.wav');
+    await AudioHelper.convertToWav16kMono(options.inputVideoPath, extractedAudioPath);
+    if (options.verbose) {
+        console.log('Transcribing audio to captions...');
+        console.time('transcribeAudioToCaptions');
+    }
+    // transcribe audio to captions
+    const whisperPath = Path.join(__dirname, '../output', 'whisper.cpp');
+    const transcribedText = await TranscriptionWordHelper.transcribeVoice(whisperPath, extractedAudioPath, { verbose: options.verbose });
+    if (options.verbose) {
+        console.timeEnd('transcribeAudioToCaptions');
+    }
+    // print transcription result
+    if (options.outputPath === undefined) {
+        console.log(transcribedText);
+    }
+    else {
+        // save transcription result to file
+        await Fs.promises.writeFile(options.outputPath, transcribedText, 'utf-8');
+    }
+}
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+void main();
+//# sourceMappingURL=cli.js.map

package/dist/cli.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,eAAe;AACf,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,SAAS,CAAC;AAEzB,cAAc;AACd,OAAO,KAAK,SAAS,MAAM,WAAW,CAAC;AAEvC,gBAAgB;AAChB,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AACvE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAErD,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;AAEzD,KAAK,UAAU,IAAI;IAClB,+BAA+B;IAC/B,MAAM,OAAO,GAAG,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;IACxC,OAAO;SACL,MAAM,CAAC,+BAA+B,EAAE,sCAAsC,CAAC;SAC/E,MAAM,CAAC,0BAA0B,EAAE,uCAAuC,CAAC;SAC3E,MAAM,CAAC,eAAe,EAAE,wBAAwB,EAAE,KAAK,CAAC;SACxD,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAGtB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAIxB,CAAC;IAKL,iDAAiD;IACjD,wEAAwE;IACxE,MAAM,kBAAkB,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,+BAA+B,CAAC,CAAC;IACjF,MAAM,WAAW,CAAC,mBAAmB,CAAC,OAAO,CAAC,cAAc,EAAE,kBAAkB,CAAC,CAAC;IAElF,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACrB,OAAO,CAAC,GAAG,CAAC,mCAAmC,CAAC,CAAC;QACjD,OAAO,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;IAC3C,CAAC;IAED,+BAA+B;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,WAAW,EAAE,aAAa,CAAC,CAAC;IACrE,MAAM,eAAe,GAAG,MAAM,uBAAuB,CAAC,eAAe,CAAC,WAAW,EAAE,kBAAkB,EAAE,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;IAGrI,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACrB,OAAO,CAAC,OAAO,CAAC,2BAA2B,CAAC,CAAC;IAC9C,CAAC;IAED,6BAA6B;IAC7B,IAAI,OAAO,CAAC,UAAU,KAAK,SAAS,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;IAC9B,CAAC;SAAM,CAAC;QACP,oCAAoC;QACpC,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,eAAe,EAAE,OAAO,CAAC,CAAC;IAC3E,CAAC;AACF,CAAC;AAED,+EAA+E;AAC/E,+EAA+E;AAC/E,GAAG;AACH,+EAA+E;AAC/E,+EAA+E;AAE/E,KAAK,IAAI,EAAE,CAAC"}

package/dist/libs/audio_helper.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
+ */
+export declare class AudioHelper {
+    /**
+     * Convert an audio file to an MP3 file using ffmpeg.
+     *
+     * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+     * @param outputAudioPath output audio path of the .wav
+     * @returns
+     */
+    static convertToMp3(inputAudioPath: string, outputAudioPath: string): Promise<void>;
+    /**
+     * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
+     *
+     * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+     * @param outputAudioPath output audio path of the .wav
+     * @returns
+     */
+    static convertToWav16kMono(inputAudioPath: string, outputAudioPath: string): Promise<void>;
+}
+//# sourceMappingURL=audio_helper.d.ts.map

package/dist/libs/audio_helper.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"audio_helper.d.ts","sourceRoot":"","sources":["../../src/libs/audio_helper.ts"],"names":[],"mappings":"AAMA;;;GAGG;AAGH,qBAAa,WAAW;IAEvB;;;;;;OAMG;WACU,YAAY,CAAC,cAAc,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM;IAczE;;;;;;OAMG;WACU,mBAAmB,CAAC,cAAc,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM;CAchF"}

package/dist/libs/audio_helper.js ADDED Viewed

@@ -0,0 +1,53 @@
+// node import
+import ChildProcess from "node:child_process";
+// npm import
+import ffmpegStaticPath from 'ffmpeg-static';
+/**
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
+ */
+export class AudioHelper {
+    /**
+     * Convert an audio file to an MP3 file using ffmpeg.
+     *
+     * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+     * @param outputAudioPath output audio path of the .wav
+     * @returns
+     */
+    static async convertToMp3(inputAudioPath, outputAudioPath) {
+        const ffmpegPath = ffmpegStaticPath;
+        const command = `${ffmpegPath} -i "${inputAudioPath}" "${outputAudioPath}" -y`;
+        return new Promise((resolve, reject) => {
+            ChildProcess.exec(command, (error, _stdout, _stderr) => {
+                if (error) {
+                    reject(`Error converting audio: ${error.message}`);
+                }
+                else {
+                    resolve();
+                }
+            });
+        });
+    }
+    /**
+     * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
+     *
+     * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+     * @param outputAudioPath output audio path of the .wav
+     * @returns
+     */
+    static async convertToWav16kMono(inputAudioPath, outputAudioPath) {
+        const ffmpegPath = ffmpegStaticPath;
+        const command = `${ffmpegPath} -i "${inputAudioPath}" -ar 16000 -ac 1 -c:a pcm_s16le "${outputAudioPath}" -y`;
+        return new Promise((resolve, reject) => {
+            ChildProcess.exec(command, (error, _stdout, _stderr) => {
+                if (error) {
+                    reject(`Error converting audio: ${error.message}`);
+                }
+                else {
+                    resolve();
+                }
+            });
+        });
+    }
+}
+//# sourceMappingURL=audio_helper.js.map

package/dist/libs/audio_helper.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"audio_helper.js","sourceRoot":"","sources":["../../src/libs/audio_helper.ts"],"names":[],"mappings":"AAAA,cAAc;AACd,OAAO,YAAY,MAAM,oBAAoB,CAAC;AAE9C,aAAa;AACb,OAAO,gBAAgB,MAAM,eAAe,CAAC;AAE7C;;;GAGG;AAGH,MAAM,OAAO,WAAW;IAEvB;;;;;;OAMG;IACH,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,cAAsB,EAAE,eAAuB;QACxE,MAAM,UAAU,GAAG,gBAAgB,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,cAAc,MAAM,eAAe,MAAM,CAAC;QAC/E,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YAC5C,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAmB,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;gBACpF,IAAI,KAAK,EAAE,CAAC;oBACX,MAAM,CAAC,2BAA2B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACP,OAAO,EAAE,CAAC;gBACX,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC;IACJ,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,KAAK,CAAC,mBAAmB,CAAC,cAAsB,EAAE,eAAuB;QAC/E,MAAM,UAAU,GAAG,gBAAgB,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,cAAc,qCAAqC,eAAe,MAAM,CAAC;QAC9G,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YAC5C,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAmB,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;gBACpF,IAAI,KAAK,EAAE,CAAC;oBACX,MAAM,CAAC,2BAA2B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACP,OAAO,EAAE,CAAC;gBACX,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC;IACJ,CAAC;CAED"}

package/dist/libs/transcription_word.d.ts ADDED Viewed

@@ -0,0 +1,32 @@
+import { Language } from '@remotion/install-whisper-cpp';
+/**
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
+ * from https://huggingface.co/openai/whisper-medium
+ */
+export declare class TranscriptionWordHelper {
+    /**
+     * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
+     * the convertToWav16kMono function in this class to convert an audio file to the required format.
+     *
+     * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
+     * first time it is run.**
+     *
+     * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+     * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
+     * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
+     * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
+     * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
+     */
+    static transcribeVoice(whisperPath: string, audioWavPath: string, { verbose, modelName, language }?: {
+        verbose?: boolean;
+        modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
+        language?: Language;
+    }): Promise<string>;
+    /**
+     * Install whisper.cpp and the whisper model if they are not already installed.
+     * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+     * @param verbose
+     */
+    private static _installIfNeeded;
+}
+//# sourceMappingURL=transcription_word.d.ts.map

package/dist/libs/transcription_word.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"transcription_word.d.ts","sourceRoot":"","sources":["../../src/libs/transcription_word.ts"],"names":[],"mappings":"AACA,OAAO,EAAmE,QAAQ,EAAE,MAAM,+BAA+B,CAAC;AAS1H;;;GAGG;AACH,qBAAa,uBAAuB;IAEnC;;;;;;;;;;;;OAYG;WACU,eAAe,CAAC,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,EAAE,EACvE,OAAe,EACf,SAAuB,EACvB,QAAe,EACf,GAAE;QACF,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,SAAS,CAAC,EAAE,WAAW,GAAG,MAAM,GAAG,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,UAAU,GAAG,gBAAgB,GAAG,QAAQ,GAAG,OAAO,GAAG,UAAU,GAAG,MAAM,GAAG,SAAS,CAAC;QAC9J,QAAQ,CAAC,EAAE,QAAQ,CAAC;KACf,GAAG,OAAO,CAAC,MAAM,CAAC;IAoCxB;;;;OAIG;mBACkB,gBAAgB;CAqBrC"}

package/dist/libs/transcription_word.js ADDED Viewed

@@ -0,0 +1,75 @@
+// npm import
+import { downloadWhisperModel, installWhisperCpp, transcribe, toCaptions } from '@remotion/install-whisper-cpp';
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+//	Class
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+/**
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
+ * from https://huggingface.co/openai/whisper-medium
+ */
+export class TranscriptionWordHelper {
+    /**
+     * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
+     * the convertToWav16kMono function in this class to convert an audio file to the required format.
+     *
+     * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
+     * first time it is run.**
+     *
+     * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+     * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
+     * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
+     * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
+     * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
+     */
+    static async transcribeVoice(whisperPath, audioWavPath, { verbose = false, modelName = 'medium.en', language = 'en' } = {}) {
+        // install if needed
+        await this._installIfNeeded(whisperPath, { verbose, modelName });
+        // transcribe
+        const whisperCppOutput = await transcribe({
+            model: modelName,
+            whisperPath: whisperPath,
+            whisperCppVersion: '1.5.5',
+            inputPath: audioWavPath,
+            language: language,
+            tokenLevelTimestamps: true,
+            splitOnWord: false,
+            printOutput: verbose ? true : false,
+        });
+        // Optional: Apply remotion recommended postprocessing
+        const { captions } = toCaptions({
+            whisperCppOutput,
+        });
+        let transcribedText = '';
+        for (const caption of captions) {
+            transcribedText = transcribedText.concat(caption.text);
+        }
+        return transcribedText;
+    }
+    ///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    //	Private functions
+    ///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    /**
+     * Install whisper.cpp and the whisper model if they are not already installed.
+     * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+     * @param verbose
+     */
+    static async _installIfNeeded(whisperPath, { verbose = false, modelName = 'medium.en', } = {}) {
+        // Install whisper.cpp if needed
+        const whisperAlreadyExisted = await installWhisperCpp({
+            to: whisperPath,
+            version: '1.5.5',
+            printOutput: verbose ? true : false,
+        });
+        // Download the whisper model if needed
+        const modelAlreadyExisted = await downloadWhisperModel({
+            model: modelName,
+            folder: whisperPath,
+            printOutput: verbose ? true : false,
+        });
+    }
+}
+//# sourceMappingURL=transcription_word.js.map

package/dist/libs/transcription_word.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"transcription_word.js","sourceRoot":"","sources":["../../src/libs/transcription_word.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,OAAO,EAAE,oBAAoB,EAAE,iBAAiB,EAAE,UAAU,EAAE,UAAU,EAAY,MAAM,+BAA+B,CAAC;AAG1H,+EAA+E;AAC/E,+EAA+E;AAC/E,QAAQ;AACR,+EAA+E;AAC/E,+EAA+E;AAE/E;;;GAGG;AACH,MAAM,OAAO,uBAAuB;IAEnC;;;;;;;;;;;;OAYG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAAC,WAAmB,EAAE,YAAoB,EAAE,EACvE,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,WAAW,EACvB,QAAQ,GAAG,IAAI,KAKZ,EAAE;QACL,oBAAoB;QACpB,MAAM,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,CAAC;QAEjE,aAAa;QACb,MAAM,gBAAgB,GAAG,MAAM,UAAU,CAAC;YACzC,KAAK,EAAE,SAAS;YAChB,WAAW,EAAE,WAAW;YACxB,iBAAiB,EAAE,OAAO;YAC1B,SAAS,EAAE,YAAY;YACvB,QAAQ,EAAE,QAAQ;YAClB,oBAAoB,EAAE,IAAI;YAC1B,WAAW,EAAE,KAAK;YAClB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;QAEH,sDAAsD;QACtD,MAAM,EAAE,QAAQ,EAAE,GAAG,UAAU,CAAC;YAC/B,gBAAgB;SAChB,CAAC,CAAC;QAEH,IAAI,eAAe,GAAW,EAAE,CAAA;QAChC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,eAAe,GAAG,eAAe,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,eAAe,CAAA;IACvB,CAAC;IAGD,+EAA+E;IAC/E,+EAA+E;IAC/E,oBAAoB;IACpB,+EAA+E;IAC/E,+EAA+E;IAE/E;;;;OAIG;IACK,MAAM,CAAC,KAAK,CAAC,gBAAgB,CAAC,WAAmB,EAAE,EAC1D,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,WAAW,MAIpB,EAAE;QACL,gCAAgC;QAChC,MAAM,qBAAqB,GAAG,MAAM,iBAAiB,CAAC;YACrD,EAAE,EAAE,WAAW;YACf,OAAO,EAAE,OAAO;YAChB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;QAEH,uCAAuC;QACvC,MAAM,mBAAmB,GAAG,MAAM,oBAAoB,CAAC;YACtD,KAAK,EAAE,SAAS;YAChB,MAAM,EAAE,WAAW;YACnB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;IACJ,CAAC;CACD"}

package/output/.gitignore ADDED Viewed

@@ -0,0 +1,3 @@
+# ignore all files but .gitignore
+*
+!.gitignore

package/package.json ADDED Viewed

@@ -0,0 +1,28 @@
+{
+    "name": "video_transcript_whisper",
+    "version": "1.0.1",
+    "description": "",
+    "type": "module",
+    "main": "index.js",
+    "bin": {
+        "video_transcript_whisper": "./dist/cli.js"
+    },
+    "scripts": {
+        "build": "tsc",
+        "typecheck": "tsc --noEmit",
+        "publish:all": "npm run build && npm version patch && npm publish --access public",
+        "test": "echo \"Error: no test specified\" && exit 1"
+    },
+    "keywords": [],
+    "author": "",
+    "license": "MIT",
+    "dependencies": {
+        "@remotion/install-whisper-cpp": "^4.0.446",
+        "commander": "^14.0.3",
+        "ffmpeg-static": "^5.3.0",
+        "typescript": "^6.0.2"
+    },
+    "devDependencies": {
+        "@types/node": "^25.5.2"
+    }
+}

package/src/cli.ts ADDED Viewed

@@ -0,0 +1,68 @@
+#!/usr/bin/env node
+// node imports
+import Path from 'node:path';
+import Fs from 'node:fs';
+// npm imports
+import * as Commander from 'commander';
+// local imports
+import { TranscriptionWordHelper } from './libs/transcription_word.js';
+import { AudioHelper } from './libs/audio_helper.js';
+const __dirname = new URL('.', import.meta.url).pathname;
+async function main() {
+	// parse command line arguments
+	const program = new Commander.Command();
+	program
+		.option('-i, --input-video-path <path>', 'Path to the video file to transcribe')
+		.option('-o, --output-path <path>', 'Path to save the transcribed captions')
+		.option('-v, --verbose', 'Enable verbose logging', false)
+		.parse(process.argv);
+	const options = program.opts<{
+		inputVideoPath: string;
+		outputPath?: string;
+		verbose: boolean;
+	}>();
+	// Extract audio from video and save as .wav file
+	// convert audio to wav 16KHz Mono - the required format for whisper.cpp
+	const extractedAudioPath = Path.join(__dirname, '../output/extracted_audio.wav');
+	await AudioHelper.convertToWav16kMono(options.inputVideoPath, extractedAudioPath);
+	if (options.verbose) {
+		console.log('Transcribing audio to captions...');
+		console.time('transcribeAudioToCaptions');
+	}
+	// transcribe audio to captions
+	const whisperPath = Path.join(__dirname, '../output', 'whisper.cpp');
+	const transcribedText = await TranscriptionWordHelper.transcribeVoice(whisperPath, extractedAudioPath, { verbose: options.verbose });
+	if (options.verbose) {
+		console.timeEnd('transcribeAudioToCaptions');
+	}
+	// print transcription result
+	if (options.outputPath === undefined) {
+		console.log(transcribedText);
+	} else {
+		// save transcription result to file
+		await Fs.promises.writeFile(options.outputPath, transcribedText, 'utf-8');
+	}
+}
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+void main();

package/src/libs/audio_helper.ts ADDED Viewed

@@ -0,0 +1,57 @@
+// node import
+import ChildProcess from "node:child_process";
+// npm import
+import ffmpegStaticPath from 'ffmpeg-static';
+/**
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
+ */
+export class AudioHelper {
+	/**
+	 * Convert an audio file to an MP3 file using ffmpeg.
+	 *
+	 * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+	 * @param outputAudioPath output audio path of the .wav
+	 * @returns
+	 */
+	static async convertToMp3(inputAudioPath: string, outputAudioPath: string) {
+		const ffmpegPath = ffmpegStaticPath;
+		const command = `${ffmpegPath} -i "${inputAudioPath}" "${outputAudioPath}" -y`;
+		return new Promise<void>((resolve, reject) => {
+			ChildProcess.exec(command, (error: Error | null, _stdout: string, _stderr: string) => {
+				if (error) {
+					reject(`Error converting audio: ${error.message}`);
+				} else {
+					resolve();
+				}
+			});
+		});
+	}
+	/**
+	 * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
+	 *
+	 * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
+	 * @param outputAudioPath output audio path of the .wav
+	 * @returns
+	 */
+	static async convertToWav16kMono(inputAudioPath: string, outputAudioPath: string) {
+		const ffmpegPath = ffmpegStaticPath;
+		const command = `${ffmpegPath} -i "${inputAudioPath}" -ar 16000 -ac 1 -c:a pcm_s16le "${outputAudioPath}" -y`;
+		return new Promise<void>((resolve, reject) => {
+			ChildProcess.exec(command, (error: Error | null, _stdout: string, _stderr: string) => {
+				if (error) {
+					reject(`Error converting audio: ${error.message}`);
+				} else {
+					resolve();
+				}
+			});
+		});
+	}
+}

package/src/libs/transcription_word.ts ADDED Viewed

@@ -0,0 +1,100 @@
+// npm import
+import { downloadWhisperModel, installWhisperCpp, transcribe, toCaptions, Language } from '@remotion/install-whisper-cpp';
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+//	Class
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+/**
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
+ * from https://huggingface.co/openai/whisper-medium
+ */
+export class TranscriptionWordHelper {
+	/**
+	 * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
+	 * the convertToWav16kMono function in this class to convert an audio file to the required format.
+	 *
+	 * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
+	 * first time it is run.**
+	 *
+	 * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+	 * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
+	 * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
+	 * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
+	 * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
+	 */
+	static async transcribeVoice(whisperPath: string, audioWavPath: string, {
+		verbose = false,
+		modelName = 'medium.en',
+		language = 'en'
+	}: {
+		verbose?: boolean;
+		modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
+		language?: Language;
+	} = {}): Promise<string> {
+		// install if needed
+		await this._installIfNeeded(whisperPath, { verbose, modelName });
+		// transcribe
+		const whisperCppOutput = await transcribe({
+			model: modelName,
+			whisperPath: whisperPath,
+			whisperCppVersion: '1.5.5',
+			inputPath: audioWavPath,
+			language: language,
+			tokenLevelTimestamps: true,
+			splitOnWord: false,
+			printOutput: verbose ? true : false,
+		});
+		// Optional: Apply remotion recommended postprocessing
+		const { captions } = toCaptions({
+			whisperCppOutput,
+		});
+		let transcribedText: string = ''
+		for (const caption of captions) {
+			transcribedText = transcribedText.concat(caption.text);
+		}
+		return transcribedText
+	}
+	///////////////////////////////////////////////////////////////////////////////
+	///////////////////////////////////////////////////////////////////////////////
+	//	Private functions
+	///////////////////////////////////////////////////////////////////////////////
+	///////////////////////////////////////////////////////////////////////////////
+	/**
+	 * Install whisper.cpp and the whisper model if they are not already installed.
+	 * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
+	 * @param verbose
+	 */
+	private static async _installIfNeeded(whisperPath: string, {
+		verbose = false,
+		modelName = 'medium.en',
+	}: {
+		verbose?: boolean;
+		modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
+	} = {}) {
+		// Install whisper.cpp if needed
+		const whisperAlreadyExisted = await installWhisperCpp({
+			to: whisperPath,
+			version: '1.5.5',
+			printOutput: verbose ? true : false,
+		});
+		// Download the whisper model if needed
+		const modelAlreadyExisted = await downloadWhisperModel({
+			model: modelName,
+			folder: whisperPath,
+			printOutput: verbose ? true : false,
+		});
+	}
+}

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+    "compilerOptions": {
+        "target": "ES2020",
+        "module": "node16",
+        "lib": [
+            "ES2020",
+            "DOM"
+        ],
+        "types": [
+            "node"
+        ],
+        "outDir": "./dist",
+        "rootDir": "./src",
+        "strict": true,
+        "esModuleInterop": true,
+        "skipLibCheck": true,
+        "forceConsistentCasingInFileNames": true,
+        "resolveJsonModule": true,
+        "declaration": true,
+        "declarationMap": true,
+        "sourceMap": true
+    },
+    "include": [
+        "src/**/*"
+    ],
+    "exclude": [
+        "node_modules",
+        "dist"
+    ]
+}