video_transcript_whisper 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # video_transcript_whisper
2
+
3
+ A CLI tool that transcribes the audio from a video file to text using [whisper.cpp](https://github.com/ggerganov/whisper.cpp) — no cloud API needed, everything runs locally.
4
+
5
+ ## How it works
6
+
7
+ 1. Extracts audio from the input video and converts it to 16 kHz mono WAV (the format whisper.cpp requires) via bundled `ffmpeg`.
8
+ 2. On first run, automatically downloads and compiles whisper.cpp and fetches the chosen Whisper model (default: `medium.en`).
9
+ 3. Runs transcription locally and prints the result to stdout, or saves it to a file.
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ npm install -g video_transcript_whisper
15
+ ```
16
+
17
+ Or run directly with `npx`:
18
+
19
+ ```bash
20
+ npx video_transcript_whisper -i video.mp4
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```
26
+ video_transcript_whisper [options]
27
+
28
+ Options:
29
+ -i, --input-video-path <path> Path to the video file to transcribe (required)
30
+ -o, --output-path <path> Path to save the transcription (default: print to stdout)
31
+ -v, --verbose Enable verbose logging (default: false)
32
+ -h, --help Display help
33
+ ```
34
+
35
+ ### Examples
36
+
37
+ Print transcription to stdout:
38
+
39
+ ```bash
40
+ video_transcript_whisper -i lecture.mp4
41
+ ```
42
+
43
+ Save transcription to a file:
44
+
45
+ ```bash
46
+ video_transcript_whisper -i interview.mp4 -o transcript.txt
47
+ ```
48
+
49
+ Watch download/compilation progress on first run:
50
+
51
+ ```bash
52
+ video_transcript_whisper -i demo.mp4 -v
53
+ ```
54
+
55
+ ## First run
56
+
57
+ The first time the tool runs it will:
58
+
59
+ 1. Compile whisper.cpp v1.5.5 into `./output/whisper.cpp/` (requires a C++ compiler).
60
+ 2. Download the `medium.en` Whisper model (~1.5 GB) into the same directory.
61
+
62
+ Subsequent runs skip both steps and go straight to transcription.
63
+
64
+ ## Supported video/audio formats
65
+
66
+ Any format supported by ffmpeg (MP4, MOV, MKV, AVI, MP3, WAV, …).
67
+
68
+ ## Requirements
69
+
70
+ - Node.js 18+
71
+ - A C++ compiler (`gcc` / `clang`) for the one-time whisper.cpp build step
72
+ - `make`
73
+
74
+ ## Development
75
+
76
+ ```bash
77
+ # Install dependencies
78
+ npm install
79
+
80
+ # Type-check
81
+ npm run typecheck
82
+
83
+ # Build
84
+ npm run build
85
+
86
+ # Run from source
87
+ node dist/cli.js -i video.mp4
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}
package/dist/cli.js ADDED
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env node
2
+ // node imports
3
+ import Path from 'node:path';
4
+ import Fs from 'node:fs';
5
+ // npm imports
6
+ import * as Commander from 'commander';
7
+ // local imports
8
+ import { TranscriptionWordHelper } from './libs/transcription_word.js';
9
+ import { AudioHelper } from './libs/audio_helper.js';
10
+ const __dirname = new URL('.', import.meta.url).pathname;
11
+ async function main() {
12
+ // parse command line arguments
13
+ const program = new Commander.Command();
14
+ program
15
+ .option('-i, --input-video-path <path>', 'Path to the video file to transcribe')
16
+ .option('-o, --output-path <path>', 'Path to save the transcribed captions')
17
+ .option('-v, --verbose', 'Enable verbose logging', false)
18
+ .parse(process.argv);
19
+ const options = program.opts();
20
+ // Extract audio from video and save as .wav file
21
+ // convert audio to wav 16KHz Mono - the required format for whisper.cpp
22
+ const extractedAudioPath = Path.join(__dirname, '../output/extracted_audio.wav');
23
+ await AudioHelper.convertToWav16kMono(options.inputVideoPath, extractedAudioPath);
24
+ if (options.verbose) {
25
+ console.log('Transcribing audio to captions...');
26
+ console.time('transcribeAudioToCaptions');
27
+ }
28
+ // transcribe audio to captions
29
+ const whisperPath = Path.join(__dirname, '../output', 'whisper.cpp');
30
+ const transcribedText = await TranscriptionWordHelper.transcribeVoice(whisperPath, extractedAudioPath, { verbose: options.verbose });
31
+ if (options.verbose) {
32
+ console.timeEnd('transcribeAudioToCaptions');
33
+ }
34
+ // print transcription result
35
+ if (options.outputPath === undefined) {
36
+ console.log(transcribedText);
37
+ }
38
+ else {
39
+ // save transcription result to file
40
+ await Fs.promises.writeFile(options.outputPath, transcribedText, 'utf-8');
41
+ }
42
+ }
43
+ ///////////////////////////////////////////////////////////////////////////////
44
+ ///////////////////////////////////////////////////////////////////////////////
45
+ //
46
+ ///////////////////////////////////////////////////////////////////////////////
47
+ ///////////////////////////////////////////////////////////////////////////////
48
+ void main();
49
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,eAAe;AACf,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,SAAS,CAAC;AAEzB,cAAc;AACd,OAAO,KAAK,SAAS,MAAM,WAAW,CAAC;AAEvC,gBAAgB;AAChB,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AACvE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAErD,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;AAEzD,KAAK,UAAU,IAAI;IAClB,+BAA+B;IAC/B,MAAM,OAAO,GAAG,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;IACxC,OAAO;SACL,MAAM,CAAC,+BAA+B,EAAE,sCAAsC,CAAC;SAC/E,MAAM,CAAC,0BAA0B,EAAE,uCAAuC,CAAC;SAC3E,MAAM,CAAC,eAAe,EAAE,wBAAwB,EAAE,KAAK,CAAC;SACxD,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAGtB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAIxB,CAAC;IAKL,iDAAiD;IACjD,wEAAwE;IACxE,MAAM,kBAAkB,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,+BAA+B,CAAC,CAAC;IACjF,MAAM,WAAW,CAAC,mBAAmB,CAAC,OAAO,CAAC,cAAc,EAAE,kBAAkB,CAAC,CAAC;IAElF,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACrB,OAAO,CAAC,GAAG,CAAC,mCAAmC,CAAC,CAAC;QACjD,OAAO,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;IAC3C,CAAC;IAED,+BAA+B;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,WAAW,EAAE,aAAa,CAAC,CAAC;IACrE,MAAM,eAAe,GAAG,MAAM,uBAAuB,CAAC,eAAe,CAAC,WAAW,EAAE,kBAAkB,EAAE,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;IAGrI,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACrB,OAAO,CAAC,OAAO,CAAC,2BAA2B,CAAC,CAAC;IAC9C,CAAC;IAED,6BAA6B;IAC7B,IAAI,OAAO,CAAC,UAAU,KAAK,SAAS,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;IAC9B,CAAC;SAAM,CAAC;QACP,oCAAoC;QACpC,MAAM,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,eAAe,EAAE,OAAO,CAAC,CAAC;IAC3E,CAAC;AACF,CAAC;AAED,+EAA+E;AAC/E,+EAA+E;AAC/E,GAAG;AACH,+EAA+E;AAC/E,+EAA+E;AAE/E,KAAK,IAAI,EAAE,CAAC"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
3
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
4
+ */
5
+ export declare class AudioHelper {
6
+ /**
7
+ * Convert an audio file to an MP3 file using ffmpeg.
8
+ *
9
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
10
+ * @param outputAudioPath output audio path of the .wav
11
+ * @returns
12
+ */
13
+ static convertToMp3(inputAudioPath: string, outputAudioPath: string): Promise<void>;
14
+ /**
15
+ * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
16
+ *
17
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
18
+ * @param outputAudioPath output audio path of the .wav
19
+ * @returns
20
+ */
21
+ static convertToWav16kMono(inputAudioPath: string, outputAudioPath: string): Promise<void>;
22
+ }
23
+ //# sourceMappingURL=audio_helper.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"audio_helper.d.ts","sourceRoot":"","sources":["../../src/libs/audio_helper.ts"],"names":[],"mappings":"AAMA;;;GAGG;AAGH,qBAAa,WAAW;IAEvB;;;;;;OAMG;WACU,YAAY,CAAC,cAAc,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM;IAczE;;;;;;OAMG;WACU,mBAAmB,CAAC,cAAc,EAAE,MAAM,EAAE,eAAe,EAAE,MAAM;CAchF"}
@@ -0,0 +1,53 @@
1
+ // node import
2
+ import ChildProcess from "node:child_process";
3
+ // npm import
4
+ import ffmpegStaticPath from 'ffmpeg-static';
5
+ /**
6
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
7
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
8
+ */
9
+ export class AudioHelper {
10
+ /**
11
+ * Convert an audio file to an MP3 file using ffmpeg.
12
+ *
13
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
14
+ * @param outputAudioPath output audio path of the .wav
15
+ * @returns
16
+ */
17
+ static async convertToMp3(inputAudioPath, outputAudioPath) {
18
+ const ffmpegPath = ffmpegStaticPath;
19
+ const command = `${ffmpegPath} -i "${inputAudioPath}" "${outputAudioPath}" -y`;
20
+ return new Promise((resolve, reject) => {
21
+ ChildProcess.exec(command, (error, _stdout, _stderr) => {
22
+ if (error) {
23
+ reject(`Error converting audio: ${error.message}`);
24
+ }
25
+ else {
26
+ resolve();
27
+ }
28
+ });
29
+ });
30
+ }
31
+ /**
32
+ * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
33
+ *
34
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
35
+ * @param outputAudioPath output audio path of the .wav
36
+ * @returns
37
+ */
38
+ static async convertToWav16kMono(inputAudioPath, outputAudioPath) {
39
+ const ffmpegPath = ffmpegStaticPath;
40
+ const command = `${ffmpegPath} -i "${inputAudioPath}" -ar 16000 -ac 1 -c:a pcm_s16le "${outputAudioPath}" -y`;
41
+ return new Promise((resolve, reject) => {
42
+ ChildProcess.exec(command, (error, _stdout, _stderr) => {
43
+ if (error) {
44
+ reject(`Error converting audio: ${error.message}`);
45
+ }
46
+ else {
47
+ resolve();
48
+ }
49
+ });
50
+ });
51
+ }
52
+ }
53
+ //# sourceMappingURL=audio_helper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"audio_helper.js","sourceRoot":"","sources":["../../src/libs/audio_helper.ts"],"names":[],"mappings":"AAAA,cAAc;AACd,OAAO,YAAY,MAAM,oBAAoB,CAAC;AAE9C,aAAa;AACb,OAAO,gBAAgB,MAAM,eAAe,CAAC;AAE7C;;;GAGG;AAGH,MAAM,OAAO,WAAW;IAEvB;;;;;;OAMG;IACH,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,cAAsB,EAAE,eAAuB;QACxE,MAAM,UAAU,GAAG,gBAAgB,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,cAAc,MAAM,eAAe,MAAM,CAAC;QAC/E,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YAC5C,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAmB,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;gBACpF,IAAI,KAAK,EAAE,CAAC;oBACX,MAAM,CAAC,2BAA2B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACP,OAAO,EAAE,CAAC;gBACX,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC;IACJ,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,KAAK,CAAC,mBAAmB,CAAC,cAAsB,EAAE,eAAuB;QAC/E,MAAM,UAAU,GAAG,gBAAgB,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,cAAc,qCAAqC,eAAe,MAAM,CAAC;QAC9G,OAAO,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YAC5C,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,KAAmB,EAAE,OAAe,EAAE,OAAe,EAAE,EAAE;gBACpF,IAAI,KAAK,EAAE,CAAC;oBACX,MAAM,CAAC,2BAA2B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACP,OAAO,EAAE,CAAC;gBACX,CAAC;YACF,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC;IACJ,CAAC;CAED"}
@@ -0,0 +1,32 @@
1
+ import { Language } from '@remotion/install-whisper-cpp';
2
+ /**
3
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
4
+ * from https://huggingface.co/openai/whisper-medium
5
+ */
6
+ export declare class TranscriptionWordHelper {
7
+ /**
8
+ * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
9
+ * the convertToWav16kMono function in this class to convert an audio file to the required format.
10
+ *
11
+ * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
12
+ * first time it is run.**
13
+ *
14
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
15
+ * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
16
+ * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
17
+ * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
18
+ * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
19
+ */
20
+ static transcribeVoice(whisperPath: string, audioWavPath: string, { verbose, modelName, language }?: {
21
+ verbose?: boolean;
22
+ modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
23
+ language?: Language;
24
+ }): Promise<string>;
25
+ /**
26
+ * Install whisper.cpp and the whisper model if they are not already installed.
27
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
28
+ * @param verbose
29
+ */
30
+ private static _installIfNeeded;
31
+ }
32
+ //# sourceMappingURL=transcription_word.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transcription_word.d.ts","sourceRoot":"","sources":["../../src/libs/transcription_word.ts"],"names":[],"mappings":"AACA,OAAO,EAAmE,QAAQ,EAAE,MAAM,+BAA+B,CAAC;AAS1H;;;GAGG;AACH,qBAAa,uBAAuB;IAEnC;;;;;;;;;;;;OAYG;WACU,eAAe,CAAC,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,EAAE,EACvE,OAAe,EACf,SAAuB,EACvB,QAAe,EACf,GAAE;QACF,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,SAAS,CAAC,EAAE,WAAW,GAAG,MAAM,GAAG,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,UAAU,GAAG,gBAAgB,GAAG,QAAQ,GAAG,OAAO,GAAG,UAAU,GAAG,MAAM,GAAG,SAAS,CAAC;QAC9J,QAAQ,CAAC,EAAE,QAAQ,CAAC;KACf,GAAG,OAAO,CAAC,MAAM,CAAC;IAoCxB;;;;OAIG;mBACkB,gBAAgB;CAqBrC"}
@@ -0,0 +1,75 @@
1
+ // npm import
2
+ import { downloadWhisperModel, installWhisperCpp, transcribe, toCaptions } from '@remotion/install-whisper-cpp';
3
+ ///////////////////////////////////////////////////////////////////////////////
4
+ ///////////////////////////////////////////////////////////////////////////////
5
+ // Class
6
+ ///////////////////////////////////////////////////////////////////////////////
7
+ ///////////////////////////////////////////////////////////////////////////////
8
+ /**
9
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
10
+ * from https://huggingface.co/openai/whisper-medium
11
+ */
12
+ export class TranscriptionWordHelper {
13
+ /**
14
+ * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
15
+ * the convertToWav16kMono function in this class to convert an audio file to the required format.
16
+ *
17
+ * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
18
+ * first time it is run.**
19
+ *
20
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
21
+ * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
22
+ * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
23
+ * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
24
+ * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
25
+ */
26
+ static async transcribeVoice(whisperPath, audioWavPath, { verbose = false, modelName = 'medium.en', language = 'en' } = {}) {
27
+ // install if needed
28
+ await this._installIfNeeded(whisperPath, { verbose, modelName });
29
+ // transcribe
30
+ const whisperCppOutput = await transcribe({
31
+ model: modelName,
32
+ whisperPath: whisperPath,
33
+ whisperCppVersion: '1.5.5',
34
+ inputPath: audioWavPath,
35
+ language: language,
36
+ tokenLevelTimestamps: true,
37
+ splitOnWord: false,
38
+ printOutput: verbose ? true : false,
39
+ });
40
+ // Optional: Apply remotion recommended postprocessing
41
+ const { captions } = toCaptions({
42
+ whisperCppOutput,
43
+ });
44
+ let transcribedText = '';
45
+ for (const caption of captions) {
46
+ transcribedText = transcribedText.concat(caption.text);
47
+ }
48
+ return transcribedText;
49
+ }
50
+ ///////////////////////////////////////////////////////////////////////////////
51
+ ///////////////////////////////////////////////////////////////////////////////
52
+ // Private functions
53
+ ///////////////////////////////////////////////////////////////////////////////
54
+ ///////////////////////////////////////////////////////////////////////////////
55
+ /**
56
+ * Install whisper.cpp and the whisper model if they are not already installed.
57
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
58
+ * @param verbose
59
+ */
60
+ static async _installIfNeeded(whisperPath, { verbose = false, modelName = 'medium.en', } = {}) {
61
+ // Install whisper.cpp if needed
62
+ const whisperAlreadyExisted = await installWhisperCpp({
63
+ to: whisperPath,
64
+ version: '1.5.5',
65
+ printOutput: verbose ? true : false,
66
+ });
67
+ // Download the whisper model if needed
68
+ const modelAlreadyExisted = await downloadWhisperModel({
69
+ model: modelName,
70
+ folder: whisperPath,
71
+ printOutput: verbose ? true : false,
72
+ });
73
+ }
74
+ }
75
+ //# sourceMappingURL=transcription_word.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transcription_word.js","sourceRoot":"","sources":["../../src/libs/transcription_word.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,OAAO,EAAE,oBAAoB,EAAE,iBAAiB,EAAE,UAAU,EAAE,UAAU,EAAY,MAAM,+BAA+B,CAAC;AAG1H,+EAA+E;AAC/E,+EAA+E;AAC/E,QAAQ;AACR,+EAA+E;AAC/E,+EAA+E;AAE/E;;;GAGG;AACH,MAAM,OAAO,uBAAuB;IAEnC;;;;;;;;;;;;OAYG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAAC,WAAmB,EAAE,YAAoB,EAAE,EACvE,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,WAAW,EACvB,QAAQ,GAAG,IAAI,KAKZ,EAAE;QACL,oBAAoB;QACpB,MAAM,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,CAAC;QAEjE,aAAa;QACb,MAAM,gBAAgB,GAAG,MAAM,UAAU,CAAC;YACzC,KAAK,EAAE,SAAS;YAChB,WAAW,EAAE,WAAW;YACxB,iBAAiB,EAAE,OAAO;YAC1B,SAAS,EAAE,YAAY;YACvB,QAAQ,EAAE,QAAQ;YAClB,oBAAoB,EAAE,IAAI;YAC1B,WAAW,EAAE,KAAK;YAClB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;QAEH,sDAAsD;QACtD,MAAM,EAAE,QAAQ,EAAE,GAAG,UAAU,CAAC;YAC/B,gBAAgB;SAChB,CAAC,CAAC;QAEH,IAAI,eAAe,GAAW,EAAE,CAAA;QAChC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,eAAe,GAAG,eAAe,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,eAAe,CAAA;IACvB,CAAC;IAGD,+EAA+E;IAC/E,+EAA+E;IAC/E,oBAAoB;IACpB,+EAA+E;IAC/E,+EAA+E;IAE/E;;;;OAIG;IACK,MAAM,CAAC,KAAK,CAAC,gBAAgB,CAAC,WAAmB,EAAE,EAC1D,OAAO,GAAG,KAAK,EACf,SAAS,GAAG,WAAW,MAIpB,EAAE;QACL,gCAAgC;QAChC,MAAM,qBAAqB,GAAG,MAAM,iBAAiB,CAAC;YACrD,EAAE,EAAE,WAAW;YACf,OAAO,EAAE,OAAO;YAChB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;QAEH,uCAAuC;QACvC,MAAM,mBAAmB,GAAG,MAAM,oBAAoB,CAAC;YACtD,KAAK,EAAE,SAAS;YAChB,MAAM,EAAE,WAAW;YACnB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK;SACnC,CAAC,CAAC;IACJ,CAAC;CACD"}
@@ -0,0 +1,3 @@
1
+ # ignore all files but .gitignore
2
+ *
3
+ !.gitignore
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "video_transcript_whisper",
3
+ "version": "1.0.1",
4
+ "description": "",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "bin": {
8
+ "video_transcript_whisper": "./dist/cli.js"
9
+ },
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "typecheck": "tsc --noEmit",
13
+ "publish:all": "npm run build && npm version patch && npm publish --access public",
14
+ "test": "echo \"Error: no test specified\" && exit 1"
15
+ },
16
+ "keywords": [],
17
+ "author": "",
18
+ "license": "MIT",
19
+ "dependencies": {
20
+ "@remotion/install-whisper-cpp": "^4.0.446",
21
+ "commander": "^14.0.3",
22
+ "ffmpeg-static": "^5.3.0",
23
+ "typescript": "^6.0.2"
24
+ },
25
+ "devDependencies": {
26
+ "@types/node": "^25.5.2"
27
+ }
28
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env node
2
+ // node imports
3
+ import Path from 'node:path';
4
+ import Fs from 'node:fs';
5
+
6
+ // npm imports
7
+ import * as Commander from 'commander';
8
+
9
+ // local imports
10
+ import { TranscriptionWordHelper } from './libs/transcription_word.js';
11
+ import { AudioHelper } from './libs/audio_helper.js';
12
+
13
+ const __dirname = new URL('.', import.meta.url).pathname;
14
+
15
+ async function main() {
16
+ // parse command line arguments
17
+ const program = new Commander.Command();
18
+ program
19
+ .option('-i, --input-video-path <path>', 'Path to the video file to transcribe')
20
+ .option('-o, --output-path <path>', 'Path to save the transcribed captions')
21
+ .option('-v, --verbose', 'Enable verbose logging', false)
22
+ .parse(process.argv);
23
+
24
+
25
+ const options = program.opts<{
26
+ inputVideoPath: string;
27
+ outputPath?: string;
28
+ verbose: boolean;
29
+ }>();
30
+
31
+
32
+
33
+
34
+ // Extract audio from video and save as .wav file
35
+ // convert audio to wav 16KHz Mono - the required format for whisper.cpp
36
+ const extractedAudioPath = Path.join(__dirname, '../output/extracted_audio.wav');
37
+ await AudioHelper.convertToWav16kMono(options.inputVideoPath, extractedAudioPath);
38
+
39
+ if (options.verbose) {
40
+ console.log('Transcribing audio to captions...');
41
+ console.time('transcribeAudioToCaptions');
42
+ }
43
+
44
+ // transcribe audio to captions
45
+ const whisperPath = Path.join(__dirname, '../output', 'whisper.cpp');
46
+ const transcribedText = await TranscriptionWordHelper.transcribeVoice(whisperPath, extractedAudioPath, { verbose: options.verbose });
47
+
48
+
49
+ if (options.verbose) {
50
+ console.timeEnd('transcribeAudioToCaptions');
51
+ }
52
+
53
+ // print transcription result
54
+ if (options.outputPath === undefined) {
55
+ console.log(transcribedText);
56
+ } else {
57
+ // save transcription result to file
58
+ await Fs.promises.writeFile(options.outputPath, transcribedText, 'utf-8');
59
+ }
60
+ }
61
+
62
+ ///////////////////////////////////////////////////////////////////////////////
63
+ ///////////////////////////////////////////////////////////////////////////////
64
+ //
65
+ ///////////////////////////////////////////////////////////////////////////////
66
+ ///////////////////////////////////////////////////////////////////////////////
67
+
68
+ void main();
@@ -0,0 +1,57 @@
1
+ // node import
2
+ import ChildProcess from "node:child_process";
3
+
4
+ // npm import
5
+ import ffmpegStaticPath from 'ffmpeg-static';
6
+
7
+ /**
8
+ * Generic helper class for audio/voice generation and processing using the OpenAI API and MusicMetadata library.
9
+ * - MUST NOT be specific to the dialog video use case, it should be reusable for other use cases as well.
10
+ */
11
+
12
+
13
+ export class AudioHelper {
14
+
15
+ /**
16
+ * Convert an audio file to an MP3 file using ffmpeg.
17
+ *
18
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
19
+ * @param outputAudioPath output audio path of the .wav
20
+ * @returns
21
+ */
22
+ static async convertToMp3(inputAudioPath: string, outputAudioPath: string) {
23
+ const ffmpegPath = ffmpegStaticPath;
24
+ const command = `${ffmpegPath} -i "${inputAudioPath}" "${outputAudioPath}" -y`;
25
+ return new Promise<void>((resolve, reject) => {
26
+ ChildProcess.exec(command, (error: Error | null, _stdout: string, _stderr: string) => {
27
+ if (error) {
28
+ reject(`Error converting audio: ${error.message}`);
29
+ } else {
30
+ resolve();
31
+ }
32
+ });
33
+ });
34
+ }
35
+
36
+ /**
37
+ * Convert an audio file to a 16KHz mono wav file using ffmpeg. The output file is required for the whisper.cpp transcriber.
38
+ *
39
+ * @param inputAudioPath input audio path readable by 'ffmpeg-static' npm package
40
+ * @param outputAudioPath output audio path of the .wav
41
+ * @returns
42
+ */
43
+ static async convertToWav16kMono(inputAudioPath: string, outputAudioPath: string) {
44
+ const ffmpegPath = ffmpegStaticPath;
45
+ const command = `${ffmpegPath} -i "${inputAudioPath}" -ar 16000 -ac 1 -c:a pcm_s16le "${outputAudioPath}" -y`;
46
+ return new Promise<void>((resolve, reject) => {
47
+ ChildProcess.exec(command, (error: Error | null, _stdout: string, _stderr: string) => {
48
+ if (error) {
49
+ reject(`Error converting audio: ${error.message}`);
50
+ } else {
51
+ resolve();
52
+ }
53
+ });
54
+ });
55
+ }
56
+
57
+ }
@@ -0,0 +1,100 @@
1
+ // npm import
2
+ import { downloadWhisperModel, installWhisperCpp, transcribe, toCaptions, Language } from '@remotion/install-whisper-cpp';
3
+
4
+
5
+ ///////////////////////////////////////////////////////////////////////////////
6
+ ///////////////////////////////////////////////////////////////////////////////
7
+ // Class
8
+ ///////////////////////////////////////////////////////////////////////////////
9
+ ///////////////////////////////////////////////////////////////////////////////
10
+
11
+ /**
12
+ * from https://www.remotion.dev/docs/install-whisper-cpp/
13
+ * from https://huggingface.co/openai/whisper-medium
14
+ */
15
+ export class TranscriptionWordHelper {
16
+
17
+ /**
18
+ * Transcribe an audio file to captions using whisper.cpp. The audio file MUST be a wav file in 16KHz PCM Mono format. You can use
19
+ * the convertToWav16kMono function in this class to convert an audio file to the required format.
20
+ *
21
+ * **NOTE: This function may be long as it may need to install whisper.cpp and download the whisper model the
22
+ * first time it is run.**
23
+ *
24
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
25
+ * @param audioWavPath path to .wav file - the file MUST be a wav file in 16KHz PCM Mono
26
+ * @param options.verbose if true, will print the output of the various steps (installing whisper.cpp, downloading model, transcribing). Default is false.
27
+ * @param options.modelName the whisper model to use. Default is 'medium.en'. See https://www.remotion.dev/docs/install-whisper-cpp/#models for available models. "medium.en" is a good default for English audio. If you are transcribing non-English audio, you may want to use "medium" or a model without the ".en" suffix.
28
+ * @param options.language the language of the audio. This is used to help whisper.cpp transcribe better. Default is 'en' (English). See https://www.remotion.dev/docs/install-whisper-cpp/#languages for available languages
29
+ */
30
+ static async transcribeVoice(whisperPath: string, audioWavPath: string, {
31
+ verbose = false,
32
+ modelName = 'medium.en',
33
+ language = 'en'
34
+ }: {
35
+ verbose?: boolean;
36
+ modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
37
+ language?: Language;
38
+ } = {}): Promise<string> {
39
+ // install if needed
40
+ await this._installIfNeeded(whisperPath, { verbose, modelName });
41
+
42
+ // transcribe
43
+ const whisperCppOutput = await transcribe({
44
+ model: modelName,
45
+ whisperPath: whisperPath,
46
+ whisperCppVersion: '1.5.5',
47
+ inputPath: audioWavPath,
48
+ language: language,
49
+ tokenLevelTimestamps: true,
50
+ splitOnWord: false,
51
+ printOutput: verbose ? true : false,
52
+ });
53
+
54
+ // Optional: Apply remotion recommended postprocessing
55
+ const { captions } = toCaptions({
56
+ whisperCppOutput,
57
+ });
58
+
59
+ let transcribedText: string = ''
60
+ for (const caption of captions) {
61
+ transcribedText = transcribedText.concat(caption.text);
62
+ }
63
+
64
+ return transcribedText
65
+ }
66
+
67
+
68
+ ///////////////////////////////////////////////////////////////////////////////
69
+ ///////////////////////////////////////////////////////////////////////////////
70
+ // Private functions
71
+ ///////////////////////////////////////////////////////////////////////////////
72
+ ///////////////////////////////////////////////////////////////////////////////
73
+
74
+ /**
75
+ * Install whisper.cpp and the whisper model if they are not already installed.
76
+ * @param whisperPath path to the whisper.cpp installation directory. This is where the whisper.cpp binary and models will be installed.
77
+ * @param verbose
78
+ */
79
+ private static async _installIfNeeded(whisperPath: string, {
80
+ verbose = false,
81
+ modelName = 'medium.en',
82
+ }: {
83
+ verbose?: boolean;
84
+ modelName?: "medium.en" | "base" | "base.en" | "large-v1" | "large-v2" | "large-v3" | "large-v3-turbo" | "medium" | "small" | "small.en" | "tiny" | "tiny.en";
85
+ } = {}) {
86
+ // Install whisper.cpp if needed
87
+ const whisperAlreadyExisted = await installWhisperCpp({
88
+ to: whisperPath,
89
+ version: '1.5.5',
90
+ printOutput: verbose ? true : false,
91
+ });
92
+
93
+ // Download the whisper model if needed
94
+ const modelAlreadyExisted = await downloadWhisperModel({
95
+ model: modelName,
96
+ folder: whisperPath,
97
+ printOutput: verbose ? true : false,
98
+ });
99
+ }
100
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "node16",
5
+ "lib": [
6
+ "ES2020",
7
+ "DOM"
8
+ ],
9
+ "types": [
10
+ "node"
11
+ ],
12
+ "outDir": "./dist",
13
+ "rootDir": "./src",
14
+ "strict": true,
15
+ "esModuleInterop": true,
16
+ "skipLibCheck": true,
17
+ "forceConsistentCasingInFileNames": true,
18
+ "resolveJsonModule": true,
19
+ "declaration": true,
20
+ "declarationMap": true,
21
+ "sourceMap": true
22
+ },
23
+ "include": [
24
+ "src/**/*"
25
+ ],
26
+ "exclude": [
27
+ "node_modules",
28
+ "dist"
29
+ ]
30
+ }