@wovin/tranz-cli 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ var _a;
2
+ import { Command, Flags, CliUx } from '@oclif/core';
3
+ import ffmpeg from 'fluent-ffmpeg';
4
+ import * as fs from 'node:fs';
5
+ import { getFileInfo, getName, getNameWithExt } from "../../utils/file-utils.js";
6
+ // @ts-expect-error
7
+ import normalize from '@dharmendrasha/ffmpeg-normalize';
8
+ import path from 'node:path';
9
+ const doSilenceRemoval = async (inputPath, silence, sildur, output = `${inputPath}-silenceRemoved.mp3`) => {
10
+ ffmpeg(inputPath, { logger: console })
11
+ .audioFilters([
12
+ {
13
+ filter: 'silenceremove',
14
+ options: { stop_threshold: silence, stop_periods: -1, stop_duration: sildur, /* stop_silence: 0.25, start_silence: 0.25 */ }
15
+ }
16
+ ])
17
+ .on('codecData', function (data) {
18
+ // console.log('Input is ' + data.audio + ' audio ' + 'with ' + data.video + ' video');
19
+ })
20
+ .on('stderr', function (stderrLine) {
21
+ console.log('Stderr output: ' + stderrLine);
22
+ })
23
+ .on('error', function (error) {
24
+ console.error('Error:', error, silence);
25
+ })
26
+ .on('end', function (stdout, stderr) {
27
+ console.log('Silence Removal succeeded, output path:', output);
28
+ })
29
+ .save(output);
30
+ };
31
+ const doNormalization = async (path, outputDir) => {
32
+ // const [name, ext] = getExt(path) as string[]
33
+ const { name, ext } = getFileInfo(path);
34
+ const output = `${outputDir}/${name}_normalized${ext}`;
35
+ const normalized = await normalize({
36
+ input: path,
37
+ output,
38
+ loudness: {
39
+ normalization: 'ebuR128',
40
+ target: {
41
+ input_i: -23,
42
+ input_lra: 7.0,
43
+ input_tp: -2.0
44
+ }
45
+ }
46
+ }).catch((error) => {
47
+ // Some error happened
48
+ console.error(error);
49
+ });
50
+ console.log(normalized);
51
+ const finalOutputPath = `${outputDir}/${getName(output)}-prepared.wav`;
52
+ return new Promise(resolveFx => {
53
+ ffmpeg(output, { logger: console })
54
+ .audioFrequency(16000) // required for whisper
55
+ .outputOptions(['-ac 1', '-c:a pcm_s16le'])
56
+ .save(finalOutputPath)
57
+ .on('end', () => {
58
+ resolveFx(finalOutputPath);
59
+ });
60
+ });
61
+ };
62
+ class Prep extends Command {
63
+ /**
64
+ * doPrep
65
+ */
66
+ async doPrep(props) {
67
+ const { input, outdir: outdirFlag, silthr, sildur, norm, verbose } = props;
68
+ // HACK ESM imports and geScrewed in oclif
69
+ const { DEBUG, ERROR } = { DEBUG: console.log, ERROR: (...Any) => new Error(JSON.stringify(Any)) }; //Logger.setup(verbose ? Logger.DEBUG : Logger.INFO) // eslint-disable-line no-unused-vars
70
+ DEBUG('Starting Prep with:', props);
71
+ return new Promise(async (resolveFx, rejectFx) => {
72
+ const silenceInfo = [];
73
+ const info = { traces: [], silenceInfo };
74
+ const sourceFileName = getNameWithExt(input);
75
+ const cachedRnnnPath = await _a.ensureRnnnIsCached(this.config);
76
+ // const isCreateInterimArtifacts = true // ? consider implement chaining if this is false and add param
77
+ const phase1input = input;
78
+ const phase1Name = getName(phase1input);
79
+ const outDir = `${outdirFlag}/${phase1Name}`;
80
+ fs.mkdirSync(path.normalize(outDir), { recursive: true });
81
+ const phase1OutputPath = `${outDir}/${phase1Name}-denoised.wav`;
82
+ const ffmpegChain = ffmpeg(phase1input, { logger: console });
83
+ ffmpegChain
84
+ .audioFilters([
85
+ {
86
+ filter: 'arnndn',
87
+ options: { m: cachedRnnnPath, mix: 1 } // mix:-1 means keep only noise https://ffmpeg.org/ffmpeg-filters.html#arnndn
88
+ },
89
+ {
90
+ filter: 'silencedetect',
91
+ options: { n: silthr, d: sildur }
92
+ },
93
+ ])
94
+ .audioFrequency(16000) // required for whisper
95
+ .outputOptions(['-ac 1', '-c:a pcm_s16le'])
96
+ .on('codecData', function (data) {
97
+ const codecInfo = 'Input is ' + data.audio + ' audio ' + 'with ' + data.video + ' video';
98
+ info.traces.push([codecInfo]);
99
+ DEBUG(codecInfo);
100
+ })
101
+ .on('stderr', function (stderrLine) {
102
+ if (stderrLine.includes(' silence_end: ')) {
103
+ const endInfo = stderrLine.split(' silence_end: ')[1].split(' | silence_duration: ');
104
+ const [endSec, duration] = endInfo.map(s => +s);
105
+ silenceInfo.push({ stSec: endSec - duration, endSec, duration });
106
+ }
107
+ })
108
+ .on('error', function (error) {
109
+ rejectFx(ERROR('Error:', error, silthr));
110
+ })
111
+ .on('end', async function (stdout, stderr) {
112
+ const successTrace = ['Phase2 succeeded: ', { silenceInfo, phase2OutputPath: phase1OutputPath }];
113
+ info.traces.push(successTrace);
114
+ DEBUG(successTrace);
115
+ CliUx.ux.table(silenceInfo, { stSec: {}, endSec: {}, duration: {} });
116
+ const outputPath = norm ? await _a.normalize(phase1OutputPath, outDir) : phase1OutputPath;
117
+ const results = { ffmpegChain, outputPath };
118
+ resolveFx({ props, info, results });
119
+ })
120
+ .save(phase1OutputPath);
121
+ });
122
+ }
123
+ /**
124
+ * only for cli parsing
125
+ */
126
+ async run() {
127
+ const { args: { input }, flags, flags: { outdir, silthr, sildur, norm, verbose } } = await this.parse(_a);
128
+ const report = await this.doPrep({ input, outdir, silthr, sildur, norm, verbose });
129
+ verbose && console.log(report);
130
+ }
131
+ }
132
+ _a = Prep;
133
+ Prep.description = 'Prepare audio file - normalize, noise reduce, split on silence';
134
+ Prep.DEFAULTS = {
135
+ SILDUR: '1.3',
136
+ SILBUF: 0.2,
137
+ SILTHR: '-35dB',
138
+ RNNN: 'https://raw.githubusercontent.com/GregorR/rnnoise-models/master/somnolent-hogwash-2018-09-01/sh.rnnn'
139
+ };
140
+ Prep.args = [{ name: 'input', description: 'input file', required: true }];
141
+ Prep.flags = {
142
+ norm: Flags.boolean({ char: 'n', description: 'do normalization?', required: false, default: false }),
143
+ verbose: Flags.boolean({ char: 'v', description: 'trace more?', required: false, default: false }),
144
+ outdir: Flags.string({ char: 'o', description: 'output directory', required: false, default: './out' }),
145
+ silthr: Flags.string({ char: 's', description: 'silence threshold', required: false, default: _a.DEFAULTS.SILTHR }),
146
+ sildur: Flags.string({ char: 'd', description: 'silence duration', required: false, default: _a.DEFAULTS.SILDUR }),
147
+ };
148
+ /**
149
+ * subroutines
150
+ */
151
+ Prep.normalize = doNormalization;
152
+ Prep.removeSilence = doSilenceRemoval;
153
+ /**
154
+ * cli only (depends on config and fs situation)
155
+ */
156
+ Prep.ensureRnnnIsCached = async (config) => {
157
+ const cachedRnnnPath = `${config.cacheDir}/sh.rnnn`;
158
+ const isRnnnExisting = fs.existsSync(cachedRnnnPath);
159
+ if (!isRnnnExisting) {
160
+ console.log(`
161
+ rnnn for noise removal missing
162
+ Fetching ${_a.DEFAULTS.RNNN} into ${cachedRnnnPath}
163
+ `);
164
+ const data = await fetch(_a.DEFAULTS.RNNN);
165
+ fs.writeFileSync(cachedRnnnPath, await data.text());
166
+ }
167
+ else
168
+ console.log(`Found ${cachedRnnnPath} \n`);
169
+ return cachedRnnnPath;
170
+ };
171
+ Prep.examples = [
172
+ `$ tranz prep AI_Could_Be_The_End_Of_Democracy.m4a' -s -35dB -d 1.5
173
+ --> sets silence threshold to -35dB and min silence duration to 1.5s
174
+ --> runs normalization and noise removal
175
+ `,
176
+ ];
177
+ export default Prep;
@@ -0,0 +1,43 @@
1
+ import { Command } from "@oclif/core";
2
+ export default class Scribe extends Command {
3
+ static DEFAULTS: {
4
+ DIARIZE: boolean;
5
+ SILDUR: string;
6
+ SILBUF: number;
7
+ SILTHR: string;
8
+ MODEL_KEYS: {
9
+ tinyd: string;
10
+ small: string;
11
+ medium: string;
12
+ };
13
+ MODELS: {
14
+ tinyd: string;
15
+ small: string;
16
+ medium: string;
17
+ };
18
+ };
19
+ static description: string;
20
+ static examples: string[];
21
+ static flags: {
22
+ prep: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
23
+ withGPU: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
24
+ norm: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
25
+ output: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
26
+ diarization: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
27
+ separate_speakers: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
28
+ provider: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
29
+ model: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
30
+ language: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
31
+ timestamps: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
32
+ "no-pauses": import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
33
+ "auto-split": import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
34
+ "max-segment": import("@oclif/core/lib/interfaces/parser").OptionFlag<number>;
35
+ "split-silence-dur": import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
36
+ };
37
+ static args: {
38
+ name: string;
39
+ description: string;
40
+ required: boolean;
41
+ }[];
42
+ run(): Promise<void>;
43
+ }
@@ -0,0 +1,383 @@
1
+ import { Command, Flags } from "@oclif/core";
2
+ import * as fs from "node:fs";
3
+ import path from "node:path";
4
+ import { getName } from "../../utils/file-utils.js";
5
+ import { loadSecretOrThrow } from "../../utils/secrets.js";
6
+ import { formatBytes } from "../../utils/formatBytes.js";
7
+ import Prep from "../prep/index.js";
8
+ import { createProvider, VOXTRAL_LIMITS, formatTranscriptWithPauses, autoSplitAudio, getAudioDuration, mergeTranscriptionResults, } from "@wovin/tranz";
9
+ class Scribe extends Command {
10
+ async run() {
11
+ const { args: { input }, flags, flags: { output: outputFlag, diarization, separate_speakers, prep, norm, provider: providerFlag, model, language, timestamps, withGPU, "no-pauses": noPauses, "auto-split": autoSplit, "max-segment": maxSegment, "split-silence-dur": splitSilenceDur, }, } = await this.parse(Scribe);
12
+ // Use new diarization flag, fallback to deprecated separate_speakers
13
+ const enableDiarization = diarization || separate_speakers;
14
+ let inputPath = input;
15
+ const sourceFileName = getName(inputPath);
16
+ // 🎙️ Header
17
+ this.log("\n🎙️ Transcribing Audio\n");
18
+ // 🎵 Loading audio file with size
19
+ const fileStats = fs.statSync(inputPath);
20
+ const fileSize = formatBytes(fileStats.size);
21
+ this.log(`🎵 Loading audio file: ${inputPath}`);
22
+ this.log(` Size: ${fileSize}`);
23
+ if (prep) {
24
+ const prepCommand = new Prep([], this.config);
25
+ const prepResult = prepCommand.doPrep({
26
+ input: inputPath,
27
+ outdir: outputFlag,
28
+ norm,
29
+ verbose: true,
30
+ silthr: "35dB",
31
+ sildur: "1.2",
32
+ });
33
+ inputPath = (await prepResult).results.outputPath;
34
+ }
35
+ const outputDir = `${outputFlag}/${sourceFileName}`;
36
+ fs.mkdirSync(path.normalize(outputDir), { recursive: true });
37
+ // Normalize provider name (voxtral -> mistral)
38
+ let providerName = (providerFlag === "voxtral" ? "mistral" : providerFlag);
39
+ // Handle API-based providers
40
+ let apiKey;
41
+ if (providerName === "mistral" || providerName === "greenpt") {
42
+ if (withGPU) {
43
+ this.warn(`--withGPU flag is not applicable to ${providerName} provider (API-based)`);
44
+ }
45
+ if (providerName === "mistral") {
46
+ if (timestamps && language) {
47
+ this.error(`--timestamps and --language cannot be used together (Mistral API limitation)\nSee: https://docs.mistral.ai/capabilities/audio_transcription`);
48
+ }
49
+ }
50
+ // Load API key
51
+ const envVarMap = {
52
+ mistral: { envVar: "MISTRAL_API_KEY", secretPath: "secret/mistral" },
53
+ greenpt: { envVar: "GREENPT_API_KEY", secretPath: "secret/greenpt" },
54
+ };
55
+ const { envVar, secretPath } = envVarMap[providerName];
56
+ apiKey = await loadSecretOrThrow(envVar, secretPath, `${providerName} API key`);
57
+ }
58
+ // ⚙️ Configuration summary
59
+ const actualModel = model || (providerName === "mistral" ? "voxtral-mini-latest" : providerName === "greenpt" ? "whisper-large-v3" : "medium");
60
+ this.log(`\n⚙️ Configuration:`);
61
+ this.log(` Provider: ${providerName}`);
62
+ this.log(` Model: ${actualModel}`);
63
+ if (language) {
64
+ this.log(` Language: ${language}`);
65
+ }
66
+ if (timestamps) {
67
+ this.log(` Timestamps: ${timestamps}`);
68
+ }
69
+ if (enableDiarization) {
70
+ this.log(` Diarization: enabled`);
71
+ }
72
+ if (autoSplit) {
73
+ this.log(` Auto-split: enabled (max ${maxSegment}s segments)`);
74
+ }
75
+ // Create provider
76
+ const provider = createProvider(providerName, this.config);
77
+ // Check if provider has limits that require auto-split
78
+ let effectiveAutoSplit = autoSplit;
79
+ let effectiveMaxSegment = maxSegment;
80
+ // For Mistral/Voxtral provider, auto-enable split if audio exceeds limit
81
+ if (providerName === "mistral") {
82
+ const audioDuration = await getAudioDuration(inputPath);
83
+ if (audioDuration > VOXTRAL_LIMITS.recommendedMaxDurationSec) {
84
+ if (!autoSplit) {
85
+ this.log(`\n⚠️ Audio duration (${Math.floor(audioDuration / 60)}m) exceeds Voxtral limit (${VOXTRAL_LIMITS.recommendedMaxDurationSec / 60}m)`);
86
+ this.log(` Auto-enabling split mode...`);
87
+ effectiveAutoSplit = true;
88
+ effectiveMaxSegment = VOXTRAL_LIMITS.recommendedMaxDurationSec;
89
+ }
90
+ }
91
+ }
92
+ // Determine if we need to auto-split
93
+ let result;
94
+ let segments;
95
+ if (effectiveAutoSplit) {
96
+ // Get audio duration to check if splitting is needed
97
+ const audioDuration = await getAudioDuration(inputPath);
98
+ this.log(`\n📏 Audio duration: ${Math.floor(audioDuration / 60)}m ${(audioDuration % 60).toFixed(0)}s`);
99
+ if (audioDuration > effectiveMaxSegment) {
100
+ // Split audio at silence boundaries
101
+ this.log(`\n✂️ Splitting audio (exceeds ${effectiveMaxSegment}s max)...`);
102
+ const splitConfig = {
103
+ maxDurationSec: effectiveMaxSegment,
104
+ minSilenceDurSec: parseFloat(splitSilenceDur),
105
+ preferLongerSilence: true,
106
+ };
107
+ const segmentsDir = path.join(outputDir, 'segments');
108
+ segments = await autoSplitAudio(inputPath, segmentsDir, splitConfig);
109
+ this.log(` Created ${segments.length} segments`);
110
+ for (const seg of segments) {
111
+ this.log(` - Segment ${seg.index}: ${seg.startSec.toFixed(1)}s - ${seg.endSec.toFixed(1)}s (${seg.durationSec.toFixed(1)}s)`);
112
+ }
113
+ // Transcribe each segment
114
+ this.log(`\n⏳ Transcribing ${segments.length} segments...`);
115
+ const segmentResults = [];
116
+ for (const seg of segments) {
117
+ this.log(` Transcribing segment ${seg.index + 1}/${segments.length}...`);
118
+ const segResult = await provider.transcribe({
119
+ audioPath: seg.outputPath,
120
+ outputDir,
121
+ model,
122
+ language,
123
+ timestampGranularity: timestamps,
124
+ diarize: enableDiarization,
125
+ apiKey,
126
+ });
127
+ segmentResults.push(segResult);
128
+ if (segResult.error) {
129
+ this.warn(` Segment ${seg.index} error: ${segResult.error}`);
130
+ }
131
+ }
132
+ // Merge results
133
+ result = mergeTranscriptionResults(segmentResults, segments);
134
+ this.log(`\n🔗 Merged ${segments.length} segment transcriptions`);
135
+ }
136
+ else {
137
+ this.log(`\n⏳ Transcribing (no split needed)...`);
138
+ result = await provider.transcribe({
139
+ audioPath: inputPath,
140
+ outputDir,
141
+ model,
142
+ language,
143
+ timestampGranularity: timestamps,
144
+ diarize: enableDiarization,
145
+ apiKey,
146
+ });
147
+ }
148
+ }
149
+ else {
150
+ // ⏳ Progress indicator
151
+ this.log(`\n⏳ Transcribing...`);
152
+ result = await provider.transcribe({
153
+ audioPath: inputPath,
154
+ outputDir,
155
+ model,
156
+ language,
157
+ timestampGranularity: timestamps,
158
+ diarize: enableDiarization,
159
+ apiKey,
160
+ });
161
+ }
162
+ if (result.error) {
163
+ throw new Error(`Transcription failed: ${result.error}`);
164
+ }
165
+ // Determine output filename based on provider
166
+ let outputFileName;
167
+ switch (providerName) {
168
+ case "mistral":
169
+ outputFileName = `${sourceFileName}-transcript-mistral.json`;
170
+ break;
171
+ case "greenpt":
172
+ outputFileName = `${sourceFileName}-transcript-greenpt.json`;
173
+ break;
174
+ case "whisper":
175
+ default:
176
+ outputFileName = `${sourceFileName}-transcript.json`;
177
+ }
178
+ const outputPath = `${outputDir}/${outputFileName}`;
179
+ // For Whisper, the file is already saved by the provider (unless we did auto-split)
180
+ // For API providers or auto-split, save the result
181
+ if (providerName !== "whisper" || segments) {
182
+ fs.writeFileSync(outputPath, JSON.stringify(result.rawResponse || result, null, 2));
183
+ }
184
+ // Show segment info if we split
185
+ const mergedResult = result;
186
+ if (mergedResult.totalSegments && mergedResult.totalSegments > 1) {
187
+ this.log(`\n📊 Split Summary:`);
188
+ this.log(` Total segments: ${mergedResult.totalSegments}`);
189
+ }
190
+ // 📝 Transcript preview
191
+ this.log(`\n📝 Transcript preview:`);
192
+ let previewText = result.text;
193
+ // Apply pause formatting if words data available and not disabled
194
+ const words = result.rawResponse?.words || result.words;
195
+ if (!noPauses && words && words.length > 0) {
196
+ previewText = formatTranscriptWithPauses(result.text, words);
197
+ }
198
+ // Show first 200 chars of preview
199
+ const previewLimit = 200;
200
+ const truncatedPreview = previewText.length > previewLimit
201
+ ? previewText.substring(0, previewLimit) + "..."
202
+ : previewText;
203
+ this.log(` ${truncatedPreview.replace(/\n/g, "\n ")}`);
204
+ // 📊 Metadata summary
205
+ this.log(`\n📊 Metadata:`);
206
+ const duration = result.rawResponse?.duration || result.duration;
207
+ if (duration) {
208
+ const minutes = Math.floor(duration / 60);
209
+ const seconds = (duration % 60).toFixed(1);
210
+ this.log(` Duration: ${minutes}m ${seconds}s (${duration.toFixed(1)}s)`);
211
+ }
212
+ // Show actual model if returned by API (may differ from requested)
213
+ if (result.model) {
214
+ this.log(` Model (actual): ${result.model}`);
215
+ }
216
+ const wordCount = result.text.split(/\s+/).filter((w) => w.length > 0).length;
217
+ const charCount = result.text.length;
218
+ this.log(` Words: ${wordCount}`);
219
+ this.log(` Characters: ${charCount}`);
220
+ // 💰 Detailed cost breakdown
221
+ this.log(`\n💰 Cost:`);
222
+ if (duration) {
223
+ const durationMinutes = duration / 60;
224
+ this.log(` Duration: ${durationMinutes.toFixed(2)} minutes`);
225
+ // Provider-specific cost rates
226
+ if (providerName === "mistral") {
227
+ // $0.003 per minute for Voxtral Transcribe 2
228
+ // Source: https://docs.mistral.ai/models/voxtral-mini-transcribe-26-02
229
+ const audioRate = 0.003;
230
+ const audioCost = durationMinutes * audioRate;
231
+ this.log(` Rate: $${audioRate}/min`);
232
+ this.log(` Audio cost: $${audioCost.toFixed(4)}`);
233
+ // Token cost if available
234
+ const usage = result.rawResponse?.usage;
235
+ if (usage?.total_tokens) {
236
+ const tokenRate = 0.000003; // ~$3/1M tokens estimate for Mistral
237
+ const tokenCost = usage.total_tokens * tokenRate;
238
+ this.log(` Token cost: $${tokenCost.toFixed(4)} (${usage.total_tokens} tokens)`);
239
+ this.log(` Total: $${(audioCost + tokenCost).toFixed(4)}`);
240
+ }
241
+ else {
242
+ this.log(` Total: $${audioCost.toFixed(4)}`);
243
+ }
244
+ }
245
+ else if (providerName === "greenpt") {
246
+ const audioRate = 0.006; // $0.006 per minute estimate
247
+ const audioCost = durationMinutes * audioRate;
248
+ this.log(` Rate: $${audioRate}/min`);
249
+ this.log(` Total: $${audioCost.toFixed(4)}`);
250
+ }
251
+ else {
252
+ this.log(` Local whisper: no API cost`);
253
+ }
254
+ }
255
+ else {
256
+ this.log(` Duration data not available for cost calculation`);
257
+ }
258
+ // 💾 Save notification
259
+ this.log(`\n💾 Saved to: ${outputPath}`);
260
+ this.log(`\n✅ Transcription completed successfully!\n`);
261
+ }
262
+ }
263
+ Scribe.DEFAULTS = {
264
+ DIARIZE: false,
265
+ SILDUR: "1.3",
266
+ SILBUF: 0.2,
267
+ SILTHR: "-35dB",
268
+ MODEL_KEYS: {
269
+ tinyd: "ggml-small.en-tdrz.bin",
270
+ small: "ggml-small.bin",
271
+ medium: "ggml-medium.bin",
272
+ },
273
+ MODELS: {
274
+ tinyd: "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
275
+ small: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
276
+ medium: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
277
+ },
278
+ };
279
+ Scribe.description = "Transcribe audio file - optionally prepare first";
280
+ Scribe.examples = [
281
+ `$ tranz scribe 'path/to/16khz-audiofile.wav'
282
+ runs whisper and outputs and saves a transcription json
283
+
284
+ $ tranz scribe 'path/to/whatever-audiofile.mp3' -p
285
+ first prepares and then runs whisper and outputs and saves a transcription json
286
+
287
+ $ tranz scribe 'path/to/audiofile.mp3' -r mistral -m voxtral-mini-latest
288
+ transcribe using Mistral provider with specified model
289
+
290
+ $ tranz scribe 'path/to/audiofile.mp3' -r mistral --timestamps segment
291
+ transcribe using Mistral with segment-level timestamps
292
+
293
+ $ tranz scribe 'path/to/audiofile.mp3' -r mistral --diarization --timestamps word
294
+ transcribe using Mistral with speaker diarization and word-level timestamps
295
+
296
+ $ tranz scribe 'path/to/audiofile.mp3' -r greenpt -l en
297
+ transcribe using GreenPT provider with language specification
298
+
299
+ $ tranz scribe 'path/to/long-audio.mp3' --auto-split --max-segment 300
300
+ automatically split long audio at silence boundaries (max 5 min segments)
301
+ `,
302
+ ];
303
+ Scribe.flags = {
304
+ prep: Flags.boolean({
305
+ char: "p",
306
+ description: "do prep?",
307
+ required: false,
308
+ default: false,
309
+ }),
310
+ withGPU: Flags.boolean({
311
+ char: "g",
312
+ description: "use gpu?",
313
+ required: false,
314
+ default: false,
315
+ }),
316
+ norm: Flags.boolean({
317
+ char: "n",
318
+ description: "do normalization?",
319
+ required: false,
320
+ default: false,
321
+ }),
322
+ output: Flags.string({
323
+ char: "o",
324
+ description: "output directory",
325
+ required: false,
326
+ default: "./out",
327
+ }),
328
+ diarization: Flags.boolean({
329
+ char: "d",
330
+ description: "enable speaker diarization",
331
+ required: false,
332
+ default: Scribe.DEFAULTS.DIARIZE,
333
+ }),
334
+ separate_speakers: Flags.boolean({
335
+ description: "separate via diarization (deprecated: use --diarization)",
336
+ required: false,
337
+ deprecated: {
338
+ message: "use --diarization instead",
339
+ },
340
+ }),
341
+ provider: Flags.string({
342
+ char: "r",
343
+ description: "transcription provider (whisper, mistral, greenpt)",
344
+ options: ["whisper", "mistral", "voxtral", "greenpt"],
345
+ default: "whisper",
346
+ }),
347
+ model: Flags.string({
348
+ char: "m",
349
+ description: "model name (provider-specific)",
350
+ }),
351
+ language: Flags.string({
352
+ char: "l",
353
+ description: "language code (e.g., en, de, fr)",
354
+ }),
355
+ timestamps: Flags.string({
356
+ description: "enable timestamp granularity (segment, word) - Mistral only",
357
+ options: ["segment", "word"],
358
+ }),
359
+ "no-pauses": Flags.boolean({
360
+ description: "disable pause-based line break formatting in transcript preview",
361
+ default: false,
362
+ }),
363
+ "auto-split": Flags.boolean({
364
+ description: "automatically split long audio files at silence boundaries",
365
+ default: false,
366
+ }),
367
+ "max-segment": Flags.integer({
368
+ description: "maximum segment duration in seconds for auto-split (default: 600)",
369
+ default: 600,
370
+ }),
371
+ "split-silence-dur": Flags.string({
372
+ description: "minimum silence duration for split points (default: 1.0)",
373
+ default: "1.0",
374
+ }),
375
+ };
376
+ Scribe.args = [
377
+ {
378
+ name: 'input',
379
+ description: 'input file',
380
+ required: true,
381
+ },
382
+ ];
383
+ export default Scribe;
@@ -0,0 +1,23 @@
1
+ import { Command } from '@oclif/core';
2
+ export default class Split extends Command {
3
+ static description: string;
4
+ static DEFAULTS: {
5
+ OUTDIR: string;
6
+ SILDUR: string;
7
+ SILBUF: number;
8
+ SILTHR: string;
9
+ };
10
+ static flags: {
11
+ output: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
12
+ silthr: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
13
+ sildur: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
14
+ };
15
+ static args: {
16
+ name: string;
17
+ description: string;
18
+ required: boolean;
19
+ }[];
20
+ static split: () => void;
21
+ run(): Promise<void>;
22
+ static examples: string[];
23
+ }