@wovin/tranz-cli 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +447 -0
- package/bin/dev.cmd +3 -0
- package/bin/dev.js +10 -0
- package/bin/run.cmd +3 -0
- package/bin/run.js +6 -0
- package/dist/commands/prep/index.d.ts +59 -0
- package/dist/commands/prep/index.js +177 -0
- package/dist/commands/scribe/index.d.ts +43 -0
- package/dist/commands/scribe/index.js +383 -0
- package/dist/commands/split/index.d.ts +23 -0
- package/dist/commands/split/index.js +95 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +3 -0
- package/dist/utils/file-utils.d.ts +5 -0
- package/dist/utils/file-utils.js +15 -0
- package/dist/utils/formatBytes.d.ts +14 -0
- package/dist/utils/formatBytes.js +22 -0
- package/dist/utils/secrets.d.ts +18 -0
- package/dist/utils/secrets.js +40 -0
- package/oclif.manifest.json +284 -0
- package/package.json +90 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
var _a;
|
|
2
|
+
import { Command, Flags, CliUx } from '@oclif/core';
|
|
3
|
+
import ffmpeg from 'fluent-ffmpeg';
|
|
4
|
+
import * as fs from 'node:fs';
|
|
5
|
+
import { getFileInfo, getName, getNameWithExt } from "../../utils/file-utils.js";
|
|
6
|
+
// @ts-expect-error
|
|
7
|
+
import normalize from '@dharmendrasha/ffmpeg-normalize';
|
|
8
|
+
import path from 'node:path';
|
|
9
|
+
const doSilenceRemoval = async (inputPath, silence, sildur, output = `${inputPath}-silenceRemoved.mp3`) => {
|
|
10
|
+
ffmpeg(inputPath, { logger: console })
|
|
11
|
+
.audioFilters([
|
|
12
|
+
{
|
|
13
|
+
filter: 'silenceremove',
|
|
14
|
+
options: { stop_threshold: silence, stop_periods: -1, stop_duration: sildur, /* stop_silence: 0.25, start_silence: 0.25 */ }
|
|
15
|
+
}
|
|
16
|
+
])
|
|
17
|
+
.on('codecData', function (data) {
|
|
18
|
+
// console.log('Input is ' + data.audio + ' audio ' + 'with ' + data.video + ' video');
|
|
19
|
+
})
|
|
20
|
+
.on('stderr', function (stderrLine) {
|
|
21
|
+
console.log('Stderr output: ' + stderrLine);
|
|
22
|
+
})
|
|
23
|
+
.on('error', function (error) {
|
|
24
|
+
console.error('Error:', error, silence);
|
|
25
|
+
})
|
|
26
|
+
.on('end', function (stdout, stderr) {
|
|
27
|
+
console.log('Silence Removal succeeded, output path:', output);
|
|
28
|
+
})
|
|
29
|
+
.save(output);
|
|
30
|
+
};
|
|
31
|
+
const doNormalization = async (path, outputDir) => {
|
|
32
|
+
// const [name, ext] = getExt(path) as string[]
|
|
33
|
+
const { name, ext } = getFileInfo(path);
|
|
34
|
+
const output = `${outputDir}/${name}_normalized${ext}`;
|
|
35
|
+
const normalized = await normalize({
|
|
36
|
+
input: path,
|
|
37
|
+
output,
|
|
38
|
+
loudness: {
|
|
39
|
+
normalization: 'ebuR128',
|
|
40
|
+
target: {
|
|
41
|
+
input_i: -23,
|
|
42
|
+
input_lra: 7.0,
|
|
43
|
+
input_tp: -2.0
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}).catch((error) => {
|
|
47
|
+
// Some error happened
|
|
48
|
+
console.error(error);
|
|
49
|
+
});
|
|
50
|
+
console.log(normalized);
|
|
51
|
+
const finalOutputPath = `${outputDir}/${getName(output)}-prepared.wav`;
|
|
52
|
+
return new Promise(resolveFx => {
|
|
53
|
+
ffmpeg(output, { logger: console })
|
|
54
|
+
.audioFrequency(16000) // required for whisper
|
|
55
|
+
.outputOptions(['-ac 1', '-c:a pcm_s16le'])
|
|
56
|
+
.save(finalOutputPath)
|
|
57
|
+
.on('end', () => {
|
|
58
|
+
resolveFx(finalOutputPath);
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
};
|
|
62
|
+
class Prep extends Command {
|
|
63
|
+
/**
|
|
64
|
+
* doPrep
|
|
65
|
+
*/
|
|
66
|
+
async doPrep(props) {
|
|
67
|
+
const { input, outdir: outdirFlag, silthr, sildur, norm, verbose } = props;
|
|
68
|
+
// HACK ESM imports and geScrewed in oclif
|
|
69
|
+
const { DEBUG, ERROR } = { DEBUG: console.log, ERROR: (...Any) => new Error(JSON.stringify(Any)) }; //Logger.setup(verbose ? Logger.DEBUG : Logger.INFO) // eslint-disable-line no-unused-vars
|
|
70
|
+
DEBUG('Starting Prep with:', props);
|
|
71
|
+
return new Promise(async (resolveFx, rejectFx) => {
|
|
72
|
+
const silenceInfo = [];
|
|
73
|
+
const info = { traces: [], silenceInfo };
|
|
74
|
+
const sourceFileName = getNameWithExt(input);
|
|
75
|
+
const cachedRnnnPath = await _a.ensureRnnnIsCached(this.config);
|
|
76
|
+
// const isCreateInterimArtifacts = true // ? consider implement chaining if this is false and add param
|
|
77
|
+
const phase1input = input;
|
|
78
|
+
const phase1Name = getName(phase1input);
|
|
79
|
+
const outDir = `${outdirFlag}/${phase1Name}`;
|
|
80
|
+
fs.mkdirSync(path.normalize(outDir), { recursive: true });
|
|
81
|
+
const phase1OutputPath = `${outDir}/${phase1Name}-denoised.wav`;
|
|
82
|
+
const ffmpegChain = ffmpeg(phase1input, { logger: console });
|
|
83
|
+
ffmpegChain
|
|
84
|
+
.audioFilters([
|
|
85
|
+
{
|
|
86
|
+
filter: 'arnndn',
|
|
87
|
+
options: { m: cachedRnnnPath, mix: 1 } // mix:-1 means keep only noise https://ffmpeg.org/ffmpeg-filters.html#arnndn
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
filter: 'silencedetect',
|
|
91
|
+
options: { n: silthr, d: sildur }
|
|
92
|
+
},
|
|
93
|
+
])
|
|
94
|
+
.audioFrequency(16000) // required for whisper
|
|
95
|
+
.outputOptions(['-ac 1', '-c:a pcm_s16le'])
|
|
96
|
+
.on('codecData', function (data) {
|
|
97
|
+
const codecInfo = 'Input is ' + data.audio + ' audio ' + 'with ' + data.video + ' video';
|
|
98
|
+
info.traces.push([codecInfo]);
|
|
99
|
+
DEBUG(codecInfo);
|
|
100
|
+
})
|
|
101
|
+
.on('stderr', function (stderrLine) {
|
|
102
|
+
if (stderrLine.includes(' silence_end: ')) {
|
|
103
|
+
const endInfo = stderrLine.split(' silence_end: ')[1].split(' | silence_duration: ');
|
|
104
|
+
const [endSec, duration] = endInfo.map(s => +s);
|
|
105
|
+
silenceInfo.push({ stSec: endSec - duration, endSec, duration });
|
|
106
|
+
}
|
|
107
|
+
})
|
|
108
|
+
.on('error', function (error) {
|
|
109
|
+
rejectFx(ERROR('Error:', error, silthr));
|
|
110
|
+
})
|
|
111
|
+
.on('end', async function (stdout, stderr) {
|
|
112
|
+
const successTrace = ['Phase2 succeeded: ', { silenceInfo, phase2OutputPath: phase1OutputPath }];
|
|
113
|
+
info.traces.push(successTrace);
|
|
114
|
+
DEBUG(successTrace);
|
|
115
|
+
CliUx.ux.table(silenceInfo, { stSec: {}, endSec: {}, duration: {} });
|
|
116
|
+
const outputPath = norm ? await _a.normalize(phase1OutputPath, outDir) : phase1OutputPath;
|
|
117
|
+
const results = { ffmpegChain, outputPath };
|
|
118
|
+
resolveFx({ props, info, results });
|
|
119
|
+
})
|
|
120
|
+
.save(phase1OutputPath);
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* only for cli parsing
|
|
125
|
+
*/
|
|
126
|
+
async run() {
|
|
127
|
+
const { args: { input }, flags, flags: { outdir, silthr, sildur, norm, verbose } } = await this.parse(_a);
|
|
128
|
+
const report = await this.doPrep({ input, outdir, silthr, sildur, norm, verbose });
|
|
129
|
+
verbose && console.log(report);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
_a = Prep;
|
|
133
|
+
Prep.description = 'Prepare audio file - normalize, noise reduce, split on silence';
|
|
134
|
+
Prep.DEFAULTS = {
|
|
135
|
+
SILDUR: '1.3',
|
|
136
|
+
SILBUF: 0.2,
|
|
137
|
+
SILTHR: '-35dB',
|
|
138
|
+
RNNN: 'https://raw.githubusercontent.com/GregorR/rnnoise-models/master/somnolent-hogwash-2018-09-01/sh.rnnn'
|
|
139
|
+
};
|
|
140
|
+
Prep.args = [{ name: 'input', description: 'input file', required: true }];
|
|
141
|
+
Prep.flags = {
|
|
142
|
+
norm: Flags.boolean({ char: 'n', description: 'do normalization?', required: false, default: false }),
|
|
143
|
+
verbose: Flags.boolean({ char: 'v', description: 'trace more?', required: false, default: false }),
|
|
144
|
+
outdir: Flags.string({ char: 'o', description: 'output directory', required: false, default: './out' }),
|
|
145
|
+
silthr: Flags.string({ char: 's', description: 'silence threshold', required: false, default: _a.DEFAULTS.SILTHR }),
|
|
146
|
+
sildur: Flags.string({ char: 'd', description: 'silence duration', required: false, default: _a.DEFAULTS.SILDUR }),
|
|
147
|
+
};
|
|
148
|
+
/**
|
|
149
|
+
* subroutines
|
|
150
|
+
*/
|
|
151
|
+
Prep.normalize = doNormalization;
|
|
152
|
+
Prep.removeSilence = doSilenceRemoval;
|
|
153
|
+
/**
|
|
154
|
+
* cli only (depends on config and fs situation)
|
|
155
|
+
*/
|
|
156
|
+
Prep.ensureRnnnIsCached = async (config) => {
|
|
157
|
+
const cachedRnnnPath = `${config.cacheDir}/sh.rnnn`;
|
|
158
|
+
const isRnnnExisting = fs.existsSync(cachedRnnnPath);
|
|
159
|
+
if (!isRnnnExisting) {
|
|
160
|
+
console.log(`
|
|
161
|
+
rnnn for noise removal missing
|
|
162
|
+
Fetching ${_a.DEFAULTS.RNNN} into ${cachedRnnnPath}
|
|
163
|
+
`);
|
|
164
|
+
const data = await fetch(_a.DEFAULTS.RNNN);
|
|
165
|
+
fs.writeFileSync(cachedRnnnPath, await data.text());
|
|
166
|
+
}
|
|
167
|
+
else
|
|
168
|
+
console.log(`Found ${cachedRnnnPath} \n`);
|
|
169
|
+
return cachedRnnnPath;
|
|
170
|
+
};
|
|
171
|
+
Prep.examples = [
|
|
172
|
+
`$ tranz prep AI_Could_Be_The_End_Of_Democracy.m4a' -s -35dB -d 1.5
|
|
173
|
+
--> sets silence threshold to -35dB and min silence duration to 1.5s
|
|
174
|
+
--> runs normalization and noise removal
|
|
175
|
+
`,
|
|
176
|
+
];
|
|
177
|
+
export default Prep;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { Command } from "@oclif/core";
|
|
2
|
+
export default class Scribe extends Command {
|
|
3
|
+
static DEFAULTS: {
|
|
4
|
+
DIARIZE: boolean;
|
|
5
|
+
SILDUR: string;
|
|
6
|
+
SILBUF: number;
|
|
7
|
+
SILTHR: string;
|
|
8
|
+
MODEL_KEYS: {
|
|
9
|
+
tinyd: string;
|
|
10
|
+
small: string;
|
|
11
|
+
medium: string;
|
|
12
|
+
};
|
|
13
|
+
MODELS: {
|
|
14
|
+
tinyd: string;
|
|
15
|
+
small: string;
|
|
16
|
+
medium: string;
|
|
17
|
+
};
|
|
18
|
+
};
|
|
19
|
+
static description: string;
|
|
20
|
+
static examples: string[];
|
|
21
|
+
static flags: {
|
|
22
|
+
prep: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
23
|
+
withGPU: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
24
|
+
norm: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
25
|
+
output: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
26
|
+
diarization: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
27
|
+
separate_speakers: import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
28
|
+
provider: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
29
|
+
model: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
|
|
30
|
+
language: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
|
|
31
|
+
timestamps: import("@oclif/core/lib/interfaces/parser").OptionFlag<string | undefined>;
|
|
32
|
+
"no-pauses": import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
33
|
+
"auto-split": import("@oclif/core/lib/interfaces/parser").BooleanFlag<boolean>;
|
|
34
|
+
"max-segment": import("@oclif/core/lib/interfaces/parser").OptionFlag<number>;
|
|
35
|
+
"split-silence-dur": import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
36
|
+
};
|
|
37
|
+
static args: {
|
|
38
|
+
name: string;
|
|
39
|
+
description: string;
|
|
40
|
+
required: boolean;
|
|
41
|
+
}[];
|
|
42
|
+
run(): Promise<void>;
|
|
43
|
+
}
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
import { Command, Flags } from "@oclif/core";
|
|
2
|
+
import * as fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { getName } from "../../utils/file-utils.js";
|
|
5
|
+
import { loadSecretOrThrow } from "../../utils/secrets.js";
|
|
6
|
+
import { formatBytes } from "../../utils/formatBytes.js";
|
|
7
|
+
import Prep from "../prep/index.js";
|
|
8
|
+
import { createProvider, VOXTRAL_LIMITS, formatTranscriptWithPauses, autoSplitAudio, getAudioDuration, mergeTranscriptionResults, } from "@wovin/tranz";
|
|
9
|
+
class Scribe extends Command {
|
|
10
|
+
async run() {
|
|
11
|
+
const { args: { input }, flags, flags: { output: outputFlag, diarization, separate_speakers, prep, norm, provider: providerFlag, model, language, timestamps, withGPU, "no-pauses": noPauses, "auto-split": autoSplit, "max-segment": maxSegment, "split-silence-dur": splitSilenceDur, }, } = await this.parse(Scribe);
|
|
12
|
+
// Use new diarization flag, fallback to deprecated separate_speakers
|
|
13
|
+
const enableDiarization = diarization || separate_speakers;
|
|
14
|
+
let inputPath = input;
|
|
15
|
+
const sourceFileName = getName(inputPath);
|
|
16
|
+
// 🎙️ Header
|
|
17
|
+
this.log("\n🎙️ Transcribing Audio\n");
|
|
18
|
+
// 🎵 Loading audio file with size
|
|
19
|
+
const fileStats = fs.statSync(inputPath);
|
|
20
|
+
const fileSize = formatBytes(fileStats.size);
|
|
21
|
+
this.log(`🎵 Loading audio file: ${inputPath}`);
|
|
22
|
+
this.log(` Size: ${fileSize}`);
|
|
23
|
+
if (prep) {
|
|
24
|
+
const prepCommand = new Prep([], this.config);
|
|
25
|
+
const prepResult = prepCommand.doPrep({
|
|
26
|
+
input: inputPath,
|
|
27
|
+
outdir: outputFlag,
|
|
28
|
+
norm,
|
|
29
|
+
verbose: true,
|
|
30
|
+
silthr: "35dB",
|
|
31
|
+
sildur: "1.2",
|
|
32
|
+
});
|
|
33
|
+
inputPath = (await prepResult).results.outputPath;
|
|
34
|
+
}
|
|
35
|
+
const outputDir = `${outputFlag}/${sourceFileName}`;
|
|
36
|
+
fs.mkdirSync(path.normalize(outputDir), { recursive: true });
|
|
37
|
+
// Normalize provider name (voxtral -> mistral)
|
|
38
|
+
let providerName = (providerFlag === "voxtral" ? "mistral" : providerFlag);
|
|
39
|
+
// Handle API-based providers
|
|
40
|
+
let apiKey;
|
|
41
|
+
if (providerName === "mistral" || providerName === "greenpt") {
|
|
42
|
+
if (withGPU) {
|
|
43
|
+
this.warn(`--withGPU flag is not applicable to ${providerName} provider (API-based)`);
|
|
44
|
+
}
|
|
45
|
+
if (providerName === "mistral") {
|
|
46
|
+
if (timestamps && language) {
|
|
47
|
+
this.error(`--timestamps and --language cannot be used together (Mistral API limitation)\nSee: https://docs.mistral.ai/capabilities/audio_transcription`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
// Load API key
|
|
51
|
+
const envVarMap = {
|
|
52
|
+
mistral: { envVar: "MISTRAL_API_KEY", secretPath: "secret/mistral" },
|
|
53
|
+
greenpt: { envVar: "GREENPT_API_KEY", secretPath: "secret/greenpt" },
|
|
54
|
+
};
|
|
55
|
+
const { envVar, secretPath } = envVarMap[providerName];
|
|
56
|
+
apiKey = await loadSecretOrThrow(envVar, secretPath, `${providerName} API key`);
|
|
57
|
+
}
|
|
58
|
+
// ⚙️ Configuration summary
|
|
59
|
+
const actualModel = model || (providerName === "mistral" ? "voxtral-mini-latest" : providerName === "greenpt" ? "whisper-large-v3" : "medium");
|
|
60
|
+
this.log(`\n⚙️ Configuration:`);
|
|
61
|
+
this.log(` Provider: ${providerName}`);
|
|
62
|
+
this.log(` Model: ${actualModel}`);
|
|
63
|
+
if (language) {
|
|
64
|
+
this.log(` Language: ${language}`);
|
|
65
|
+
}
|
|
66
|
+
if (timestamps) {
|
|
67
|
+
this.log(` Timestamps: ${timestamps}`);
|
|
68
|
+
}
|
|
69
|
+
if (enableDiarization) {
|
|
70
|
+
this.log(` Diarization: enabled`);
|
|
71
|
+
}
|
|
72
|
+
if (autoSplit) {
|
|
73
|
+
this.log(` Auto-split: enabled (max ${maxSegment}s segments)`);
|
|
74
|
+
}
|
|
75
|
+
// Create provider
|
|
76
|
+
const provider = createProvider(providerName, this.config);
|
|
77
|
+
// Check if provider has limits that require auto-split
|
|
78
|
+
let effectiveAutoSplit = autoSplit;
|
|
79
|
+
let effectiveMaxSegment = maxSegment;
|
|
80
|
+
// For Mistral/Voxtral provider, auto-enable split if audio exceeds limit
|
|
81
|
+
if (providerName === "mistral") {
|
|
82
|
+
const audioDuration = await getAudioDuration(inputPath);
|
|
83
|
+
if (audioDuration > VOXTRAL_LIMITS.recommendedMaxDurationSec) {
|
|
84
|
+
if (!autoSplit) {
|
|
85
|
+
this.log(`\n⚠️ Audio duration (${Math.floor(audioDuration / 60)}m) exceeds Voxtral limit (${VOXTRAL_LIMITS.recommendedMaxDurationSec / 60}m)`);
|
|
86
|
+
this.log(` Auto-enabling split mode...`);
|
|
87
|
+
effectiveAutoSplit = true;
|
|
88
|
+
effectiveMaxSegment = VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// Determine if we need to auto-split
|
|
93
|
+
let result;
|
|
94
|
+
let segments;
|
|
95
|
+
if (effectiveAutoSplit) {
|
|
96
|
+
// Get audio duration to check if splitting is needed
|
|
97
|
+
const audioDuration = await getAudioDuration(inputPath);
|
|
98
|
+
this.log(`\n📏 Audio duration: ${Math.floor(audioDuration / 60)}m ${(audioDuration % 60).toFixed(0)}s`);
|
|
99
|
+
if (audioDuration > effectiveMaxSegment) {
|
|
100
|
+
// Split audio at silence boundaries
|
|
101
|
+
this.log(`\n✂️ Splitting audio (exceeds ${effectiveMaxSegment}s max)...`);
|
|
102
|
+
const splitConfig = {
|
|
103
|
+
maxDurationSec: effectiveMaxSegment,
|
|
104
|
+
minSilenceDurSec: parseFloat(splitSilenceDur),
|
|
105
|
+
preferLongerSilence: true,
|
|
106
|
+
};
|
|
107
|
+
const segmentsDir = path.join(outputDir, 'segments');
|
|
108
|
+
segments = await autoSplitAudio(inputPath, segmentsDir, splitConfig);
|
|
109
|
+
this.log(` Created ${segments.length} segments`);
|
|
110
|
+
for (const seg of segments) {
|
|
111
|
+
this.log(` - Segment ${seg.index}: ${seg.startSec.toFixed(1)}s - ${seg.endSec.toFixed(1)}s (${seg.durationSec.toFixed(1)}s)`);
|
|
112
|
+
}
|
|
113
|
+
// Transcribe each segment
|
|
114
|
+
this.log(`\n⏳ Transcribing ${segments.length} segments...`);
|
|
115
|
+
const segmentResults = [];
|
|
116
|
+
for (const seg of segments) {
|
|
117
|
+
this.log(` Transcribing segment ${seg.index + 1}/${segments.length}...`);
|
|
118
|
+
const segResult = await provider.transcribe({
|
|
119
|
+
audioPath: seg.outputPath,
|
|
120
|
+
outputDir,
|
|
121
|
+
model,
|
|
122
|
+
language,
|
|
123
|
+
timestampGranularity: timestamps,
|
|
124
|
+
diarize: enableDiarization,
|
|
125
|
+
apiKey,
|
|
126
|
+
});
|
|
127
|
+
segmentResults.push(segResult);
|
|
128
|
+
if (segResult.error) {
|
|
129
|
+
this.warn(` Segment ${seg.index} error: ${segResult.error}`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
// Merge results
|
|
133
|
+
result = mergeTranscriptionResults(segmentResults, segments);
|
|
134
|
+
this.log(`\n🔗 Merged ${segments.length} segment transcriptions`);
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
this.log(`\n⏳ Transcribing (no split needed)...`);
|
|
138
|
+
result = await provider.transcribe({
|
|
139
|
+
audioPath: inputPath,
|
|
140
|
+
outputDir,
|
|
141
|
+
model,
|
|
142
|
+
language,
|
|
143
|
+
timestampGranularity: timestamps,
|
|
144
|
+
diarize: enableDiarization,
|
|
145
|
+
apiKey,
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
// ⏳ Progress indicator
|
|
151
|
+
this.log(`\n⏳ Transcribing...`);
|
|
152
|
+
result = await provider.transcribe({
|
|
153
|
+
audioPath: inputPath,
|
|
154
|
+
outputDir,
|
|
155
|
+
model,
|
|
156
|
+
language,
|
|
157
|
+
timestampGranularity: timestamps,
|
|
158
|
+
diarize: enableDiarization,
|
|
159
|
+
apiKey,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
if (result.error) {
|
|
163
|
+
throw new Error(`Transcription failed: ${result.error}`);
|
|
164
|
+
}
|
|
165
|
+
// Determine output filename based on provider
|
|
166
|
+
let outputFileName;
|
|
167
|
+
switch (providerName) {
|
|
168
|
+
case "mistral":
|
|
169
|
+
outputFileName = `${sourceFileName}-transcript-mistral.json`;
|
|
170
|
+
break;
|
|
171
|
+
case "greenpt":
|
|
172
|
+
outputFileName = `${sourceFileName}-transcript-greenpt.json`;
|
|
173
|
+
break;
|
|
174
|
+
case "whisper":
|
|
175
|
+
default:
|
|
176
|
+
outputFileName = `${sourceFileName}-transcript.json`;
|
|
177
|
+
}
|
|
178
|
+
const outputPath = `${outputDir}/${outputFileName}`;
|
|
179
|
+
// For Whisper, the file is already saved by the provider (unless we did auto-split)
|
|
180
|
+
// For API providers or auto-split, save the result
|
|
181
|
+
if (providerName !== "whisper" || segments) {
|
|
182
|
+
fs.writeFileSync(outputPath, JSON.stringify(result.rawResponse || result, null, 2));
|
|
183
|
+
}
|
|
184
|
+
// Show segment info if we split
|
|
185
|
+
const mergedResult = result;
|
|
186
|
+
if (mergedResult.totalSegments && mergedResult.totalSegments > 1) {
|
|
187
|
+
this.log(`\n📊 Split Summary:`);
|
|
188
|
+
this.log(` Total segments: ${mergedResult.totalSegments}`);
|
|
189
|
+
}
|
|
190
|
+
// 📝 Transcript preview
|
|
191
|
+
this.log(`\n📝 Transcript preview:`);
|
|
192
|
+
let previewText = result.text;
|
|
193
|
+
// Apply pause formatting if words data available and not disabled
|
|
194
|
+
const words = result.rawResponse?.words || result.words;
|
|
195
|
+
if (!noPauses && words && words.length > 0) {
|
|
196
|
+
previewText = formatTranscriptWithPauses(result.text, words);
|
|
197
|
+
}
|
|
198
|
+
// Show first 200 chars of preview
|
|
199
|
+
const previewLimit = 200;
|
|
200
|
+
const truncatedPreview = previewText.length > previewLimit
|
|
201
|
+
? previewText.substring(0, previewLimit) + "..."
|
|
202
|
+
: previewText;
|
|
203
|
+
this.log(` ${truncatedPreview.replace(/\n/g, "\n ")}`);
|
|
204
|
+
// 📊 Metadata summary
|
|
205
|
+
this.log(`\n📊 Metadata:`);
|
|
206
|
+
const duration = result.rawResponse?.duration || result.duration;
|
|
207
|
+
if (duration) {
|
|
208
|
+
const minutes = Math.floor(duration / 60);
|
|
209
|
+
const seconds = (duration % 60).toFixed(1);
|
|
210
|
+
this.log(` Duration: ${minutes}m ${seconds}s (${duration.toFixed(1)}s)`);
|
|
211
|
+
}
|
|
212
|
+
// Show actual model if returned by API (may differ from requested)
|
|
213
|
+
if (result.model) {
|
|
214
|
+
this.log(` Model (actual): ${result.model}`);
|
|
215
|
+
}
|
|
216
|
+
const wordCount = result.text.split(/\s+/).filter((w) => w.length > 0).length;
|
|
217
|
+
const charCount = result.text.length;
|
|
218
|
+
this.log(` Words: ${wordCount}`);
|
|
219
|
+
this.log(` Characters: ${charCount}`);
|
|
220
|
+
// 💰 Detailed cost breakdown
|
|
221
|
+
this.log(`\n💰 Cost:`);
|
|
222
|
+
if (duration) {
|
|
223
|
+
const durationMinutes = duration / 60;
|
|
224
|
+
this.log(` Duration: ${durationMinutes.toFixed(2)} minutes`);
|
|
225
|
+
// Provider-specific cost rates
|
|
226
|
+
if (providerName === "mistral") {
|
|
227
|
+
// $0.003 per minute for Voxtral Transcribe 2
|
|
228
|
+
// Source: https://docs.mistral.ai/models/voxtral-mini-transcribe-26-02
|
|
229
|
+
const audioRate = 0.003;
|
|
230
|
+
const audioCost = durationMinutes * audioRate;
|
|
231
|
+
this.log(` Rate: $${audioRate}/min`);
|
|
232
|
+
this.log(` Audio cost: $${audioCost.toFixed(4)}`);
|
|
233
|
+
// Token cost if available
|
|
234
|
+
const usage = result.rawResponse?.usage;
|
|
235
|
+
if (usage?.total_tokens) {
|
|
236
|
+
const tokenRate = 0.000003; // ~$3/1M tokens estimate for Mistral
|
|
237
|
+
const tokenCost = usage.total_tokens * tokenRate;
|
|
238
|
+
this.log(` Token cost: $${tokenCost.toFixed(4)} (${usage.total_tokens} tokens)`);
|
|
239
|
+
this.log(` Total: $${(audioCost + tokenCost).toFixed(4)}`);
|
|
240
|
+
}
|
|
241
|
+
else {
|
|
242
|
+
this.log(` Total: $${audioCost.toFixed(4)}`);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
else if (providerName === "greenpt") {
|
|
246
|
+
const audioRate = 0.006; // $0.006 per minute estimate
|
|
247
|
+
const audioCost = durationMinutes * audioRate;
|
|
248
|
+
this.log(` Rate: $${audioRate}/min`);
|
|
249
|
+
this.log(` Total: $${audioCost.toFixed(4)}`);
|
|
250
|
+
}
|
|
251
|
+
else {
|
|
252
|
+
this.log(` Local whisper: no API cost`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
else {
|
|
256
|
+
this.log(` Duration data not available for cost calculation`);
|
|
257
|
+
}
|
|
258
|
+
// 💾 Save notification
|
|
259
|
+
this.log(`\n💾 Saved to: ${outputPath}`);
|
|
260
|
+
this.log(`\n✅ Transcription completed successfully!\n`);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
Scribe.DEFAULTS = {
|
|
264
|
+
DIARIZE: false,
|
|
265
|
+
SILDUR: "1.3",
|
|
266
|
+
SILBUF: 0.2,
|
|
267
|
+
SILTHR: "-35dB",
|
|
268
|
+
MODEL_KEYS: {
|
|
269
|
+
tinyd: "ggml-small.en-tdrz.bin",
|
|
270
|
+
small: "ggml-small.bin",
|
|
271
|
+
medium: "ggml-medium.bin",
|
|
272
|
+
},
|
|
273
|
+
MODELS: {
|
|
274
|
+
tinyd: "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
|
|
275
|
+
small: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
|
|
276
|
+
medium: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
|
|
277
|
+
},
|
|
278
|
+
};
|
|
279
|
+
Scribe.description = "Transcribe audio file - optionally prepare first";
|
|
280
|
+
Scribe.examples = [
|
|
281
|
+
`$ tranz scribe 'path/to/16khz-audiofile.wav'
|
|
282
|
+
runs whisper and outputs and saves a transcription json
|
|
283
|
+
|
|
284
|
+
$ tranz scribe 'path/to/whatever-audiofile.mp3' -p
|
|
285
|
+
first prepares and then runs whisper and outputs and saves a transcription json
|
|
286
|
+
|
|
287
|
+
$ tranz scribe 'path/to/audiofile.mp3' -r mistral -m voxtral-mini-latest
|
|
288
|
+
transcribe using Mistral provider with specified model
|
|
289
|
+
|
|
290
|
+
$ tranz scribe 'path/to/audiofile.mp3' -r mistral --timestamps segment
|
|
291
|
+
transcribe using Mistral with segment-level timestamps
|
|
292
|
+
|
|
293
|
+
$ tranz scribe 'path/to/audiofile.mp3' -r mistral --diarization --timestamps word
|
|
294
|
+
transcribe using Mistral with speaker diarization and word-level timestamps
|
|
295
|
+
|
|
296
|
+
$ tranz scribe 'path/to/audiofile.mp3' -r greenpt -l en
|
|
297
|
+
transcribe using GreenPT provider with language specification
|
|
298
|
+
|
|
299
|
+
$ tranz scribe 'path/to/long-audio.mp3' --auto-split --max-segment 300
|
|
300
|
+
automatically split long audio at silence boundaries (max 5 min segments)
|
|
301
|
+
`,
|
|
302
|
+
];
|
|
303
|
+
Scribe.flags = {
|
|
304
|
+
prep: Flags.boolean({
|
|
305
|
+
char: "p",
|
|
306
|
+
description: "do prep?",
|
|
307
|
+
required: false,
|
|
308
|
+
default: false,
|
|
309
|
+
}),
|
|
310
|
+
withGPU: Flags.boolean({
|
|
311
|
+
char: "g",
|
|
312
|
+
description: "use gpu?",
|
|
313
|
+
required: false,
|
|
314
|
+
default: false,
|
|
315
|
+
}),
|
|
316
|
+
norm: Flags.boolean({
|
|
317
|
+
char: "n",
|
|
318
|
+
description: "do normalization?",
|
|
319
|
+
required: false,
|
|
320
|
+
default: false,
|
|
321
|
+
}),
|
|
322
|
+
output: Flags.string({
|
|
323
|
+
char: "o",
|
|
324
|
+
description: "output directory",
|
|
325
|
+
required: false,
|
|
326
|
+
default: "./out",
|
|
327
|
+
}),
|
|
328
|
+
diarization: Flags.boolean({
|
|
329
|
+
char: "d",
|
|
330
|
+
description: "enable speaker diarization",
|
|
331
|
+
required: false,
|
|
332
|
+
default: Scribe.DEFAULTS.DIARIZE,
|
|
333
|
+
}),
|
|
334
|
+
separate_speakers: Flags.boolean({
|
|
335
|
+
description: "separate via diarization (deprecated: use --diarization)",
|
|
336
|
+
required: false,
|
|
337
|
+
deprecated: {
|
|
338
|
+
message: "use --diarization instead",
|
|
339
|
+
},
|
|
340
|
+
}),
|
|
341
|
+
provider: Flags.string({
|
|
342
|
+
char: "r",
|
|
343
|
+
description: "transcription provider (whisper, mistral, greenpt)",
|
|
344
|
+
options: ["whisper", "mistral", "voxtral", "greenpt"],
|
|
345
|
+
default: "whisper",
|
|
346
|
+
}),
|
|
347
|
+
model: Flags.string({
|
|
348
|
+
char: "m",
|
|
349
|
+
description: "model name (provider-specific)",
|
|
350
|
+
}),
|
|
351
|
+
language: Flags.string({
|
|
352
|
+
char: "l",
|
|
353
|
+
description: "language code (e.g., en, de, fr)",
|
|
354
|
+
}),
|
|
355
|
+
timestamps: Flags.string({
|
|
356
|
+
description: "enable timestamp granularity (segment, word) - Mistral only",
|
|
357
|
+
options: ["segment", "word"],
|
|
358
|
+
}),
|
|
359
|
+
"no-pauses": Flags.boolean({
|
|
360
|
+
description: "disable pause-based line break formatting in transcript preview",
|
|
361
|
+
default: false,
|
|
362
|
+
}),
|
|
363
|
+
"auto-split": Flags.boolean({
|
|
364
|
+
description: "automatically split long audio files at silence boundaries",
|
|
365
|
+
default: false,
|
|
366
|
+
}),
|
|
367
|
+
"max-segment": Flags.integer({
|
|
368
|
+
description: "maximum segment duration in seconds for auto-split (default: 600)",
|
|
369
|
+
default: 600,
|
|
370
|
+
}),
|
|
371
|
+
"split-silence-dur": Flags.string({
|
|
372
|
+
description: "minimum silence duration for split points (default: 1.0)",
|
|
373
|
+
default: "1.0",
|
|
374
|
+
}),
|
|
375
|
+
};
|
|
376
|
+
Scribe.args = [
|
|
377
|
+
{
|
|
378
|
+
name: 'input',
|
|
379
|
+
description: 'input file',
|
|
380
|
+
required: true,
|
|
381
|
+
},
|
|
382
|
+
];
|
|
383
|
+
export default Scribe;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { Command } from '@oclif/core';
|
|
2
|
+
export default class Split extends Command {
|
|
3
|
+
static description: string;
|
|
4
|
+
static DEFAULTS: {
|
|
5
|
+
OUTDIR: string;
|
|
6
|
+
SILDUR: string;
|
|
7
|
+
SILBUF: number;
|
|
8
|
+
SILTHR: string;
|
|
9
|
+
};
|
|
10
|
+
static flags: {
|
|
11
|
+
output: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
12
|
+
silthr: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
13
|
+
sildur: import("@oclif/core/lib/interfaces/parser").OptionFlag<string>;
|
|
14
|
+
};
|
|
15
|
+
static args: {
|
|
16
|
+
name: string;
|
|
17
|
+
description: string;
|
|
18
|
+
required: boolean;
|
|
19
|
+
}[];
|
|
20
|
+
static split: () => void;
|
|
21
|
+
run(): Promise<void>;
|
|
22
|
+
static examples: string[];
|
|
23
|
+
}
|