@wovin/tranz 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/dist/audio.d.ts +6 -0
- package/dist/audio.d.ts.map +1 -0
- package/dist/audio.min.js +302 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.min.js +769 -0
- package/dist/providers.d.ts +6 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.min.js +681 -0
- package/dist/utils/audio/index.d.ts +6 -0
- package/dist/utils/audio/index.d.ts.map +1 -0
- package/dist/utils/audio/merge-results.d.ts +47 -0
- package/dist/utils/audio/merge-results.d.ts.map +1 -0
- package/dist/utils/audio/split.d.ts +106 -0
- package/dist/utils/audio/split.d.ts.map +1 -0
- package/dist/utils/file-utils.d.ts +6 -0
- package/dist/utils/file-utils.d.ts.map +1 -0
- package/dist/utils/transcription/format.d.ts +14 -0
- package/dist/utils/transcription/format.d.ts.map +1 -0
- package/dist/utils/transcription/mime-detection.d.ts +25 -0
- package/dist/utils/transcription/mime-detection.d.ts.map +1 -0
- package/dist/utils/transcription/providers.d.ts +146 -0
- package/dist/utils/transcription/providers.d.ts.map +1 -0
- package/dist/utils/transcription/transcribe.d.ts +59 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -0
- package/package.json +62 -0
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @wovin/tranz/providers - Transcription provider implementations
|
|
3
|
+
*/
|
|
4
|
+
export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, } from './utils/transcription/providers.js';
|
|
5
|
+
export { createMistralTranscriber, transcribe, type TranscribeOptions, type MistralTranscriberConfig, } from './utils/transcription/transcribe.js';
|
|
6
|
+
//# sourceMappingURL=providers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,GAC3B,MAAM,oCAAoC,CAAA;AAE3C,OAAO,EACL,wBAAwB,EACxB,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,wBAAwB,GAC9B,MAAM,qCAAqC,CAAA"}
|
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
// src/utils/transcription/providers.ts
|
|
2
|
+
import { spawn } from "child_process";
|
|
3
|
+
import * as fs from "fs";
|
|
4
|
+
|
|
5
|
+
// src/utils/file-utils.ts
|
|
6
|
+
import path from "path";
|
|
7
|
+
var getName = (filePath) => {
|
|
8
|
+
return getFileInfo(filePath).name;
|
|
9
|
+
};
|
|
10
|
+
var getFileInfo = (filePath) => {
|
|
11
|
+
const normed = path.normalize(filePath);
|
|
12
|
+
return path.parse(normed);
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
// src/utils/transcription/providers.ts
|
|
16
|
+
import { pipeline } from "stream";
|
|
17
|
+
import { promisify } from "util";
|
|
18
|
+
|
|
19
|
+
// src/utils/transcription/mime-detection.ts
|
|
20
|
+
function detectAudioMimeType(buffer) {
|
|
21
|
+
if (buffer.length < 4) return "audio/ogg";
|
|
22
|
+
let offset = 0;
|
|
23
|
+
if (buffer[0] === 73 && buffer[1] === 68 && buffer[2] === 51) {
|
|
24
|
+
if (buffer.length >= 10) {
|
|
25
|
+
const size = (buffer[6] & 127) << 21 | (buffer[7] & 127) << 14 | (buffer[8] & 127) << 7 | buffer[9] & 127;
|
|
26
|
+
offset = 10 + size;
|
|
27
|
+
if (offset >= buffer.length) offset = 0;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
if (buffer.length - offset >= 4) {
|
|
31
|
+
if (buffer[offset] === 255 && (buffer[offset + 1] === 251 || buffer[offset + 1] === 250) || buffer[offset] === 255 && buffer[offset + 1] === 243 || buffer[offset] === 255 && buffer[offset + 1] === 242) {
|
|
32
|
+
return "audio/mpeg";
|
|
33
|
+
}
|
|
34
|
+
if (buffer[offset] === 79 && buffer[offset + 1] === 103 && buffer[offset + 2] === 103 && buffer[offset + 3] === 83) {
|
|
35
|
+
return "audio/ogg";
|
|
36
|
+
}
|
|
37
|
+
if (buffer[offset] === 82 && buffer[offset + 1] === 73 && buffer[offset + 2] === 70 && buffer[offset + 3] === 70) {
|
|
38
|
+
return "audio/wav";
|
|
39
|
+
}
|
|
40
|
+
if (buffer[offset] === 102 && buffer[offset + 1] === 76 && buffer[offset + 2] === 97 && buffer[offset + 3] === 67) {
|
|
41
|
+
return "audio/flac";
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
return "audio/ogg";
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// src/utils/transcription/providers.ts
|
|
48
|
+
function createProvider(providerName, config) {
|
|
49
|
+
switch (providerName) {
|
|
50
|
+
case "whisper":
|
|
51
|
+
return new WhisperProvider(config);
|
|
52
|
+
case "mistral":
|
|
53
|
+
return new MistralProvider();
|
|
54
|
+
case "greenpt":
|
|
55
|
+
return new GreenPTProvider();
|
|
56
|
+
default:
|
|
57
|
+
throw new Error(`Unknown provider: ${providerName}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
var WhisperProvider = class _WhisperProvider {
|
|
61
|
+
name = "whisper";
|
|
62
|
+
cacheDir;
|
|
63
|
+
static DEFAULTS = {
|
|
64
|
+
DIARIZE: false,
|
|
65
|
+
SILDUR: "1.3",
|
|
66
|
+
SILBUF: 0.2,
|
|
67
|
+
SILTHR: "-35dB",
|
|
68
|
+
MODEL_KEYS: {
|
|
69
|
+
tinyd: "ggml-small.en-tdrz.bin",
|
|
70
|
+
small: "ggml-small.bin",
|
|
71
|
+
medium: "ggml-medium.bin"
|
|
72
|
+
},
|
|
73
|
+
MODELS: {
|
|
74
|
+
tinyd: "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
|
|
75
|
+
small: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
|
|
76
|
+
medium: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin"
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
constructor(config) {
|
|
80
|
+
this.cacheDir = config?.cacheDir || `${process.env.HOME}/.cache/whisper-models`;
|
|
81
|
+
}
|
|
82
|
+
async transcribe(params) {
|
|
83
|
+
const {
|
|
84
|
+
audioPath,
|
|
85
|
+
outputDir = "./out",
|
|
86
|
+
diarize = _WhisperProvider.DEFAULTS.DIARIZE,
|
|
87
|
+
modelPath: providedModelPath
|
|
88
|
+
} = params;
|
|
89
|
+
let modelPath = providedModelPath;
|
|
90
|
+
const modelKey = diarize ? "tinyd" : "small";
|
|
91
|
+
if (!modelPath) {
|
|
92
|
+
modelPath = await this.ensureRequestedModelIsCached(modelKey);
|
|
93
|
+
}
|
|
94
|
+
const sourceFileName = getName(audioPath);
|
|
95
|
+
const outTransPath = `${outputDir}/${sourceFileName}-transcript`;
|
|
96
|
+
const tdrz = diarize ? "-tdrz" : "";
|
|
97
|
+
const args = [
|
|
98
|
+
tdrz,
|
|
99
|
+
"-t",
|
|
100
|
+
"8",
|
|
101
|
+
"-oj",
|
|
102
|
+
"-ng",
|
|
103
|
+
// TODO: consider withGPU option
|
|
104
|
+
"-f",
|
|
105
|
+
audioPath,
|
|
106
|
+
"-m",
|
|
107
|
+
modelPath,
|
|
108
|
+
"-of",
|
|
109
|
+
outTransPath
|
|
110
|
+
].filter((arg) => arg !== "");
|
|
111
|
+
const cmd = `whisper-cli ${args.join(" ")}`;
|
|
112
|
+
console.log("spawning ", cmd);
|
|
113
|
+
const whisperThread = spawn(`whisper-cli`, args);
|
|
114
|
+
return new Promise((resolveFx) => {
|
|
115
|
+
let whisperOutput = "";
|
|
116
|
+
const handleOut = (data) => {
|
|
117
|
+
const str = data.toString();
|
|
118
|
+
for (const match of ["[", "main:"]) {
|
|
119
|
+
if (str.startsWith(match) || str.includes("total time"))
|
|
120
|
+
console.log(str);
|
|
121
|
+
}
|
|
122
|
+
whisperOutput += data;
|
|
123
|
+
};
|
|
124
|
+
whisperThread.stdout.on("data", handleOut);
|
|
125
|
+
whisperThread.stderr.on("data", handleOut);
|
|
126
|
+
whisperThread.on("close", (code) => {
|
|
127
|
+
try {
|
|
128
|
+
const trans = JSON.parse(
|
|
129
|
+
fs.readFileSync(`${outTransPath}.json`).toString()
|
|
130
|
+
);
|
|
131
|
+
const transcriptionArray = trans.result?.transcription || [];
|
|
132
|
+
const text = transcriptionArray.map((entry) => entry.text).filter((t) => t).join(" ").trim();
|
|
133
|
+
resolveFx({
|
|
134
|
+
text: text || "",
|
|
135
|
+
rawResponse: trans
|
|
136
|
+
});
|
|
137
|
+
} catch (error) {
|
|
138
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
139
|
+
resolveFx({
|
|
140
|
+
text: "",
|
|
141
|
+
error: `Failed to parse transcription result: ${errorMessage}`,
|
|
142
|
+
rawResponse: void 0
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
whisperThread.on("error", (err) => {
|
|
147
|
+
console.error("Whisper Error", { err, outTransPath, args });
|
|
148
|
+
resolveFx({
|
|
149
|
+
text: "",
|
|
150
|
+
error: `Whisper process error: ${err.message}`
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
}).catch((whisperError) => {
|
|
154
|
+
console.error("Uncaught Whisper Error", whisperError);
|
|
155
|
+
return {
|
|
156
|
+
text: "",
|
|
157
|
+
error: `Uncaught error: ${whisperError instanceof Error ? whisperError.message : String(whisperError)}`
|
|
158
|
+
};
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
async ensureRequestedModelIsCached(modelKey) {
|
|
162
|
+
if (!_WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey])
|
|
163
|
+
throw new Error(`${modelKey} not known`);
|
|
164
|
+
const cachedModelsDirPath = `${this.cacheDir}/models`;
|
|
165
|
+
if (!fs.existsSync(cachedModelsDirPath)) {
|
|
166
|
+
fs.mkdirSync(cachedModelsDirPath, { recursive: true });
|
|
167
|
+
}
|
|
168
|
+
const modelPath = `${cachedModelsDirPath}/${_WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey]}`;
|
|
169
|
+
const isModelExisting = fs.existsSync(modelPath);
|
|
170
|
+
if (!isModelExisting) {
|
|
171
|
+
const srcURL = _WhisperProvider.DEFAULTS.MODELS[modelKey];
|
|
172
|
+
console.log(`
|
|
173
|
+
requested model is missing
|
|
174
|
+
Fetching ${srcURL} into ${modelPath}
|
|
175
|
+
`);
|
|
176
|
+
const data = await fetch(srcURL);
|
|
177
|
+
if (!data?.body) throw new Error("fetch failed");
|
|
178
|
+
const streamPipeline = promisify(pipeline);
|
|
179
|
+
await streamPipeline(
|
|
180
|
+
data.body,
|
|
181
|
+
fs.createWriteStream(modelPath)
|
|
182
|
+
);
|
|
183
|
+
} else {
|
|
184
|
+
console.log(`Found ${modelPath}
|
|
185
|
+
`);
|
|
186
|
+
}
|
|
187
|
+
return modelPath;
|
|
188
|
+
}
|
|
189
|
+
};
|
|
190
|
+
var VOXTRAL_LIMITS = {
|
|
191
|
+
/** Maximum audio duration in seconds (3 hours for Voxtral Transcribe 2) */
|
|
192
|
+
maxAudioDurationSec: 3 * 60 * 60,
|
|
193
|
+
// 10800 seconds = 3 hours
|
|
194
|
+
/** Recommended max duration before splitting (for reliability) */
|
|
195
|
+
recommendedMaxDurationSec: 30 * 60,
|
|
196
|
+
// 30 minutes
|
|
197
|
+
/** Maximum context biasing words/phrases */
|
|
198
|
+
maxContextBiasingTerms: 100,
|
|
199
|
+
/** Maximum file size in bytes (1GB) */
|
|
200
|
+
maxFileSizeBytes: 1024 * 1024 * 1024
|
|
201
|
+
};
|
|
202
|
+
var MistralProvider = class {
|
|
203
|
+
name = "mistral";
|
|
204
|
+
maxAudioDurationSec = VOXTRAL_LIMITS.maxAudioDurationSec;
|
|
205
|
+
/**
|
|
206
|
+
* Check if audio duration exceeds recommended limits
|
|
207
|
+
*/
|
|
208
|
+
static shouldSplit(durationSec) {
|
|
209
|
+
return durationSec > VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Get the recommended max segment duration for splitting
|
|
213
|
+
*/
|
|
214
|
+
static getRecommendedMaxSegment() {
|
|
215
|
+
return VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
216
|
+
}
|
|
217
|
+
async transcribe(params) {
|
|
218
|
+
const formData = new FormData();
|
|
219
|
+
if (params.audioUrl) {
|
|
220
|
+
formData.append("file_url", params.audioUrl);
|
|
221
|
+
} else {
|
|
222
|
+
let audioBuffer;
|
|
223
|
+
let mimeType;
|
|
224
|
+
if (params.audioBuffer) {
|
|
225
|
+
audioBuffer = params.audioBuffer;
|
|
226
|
+
mimeType = params.mimeType || detectAudioMimeType(audioBuffer);
|
|
227
|
+
} else if (params.audioPath) {
|
|
228
|
+
audioBuffer = fs.readFileSync(params.audioPath);
|
|
229
|
+
mimeType = detectAudioMimeType(audioBuffer);
|
|
230
|
+
} else {
|
|
231
|
+
return { text: "", error: "No audio input provided (audioPath, audioBuffer, or audioUrl required)" };
|
|
232
|
+
}
|
|
233
|
+
const extension = mimeType === "audio/mpeg" ? "mp3" : mimeType === "audio/wav" ? "wav" : mimeType === "audio/flac" ? "flac" : "ogg";
|
|
234
|
+
const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
|
|
235
|
+
formData.append("file", audioBlob, `audio.${extension}`);
|
|
236
|
+
}
|
|
237
|
+
const model = params.model || "voxtral-mini-latest";
|
|
238
|
+
formData.append("model", model);
|
|
239
|
+
if (params.language) {
|
|
240
|
+
formData.append("language", params.language);
|
|
241
|
+
}
|
|
242
|
+
const timestampGranularity = params.timestampGranularity ?? (params.language ? void 0 : "word");
|
|
243
|
+
if (timestampGranularity) {
|
|
244
|
+
formData.append("timestamp_granularities", timestampGranularity);
|
|
245
|
+
}
|
|
246
|
+
const diarize = params.diarize ?? true;
|
|
247
|
+
if (diarize) {
|
|
248
|
+
formData.append("diarize", "true");
|
|
249
|
+
}
|
|
250
|
+
const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
|
|
251
|
+
method: "POST",
|
|
252
|
+
headers: {
|
|
253
|
+
"Authorization": `Bearer ${params.apiKey}`
|
|
254
|
+
},
|
|
255
|
+
body: formData
|
|
256
|
+
});
|
|
257
|
+
if (!response.ok) {
|
|
258
|
+
const errorText = await response.text();
|
|
259
|
+
return { text: "", error: `API returned ${response.status}: ${errorText}` };
|
|
260
|
+
}
|
|
261
|
+
const result = await response.json();
|
|
262
|
+
if (!result?.text) {
|
|
263
|
+
return { text: "", error: "No transcription returned", rawResponse: result };
|
|
264
|
+
}
|
|
265
|
+
const words = result.words || result.segments?.flatMap((seg) => seg.words || []);
|
|
266
|
+
const duration = result.usage?.prompt_audio_seconds;
|
|
267
|
+
return {
|
|
268
|
+
text: result.text,
|
|
269
|
+
language: result.language ?? params.language,
|
|
270
|
+
model: result.model,
|
|
271
|
+
duration,
|
|
272
|
+
words,
|
|
273
|
+
rawResponse: result
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
};
|
|
277
|
+
var GreenPTProvider = class {
|
|
278
|
+
name = "greenpt";
|
|
279
|
+
async transcribe(params) {
|
|
280
|
+
if (!params.apiKey) {
|
|
281
|
+
return { text: "", error: "API key is required for GreenPT provider" };
|
|
282
|
+
}
|
|
283
|
+
if (!params.audioPath) {
|
|
284
|
+
return { text: "", error: "Audio path is required" };
|
|
285
|
+
}
|
|
286
|
+
try {
|
|
287
|
+
const audioBuffer = fs.readFileSync(params.audioPath);
|
|
288
|
+
const mimeType = detectAudioMimeType(audioBuffer);
|
|
289
|
+
const formData = new FormData();
|
|
290
|
+
const extension = mimeType === "audio/mpeg" ? "mp3" : mimeType === "audio/wav" ? "wav" : mimeType === "audio/flac" ? "flac" : "ogg";
|
|
291
|
+
const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
|
|
292
|
+
formData.append("file", audioBlob, `audio.${extension}`);
|
|
293
|
+
const queryParams = new URLSearchParams();
|
|
294
|
+
const model = params.model || "green-s-pro";
|
|
295
|
+
queryParams.append("model", model);
|
|
296
|
+
if (params.language) {
|
|
297
|
+
queryParams.append("language", params.language);
|
|
298
|
+
}
|
|
299
|
+
if (params.diarize !== void 0) {
|
|
300
|
+
queryParams.append("diarize", String(params.diarize));
|
|
301
|
+
}
|
|
302
|
+
queryParams.append("punctuate", "true");
|
|
303
|
+
const url = `https://api.greenpt.ai/v1/listen?${queryParams.toString()}`;
|
|
304
|
+
const response = await fetch(url, {
|
|
305
|
+
method: "POST",
|
|
306
|
+
headers: {
|
|
307
|
+
"Authorization": `Bearer ${params.apiKey}`
|
|
308
|
+
},
|
|
309
|
+
body: formData
|
|
310
|
+
});
|
|
311
|
+
if (!response.ok) {
|
|
312
|
+
const errorText = await response.text();
|
|
313
|
+
return {
|
|
314
|
+
text: "",
|
|
315
|
+
error: `API returned ${response.status}: ${errorText}`
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
const result = await response.json();
|
|
319
|
+
const transcript = result?.results?.channels?.[0]?.alternatives?.[0];
|
|
320
|
+
if (!transcript) {
|
|
321
|
+
return {
|
|
322
|
+
text: "",
|
|
323
|
+
error: "No transcription returned",
|
|
324
|
+
rawResponse: result
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
let text = transcript.transcript;
|
|
328
|
+
if (!text && transcript.words && transcript.words.length > 0) {
|
|
329
|
+
text = transcript.words.map((w) => w.word).join(" ");
|
|
330
|
+
}
|
|
331
|
+
return {
|
|
332
|
+
text: text || "",
|
|
333
|
+
confidence: transcript.confidence,
|
|
334
|
+
words: transcript.words,
|
|
335
|
+
duration: result?.metadata?.duration,
|
|
336
|
+
rawResponse: result
|
|
337
|
+
};
|
|
338
|
+
} catch (error) {
|
|
339
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
340
|
+
return {
|
|
341
|
+
text: "",
|
|
342
|
+
error: `Transcription failed: ${errorMessage}`
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
};
|
|
347
|
+
|
|
348
|
+
// src/utils/transcription/transcribe.ts
|
|
349
|
+
import * as fs3 from "fs";
|
|
350
|
+
import * as os from "os";
|
|
351
|
+
import * as path3 from "path";
|
|
352
|
+
|
|
353
|
+
// src/utils/audio/split.ts
|
|
354
|
+
import ffmpeg from "fluent-ffmpeg";
|
|
355
|
+
import * as fs2 from "fs";
|
|
356
|
+
import path2 from "path";
|
|
357
|
+
import { spawn as spawn2 } from "child_process";
|
|
358
|
+
var DEFAULT_SPLIT_CONFIG = {
|
|
359
|
+
maxDurationSec: 600,
|
|
360
|
+
// 10 minutes
|
|
361
|
+
minSilenceDurSec: 1,
|
|
362
|
+
silenceThreshold: "-35dB",
|
|
363
|
+
preferLongerSilence: true,
|
|
364
|
+
silenceBuffer: 0.2
|
|
365
|
+
};
|
|
366
|
+
async function getAudioDuration(audioPath) {
|
|
367
|
+
return new Promise((resolve, reject) => {
|
|
368
|
+
ffmpeg.ffprobe(audioPath, (err, metadata) => {
|
|
369
|
+
if (err) {
|
|
370
|
+
reject(new Error(`Failed to probe audio: ${err.message}`));
|
|
371
|
+
return;
|
|
372
|
+
}
|
|
373
|
+
const duration = metadata.format.duration;
|
|
374
|
+
if (typeof duration !== "number") {
|
|
375
|
+
reject(new Error("Could not determine audio duration"));
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
resolve(duration);
|
|
379
|
+
});
|
|
380
|
+
});
|
|
381
|
+
}
|
|
382
|
+
async function detectSilenceRegions(audioPath, config = {}) {
|
|
383
|
+
const { minSilenceDurSec, silenceThreshold } = { ...DEFAULT_SPLIT_CONFIG, ...config };
|
|
384
|
+
return new Promise((resolve, reject) => {
|
|
385
|
+
const silenceRegions = [];
|
|
386
|
+
const args = [
|
|
387
|
+
"-i",
|
|
388
|
+
audioPath,
|
|
389
|
+
"-af",
|
|
390
|
+
`silencedetect=n=${silenceThreshold}:d=${minSilenceDurSec}`,
|
|
391
|
+
"-f",
|
|
392
|
+
"wav",
|
|
393
|
+
"-ac",
|
|
394
|
+
"1",
|
|
395
|
+
"-ar",
|
|
396
|
+
"8000",
|
|
397
|
+
"pipe:1"
|
|
398
|
+
];
|
|
399
|
+
const proc = spawn2("ffmpeg", args);
|
|
400
|
+
proc.stdout.on("data", () => {
|
|
401
|
+
});
|
|
402
|
+
proc.stderr.on("data", (data) => {
|
|
403
|
+
const lines = data.toString().split("\n");
|
|
404
|
+
for (const line of lines) {
|
|
405
|
+
if (line.includes("silence_end:")) {
|
|
406
|
+
const match = line.match(/silence_end:\s*([\d.]+)\s*\|\s*silence_duration:\s*([\d.]+)/);
|
|
407
|
+
if (match) {
|
|
408
|
+
const endSec = parseFloat(match[1]);
|
|
409
|
+
const durationSec = parseFloat(match[2]);
|
|
410
|
+
if (!isNaN(endSec) && !isNaN(durationSec)) {
|
|
411
|
+
silenceRegions.push({
|
|
412
|
+
startSec: endSec - durationSec,
|
|
413
|
+
endSec,
|
|
414
|
+
durationSec
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
});
|
|
421
|
+
proc.on("close", (code) => {
|
|
422
|
+
if (code === 0 || silenceRegions.length > 0) {
|
|
423
|
+
resolve(silenceRegions);
|
|
424
|
+
} else {
|
|
425
|
+
reject(new Error(`FFmpeg exited with code ${code}`));
|
|
426
|
+
}
|
|
427
|
+
});
|
|
428
|
+
proc.on("error", (err) => {
|
|
429
|
+
reject(new Error(`Silence detection failed: ${err.message}`));
|
|
430
|
+
});
|
|
431
|
+
});
|
|
432
|
+
}
|
|
433
|
+
function findOptimalSplitPoints(silenceRegions, totalDuration, config = {}) {
|
|
434
|
+
const { maxDurationSec, preferLongerSilence, silenceBuffer } = {
|
|
435
|
+
...DEFAULT_SPLIT_CONFIG,
|
|
436
|
+
...config
|
|
437
|
+
};
|
|
438
|
+
if (totalDuration <= maxDurationSec) {
|
|
439
|
+
return [];
|
|
440
|
+
}
|
|
441
|
+
const numSegments = Math.ceil(totalDuration / maxDurationSec);
|
|
442
|
+
const idealSegmentDuration = totalDuration / numSegments;
|
|
443
|
+
const splitPoints = [];
|
|
444
|
+
for (let i = 1; i < numSegments; i++) {
|
|
445
|
+
const idealSplitTime = idealSegmentDuration * i;
|
|
446
|
+
const windowSize = idealSegmentDuration * 0.3;
|
|
447
|
+
const windowStart = idealSplitTime - windowSize;
|
|
448
|
+
const windowEnd = idealSplitTime + windowSize;
|
|
449
|
+
const candidateSilences = silenceRegions.filter((silence) => {
|
|
450
|
+
const silenceMid = (silence.startSec + silence.endSec) / 2;
|
|
451
|
+
return silenceMid >= windowStart && silenceMid <= windowEnd;
|
|
452
|
+
});
|
|
453
|
+
let bestSplitPoint;
|
|
454
|
+
if (candidateSilences.length > 0) {
|
|
455
|
+
let bestScore = -Infinity;
|
|
456
|
+
let bestSilence = candidateSilences[0];
|
|
457
|
+
for (const silence of candidateSilences) {
|
|
458
|
+
const silenceMid = (silence.startSec + silence.endSec) / 2;
|
|
459
|
+
const proximityScore = 1 - Math.abs(silenceMid - idealSplitTime) / windowSize;
|
|
460
|
+
const score = preferLongerSilence ? silence.durationSec * proximityScore : proximityScore;
|
|
461
|
+
if (score > bestScore) {
|
|
462
|
+
bestScore = score;
|
|
463
|
+
bestSilence = silence;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
bestSplitPoint = {
|
|
467
|
+
timeSec: (bestSilence.startSec + bestSilence.endSec) / 2,
|
|
468
|
+
silenceDuration: bestSilence.durationSec
|
|
469
|
+
};
|
|
470
|
+
} else {
|
|
471
|
+
bestSplitPoint = {
|
|
472
|
+
timeSec: idealSplitTime,
|
|
473
|
+
silenceDuration: 0
|
|
474
|
+
};
|
|
475
|
+
}
|
|
476
|
+
splitPoints.push(bestSplitPoint);
|
|
477
|
+
}
|
|
478
|
+
return splitPoints.sort((a, b) => a.timeSec - b.timeSec);
|
|
479
|
+
}
|
|
480
|
+
async function splitAudioAtPoints(audioPath, splitPoints, totalDuration, outputDir, baseName) {
|
|
481
|
+
fs2.mkdirSync(outputDir, { recursive: true });
|
|
482
|
+
const segments = [];
|
|
483
|
+
const boundaries = [0, ...splitPoints.map((sp) => sp.timeSec), totalDuration];
|
|
484
|
+
const splitPromises = [];
|
|
485
|
+
for (let i = 0; i < boundaries.length - 1; i++) {
|
|
486
|
+
const startSec = boundaries[i];
|
|
487
|
+
const endSec = boundaries[i + 1];
|
|
488
|
+
const durationSec = endSec - startSec;
|
|
489
|
+
const outputPath = path2.join(outputDir, `${baseName}-segment-${i.toString().padStart(3, "0")}.wav`);
|
|
490
|
+
const segment = {
|
|
491
|
+
index: i,
|
|
492
|
+
startSec,
|
|
493
|
+
endSec,
|
|
494
|
+
durationSec,
|
|
495
|
+
outputPath
|
|
496
|
+
};
|
|
497
|
+
segments.push(segment);
|
|
498
|
+
const extractPromise = new Promise((resolve, reject) => {
|
|
499
|
+
ffmpeg(audioPath).setStartTime(startSec).setDuration(durationSec).audioFrequency(16e3).outputOptions(["-ac 1", "-c:a pcm_s16le"]).output(outputPath).on("error", (err) => reject(new Error(`Failed to extract segment ${i}: ${err.message}`))).on("end", () => resolve()).run();
|
|
500
|
+
});
|
|
501
|
+
splitPromises.push(extractPromise);
|
|
502
|
+
}
|
|
503
|
+
await Promise.all(splitPromises);
|
|
504
|
+
return segments;
|
|
505
|
+
}
|
|
506
|
+
async function autoSplitAudio(audioPath, outputDir, config = {}) {
|
|
507
|
+
const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config };
|
|
508
|
+
const totalDuration = await getAudioDuration(audioPath);
|
|
509
|
+
if (totalDuration <= mergedConfig.maxDurationSec) {
|
|
510
|
+
return [
|
|
511
|
+
{
|
|
512
|
+
index: 0,
|
|
513
|
+
startSec: 0,
|
|
514
|
+
endSec: totalDuration,
|
|
515
|
+
durationSec: totalDuration,
|
|
516
|
+
outputPath: audioPath
|
|
517
|
+
}
|
|
518
|
+
];
|
|
519
|
+
}
|
|
520
|
+
const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig);
|
|
521
|
+
const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig);
|
|
522
|
+
const baseName = path2.basename(audioPath, path2.extname(audioPath));
|
|
523
|
+
const segments = await splitAudioAtPoints(
|
|
524
|
+
audioPath,
|
|
525
|
+
splitPoints,
|
|
526
|
+
totalDuration,
|
|
527
|
+
outputDir,
|
|
528
|
+
baseName
|
|
529
|
+
);
|
|
530
|
+
return segments;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// src/utils/audio/merge-results.ts
|
|
534
|
+
function mergeTranscriptionResults(results, segments) {
|
|
535
|
+
if (results.length === 0) {
|
|
536
|
+
return {
|
|
537
|
+
text: "",
|
|
538
|
+
error: "No results to merge"
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
if (results.length === 1) {
|
|
542
|
+
return {
|
|
543
|
+
...results[0],
|
|
544
|
+
totalSegments: 1
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
const errors = results.map((r, i) => r.error ? `Segment ${i}: ${r.error}` : null).filter(Boolean);
|
|
548
|
+
if (errors.length > 0) {
|
|
549
|
+
return {
|
|
550
|
+
text: "",
|
|
551
|
+
error: `Errors in segments: ${errors.join("; ")}`
|
|
552
|
+
};
|
|
553
|
+
}
|
|
554
|
+
const mergedText = results.map((r) => r.text.trim()).join("\n\n");
|
|
555
|
+
const mergedWords = [];
|
|
556
|
+
for (let i = 0; i < results.length; i++) {
|
|
557
|
+
const result = results[i];
|
|
558
|
+
const segment = segments[i];
|
|
559
|
+
const words = result.words || result.rawResponse?.words || [];
|
|
560
|
+
for (const word of words) {
|
|
561
|
+
mergedWords.push({
|
|
562
|
+
word: word.word || word.text,
|
|
563
|
+
start: (word.start || 0) + segment.startSec,
|
|
564
|
+
end: (word.end || 0) + segment.startSec,
|
|
565
|
+
confidence: word.confidence,
|
|
566
|
+
speaker: word.speaker
|
|
567
|
+
});
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0);
|
|
571
|
+
const segmentMeta = results.map((r, i) => ({
|
|
572
|
+
index: i,
|
|
573
|
+
startSec: segments[i].startSec,
|
|
574
|
+
endSec: segments[i].endSec,
|
|
575
|
+
text: r.text.trim()
|
|
576
|
+
}));
|
|
577
|
+
const mergedRawResponse = {
|
|
578
|
+
merged: true,
|
|
579
|
+
segmentCount: results.length,
|
|
580
|
+
segments: results.map((r, i) => ({
|
|
581
|
+
index: i,
|
|
582
|
+
startSec: segments[i].startSec,
|
|
583
|
+
rawResponse: r.rawResponse
|
|
584
|
+
})),
|
|
585
|
+
words: mergedWords
|
|
586
|
+
};
|
|
587
|
+
const firstResult = results[0];
|
|
588
|
+
return {
|
|
589
|
+
text: mergedText,
|
|
590
|
+
words: mergedWords,
|
|
591
|
+
duration: totalDuration,
|
|
592
|
+
language: firstResult.language,
|
|
593
|
+
model: firstResult.model,
|
|
594
|
+
rawResponse: mergedRawResponse,
|
|
595
|
+
segments: segmentMeta,
|
|
596
|
+
totalSegments: results.length
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// src/utils/transcription/transcribe.ts
|
|
601
|
+
function createMistralTranscriber(config) {
|
|
602
|
+
const provider = new MistralProvider();
|
|
603
|
+
const defaultModel = config.model || "voxtral-mini-latest";
|
|
604
|
+
return {
|
|
605
|
+
/**
|
|
606
|
+
* Transcribe audio with auto-splitting for long files (file path only)
|
|
607
|
+
* Diarization and word timestamps enabled by default
|
|
608
|
+
*/
|
|
609
|
+
async transcribe(options) {
|
|
610
|
+
const {
|
|
611
|
+
audioPath,
|
|
612
|
+
audioBuffer,
|
|
613
|
+
mimeType,
|
|
614
|
+
audioUrl,
|
|
615
|
+
language,
|
|
616
|
+
model = defaultModel,
|
|
617
|
+
diarize = true,
|
|
618
|
+
timestamps = language ? void 0 : "word",
|
|
619
|
+
autoSplit = true,
|
|
620
|
+
splitOutputDir
|
|
621
|
+
} = options;
|
|
622
|
+
if (audioUrl || audioBuffer) {
|
|
623
|
+
const result = await provider.transcribe({
|
|
624
|
+
audioUrl,
|
|
625
|
+
audioBuffer,
|
|
626
|
+
mimeType,
|
|
627
|
+
apiKey: config.apiKey,
|
|
628
|
+
model,
|
|
629
|
+
language,
|
|
630
|
+
diarize,
|
|
631
|
+
timestampGranularity: timestamps
|
|
632
|
+
});
|
|
633
|
+
return { ...result, totalSegments: 1 };
|
|
634
|
+
}
|
|
635
|
+
if (!audioPath) {
|
|
636
|
+
return { text: "", error: "No audio input provided (audioPath, audioBuffer, or audioUrl required)" };
|
|
637
|
+
}
|
|
638
|
+
const duration = await getAudioDuration(audioPath);
|
|
639
|
+
const needsSplit = autoSplit && duration > VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
640
|
+
if (!needsSplit) {
|
|
641
|
+
const result = await provider.transcribe({
|
|
642
|
+
audioPath,
|
|
643
|
+
apiKey: config.apiKey,
|
|
644
|
+
model,
|
|
645
|
+
language,
|
|
646
|
+
diarize,
|
|
647
|
+
timestampGranularity: timestamps
|
|
648
|
+
});
|
|
649
|
+
return { ...result, totalSegments: 1 };
|
|
650
|
+
}
|
|
651
|
+
const outDir = splitOutputDir || path3.join(os.tmpdir(), `tranz-split-${Date.now()}`);
|
|
652
|
+
fs3.mkdirSync(outDir, { recursive: true });
|
|
653
|
+
const segments = await autoSplitAudio(audioPath, outDir, {
|
|
654
|
+
maxDurationSec: VOXTRAL_LIMITS.recommendedMaxDurationSec
|
|
655
|
+
});
|
|
656
|
+
const results = [];
|
|
657
|
+
for (const segment of segments) {
|
|
658
|
+
const result = await provider.transcribe({
|
|
659
|
+
audioPath: segment.outputPath,
|
|
660
|
+
apiKey: config.apiKey,
|
|
661
|
+
model,
|
|
662
|
+
language,
|
|
663
|
+
diarize,
|
|
664
|
+
timestampGranularity: timestamps
|
|
665
|
+
});
|
|
666
|
+
results.push(result);
|
|
667
|
+
}
|
|
668
|
+
return mergeTranscriptionResults(results, segments);
|
|
669
|
+
}
|
|
670
|
+
};
|
|
671
|
+
}
|
|
672
|
+
var transcribe = createMistralTranscriber;
|
|
673
|
+
export {
|
|
674
|
+
GreenPTProvider,
|
|
675
|
+
MistralProvider,
|
|
676
|
+
VOXTRAL_LIMITS,
|
|
677
|
+
WhisperProvider,
|
|
678
|
+
createMistralTranscriber,
|
|
679
|
+
createProvider,
|
|
680
|
+
transcribe
|
|
681
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/utils/audio/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}
|