@wovin/tranz 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ /**
2
+ * @wovin/tranz/providers - Transcription provider implementations
3
+ */
4
+ export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, } from './utils/transcription/providers.js';
5
+ export { createMistralTranscriber, transcribe, type TranscribeOptions, type MistralTranscriberConfig, } from './utils/transcription/transcribe.js';
6
+ //# sourceMappingURL=providers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,GAC3B,MAAM,oCAAoC,CAAA;AAE3C,OAAO,EACL,wBAAwB,EACxB,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,wBAAwB,GAC9B,MAAM,qCAAqC,CAAA"}
@@ -0,0 +1,681 @@
1
+ // src/utils/transcription/providers.ts
2
+ import { spawn } from "child_process";
3
+ import * as fs from "fs";
4
+
5
+ // src/utils/file-utils.ts
6
+ import path from "path";
7
+ var getName = (filePath) => {
8
+ return getFileInfo(filePath).name;
9
+ };
10
+ var getFileInfo = (filePath) => {
11
+ const normed = path.normalize(filePath);
12
+ return path.parse(normed);
13
+ };
14
+
15
+ // src/utils/transcription/providers.ts
16
+ import { pipeline } from "stream";
17
+ import { promisify } from "util";
18
+
19
+ // src/utils/transcription/mime-detection.ts
20
+ function detectAudioMimeType(buffer) {
21
+ if (buffer.length < 4) return "audio/ogg";
22
+ let offset = 0;
23
+ if (buffer[0] === 73 && buffer[1] === 68 && buffer[2] === 51) {
24
+ if (buffer.length >= 10) {
25
+ const size = (buffer[6] & 127) << 21 | (buffer[7] & 127) << 14 | (buffer[8] & 127) << 7 | buffer[9] & 127;
26
+ offset = 10 + size;
27
+ if (offset >= buffer.length) offset = 0;
28
+ }
29
+ }
30
+ if (buffer.length - offset >= 4) {
31
+ if (buffer[offset] === 255 && (buffer[offset + 1] === 251 || buffer[offset + 1] === 250) || buffer[offset] === 255 && buffer[offset + 1] === 243 || buffer[offset] === 255 && buffer[offset + 1] === 242) {
32
+ return "audio/mpeg";
33
+ }
34
+ if (buffer[offset] === 79 && buffer[offset + 1] === 103 && buffer[offset + 2] === 103 && buffer[offset + 3] === 83) {
35
+ return "audio/ogg";
36
+ }
37
+ if (buffer[offset] === 82 && buffer[offset + 1] === 73 && buffer[offset + 2] === 70 && buffer[offset + 3] === 70) {
38
+ return "audio/wav";
39
+ }
40
+ if (buffer[offset] === 102 && buffer[offset + 1] === 76 && buffer[offset + 2] === 97 && buffer[offset + 3] === 67) {
41
+ return "audio/flac";
42
+ }
43
+ }
44
+ return "audio/ogg";
45
+ }
46
+
47
+ // src/utils/transcription/providers.ts
48
+ function createProvider(providerName, config) {
49
+ switch (providerName) {
50
+ case "whisper":
51
+ return new WhisperProvider(config);
52
+ case "mistral":
53
+ return new MistralProvider();
54
+ case "greenpt":
55
+ return new GreenPTProvider();
56
+ default:
57
+ throw new Error(`Unknown provider: ${providerName}`);
58
+ }
59
+ }
60
+ var WhisperProvider = class _WhisperProvider {
61
+ name = "whisper";
62
+ cacheDir;
63
+ static DEFAULTS = {
64
+ DIARIZE: false,
65
+ SILDUR: "1.3",
66
+ SILBUF: 0.2,
67
+ SILTHR: "-35dB",
68
+ MODEL_KEYS: {
69
+ tinyd: "ggml-small.en-tdrz.bin",
70
+ small: "ggml-small.bin",
71
+ medium: "ggml-medium.bin"
72
+ },
73
+ MODELS: {
74
+ tinyd: "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
75
+ small: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
76
+ medium: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin"
77
+ }
78
+ };
79
+ constructor(config) {
80
+ this.cacheDir = config?.cacheDir || `${process.env.HOME}/.cache/whisper-models`;
81
+ }
82
+ async transcribe(params) {
83
+ const {
84
+ audioPath,
85
+ outputDir = "./out",
86
+ diarize = _WhisperProvider.DEFAULTS.DIARIZE,
87
+ modelPath: providedModelPath
88
+ } = params;
89
+ let modelPath = providedModelPath;
90
+ const modelKey = diarize ? "tinyd" : "small";
91
+ if (!modelPath) {
92
+ modelPath = await this.ensureRequestedModelIsCached(modelKey);
93
+ }
94
+ const sourceFileName = getName(audioPath);
95
+ const outTransPath = `${outputDir}/${sourceFileName}-transcript`;
96
+ const tdrz = diarize ? "-tdrz" : "";
97
+ const args = [
98
+ tdrz,
99
+ "-t",
100
+ "8",
101
+ "-oj",
102
+ "-ng",
103
+ // TODO: consider withGPU option
104
+ "-f",
105
+ audioPath,
106
+ "-m",
107
+ modelPath,
108
+ "-of",
109
+ outTransPath
110
+ ].filter((arg) => arg !== "");
111
+ const cmd = `whisper-cli ${args.join(" ")}`;
112
+ console.log("spawning ", cmd);
113
+ const whisperThread = spawn(`whisper-cli`, args);
114
+ return new Promise((resolveFx) => {
115
+ let whisperOutput = "";
116
+ const handleOut = (data) => {
117
+ const str = data.toString();
118
+ for (const match of ["[", "main:"]) {
119
+ if (str.startsWith(match) || str.includes("total time"))
120
+ console.log(str);
121
+ }
122
+ whisperOutput += data;
123
+ };
124
+ whisperThread.stdout.on("data", handleOut);
125
+ whisperThread.stderr.on("data", handleOut);
126
+ whisperThread.on("close", (code) => {
127
+ try {
128
+ const trans = JSON.parse(
129
+ fs.readFileSync(`${outTransPath}.json`).toString()
130
+ );
131
+ const transcriptionArray = trans.result?.transcription || [];
132
+ const text = transcriptionArray.map((entry) => entry.text).filter((t) => t).join(" ").trim();
133
+ resolveFx({
134
+ text: text || "",
135
+ rawResponse: trans
136
+ });
137
+ } catch (error) {
138
+ const errorMessage = error instanceof Error ? error.message : String(error);
139
+ resolveFx({
140
+ text: "",
141
+ error: `Failed to parse transcription result: ${errorMessage}`,
142
+ rawResponse: void 0
143
+ });
144
+ }
145
+ });
146
+ whisperThread.on("error", (err) => {
147
+ console.error("Whisper Error", { err, outTransPath, args });
148
+ resolveFx({
149
+ text: "",
150
+ error: `Whisper process error: ${err.message}`
151
+ });
152
+ });
153
+ }).catch((whisperError) => {
154
+ console.error("Uncaught Whisper Error", whisperError);
155
+ return {
156
+ text: "",
157
+ error: `Uncaught error: ${whisperError instanceof Error ? whisperError.message : String(whisperError)}`
158
+ };
159
+ });
160
+ }
161
+ async ensureRequestedModelIsCached(modelKey) {
162
+ if (!_WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey])
163
+ throw new Error(`${modelKey} not known`);
164
+ const cachedModelsDirPath = `${this.cacheDir}/models`;
165
+ if (!fs.existsSync(cachedModelsDirPath)) {
166
+ fs.mkdirSync(cachedModelsDirPath, { recursive: true });
167
+ }
168
+ const modelPath = `${cachedModelsDirPath}/${_WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey]}`;
169
+ const isModelExisting = fs.existsSync(modelPath);
170
+ if (!isModelExisting) {
171
+ const srcURL = _WhisperProvider.DEFAULTS.MODELS[modelKey];
172
+ console.log(`
173
+ requested model is missing
174
+ Fetching ${srcURL} into ${modelPath}
175
+ `);
176
+ const data = await fetch(srcURL);
177
+ if (!data?.body) throw new Error("fetch failed");
178
+ const streamPipeline = promisify(pipeline);
179
+ await streamPipeline(
180
+ data.body,
181
+ fs.createWriteStream(modelPath)
182
+ );
183
+ } else {
184
+ console.log(`Found ${modelPath}
185
+ `);
186
+ }
187
+ return modelPath;
188
+ }
189
+ };
190
+ var VOXTRAL_LIMITS = {
191
+ /** Maximum audio duration in seconds (3 hours for Voxtral Transcribe 2) */
192
+ maxAudioDurationSec: 3 * 60 * 60,
193
+ // 10800 seconds = 3 hours
194
+ /** Recommended max duration before splitting (for reliability) */
195
+ recommendedMaxDurationSec: 30 * 60,
196
+ // 30 minutes
197
+ /** Maximum context biasing words/phrases */
198
+ maxContextBiasingTerms: 100,
199
+ /** Maximum file size in bytes (1GB) */
200
+ maxFileSizeBytes: 1024 * 1024 * 1024
201
+ };
202
+ var MistralProvider = class {
203
+ name = "mistral";
204
+ maxAudioDurationSec = VOXTRAL_LIMITS.maxAudioDurationSec;
205
+ /**
206
+ * Check if audio duration exceeds recommended limits
207
+ */
208
+ static shouldSplit(durationSec) {
209
+ return durationSec > VOXTRAL_LIMITS.recommendedMaxDurationSec;
210
+ }
211
+ /**
212
+ * Get the recommended max segment duration for splitting
213
+ */
214
+ static getRecommendedMaxSegment() {
215
+ return VOXTRAL_LIMITS.recommendedMaxDurationSec;
216
+ }
217
+ async transcribe(params) {
218
+ const formData = new FormData();
219
+ if (params.audioUrl) {
220
+ formData.append("file_url", params.audioUrl);
221
+ } else {
222
+ let audioBuffer;
223
+ let mimeType;
224
+ if (params.audioBuffer) {
225
+ audioBuffer = params.audioBuffer;
226
+ mimeType = params.mimeType || detectAudioMimeType(audioBuffer);
227
+ } else if (params.audioPath) {
228
+ audioBuffer = fs.readFileSync(params.audioPath);
229
+ mimeType = detectAudioMimeType(audioBuffer);
230
+ } else {
231
+ return { text: "", error: "No audio input provided (audioPath, audioBuffer, or audioUrl required)" };
232
+ }
233
+ const extension = mimeType === "audio/mpeg" ? "mp3" : mimeType === "audio/wav" ? "wav" : mimeType === "audio/flac" ? "flac" : "ogg";
234
+ const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
235
+ formData.append("file", audioBlob, `audio.${extension}`);
236
+ }
237
+ const model = params.model || "voxtral-mini-latest";
238
+ formData.append("model", model);
239
+ if (params.language) {
240
+ formData.append("language", params.language);
241
+ }
242
+ const timestampGranularity = params.timestampGranularity ?? (params.language ? void 0 : "word");
243
+ if (timestampGranularity) {
244
+ formData.append("timestamp_granularities", timestampGranularity);
245
+ }
246
+ const diarize = params.diarize ?? true;
247
+ if (diarize) {
248
+ formData.append("diarize", "true");
249
+ }
250
+ const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
251
+ method: "POST",
252
+ headers: {
253
+ "Authorization": `Bearer ${params.apiKey}`
254
+ },
255
+ body: formData
256
+ });
257
+ if (!response.ok) {
258
+ const errorText = await response.text();
259
+ return { text: "", error: `API returned ${response.status}: ${errorText}` };
260
+ }
261
+ const result = await response.json();
262
+ if (!result?.text) {
263
+ return { text: "", error: "No transcription returned", rawResponse: result };
264
+ }
265
+ const words = result.words || result.segments?.flatMap((seg) => seg.words || []);
266
+ const duration = result.usage?.prompt_audio_seconds;
267
+ return {
268
+ text: result.text,
269
+ language: result.language ?? params.language,
270
+ model: result.model,
271
+ duration,
272
+ words,
273
+ rawResponse: result
274
+ };
275
+ }
276
+ };
277
+ var GreenPTProvider = class {
278
+ name = "greenpt";
279
+ async transcribe(params) {
280
+ if (!params.apiKey) {
281
+ return { text: "", error: "API key is required for GreenPT provider" };
282
+ }
283
+ if (!params.audioPath) {
284
+ return { text: "", error: "Audio path is required" };
285
+ }
286
+ try {
287
+ const audioBuffer = fs.readFileSync(params.audioPath);
288
+ const mimeType = detectAudioMimeType(audioBuffer);
289
+ const formData = new FormData();
290
+ const extension = mimeType === "audio/mpeg" ? "mp3" : mimeType === "audio/wav" ? "wav" : mimeType === "audio/flac" ? "flac" : "ogg";
291
+ const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
292
+ formData.append("file", audioBlob, `audio.${extension}`);
293
+ const queryParams = new URLSearchParams();
294
+ const model = params.model || "green-s-pro";
295
+ queryParams.append("model", model);
296
+ if (params.language) {
297
+ queryParams.append("language", params.language);
298
+ }
299
+ if (params.diarize !== void 0) {
300
+ queryParams.append("diarize", String(params.diarize));
301
+ }
302
+ queryParams.append("punctuate", "true");
303
+ const url = `https://api.greenpt.ai/v1/listen?${queryParams.toString()}`;
304
+ const response = await fetch(url, {
305
+ method: "POST",
306
+ headers: {
307
+ "Authorization": `Bearer ${params.apiKey}`
308
+ },
309
+ body: formData
310
+ });
311
+ if (!response.ok) {
312
+ const errorText = await response.text();
313
+ return {
314
+ text: "",
315
+ error: `API returned ${response.status}: ${errorText}`
316
+ };
317
+ }
318
+ const result = await response.json();
319
+ const transcript = result?.results?.channels?.[0]?.alternatives?.[0];
320
+ if (!transcript) {
321
+ return {
322
+ text: "",
323
+ error: "No transcription returned",
324
+ rawResponse: result
325
+ };
326
+ }
327
+ let text = transcript.transcript;
328
+ if (!text && transcript.words && transcript.words.length > 0) {
329
+ text = transcript.words.map((w) => w.word).join(" ");
330
+ }
331
+ return {
332
+ text: text || "",
333
+ confidence: transcript.confidence,
334
+ words: transcript.words,
335
+ duration: result?.metadata?.duration,
336
+ rawResponse: result
337
+ };
338
+ } catch (error) {
339
+ const errorMessage = error instanceof Error ? error.message : String(error);
340
+ return {
341
+ text: "",
342
+ error: `Transcription failed: ${errorMessage}`
343
+ };
344
+ }
345
+ }
346
+ };
347
+
348
+ // src/utils/transcription/transcribe.ts
349
+ import * as fs3 from "fs";
350
+ import * as os from "os";
351
+ import * as path3 from "path";
352
+
353
+ // src/utils/audio/split.ts
354
+ import ffmpeg from "fluent-ffmpeg";
355
+ import * as fs2 from "fs";
356
+ import path2 from "path";
357
+ import { spawn as spawn2 } from "child_process";
358
+ var DEFAULT_SPLIT_CONFIG = {
359
+ maxDurationSec: 600,
360
+ // 10 minutes
361
+ minSilenceDurSec: 1,
362
+ silenceThreshold: "-35dB",
363
+ preferLongerSilence: true,
364
+ silenceBuffer: 0.2
365
+ };
366
+ async function getAudioDuration(audioPath) {
367
+ return new Promise((resolve, reject) => {
368
+ ffmpeg.ffprobe(audioPath, (err, metadata) => {
369
+ if (err) {
370
+ reject(new Error(`Failed to probe audio: ${err.message}`));
371
+ return;
372
+ }
373
+ const duration = metadata.format.duration;
374
+ if (typeof duration !== "number") {
375
+ reject(new Error("Could not determine audio duration"));
376
+ return;
377
+ }
378
+ resolve(duration);
379
+ });
380
+ });
381
+ }
382
+ async function detectSilenceRegions(audioPath, config = {}) {
383
+ const { minSilenceDurSec, silenceThreshold } = { ...DEFAULT_SPLIT_CONFIG, ...config };
384
+ return new Promise((resolve, reject) => {
385
+ const silenceRegions = [];
386
+ const args = [
387
+ "-i",
388
+ audioPath,
389
+ "-af",
390
+ `silencedetect=n=${silenceThreshold}:d=${minSilenceDurSec}`,
391
+ "-f",
392
+ "wav",
393
+ "-ac",
394
+ "1",
395
+ "-ar",
396
+ "8000",
397
+ "pipe:1"
398
+ ];
399
+ const proc = spawn2("ffmpeg", args);
400
+ proc.stdout.on("data", () => {
401
+ });
402
+ proc.stderr.on("data", (data) => {
403
+ const lines = data.toString().split("\n");
404
+ for (const line of lines) {
405
+ if (line.includes("silence_end:")) {
406
+ const match = line.match(/silence_end:\s*([\d.]+)\s*\|\s*silence_duration:\s*([\d.]+)/);
407
+ if (match) {
408
+ const endSec = parseFloat(match[1]);
409
+ const durationSec = parseFloat(match[2]);
410
+ if (!isNaN(endSec) && !isNaN(durationSec)) {
411
+ silenceRegions.push({
412
+ startSec: endSec - durationSec,
413
+ endSec,
414
+ durationSec
415
+ });
416
+ }
417
+ }
418
+ }
419
+ }
420
+ });
421
+ proc.on("close", (code) => {
422
+ if (code === 0 || silenceRegions.length > 0) {
423
+ resolve(silenceRegions);
424
+ } else {
425
+ reject(new Error(`FFmpeg exited with code ${code}`));
426
+ }
427
+ });
428
+ proc.on("error", (err) => {
429
+ reject(new Error(`Silence detection failed: ${err.message}`));
430
+ });
431
+ });
432
+ }
433
+ function findOptimalSplitPoints(silenceRegions, totalDuration, config = {}) {
434
+ const { maxDurationSec, preferLongerSilence, silenceBuffer } = {
435
+ ...DEFAULT_SPLIT_CONFIG,
436
+ ...config
437
+ };
438
+ if (totalDuration <= maxDurationSec) {
439
+ return [];
440
+ }
441
+ const numSegments = Math.ceil(totalDuration / maxDurationSec);
442
+ const idealSegmentDuration = totalDuration / numSegments;
443
+ const splitPoints = [];
444
+ for (let i = 1; i < numSegments; i++) {
445
+ const idealSplitTime = idealSegmentDuration * i;
446
+ const windowSize = idealSegmentDuration * 0.3;
447
+ const windowStart = idealSplitTime - windowSize;
448
+ const windowEnd = idealSplitTime + windowSize;
449
+ const candidateSilences = silenceRegions.filter((silence) => {
450
+ const silenceMid = (silence.startSec + silence.endSec) / 2;
451
+ return silenceMid >= windowStart && silenceMid <= windowEnd;
452
+ });
453
+ let bestSplitPoint;
454
+ if (candidateSilences.length > 0) {
455
+ let bestScore = -Infinity;
456
+ let bestSilence = candidateSilences[0];
457
+ for (const silence of candidateSilences) {
458
+ const silenceMid = (silence.startSec + silence.endSec) / 2;
459
+ const proximityScore = 1 - Math.abs(silenceMid - idealSplitTime) / windowSize;
460
+ const score = preferLongerSilence ? silence.durationSec * proximityScore : proximityScore;
461
+ if (score > bestScore) {
462
+ bestScore = score;
463
+ bestSilence = silence;
464
+ }
465
+ }
466
+ bestSplitPoint = {
467
+ timeSec: (bestSilence.startSec + bestSilence.endSec) / 2,
468
+ silenceDuration: bestSilence.durationSec
469
+ };
470
+ } else {
471
+ bestSplitPoint = {
472
+ timeSec: idealSplitTime,
473
+ silenceDuration: 0
474
+ };
475
+ }
476
+ splitPoints.push(bestSplitPoint);
477
+ }
478
+ return splitPoints.sort((a, b) => a.timeSec - b.timeSec);
479
+ }
480
+ async function splitAudioAtPoints(audioPath, splitPoints, totalDuration, outputDir, baseName) {
481
+ fs2.mkdirSync(outputDir, { recursive: true });
482
+ const segments = [];
483
+ const boundaries = [0, ...splitPoints.map((sp) => sp.timeSec), totalDuration];
484
+ const splitPromises = [];
485
+ for (let i = 0; i < boundaries.length - 1; i++) {
486
+ const startSec = boundaries[i];
487
+ const endSec = boundaries[i + 1];
488
+ const durationSec = endSec - startSec;
489
+ const outputPath = path2.join(outputDir, `${baseName}-segment-${i.toString().padStart(3, "0")}.wav`);
490
+ const segment = {
491
+ index: i,
492
+ startSec,
493
+ endSec,
494
+ durationSec,
495
+ outputPath
496
+ };
497
+ segments.push(segment);
498
+ const extractPromise = new Promise((resolve, reject) => {
499
+ ffmpeg(audioPath).setStartTime(startSec).setDuration(durationSec).audioFrequency(16e3).outputOptions(["-ac 1", "-c:a pcm_s16le"]).output(outputPath).on("error", (err) => reject(new Error(`Failed to extract segment ${i}: ${err.message}`))).on("end", () => resolve()).run();
500
+ });
501
+ splitPromises.push(extractPromise);
502
+ }
503
+ await Promise.all(splitPromises);
504
+ return segments;
505
+ }
506
+ async function autoSplitAudio(audioPath, outputDir, config = {}) {
507
+ const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config };
508
+ const totalDuration = await getAudioDuration(audioPath);
509
+ if (totalDuration <= mergedConfig.maxDurationSec) {
510
+ return [
511
+ {
512
+ index: 0,
513
+ startSec: 0,
514
+ endSec: totalDuration,
515
+ durationSec: totalDuration,
516
+ outputPath: audioPath
517
+ }
518
+ ];
519
+ }
520
+ const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig);
521
+ const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig);
522
+ const baseName = path2.basename(audioPath, path2.extname(audioPath));
523
+ const segments = await splitAudioAtPoints(
524
+ audioPath,
525
+ splitPoints,
526
+ totalDuration,
527
+ outputDir,
528
+ baseName
529
+ );
530
+ return segments;
531
+ }
532
+
533
+ // src/utils/audio/merge-results.ts
534
+ function mergeTranscriptionResults(results, segments) {
535
+ if (results.length === 0) {
536
+ return {
537
+ text: "",
538
+ error: "No results to merge"
539
+ };
540
+ }
541
+ if (results.length === 1) {
542
+ return {
543
+ ...results[0],
544
+ totalSegments: 1
545
+ };
546
+ }
547
+ const errors = results.map((r, i) => r.error ? `Segment ${i}: ${r.error}` : null).filter(Boolean);
548
+ if (errors.length > 0) {
549
+ return {
550
+ text: "",
551
+ error: `Errors in segments: ${errors.join("; ")}`
552
+ };
553
+ }
554
+ const mergedText = results.map((r) => r.text.trim()).join("\n\n");
555
+ const mergedWords = [];
556
+ for (let i = 0; i < results.length; i++) {
557
+ const result = results[i];
558
+ const segment = segments[i];
559
+ const words = result.words || result.rawResponse?.words || [];
560
+ for (const word of words) {
561
+ mergedWords.push({
562
+ word: word.word || word.text,
563
+ start: (word.start || 0) + segment.startSec,
564
+ end: (word.end || 0) + segment.startSec,
565
+ confidence: word.confidence,
566
+ speaker: word.speaker
567
+ });
568
+ }
569
+ }
570
+ const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0);
571
+ const segmentMeta = results.map((r, i) => ({
572
+ index: i,
573
+ startSec: segments[i].startSec,
574
+ endSec: segments[i].endSec,
575
+ text: r.text.trim()
576
+ }));
577
+ const mergedRawResponse = {
578
+ merged: true,
579
+ segmentCount: results.length,
580
+ segments: results.map((r, i) => ({
581
+ index: i,
582
+ startSec: segments[i].startSec,
583
+ rawResponse: r.rawResponse
584
+ })),
585
+ words: mergedWords
586
+ };
587
+ const firstResult = results[0];
588
+ return {
589
+ text: mergedText,
590
+ words: mergedWords,
591
+ duration: totalDuration,
592
+ language: firstResult.language,
593
+ model: firstResult.model,
594
+ rawResponse: mergedRawResponse,
595
+ segments: segmentMeta,
596
+ totalSegments: results.length
597
+ };
598
+ }
599
+
600
+ // src/utils/transcription/transcribe.ts
601
+ function createMistralTranscriber(config) {
602
+ const provider = new MistralProvider();
603
+ const defaultModel = config.model || "voxtral-mini-latest";
604
+ return {
605
+ /**
606
+ * Transcribe audio with auto-splitting for long files (file path only)
607
+ * Diarization and word timestamps enabled by default
608
+ */
609
+ async transcribe(options) {
610
+ const {
611
+ audioPath,
612
+ audioBuffer,
613
+ mimeType,
614
+ audioUrl,
615
+ language,
616
+ model = defaultModel,
617
+ diarize = true,
618
+ timestamps = language ? void 0 : "word",
619
+ autoSplit = true,
620
+ splitOutputDir
621
+ } = options;
622
+ if (audioUrl || audioBuffer) {
623
+ const result = await provider.transcribe({
624
+ audioUrl,
625
+ audioBuffer,
626
+ mimeType,
627
+ apiKey: config.apiKey,
628
+ model,
629
+ language,
630
+ diarize,
631
+ timestampGranularity: timestamps
632
+ });
633
+ return { ...result, totalSegments: 1 };
634
+ }
635
+ if (!audioPath) {
636
+ return { text: "", error: "No audio input provided (audioPath, audioBuffer, or audioUrl required)" };
637
+ }
638
+ const duration = await getAudioDuration(audioPath);
639
+ const needsSplit = autoSplit && duration > VOXTRAL_LIMITS.recommendedMaxDurationSec;
640
+ if (!needsSplit) {
641
+ const result = await provider.transcribe({
642
+ audioPath,
643
+ apiKey: config.apiKey,
644
+ model,
645
+ language,
646
+ diarize,
647
+ timestampGranularity: timestamps
648
+ });
649
+ return { ...result, totalSegments: 1 };
650
+ }
651
+ const outDir = splitOutputDir || path3.join(os.tmpdir(), `tranz-split-${Date.now()}`);
652
+ fs3.mkdirSync(outDir, { recursive: true });
653
+ const segments = await autoSplitAudio(audioPath, outDir, {
654
+ maxDurationSec: VOXTRAL_LIMITS.recommendedMaxDurationSec
655
+ });
656
+ const results = [];
657
+ for (const segment of segments) {
658
+ const result = await provider.transcribe({
659
+ audioPath: segment.outputPath,
660
+ apiKey: config.apiKey,
661
+ model,
662
+ language,
663
+ diarize,
664
+ timestampGranularity: timestamps
665
+ });
666
+ results.push(result);
667
+ }
668
+ return mergeTranscriptionResults(results, segments);
669
+ }
670
+ };
671
+ }
672
+ var transcribe = createMistralTranscriber;
673
+ export {
674
+ GreenPTProvider,
675
+ MistralProvider,
676
+ VOXTRAL_LIMITS,
677
+ WhisperProvider,
678
+ createMistralTranscriber,
679
+ createProvider,
680
+ transcribe
681
+ };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Audio utilities for tranz-cli
3
+ */
4
+ export * from './split.js';
5
+ export * from './merge-results.js';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/utils/audio/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}