@wovin/tranz 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ // src/utils/audio/split.ts
2
+ import ffmpeg from "fluent-ffmpeg";
3
+ import * as fs from "fs";
4
+ import path from "path";
5
+ import { spawn } from "child_process";
6
+ var DEFAULT_SPLIT_CONFIG = {
7
+ maxDurationSec: 600,
8
+ // 10 minutes
9
+ minSilenceDurSec: 1,
10
+ silenceThreshold: "-35dB",
11
+ preferLongerSilence: true,
12
+ silenceBuffer: 0.2
13
+ };
14
+ async function getAudioDuration(audioPath) {
15
+ return new Promise((resolve, reject) => {
16
+ ffmpeg.ffprobe(audioPath, (err, metadata) => {
17
+ if (err) {
18
+ reject(new Error(`Failed to probe audio: ${err.message}`));
19
+ return;
20
+ }
21
+ const duration = metadata.format.duration;
22
+ if (typeof duration !== "number") {
23
+ reject(new Error("Could not determine audio duration"));
24
+ return;
25
+ }
26
+ resolve(duration);
27
+ });
28
+ });
29
+ }
30
+ async function detectSilenceRegions(audioPath, config = {}) {
31
+ const { minSilenceDurSec, silenceThreshold } = { ...DEFAULT_SPLIT_CONFIG, ...config };
32
+ return new Promise((resolve, reject) => {
33
+ const silenceRegions = [];
34
+ const args = [
35
+ "-i",
36
+ audioPath,
37
+ "-af",
38
+ `silencedetect=n=${silenceThreshold}:d=${minSilenceDurSec}`,
39
+ "-f",
40
+ "wav",
41
+ "-ac",
42
+ "1",
43
+ "-ar",
44
+ "8000",
45
+ "pipe:1"
46
+ ];
47
+ const proc = spawn("ffmpeg", args);
48
+ proc.stdout.on("data", () => {
49
+ });
50
+ proc.stderr.on("data", (data) => {
51
+ const lines = data.toString().split("\n");
52
+ for (const line of lines) {
53
+ if (line.includes("silence_end:")) {
54
+ const match = line.match(/silence_end:\s*([\d.]+)\s*\|\s*silence_duration:\s*([\d.]+)/);
55
+ if (match) {
56
+ const endSec = parseFloat(match[1]);
57
+ const durationSec = parseFloat(match[2]);
58
+ if (!isNaN(endSec) && !isNaN(durationSec)) {
59
+ silenceRegions.push({
60
+ startSec: endSec - durationSec,
61
+ endSec,
62
+ durationSec
63
+ });
64
+ }
65
+ }
66
+ }
67
+ }
68
+ });
69
+ proc.on("close", (code) => {
70
+ if (code === 0 || silenceRegions.length > 0) {
71
+ resolve(silenceRegions);
72
+ } else {
73
+ reject(new Error(`FFmpeg exited with code ${code}`));
74
+ }
75
+ });
76
+ proc.on("error", (err) => {
77
+ reject(new Error(`Silence detection failed: ${err.message}`));
78
+ });
79
+ });
80
+ }
81
+ function findOptimalSplitPoints(silenceRegions, totalDuration, config = {}) {
82
+ const { maxDurationSec, preferLongerSilence, silenceBuffer } = {
83
+ ...DEFAULT_SPLIT_CONFIG,
84
+ ...config
85
+ };
86
+ if (totalDuration <= maxDurationSec) {
87
+ return [];
88
+ }
89
+ const numSegments = Math.ceil(totalDuration / maxDurationSec);
90
+ const idealSegmentDuration = totalDuration / numSegments;
91
+ const splitPoints = [];
92
+ for (let i = 1; i < numSegments; i++) {
93
+ const idealSplitTime = idealSegmentDuration * i;
94
+ const windowSize = idealSegmentDuration * 0.3;
95
+ const windowStart = idealSplitTime - windowSize;
96
+ const windowEnd = idealSplitTime + windowSize;
97
+ const candidateSilences = silenceRegions.filter((silence) => {
98
+ const silenceMid = (silence.startSec + silence.endSec) / 2;
99
+ return silenceMid >= windowStart && silenceMid <= windowEnd;
100
+ });
101
+ let bestSplitPoint;
102
+ if (candidateSilences.length > 0) {
103
+ let bestScore = -Infinity;
104
+ let bestSilence = candidateSilences[0];
105
+ for (const silence of candidateSilences) {
106
+ const silenceMid = (silence.startSec + silence.endSec) / 2;
107
+ const proximityScore = 1 - Math.abs(silenceMid - idealSplitTime) / windowSize;
108
+ const score = preferLongerSilence ? silence.durationSec * proximityScore : proximityScore;
109
+ if (score > bestScore) {
110
+ bestScore = score;
111
+ bestSilence = silence;
112
+ }
113
+ }
114
+ bestSplitPoint = {
115
+ timeSec: (bestSilence.startSec + bestSilence.endSec) / 2,
116
+ silenceDuration: bestSilence.durationSec
117
+ };
118
+ } else {
119
+ bestSplitPoint = {
120
+ timeSec: idealSplitTime,
121
+ silenceDuration: 0
122
+ };
123
+ }
124
+ splitPoints.push(bestSplitPoint);
125
+ }
126
+ return splitPoints.sort((a, b) => a.timeSec - b.timeSec);
127
+ }
128
+ async function splitAudioAtPoints(audioPath, splitPoints, totalDuration, outputDir, baseName) {
129
+ fs.mkdirSync(outputDir, { recursive: true });
130
+ const segments = [];
131
+ const boundaries = [0, ...splitPoints.map((sp) => sp.timeSec), totalDuration];
132
+ const splitPromises = [];
133
+ for (let i = 0; i < boundaries.length - 1; i++) {
134
+ const startSec = boundaries[i];
135
+ const endSec = boundaries[i + 1];
136
+ const durationSec = endSec - startSec;
137
+ const outputPath = path.join(outputDir, `${baseName}-segment-${i.toString().padStart(3, "0")}.wav`);
138
+ const segment = {
139
+ index: i,
140
+ startSec,
141
+ endSec,
142
+ durationSec,
143
+ outputPath
144
+ };
145
+ segments.push(segment);
146
+ const extractPromise = new Promise((resolve, reject) => {
147
+ ffmpeg(audioPath).setStartTime(startSec).setDuration(durationSec).audioFrequency(16e3).outputOptions(["-ac 1", "-c:a pcm_s16le"]).output(outputPath).on("error", (err) => reject(new Error(`Failed to extract segment ${i}: ${err.message}`))).on("end", () => resolve()).run();
148
+ });
149
+ splitPromises.push(extractPromise);
150
+ }
151
+ await Promise.all(splitPromises);
152
+ return segments;
153
+ }
154
+ async function autoSplitAudio(audioPath, outputDir, config = {}) {
155
+ const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config };
156
+ const totalDuration = await getAudioDuration(audioPath);
157
+ if (totalDuration <= mergedConfig.maxDurationSec) {
158
+ return [
159
+ {
160
+ index: 0,
161
+ startSec: 0,
162
+ endSec: totalDuration,
163
+ durationSec: totalDuration,
164
+ outputPath: audioPath
165
+ }
166
+ ];
167
+ }
168
+ const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig);
169
+ const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig);
170
+ const baseName = path.basename(audioPath, path.extname(audioPath));
171
+ const segments = await splitAudioAtPoints(
172
+ audioPath,
173
+ splitPoints,
174
+ totalDuration,
175
+ outputDir,
176
+ baseName
177
+ );
178
+ return segments;
179
+ }
180
+ async function analyzeSplitPoints(audioPath, config = {}) {
181
+ const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config };
182
+ const totalDuration = await getAudioDuration(audioPath);
183
+ const needsSplit = totalDuration > mergedConfig.maxDurationSec;
184
+ if (!needsSplit) {
185
+ return {
186
+ totalDuration,
187
+ numSegments: 1,
188
+ splitPoints: [],
189
+ silenceRegions: [],
190
+ needsSplit: false
191
+ };
192
+ }
193
+ const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig);
194
+ const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig);
195
+ return {
196
+ totalDuration,
197
+ numSegments: splitPoints.length + 1,
198
+ splitPoints,
199
+ silenceRegions,
200
+ needsSplit: true
201
+ };
202
+ }
203
+
204
+ // src/utils/audio/merge-results.ts
205
+ function mergeTranscriptionResults(results, segments) {
206
+ if (results.length === 0) {
207
+ return {
208
+ text: "",
209
+ error: "No results to merge"
210
+ };
211
+ }
212
+ if (results.length === 1) {
213
+ return {
214
+ ...results[0],
215
+ totalSegments: 1
216
+ };
217
+ }
218
+ const errors = results.map((r, i) => r.error ? `Segment ${i}: ${r.error}` : null).filter(Boolean);
219
+ if (errors.length > 0) {
220
+ return {
221
+ text: "",
222
+ error: `Errors in segments: ${errors.join("; ")}`
223
+ };
224
+ }
225
+ const mergedText = results.map((r) => r.text.trim()).join("\n\n");
226
+ const mergedWords = [];
227
+ for (let i = 0; i < results.length; i++) {
228
+ const result = results[i];
229
+ const segment = segments[i];
230
+ const words = result.words || result.rawResponse?.words || [];
231
+ for (const word of words) {
232
+ mergedWords.push({
233
+ word: word.word || word.text,
234
+ start: (word.start || 0) + segment.startSec,
235
+ end: (word.end || 0) + segment.startSec,
236
+ confidence: word.confidence,
237
+ speaker: word.speaker
238
+ });
239
+ }
240
+ }
241
+ const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0);
242
+ const segmentMeta = results.map((r, i) => ({
243
+ index: i,
244
+ startSec: segments[i].startSec,
245
+ endSec: segments[i].endSec,
246
+ text: r.text.trim()
247
+ }));
248
+ const mergedRawResponse = {
249
+ merged: true,
250
+ segmentCount: results.length,
251
+ segments: results.map((r, i) => ({
252
+ index: i,
253
+ startSec: segments[i].startSec,
254
+ rawResponse: r.rawResponse
255
+ })),
256
+ words: mergedWords
257
+ };
258
+ const firstResult = results[0];
259
+ return {
260
+ text: mergedText,
261
+ words: mergedWords,
262
+ duration: totalDuration,
263
+ language: firstResult.language,
264
+ model: firstResult.model,
265
+ rawResponse: mergedRawResponse,
266
+ segments: segmentMeta,
267
+ totalSegments: results.length
268
+ };
269
+ }
270
+ function formatMergedText(result, includeMarkers = false) {
271
+ if (!result.segments || result.segments.length <= 1) {
272
+ return result.text;
273
+ }
274
+ if (!includeMarkers) {
275
+ return result.text;
276
+ }
277
+ return result.segments.map((seg, i) => {
278
+ const timeStr = formatTimestamp(seg.startSec);
279
+ return `[Segment ${i + 1} @ ${timeStr}]
280
+ ${seg.text}`;
281
+ }).join("\n\n");
282
+ }
283
+ function formatTimestamp(seconds) {
284
+ const hours = Math.floor(seconds / 3600);
285
+ const minutes = Math.floor(seconds % 3600 / 60);
286
+ const secs = Math.floor(seconds % 60);
287
+ if (hours > 0) {
288
+ return `${hours}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}`;
289
+ }
290
+ return `${minutes}:${secs.toString().padStart(2, "0")}`;
291
+ }
292
+ export {
293
+ DEFAULT_SPLIT_CONFIG,
294
+ analyzeSplitPoints,
295
+ autoSplitAudio,
296
+ detectSilenceRegions,
297
+ findOptimalSplitPoints,
298
+ formatMergedText,
299
+ getAudioDuration,
300
+ mergeTranscriptionResults,
301
+ splitAudioAtPoints
302
+ };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * @wovin/tranz - Audio transcription library
3
+ */
4
+ export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, } from './utils/transcription/providers.js';
5
+ export { autoSplitAudio, analyzeSplitPoints, detectSilenceRegions, getAudioDuration, findOptimalSplitPoints, splitAudioAtPoints, DEFAULT_SPLIT_CONFIG, type SplitConfig, type SilenceRegion, type SplitPoint, type AudioSegment, type SplitAnalysis, } from './utils/audio/split.js';
6
+ export { mergeTranscriptionResults, formatMergedText, type MergedTranscriptionResult, type WordData, } from './utils/audio/merge-results.js';
7
+ export { formatTranscriptWithPauses } from './utils/transcription/format.js';
8
+ export { createMistralTranscriber, transcribe, type TranscribeOptions, type MistralTranscriberConfig, } from './utils/transcription/transcribe.js';
9
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,GAC3B,MAAM,oCAAoC,CAAA;AAG3C,OAAO,EACL,cAAc,EACd,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,EAChB,sBAAsB,EACtB,kBAAkB,EAClB,oBAAoB,EACpB,KAAK,WAAW,EAChB,KAAK,aAAa,EAClB,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,aAAa,GACnB,MAAM,wBAAwB,CAAA;AAG/B,OAAO,EACL,yBAAyB,EACzB,gBAAgB,EAChB,KAAK,yBAAyB,EAC9B,KAAK,QAAQ,GACd,MAAM,gCAAgC,CAAA;AAGvC,OAAO,EAAE,0BAA0B,EAAE,MAAM,iCAAiC,CAAA;AAG5E,OAAO,EACL,wBAAwB,EACxB,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,wBAAwB,GAC9B,MAAM,qCAAqC,CAAA"}