@wovin/tranz 0.1.36 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/{audio.min.js → audio.js} +32 -18
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/{index.min.js → index.js} +161 -29
- package/dist/providers.d.ts +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/{providers.min.js → providers.js} +68 -24
- package/dist/utils/audio/merge-results.d.ts +14 -12
- package/dist/utils/audio/merge-results.d.ts.map +1 -1
- package/dist/utils/transcription/format.d.ts +27 -0
- package/dist/utils/transcription/format.d.ts.map +1 -1
- package/dist/utils/transcription/providers.d.ts +30 -1
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/transcribe.d.ts +5 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -1
- package/package.json +10 -8
- package/src/audio.ts +25 -0
- package/src/index.ts +61 -0
- package/src/providers.ts +23 -0
- package/src/realtime.ts +58 -0
- package/src/utils/audio/index.ts +6 -0
- package/src/utils/audio/merge-results.ts +198 -0
- package/src/utils/audio/split.ts +504 -0
- package/src/utils/file-utils.ts +16 -0
- package/src/utils/transcription/format.ts +208 -0
- package/src/utils/transcription/mime-detection.ts +80 -0
- package/src/utils/transcription/providers.ts +572 -0
- package/src/utils/transcription/realtime.ts +821 -0
- package/src/utils/transcription/runtime.ts +40 -0
- package/src/utils/transcription/transcribe.ts +366 -0
- /package/dist/{realtime.min.js → realtime.js} +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utilities for merging transcription results from split audio segments
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { TranscriptionResult, TranscriptSegment } from '../transcription/providers.ts'
|
|
6
|
+
import type { AudioSegment } from './split.ts'
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Word-level data with timing information
|
|
10
|
+
*/
|
|
11
|
+
export interface WordData {
|
|
12
|
+
word: string
|
|
13
|
+
start: number
|
|
14
|
+
end: number
|
|
15
|
+
confidence?: number
|
|
16
|
+
speaker?: string | number
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Metadata describing one audio chunk in an auto-split + merge run.
|
|
21
|
+
*/
|
|
22
|
+
export interface AudioChunk {
|
|
23
|
+
index: number
|
|
24
|
+
startSec: number
|
|
25
|
+
endSec: number
|
|
26
|
+
text: string
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Merged transcription result with chunk-level metadata.
|
|
31
|
+
*/
|
|
32
|
+
export interface MergedTranscriptionResult extends TranscriptionResult {
|
|
33
|
+
/** Audio chunks that were transcribed independently and merged. Absent when no split happened. */
|
|
34
|
+
audioChunks?: AudioChunk[]
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Prefix a per-chunk diarization label so values from different chunks don't collide.
|
|
39
|
+
* Chunk-scoped labels (Mistral's `speaker_1`, Deepgram's `0`) are NOT comparable across
|
|
40
|
+
* chunks — `chunk0/speaker_1` and `chunk1/speaker_1` are almost always different humans.
|
|
41
|
+
*/
|
|
42
|
+
function prefixChunkLabel(chunkIndex: number, value: string | number | undefined): string | number | undefined {
|
|
43
|
+
if (value === undefined) return undefined
|
|
44
|
+
return `chunk${chunkIndex}/${String(value)}`
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Merge multiple transcription results from audio segments into one
|
|
49
|
+
* Adjusts word timestamps to be relative to the original audio
|
|
50
|
+
*
|
|
51
|
+
* @param results - Array of transcription results from each segment
|
|
52
|
+
* @param segments - Array of audio segment metadata
|
|
53
|
+
* @returns Merged transcription result
|
|
54
|
+
*/
|
|
55
|
+
export function mergeTranscriptionResults(
|
|
56
|
+
results: TranscriptionResult[],
|
|
57
|
+
segments: AudioSegment[]
|
|
58
|
+
): MergedTranscriptionResult {
|
|
59
|
+
if (results.length === 0) {
|
|
60
|
+
return {
|
|
61
|
+
text: '',
|
|
62
|
+
error: 'No results to merge',
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (results.length === 1) {
|
|
67
|
+
// Single result, no merging needed — preserve native diarization types verbatim.
|
|
68
|
+
return results[0]
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Check for errors in any segment
|
|
72
|
+
const errors = results
|
|
73
|
+
.map((r, i) => (r.error ? `Segment ${i}: ${r.error}` : null))
|
|
74
|
+
.filter(Boolean)
|
|
75
|
+
|
|
76
|
+
if (errors.length > 0) {
|
|
77
|
+
return {
|
|
78
|
+
text: '',
|
|
79
|
+
error: `Errors in segments: ${errors.join('; ')}`,
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Merge text with segment markers (double newline between segments)
|
|
84
|
+
const mergedText = results.map((r) => r.text.trim()).join('\n\n')
|
|
85
|
+
|
|
86
|
+
// Merge and adjust word timestamps. Words use seconds (legacy WordData shape).
|
|
87
|
+
const mergedWords: WordData[] = []
|
|
88
|
+
for (let i = 0; i < results.length; i++) {
|
|
89
|
+
const result = results[i]
|
|
90
|
+
const segment = segments[i]
|
|
91
|
+
const words = result.words || result.rawResponse?.words || []
|
|
92
|
+
|
|
93
|
+
for (const word of words) {
|
|
94
|
+
mergedWords.push({
|
|
95
|
+
word: word.word || word.text,
|
|
96
|
+
start: (word.start || 0) + segment.startSec,
|
|
97
|
+
end: (word.end || 0) + segment.startSec,
|
|
98
|
+
confidence: word.confidence,
|
|
99
|
+
speaker: prefixChunkLabel(i, word.speaker) as string | number | undefined,
|
|
100
|
+
})
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Merge transcription segments. These use integer ms (TranscriptSegment shape).
|
|
105
|
+
const mergedSegments: TranscriptSegment[] = []
|
|
106
|
+
for (let i = 0; i < results.length; i++) {
|
|
107
|
+
const result = results[i]
|
|
108
|
+
const chunkOffsetMs = Math.round(segments[i].startSec * 1000)
|
|
109
|
+
if (!result.segments) continue
|
|
110
|
+
for (const seg of result.segments) {
|
|
111
|
+
mergedSegments.push({
|
|
112
|
+
startMs: seg.startMs + chunkOffsetMs,
|
|
113
|
+
endMs: seg.endMs + chunkOffsetMs,
|
|
114
|
+
text: seg.text,
|
|
115
|
+
...(seg.diarization !== undefined
|
|
116
|
+
? { diarization: prefixChunkLabel(i, seg.diarization) as string | number }
|
|
117
|
+
: {}),
|
|
118
|
+
})
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Calculate total duration
|
|
123
|
+
const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0)
|
|
124
|
+
|
|
125
|
+
// Per-chunk metadata
|
|
126
|
+
const audioChunks: AudioChunk[] = results.map((r, i) => ({
|
|
127
|
+
index: i,
|
|
128
|
+
startSec: segments[i].startSec,
|
|
129
|
+
endSec: segments[i].endSec,
|
|
130
|
+
text: r.text.trim(),
|
|
131
|
+
}))
|
|
132
|
+
|
|
133
|
+
// Merge raw responses
|
|
134
|
+
const mergedRawResponse = {
|
|
135
|
+
merged: true,
|
|
136
|
+
chunkCount: results.length,
|
|
137
|
+
chunks: results.map((r, i) => ({
|
|
138
|
+
index: i,
|
|
139
|
+
startSec: segments[i].startSec,
|
|
140
|
+
rawResponse: r.rawResponse,
|
|
141
|
+
})),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Take language and model from first result
|
|
145
|
+
const firstResult = results[0]
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
text: mergedText,
|
|
149
|
+
duration: totalDuration,
|
|
150
|
+
language: firstResult.language,
|
|
151
|
+
model: firstResult.model,
|
|
152
|
+
rawResponse: mergedRawResponse,
|
|
153
|
+
audioChunks,
|
|
154
|
+
...(mergedWords.length > 0 ? { words: mergedWords } : {}),
|
|
155
|
+
...(mergedSegments.length > 0 ? { segments: mergedSegments } : {}),
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Format merged results with optional segment markers in the text
|
|
161
|
+
*
|
|
162
|
+
* @param result - Merged transcription result
|
|
163
|
+
* @param includeMarkers - Whether to include [Chunk N] markers
|
|
164
|
+
* @returns Formatted text
|
|
165
|
+
*/
|
|
166
|
+
export function formatMergedText(
|
|
167
|
+
result: MergedTranscriptionResult,
|
|
168
|
+
includeMarkers: boolean = false
|
|
169
|
+
): string {
|
|
170
|
+
if (!result.audioChunks || result.audioChunks.length <= 1) {
|
|
171
|
+
return result.text
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (!includeMarkers) {
|
|
175
|
+
return result.text
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return result.audioChunks
|
|
179
|
+
.map((chunk, i) => {
|
|
180
|
+
const timeStr = formatTimestamp(chunk.startSec)
|
|
181
|
+
return `[Chunk ${i + 1} @ ${timeStr}]\n${chunk.text}`
|
|
182
|
+
})
|
|
183
|
+
.join('\n\n')
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Format seconds as HH:MM:SS or MM:SS
|
|
188
|
+
*/
|
|
189
|
+
function formatTimestamp(seconds: number): string {
|
|
190
|
+
const hours = Math.floor(seconds / 3600)
|
|
191
|
+
const minutes = Math.floor((seconds % 3600) / 60)
|
|
192
|
+
const secs = Math.floor(seconds % 60)
|
|
193
|
+
|
|
194
|
+
if (hours > 0) {
|
|
195
|
+
return `${hours}:${minutes.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`
|
|
196
|
+
}
|
|
197
|
+
return `${minutes}:${secs.toString().padStart(2, '0')}`
|
|
198
|
+
}
|
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Audio splitting utilities for tranz-cli
|
|
3
|
+
* Provides silence detection and optimal split point calculation
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { execa } from 'execa'
|
|
7
|
+
import * as fs from 'node:fs'
|
|
8
|
+
import path from 'node:path'
|
|
9
|
+
import { spawn } from 'node:child_process'
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Configuration for audio splitting
|
|
13
|
+
*/
|
|
14
|
+
export interface SplitConfig {
|
|
15
|
+
/** Maximum segment duration in seconds (default: 600 = 10min) */
|
|
16
|
+
maxDurationSec: number
|
|
17
|
+
/** Minimum silence duration to consider for split (default: 1.0s) */
|
|
18
|
+
minSilenceDurSec: number
|
|
19
|
+
/** FFmpeg silence threshold (default: '-35dB') */
|
|
20
|
+
silenceThreshold: string
|
|
21
|
+
/** Prefer longer silences for splits (default: true) */
|
|
22
|
+
preferLongerSilence: boolean
|
|
23
|
+
/** Buffer to leave at silence edges (default: 0.2s) */
|
|
24
|
+
silenceBuffer: number
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* A detected silence region in the audio
|
|
29
|
+
*/
|
|
30
|
+
export interface SilenceRegion {
|
|
31
|
+
startSec: number
|
|
32
|
+
endSec: number
|
|
33
|
+
durationSec: number
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* A calculated split point
|
|
38
|
+
*/
|
|
39
|
+
export interface SplitPoint {
|
|
40
|
+
/** Time in seconds where to split (middle of silence) */
|
|
41
|
+
timeSec: number
|
|
42
|
+
/** Duration of the silence at this split point */
|
|
43
|
+
silenceDuration: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* An audio segment after splitting
|
|
48
|
+
*/
|
|
49
|
+
export interface AudioSegment {
|
|
50
|
+
index: number
|
|
51
|
+
startSec: number
|
|
52
|
+
endSec: number
|
|
53
|
+
durationSec: number
|
|
54
|
+
outputPath: string
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Default split configuration
|
|
59
|
+
*/
|
|
60
|
+
export const DEFAULT_SPLIT_CONFIG: SplitConfig = {
|
|
61
|
+
maxDurationSec: 600, // 10 minutes
|
|
62
|
+
minSilenceDurSec: 1.0,
|
|
63
|
+
silenceThreshold: '-35dB',
|
|
64
|
+
preferLongerSilence: true,
|
|
65
|
+
silenceBuffer: 0.2,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Execute ffprobe and return metadata
|
|
70
|
+
* Uses -show_format and -show_streams to get duration from either source
|
|
71
|
+
*/
|
|
72
|
+
async function execFFprobe(audioPath: string): Promise<{
|
|
73
|
+
format?: { duration?: string | number }
|
|
74
|
+
streams?: Array<{ duration?: string | number }>
|
|
75
|
+
}> {
|
|
76
|
+
try {
|
|
77
|
+
const { stdout } = await execa('ffprobe', [
|
|
78
|
+
'-v', 'error',
|
|
79
|
+
'-print_format', 'json',
|
|
80
|
+
'-show_format',
|
|
81
|
+
'-show_streams',
|
|
82
|
+
audioPath
|
|
83
|
+
])
|
|
84
|
+
return JSON.parse(stdout)
|
|
85
|
+
} catch (err) {
|
|
86
|
+
throw new Error(`Failed to probe audio: ${err instanceof Error ? err.message : String(err)}`)
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Extract audio segment using ffmpeg
|
|
92
|
+
*/
|
|
93
|
+
async function extractAudioSegment(
|
|
94
|
+
inputPath: string,
|
|
95
|
+
outputPath: string,
|
|
96
|
+
startSec: number,
|
|
97
|
+
durationSec: number
|
|
98
|
+
): Promise<void> {
|
|
99
|
+
try {
|
|
100
|
+
await execa('ffmpeg', [
|
|
101
|
+
'-ss', startSec.toString(),
|
|
102
|
+
'-t', durationSec.toString(),
|
|
103
|
+
'-i', inputPath,
|
|
104
|
+
'-ar', '16000', // 16kHz sample rate (Whisper-compatible)
|
|
105
|
+
'-ac', '1', // mono
|
|
106
|
+
'-c:a', 'pcm_s16le', // 16-bit PCM codec
|
|
107
|
+
'-y', // overwrite output
|
|
108
|
+
outputPath
|
|
109
|
+
])
|
|
110
|
+
} catch (err) {
|
|
111
|
+
throw new Error(`Failed to extract segment: ${err instanceof Error ? err.message : String(err)}`)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Get duration using ffmpeg decode (slower but more reliable)
|
|
117
|
+
* Used as fallback when ffprobe can't determine duration
|
|
118
|
+
*/
|
|
119
|
+
async function getDurationViaFfmpeg(audioPath: string): Promise<number | undefined> {
|
|
120
|
+
try {
|
|
121
|
+
// Use ffmpeg to decode to null and capture the duration from stderr
|
|
122
|
+
const { stderr } = await execa('ffmpeg', [
|
|
123
|
+
'-i', audioPath,
|
|
124
|
+
'-f', 'null',
|
|
125
|
+
'-'
|
|
126
|
+
], { reject: false })
|
|
127
|
+
|
|
128
|
+
// Parse duration from ffmpeg output: "Duration: HH:MM:SS.ss" or "time=HH:MM:SS.ss"
|
|
129
|
+
const durationMatch = stderr.match(/Duration:\s*(\d+):(\d+):(\d+(?:\.\d+)?)/)
|
|
130
|
+
if (durationMatch) {
|
|
131
|
+
const hours = parseFloat(durationMatch[1])
|
|
132
|
+
const minutes = parseFloat(durationMatch[2])
|
|
133
|
+
const seconds = parseFloat(durationMatch[3])
|
|
134
|
+
return hours * 3600 + minutes * 60 + seconds
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Try parsing from final time= output
|
|
138
|
+
const timeMatches = [...stderr.matchAll(/time=(\d+):(\d+):(\d+(?:\.\d+)?)/g)]
|
|
139
|
+
if (timeMatches.length > 0) {
|
|
140
|
+
const lastMatch = timeMatches[timeMatches.length - 1]
|
|
141
|
+
const hours = parseFloat(lastMatch[1])
|
|
142
|
+
const minutes = parseFloat(lastMatch[2])
|
|
143
|
+
const seconds = parseFloat(lastMatch[3])
|
|
144
|
+
return hours * 3600 + minutes * 60 + seconds
|
|
145
|
+
}
|
|
146
|
+
} catch {
|
|
147
|
+
// Ignore errors, return undefined
|
|
148
|
+
}
|
|
149
|
+
return undefined
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Get the duration of an audio file in seconds
|
|
154
|
+
* Tries format.duration first, then falls back to stream duration,
|
|
155
|
+
* and finally uses ffmpeg decode as last resort
|
|
156
|
+
*/
|
|
157
|
+
export async function getAudioDuration(audioPath: string): Promise<number> {
|
|
158
|
+
const metadata = await execFFprobe(audioPath)
|
|
159
|
+
|
|
160
|
+
// Try format duration first (most reliable for container formats)
|
|
161
|
+
if (metadata.format?.duration) {
|
|
162
|
+
const duration = parseFloat(String(metadata.format.duration))
|
|
163
|
+
if (!isNaN(duration) && duration > 0) {
|
|
164
|
+
return duration
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Fall back to stream duration (works for raw audio formats)
|
|
169
|
+
if (metadata.streams?.length) {
|
|
170
|
+
for (const stream of metadata.streams) {
|
|
171
|
+
if (stream.duration) {
|
|
172
|
+
const duration = parseFloat(String(stream.duration))
|
|
173
|
+
if (!isNaN(duration) && duration > 0) {
|
|
174
|
+
return duration
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Last resort: decode with ffmpeg to get duration
|
|
181
|
+
const ffmpegDuration = await getDurationViaFfmpeg(audioPath)
|
|
182
|
+
if (ffmpegDuration !== undefined && ffmpegDuration > 0) {
|
|
183
|
+
return ffmpegDuration
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Provide helpful debug info
|
|
187
|
+
const hasFormat = !!metadata.format
|
|
188
|
+
const hasStreams = !!metadata.streams?.length
|
|
189
|
+
throw new Error(
|
|
190
|
+
`Could not determine audio duration (format: ${hasFormat}, streams: ${hasStreams}). ` +
|
|
191
|
+
`File may be corrupted or in an unsupported format.`
|
|
192
|
+
)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Detect silence regions in an audio file using FFmpeg
|
|
197
|
+
* Uses spawn directly for better compatibility
|
|
198
|
+
*/
|
|
199
|
+
export async function detectSilenceRegions(
|
|
200
|
+
audioPath: string,
|
|
201
|
+
config: Partial<SplitConfig> = {}
|
|
202
|
+
): Promise<SilenceRegion[]> {
|
|
203
|
+
const { minSilenceDurSec, silenceThreshold } = { ...DEFAULT_SPLIT_CONFIG, ...config }
|
|
204
|
+
|
|
205
|
+
return new Promise((resolve, reject) => {
|
|
206
|
+
const silenceRegions: SilenceRegion[] = []
|
|
207
|
+
|
|
208
|
+
// Use spawn directly for better ffmpeg compatibility
|
|
209
|
+
const args = [
|
|
210
|
+
'-i', audioPath,
|
|
211
|
+
'-af', `silencedetect=n=${silenceThreshold}:d=${minSilenceDurSec}`,
|
|
212
|
+
'-f', 'wav',
|
|
213
|
+
'-ac', '1',
|
|
214
|
+
'-ar', '8000',
|
|
215
|
+
'pipe:1'
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
const proc = spawn('ffmpeg', args)
|
|
219
|
+
|
|
220
|
+
// Discard stdout (audio data)
|
|
221
|
+
proc.stdout.on('data', () => {})
|
|
222
|
+
|
|
223
|
+
// Parse stderr for silence info
|
|
224
|
+
proc.stderr.on('data', (data: Buffer) => {
|
|
225
|
+
const lines = data.toString().split('\n')
|
|
226
|
+
for (const line of lines) {
|
|
227
|
+
if (line.includes('silence_end:')) {
|
|
228
|
+
const match = line.match(/silence_end:\s*([\d.]+)\s*\|\s*silence_duration:\s*([\d.]+)/)
|
|
229
|
+
if (match) {
|
|
230
|
+
const endSec = parseFloat(match[1])
|
|
231
|
+
const durationSec = parseFloat(match[2])
|
|
232
|
+
if (!isNaN(endSec) && !isNaN(durationSec)) {
|
|
233
|
+
silenceRegions.push({
|
|
234
|
+
startSec: endSec - durationSec,
|
|
235
|
+
endSec,
|
|
236
|
+
durationSec,
|
|
237
|
+
})
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
proc.on('close', (code: number) => {
|
|
245
|
+
if (code === 0 || silenceRegions.length > 0) {
|
|
246
|
+
resolve(silenceRegions)
|
|
247
|
+
} else {
|
|
248
|
+
reject(new Error(`FFmpeg exited with code ${code}`))
|
|
249
|
+
}
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
proc.on('error', (err: Error) => {
|
|
253
|
+
reject(new Error(`Silence detection failed: ${err.message}`))
|
|
254
|
+
})
|
|
255
|
+
})
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Find optimal split points in audio based on silence regions
|
|
260
|
+
* Prefers splitting at longer silences when possible
|
|
261
|
+
*
|
|
262
|
+
* @param silenceRegions - Detected silence regions
|
|
263
|
+
* @param totalDuration - Total audio duration in seconds
|
|
264
|
+
* @param config - Split configuration
|
|
265
|
+
* @returns Array of optimal split points
|
|
266
|
+
*/
|
|
267
|
+
export function findOptimalSplitPoints(
|
|
268
|
+
silenceRegions: SilenceRegion[],
|
|
269
|
+
totalDuration: number,
|
|
270
|
+
config: Partial<SplitConfig> = {}
|
|
271
|
+
): SplitPoint[] {
|
|
272
|
+
const { maxDurationSec, preferLongerSilence, silenceBuffer } = {
|
|
273
|
+
...DEFAULT_SPLIT_CONFIG,
|
|
274
|
+
...config,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// No splits needed if audio is short enough
|
|
278
|
+
if (totalDuration <= maxDurationSec) {
|
|
279
|
+
return []
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Calculate how many segments we need
|
|
283
|
+
const numSegments = Math.ceil(totalDuration / maxDurationSec)
|
|
284
|
+
const idealSegmentDuration = totalDuration / numSegments
|
|
285
|
+
|
|
286
|
+
const splitPoints: SplitPoint[] = []
|
|
287
|
+
|
|
288
|
+
// Find split points for each required split (numSegments - 1 splits)
|
|
289
|
+
for (let i = 1; i < numSegments; i++) {
|
|
290
|
+
const idealSplitTime = idealSegmentDuration * i
|
|
291
|
+
|
|
292
|
+
// Define search window: ±30% of segment duration around ideal point
|
|
293
|
+
const windowSize = idealSegmentDuration * 0.3
|
|
294
|
+
const windowStart = idealSplitTime - windowSize
|
|
295
|
+
const windowEnd = idealSplitTime + windowSize
|
|
296
|
+
|
|
297
|
+
// Find all silences within the window
|
|
298
|
+
const candidateSilences = silenceRegions.filter((silence) => {
|
|
299
|
+
const silenceMid = (silence.startSec + silence.endSec) / 2
|
|
300
|
+
return silenceMid >= windowStart && silenceMid <= windowEnd
|
|
301
|
+
})
|
|
302
|
+
|
|
303
|
+
let bestSplitPoint: SplitPoint
|
|
304
|
+
|
|
305
|
+
if (candidateSilences.length > 0) {
|
|
306
|
+
// Score each silence and pick the best
|
|
307
|
+
let bestScore = -Infinity
|
|
308
|
+
let bestSilence = candidateSilences[0]
|
|
309
|
+
|
|
310
|
+
for (const silence of candidateSilences) {
|
|
311
|
+
const silenceMid = (silence.startSec + silence.endSec) / 2
|
|
312
|
+
const proximityScore = 1 - Math.abs(silenceMid - idealSplitTime) / windowSize
|
|
313
|
+
|
|
314
|
+
// Score formula: prefer longer silences and closer to ideal point
|
|
315
|
+
const score = preferLongerSilence
|
|
316
|
+
? silence.durationSec * proximityScore
|
|
317
|
+
: proximityScore
|
|
318
|
+
|
|
319
|
+
if (score > bestScore) {
|
|
320
|
+
bestScore = score
|
|
321
|
+
bestSilence = silence
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Split at the middle of the best silence
|
|
326
|
+
bestSplitPoint = {
|
|
327
|
+
timeSec: (bestSilence.startSec + bestSilence.endSec) / 2,
|
|
328
|
+
silenceDuration: bestSilence.durationSec,
|
|
329
|
+
}
|
|
330
|
+
} else {
|
|
331
|
+
// No silence found in window - split at ideal point
|
|
332
|
+
// This is a fallback; may split mid-speech
|
|
333
|
+
bestSplitPoint = {
|
|
334
|
+
timeSec: idealSplitTime,
|
|
335
|
+
silenceDuration: 0,
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
splitPoints.push(bestSplitPoint)
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return splitPoints.sort((a, b) => a.timeSec - b.timeSec)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Split audio file at specified points using FFmpeg
|
|
347
|
+
*
|
|
348
|
+
* @param audioPath - Path to source audio file
|
|
349
|
+
* @param splitPoints - Where to split the audio
|
|
350
|
+
* @param totalDuration - Total duration of source audio
|
|
351
|
+
* @param outputDir - Directory to write segments
|
|
352
|
+
* @param baseName - Base name for output files
|
|
353
|
+
* @returns Array of created audio segments
|
|
354
|
+
*/
|
|
355
|
+
export async function splitAudioAtPoints(
|
|
356
|
+
audioPath: string,
|
|
357
|
+
splitPoints: SplitPoint[],
|
|
358
|
+
totalDuration: number,
|
|
359
|
+
outputDir: string,
|
|
360
|
+
baseName: string
|
|
361
|
+
): Promise<AudioSegment[]> {
|
|
362
|
+
// Ensure output directory exists
|
|
363
|
+
fs.mkdirSync(outputDir, { recursive: true })
|
|
364
|
+
|
|
365
|
+
const segments: AudioSegment[] = []
|
|
366
|
+
|
|
367
|
+
// Build segment boundaries
|
|
368
|
+
const boundaries = [0, ...splitPoints.map((sp) => sp.timeSec), totalDuration]
|
|
369
|
+
|
|
370
|
+
const splitPromises: Promise<void>[] = []
|
|
371
|
+
|
|
372
|
+
for (let i = 0; i < boundaries.length - 1; i++) {
|
|
373
|
+
const startSec = boundaries[i]
|
|
374
|
+
const endSec = boundaries[i + 1]
|
|
375
|
+
const durationSec = endSec - startSec
|
|
376
|
+
const outputPath = path.join(outputDir, `${baseName}-segment-${i.toString().padStart(3, '0')}.wav`)
|
|
377
|
+
|
|
378
|
+
const segment: AudioSegment = {
|
|
379
|
+
index: i,
|
|
380
|
+
startSec,
|
|
381
|
+
endSec,
|
|
382
|
+
durationSec,
|
|
383
|
+
outputPath,
|
|
384
|
+
}
|
|
385
|
+
segments.push(segment)
|
|
386
|
+
|
|
387
|
+
// Create promise for this segment's extraction
|
|
388
|
+
const extractPromise = extractAudioSegment(
|
|
389
|
+
audioPath,
|
|
390
|
+
outputPath,
|
|
391
|
+
startSec,
|
|
392
|
+
durationSec
|
|
393
|
+
).catch(err => {
|
|
394
|
+
throw new Error(`Failed to extract segment ${i}: ${err instanceof Error ? err.message : String(err)}`)
|
|
395
|
+
})
|
|
396
|
+
|
|
397
|
+
splitPromises.push(extractPromise)
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
// Wait for all segments to be extracted
|
|
401
|
+
await Promise.all(splitPromises)
|
|
402
|
+
|
|
403
|
+
return segments
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Auto-split an audio file if it exceeds the maximum duration
|
|
408
|
+
* Returns the original file path if no split is needed
|
|
409
|
+
*
|
|
410
|
+
* @param audioPath - Path to source audio file
|
|
411
|
+
* @param outputDir - Directory for split segments
|
|
412
|
+
* @param config - Split configuration
|
|
413
|
+
* @returns Array of audio segment paths (single element if no split needed)
|
|
414
|
+
*/
|
|
415
|
+
export async function autoSplitAudio(
|
|
416
|
+
audioPath: string,
|
|
417
|
+
outputDir: string,
|
|
418
|
+
config: Partial<SplitConfig> = {}
|
|
419
|
+
): Promise<AudioSegment[]> {
|
|
420
|
+
const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config }
|
|
421
|
+
|
|
422
|
+
// Get audio duration
|
|
423
|
+
const totalDuration = await getAudioDuration(audioPath)
|
|
424
|
+
|
|
425
|
+
// Check if splitting is needed
|
|
426
|
+
if (totalDuration <= mergedConfig.maxDurationSec) {
|
|
427
|
+
// No split needed - return original as single segment
|
|
428
|
+
return [
|
|
429
|
+
{
|
|
430
|
+
index: 0,
|
|
431
|
+
startSec: 0,
|
|
432
|
+
endSec: totalDuration,
|
|
433
|
+
durationSec: totalDuration,
|
|
434
|
+
outputPath: audioPath,
|
|
435
|
+
},
|
|
436
|
+
]
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Detect silence regions for optimal split points
|
|
440
|
+
const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig)
|
|
441
|
+
|
|
442
|
+
// Find optimal split points
|
|
443
|
+
const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig)
|
|
444
|
+
|
|
445
|
+
// Extract base name from audio path
|
|
446
|
+
const baseName = path.basename(audioPath, path.extname(audioPath))
|
|
447
|
+
|
|
448
|
+
// Split the audio
|
|
449
|
+
const segments = await splitAudioAtPoints(
|
|
450
|
+
audioPath,
|
|
451
|
+
splitPoints,
|
|
452
|
+
totalDuration,
|
|
453
|
+
outputDir,
|
|
454
|
+
baseName
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
return segments
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Information about split points for logging/debugging
|
|
462
|
+
*/
|
|
463
|
+
export interface SplitAnalysis {
|
|
464
|
+
totalDuration: number
|
|
465
|
+
numSegments: number
|
|
466
|
+
splitPoints: SplitPoint[]
|
|
467
|
+
silenceRegions: SilenceRegion[]
|
|
468
|
+
needsSplit: boolean
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Analyze audio file and return split information without actually splitting
|
|
473
|
+
* Useful for preview/dry-run functionality
|
|
474
|
+
*/
|
|
475
|
+
export async function analyzeSplitPoints(
|
|
476
|
+
audioPath: string,
|
|
477
|
+
config: Partial<SplitConfig> = {}
|
|
478
|
+
): Promise<SplitAnalysis> {
|
|
479
|
+
const mergedConfig = { ...DEFAULT_SPLIT_CONFIG, ...config }
|
|
480
|
+
|
|
481
|
+
const totalDuration = await getAudioDuration(audioPath)
|
|
482
|
+
const needsSplit = totalDuration > mergedConfig.maxDurationSec
|
|
483
|
+
|
|
484
|
+
if (!needsSplit) {
|
|
485
|
+
return {
|
|
486
|
+
totalDuration,
|
|
487
|
+
numSegments: 1,
|
|
488
|
+
splitPoints: [],
|
|
489
|
+
silenceRegions: [],
|
|
490
|
+
needsSplit: false,
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
const silenceRegions = await detectSilenceRegions(audioPath, mergedConfig)
|
|
495
|
+
const splitPoints = findOptimalSplitPoints(silenceRegions, totalDuration, mergedConfig)
|
|
496
|
+
|
|
497
|
+
return {
|
|
498
|
+
totalDuration,
|
|
499
|
+
numSegments: splitPoints.length + 1,
|
|
500
|
+
splitPoints,
|
|
501
|
+
silenceRegions,
|
|
502
|
+
needsSplit: true,
|
|
503
|
+
}
|
|
504
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import path from 'node:path'
|
|
2
|
+
|
|
3
|
+
export const getExt = (filePath: string) => {
|
|
4
|
+
return getFileInfo(filePath).ext
|
|
5
|
+
}
|
|
6
|
+
export const getName = (filePath: string) => {
|
|
7
|
+
return getFileInfo(filePath).name
|
|
8
|
+
}
|
|
9
|
+
export const getNameWithExt = (filePath: string) => {
|
|
10
|
+
const { name, ext } = getFileInfo(filePath)
|
|
11
|
+
return `${name}.${ext}`
|
|
12
|
+
}
|
|
13
|
+
export const getFileInfo = (filePath: string) => {
|
|
14
|
+
const normed = path.normalize(filePath)
|
|
15
|
+
return path.parse(normed)
|
|
16
|
+
}
|