eprec 1.10.2 → 1.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -7,14 +7,18 @@ import {
|
|
|
7
7
|
PromptCancelled,
|
|
8
8
|
createInquirerPrompter,
|
|
9
9
|
createPathPicker,
|
|
10
|
+
createStepProgressReporter,
|
|
10
11
|
isInteractive,
|
|
12
|
+
pauseActiveSpinner,
|
|
11
13
|
resolveOptionalString,
|
|
14
|
+
resumeActiveSpinner,
|
|
12
15
|
type PathPicker,
|
|
13
16
|
type Prompter,
|
|
14
17
|
withSpinner,
|
|
15
18
|
} from '../../cli-ux'
|
|
16
19
|
import { editVideo, buildEditedOutputPath } from './video-editor'
|
|
17
20
|
import { combineVideos } from './combined-video-editor'
|
|
21
|
+
import { setLogHooks } from '../logging'
|
|
18
22
|
|
|
19
23
|
export type EditVideoCommandArgs = {
|
|
20
24
|
input: string
|
|
@@ -176,21 +180,33 @@ function resolvePaddingMs(value: unknown) {
|
|
|
176
180
|
export function createEditVideoHandler(options: CliUxOptions): CommandHandler {
|
|
177
181
|
return async (argv) => {
|
|
178
182
|
const args = await resolveEditVideoArgs(argv, options)
|
|
183
|
+
const progress = options.interactive
|
|
184
|
+
? createStepProgressReporter({ action: 'Editing video' })
|
|
185
|
+
: undefined
|
|
179
186
|
await withSpinner(
|
|
180
187
|
'Editing video',
|
|
181
188
|
async () => {
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
editedTextPath: String(args.edited),
|
|
186
|
-
outputPath: String(args.output),
|
|
187
|
-
paddingMs: args['padding-ms'],
|
|
189
|
+
setLogHooks({
|
|
190
|
+
beforeLog: pauseActiveSpinner,
|
|
191
|
+
afterLog: resumeActiveSpinner,
|
|
188
192
|
})
|
|
189
|
-
|
|
190
|
-
|
|
193
|
+
try {
|
|
194
|
+
const result = await editVideo({
|
|
195
|
+
inputPath: String(args.input),
|
|
196
|
+
transcriptJsonPath: String(args.transcript),
|
|
197
|
+
editedTextPath: String(args.edited),
|
|
198
|
+
outputPath: String(args.output),
|
|
199
|
+
paddingMs: args['padding-ms'],
|
|
200
|
+
progress,
|
|
201
|
+
})
|
|
202
|
+
if (!result.success) {
|
|
203
|
+
throw new Error(result.error ?? 'Edit failed.')
|
|
204
|
+
}
|
|
205
|
+
} finally {
|
|
206
|
+
setLogHooks({})
|
|
191
207
|
}
|
|
192
208
|
},
|
|
193
|
-
{ successText: 'Edit complete' },
|
|
209
|
+
{ successText: 'Edit complete', enabled: options.interactive },
|
|
194
210
|
)
|
|
195
211
|
console.log(`Edited video written to ${args.output}`)
|
|
196
212
|
}
|
|
@@ -201,26 +217,47 @@ export function createCombineVideosHandler(
|
|
|
201
217
|
): CommandHandler {
|
|
202
218
|
return async (argv) => {
|
|
203
219
|
const args = await resolveCombineVideosArgs(argv, options)
|
|
220
|
+
const progress = options.interactive
|
|
221
|
+
? createStepProgressReporter({ action: 'Combining videos' })
|
|
222
|
+
: undefined
|
|
223
|
+
const editProgressFactory = options.interactive
|
|
224
|
+
? (detail: string) =>
|
|
225
|
+
createStepProgressReporter({
|
|
226
|
+
action: 'Combining videos',
|
|
227
|
+
detail,
|
|
228
|
+
maxLabelLength: 28,
|
|
229
|
+
})
|
|
230
|
+
: undefined
|
|
204
231
|
let outputPath = ''
|
|
205
232
|
await withSpinner(
|
|
206
233
|
'Combining videos',
|
|
207
234
|
async () => {
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
video1EditedTextPath: args.edited1,
|
|
212
|
-
video2Path: String(args.video2),
|
|
213
|
-
video2TranscriptJsonPath: args.transcript2,
|
|
214
|
-
video2EditedTextPath: args.edited2,
|
|
215
|
-
outputPath: String(args.output),
|
|
216
|
-
overlapPaddingMs: args['padding-ms'],
|
|
235
|
+
setLogHooks({
|
|
236
|
+
beforeLog: pauseActiveSpinner,
|
|
237
|
+
afterLog: resumeActiveSpinner,
|
|
217
238
|
})
|
|
218
|
-
|
|
219
|
-
|
|
239
|
+
try {
|
|
240
|
+
const result = await combineVideos({
|
|
241
|
+
video1Path: String(args.video1),
|
|
242
|
+
video1TranscriptJsonPath: args.transcript1,
|
|
243
|
+
video1EditedTextPath: args.edited1,
|
|
244
|
+
video2Path: String(args.video2),
|
|
245
|
+
video2TranscriptJsonPath: args.transcript2,
|
|
246
|
+
video2EditedTextPath: args.edited2,
|
|
247
|
+
outputPath: String(args.output),
|
|
248
|
+
overlapPaddingMs: args['padding-ms'],
|
|
249
|
+
progress,
|
|
250
|
+
editProgressFactory,
|
|
251
|
+
})
|
|
252
|
+
if (!result.success) {
|
|
253
|
+
throw new Error(result.error ?? 'Combine failed.')
|
|
254
|
+
}
|
|
255
|
+
outputPath = result.outputPath
|
|
256
|
+
} finally {
|
|
257
|
+
setLogHooks({})
|
|
220
258
|
}
|
|
221
|
-
outputPath = result.outputPath
|
|
222
259
|
},
|
|
223
|
-
{ successText: 'Combine complete' },
|
|
260
|
+
{ successText: 'Combine complete', enabled: options.interactive },
|
|
224
261
|
)
|
|
225
262
|
console.log(`Combined video written to ${outputPath}`)
|
|
226
263
|
}
|
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
findSpeechStartWithRmsFallback,
|
|
15
15
|
} from '../utils/audio-analysis'
|
|
16
16
|
import { allocateJoinPadding } from '../utils/video-editing'
|
|
17
|
+
import type { StepProgressReporter } from '../../progress-reporter'
|
|
17
18
|
|
|
18
19
|
export interface CombineVideosOptions {
|
|
19
20
|
video1Path: string
|
|
@@ -26,6 +27,8 @@ export interface CombineVideosOptions {
|
|
|
26
27
|
video2Duration?: number
|
|
27
28
|
outputPath: string
|
|
28
29
|
overlapPaddingMs?: number
|
|
30
|
+
progress?: StepProgressReporter
|
|
31
|
+
editProgressFactory?: (detail: string) => StepProgressReporter | undefined
|
|
29
32
|
}
|
|
30
33
|
|
|
31
34
|
export interface CombineVideosResult {
|
|
@@ -39,12 +42,17 @@ export interface CombineVideosResult {
|
|
|
39
42
|
export async function combineVideos(
|
|
40
43
|
options: CombineVideosOptions,
|
|
41
44
|
): Promise<CombineVideosResult> {
|
|
45
|
+
const progress = options.progress
|
|
46
|
+
const totalSteps = 5
|
|
47
|
+
progress?.start({ stepCount: totalSteps, label: 'Preparing edits' })
|
|
48
|
+
|
|
42
49
|
const tempDir = await mkdtemp(path.join(os.tmpdir(), 'video-combine-'))
|
|
43
50
|
try {
|
|
44
51
|
const { video1Path, video2Path } = await applyOptionalEdits(
|
|
45
52
|
options,
|
|
46
53
|
tempDir,
|
|
47
54
|
)
|
|
55
|
+
progress?.step('Measuring durations')
|
|
48
56
|
const editsApplied =
|
|
49
57
|
options.video1EditedTextPath || options.video2EditedTextPath
|
|
50
58
|
const video1Duration = editsApplied
|
|
@@ -54,6 +62,8 @@ export async function combineVideos(
|
|
|
54
62
|
? await getMediaDurationSeconds(video2Path)
|
|
55
63
|
: (options.video2Duration ?? (await getMediaDurationSeconds(video2Path)))
|
|
56
64
|
|
|
65
|
+
progress?.step('Detecting speech')
|
|
66
|
+
progress?.setLabel('Checking first video')
|
|
57
67
|
const video1HasSpeech = await checkSegmentHasSpeech(
|
|
58
68
|
video1Path,
|
|
59
69
|
video1Duration,
|
|
@@ -67,6 +77,7 @@ export async function combineVideos(
|
|
|
67
77
|
}
|
|
68
78
|
}
|
|
69
79
|
|
|
80
|
+
progress?.setLabel('Finding first video speech end')
|
|
70
81
|
const paddingSeconds =
|
|
71
82
|
(options.overlapPaddingMs ?? EDIT_CONFIG.speechBoundaryPaddingMs) / 1000
|
|
72
83
|
|
|
@@ -74,6 +85,7 @@ export async function combineVideos(
|
|
|
74
85
|
inputPath: video1Path,
|
|
75
86
|
duration: video1Duration,
|
|
76
87
|
})
|
|
88
|
+
progress?.setLabel('Finding second video speech bounds')
|
|
77
89
|
const { speechStart: video2SpeechStart, speechEnd: video2SpeechEnd } =
|
|
78
90
|
await findVideo2SpeechBounds({
|
|
79
91
|
inputPath: video2Path,
|
|
@@ -103,8 +115,10 @@ export async function combineVideos(
|
|
|
103
115
|
video2Duration,
|
|
104
116
|
)
|
|
105
117
|
|
|
118
|
+
progress?.step('Trimming segments')
|
|
106
119
|
const segment1Path = path.join(tempDir, 'segment-1.mp4')
|
|
107
120
|
const segment2Path = path.join(tempDir, 'segment-2.mp4')
|
|
121
|
+
progress?.setLabel('Extracting segment 1/2')
|
|
108
122
|
await extractChapterSegmentAccurate({
|
|
109
123
|
inputPath: video1Path,
|
|
110
124
|
outputPath: segment1Path,
|
|
@@ -119,6 +133,7 @@ export async function combineVideos(
|
|
|
119
133
|
video2TrimStart,
|
|
120
134
|
}
|
|
121
135
|
}
|
|
136
|
+
progress?.setLabel('Extracting segment 2/2')
|
|
122
137
|
await extractChapterSegmentAccurate({
|
|
123
138
|
inputPath: video2Path,
|
|
124
139
|
outputPath: segment2Path,
|
|
@@ -126,6 +141,7 @@ export async function combineVideos(
|
|
|
126
141
|
end: video2TrimEnd,
|
|
127
142
|
})
|
|
128
143
|
|
|
144
|
+
progress?.setLabel('Validating trimmed speech')
|
|
129
145
|
const segment2HasSpeech = await checkSegmentHasSpeech(
|
|
130
146
|
segment2Path,
|
|
131
147
|
video2TrimEnd - video2TrimStart,
|
|
@@ -139,6 +155,7 @@ export async function combineVideos(
|
|
|
139
155
|
}
|
|
140
156
|
}
|
|
141
157
|
|
|
158
|
+
progress?.step('Combining output')
|
|
142
159
|
const resolvedOutputPath = await resolveOutputPath(
|
|
143
160
|
options.outputPath,
|
|
144
161
|
video1Path,
|
|
@@ -151,6 +168,7 @@ export async function combineVideos(
|
|
|
151
168
|
outputPath: resolvedOutputPath,
|
|
152
169
|
})
|
|
153
170
|
await finalizeOutput(resolvedOutputPath, options.outputPath)
|
|
171
|
+
progress?.finish('Complete')
|
|
154
172
|
|
|
155
173
|
return {
|
|
156
174
|
success: true,
|
|
@@ -176,18 +194,21 @@ async function applyOptionalEdits(
|
|
|
176
194
|
): Promise<{ video1Path: string; video2Path: string }> {
|
|
177
195
|
let video1Path = options.video1Path
|
|
178
196
|
let video2Path = options.video2Path
|
|
197
|
+
const editProgressFactory = options.editProgressFactory
|
|
179
198
|
|
|
180
199
|
if (options.video1EditedTextPath) {
|
|
181
200
|
if (!options.video1TranscriptJsonPath) {
|
|
182
201
|
throw new Error('Missing transcript JSON for first video edits.')
|
|
183
202
|
}
|
|
184
203
|
const editedPath = path.join(tempDir, 'video1-edited.mp4')
|
|
204
|
+
const progress = editProgressFactory?.('Edit first video')
|
|
185
205
|
const result = await editVideo({
|
|
186
206
|
inputPath: options.video1Path,
|
|
187
207
|
transcriptJsonPath: options.video1TranscriptJsonPath,
|
|
188
208
|
editedTextPath: options.video1EditedTextPath,
|
|
189
209
|
outputPath: editedPath,
|
|
190
210
|
paddingMs: options.overlapPaddingMs,
|
|
211
|
+
progress,
|
|
191
212
|
})
|
|
192
213
|
if (!result.success) {
|
|
193
214
|
throw new Error(result.error ?? 'Failed to edit first video.')
|
|
@@ -200,12 +221,14 @@ async function applyOptionalEdits(
|
|
|
200
221
|
throw new Error('Missing transcript JSON for second video edits.')
|
|
201
222
|
}
|
|
202
223
|
const editedPath = path.join(tempDir, 'video2-edited.mp4')
|
|
224
|
+
const progress = editProgressFactory?.('Edit second video')
|
|
203
225
|
const result = await editVideo({
|
|
204
226
|
inputPath: options.video2Path,
|
|
205
227
|
transcriptJsonPath: options.video2TranscriptJsonPath,
|
|
206
228
|
editedTextPath: options.video2EditedTextPath,
|
|
207
229
|
outputPath: editedPath,
|
|
208
230
|
paddingMs: options.overlapPaddingMs,
|
|
231
|
+
progress,
|
|
209
232
|
})
|
|
210
233
|
if (!result.success) {
|
|
211
234
|
throw new Error(result.error ?? 'Failed to edit second video.')
|
|
@@ -11,6 +11,7 @@ import {
|
|
|
11
11
|
} from './timestamp-refinement'
|
|
12
12
|
import type { TimeRange } from '../types'
|
|
13
13
|
import type { TranscriptJson, TranscriptWordWithIndex } from './types'
|
|
14
|
+
import type { StepProgressReporter } from '../../progress-reporter'
|
|
14
15
|
|
|
15
16
|
export interface EditVideoOptions {
|
|
16
17
|
inputPath: string
|
|
@@ -18,6 +19,7 @@ export interface EditVideoOptions {
|
|
|
18
19
|
editedTextPath: string
|
|
19
20
|
outputPath: string
|
|
20
21
|
paddingMs?: number
|
|
22
|
+
progress?: StepProgressReporter
|
|
21
23
|
}
|
|
22
24
|
|
|
23
25
|
export interface EditVideoResult {
|
|
@@ -37,8 +39,13 @@ export async function editVideo(
|
|
|
37
39
|
options: EditVideoOptions,
|
|
38
40
|
): Promise<EditVideoResult> {
|
|
39
41
|
try {
|
|
42
|
+
const progress = options.progress
|
|
43
|
+
const totalSteps = 5
|
|
44
|
+
progress?.start({ stepCount: totalSteps, label: 'Loading transcript' })
|
|
45
|
+
|
|
40
46
|
const transcript = await readTranscriptJson(options.transcriptJsonPath)
|
|
41
47
|
const editedText = await Bun.file(options.editedTextPath).text()
|
|
48
|
+
progress?.step('Validating edits')
|
|
42
49
|
const validation = validateEditedTranscript({
|
|
43
50
|
originalWords: transcript.words,
|
|
44
51
|
editedText,
|
|
@@ -51,6 +58,7 @@ export async function editVideo(
|
|
|
51
58
|
removedRanges: [],
|
|
52
59
|
}
|
|
53
60
|
}
|
|
61
|
+
progress?.step('Diffing transcript')
|
|
54
62
|
const diffResult = diffTranscripts({
|
|
55
63
|
originalWords: transcript.words,
|
|
56
64
|
editedText,
|
|
@@ -64,9 +72,12 @@ export async function editVideo(
|
|
|
64
72
|
}
|
|
65
73
|
}
|
|
66
74
|
|
|
75
|
+
progress?.step('Planning edits')
|
|
67
76
|
const removedWords = diffResult.removedWords
|
|
68
77
|
if (removedWords.length === 0) {
|
|
78
|
+
progress?.step('Rendering output')
|
|
69
79
|
await ensureOutputCopy(options.inputPath, options.outputPath)
|
|
80
|
+
progress?.finish('No edits')
|
|
70
81
|
return {
|
|
71
82
|
success: true,
|
|
72
83
|
outputPath: options.outputPath,
|
|
@@ -77,7 +88,9 @@ export async function editVideo(
|
|
|
77
88
|
|
|
78
89
|
const removalRanges = wordsToTimeRanges(removedWords)
|
|
79
90
|
if (removalRanges.length === 0) {
|
|
91
|
+
progress?.step('Rendering output')
|
|
80
92
|
await ensureOutputCopy(options.inputPath, options.outputPath)
|
|
93
|
+
progress?.finish('No ranges')
|
|
81
94
|
return {
|
|
82
95
|
success: true,
|
|
83
96
|
outputPath: options.outputPath,
|
|
@@ -86,6 +99,7 @@ export async function editVideo(
|
|
|
86
99
|
}
|
|
87
100
|
}
|
|
88
101
|
|
|
102
|
+
progress?.setLabel('Refining ranges')
|
|
89
103
|
const refinedRanges = await refineAllRemovalRanges({
|
|
90
104
|
inputPath: options.inputPath,
|
|
91
105
|
duration: transcript.source_duration,
|
|
@@ -111,6 +125,7 @@ export async function editVideo(
|
|
|
111
125
|
}
|
|
112
126
|
}
|
|
113
127
|
|
|
128
|
+
progress?.step('Rendering output')
|
|
114
129
|
await mkdir(path.dirname(options.outputPath), { recursive: true })
|
|
115
130
|
|
|
116
131
|
const isFullRange =
|
|
@@ -120,6 +135,7 @@ export async function editVideo(
|
|
|
120
135
|
keepRanges[0].end >= transcript.source_duration - 0.001
|
|
121
136
|
if (isFullRange) {
|
|
122
137
|
await ensureOutputCopy(options.inputPath, options.outputPath)
|
|
138
|
+
progress?.finish('Complete')
|
|
123
139
|
return {
|
|
124
140
|
success: true,
|
|
125
141
|
outputPath: options.outputPath,
|
|
@@ -129,12 +145,14 @@ export async function editVideo(
|
|
|
129
145
|
}
|
|
130
146
|
|
|
131
147
|
if (keepRanges.length === 1 && keepRanges[0]) {
|
|
148
|
+
progress?.setLabel('Extracting segment')
|
|
132
149
|
await extractChapterSegmentAccurate({
|
|
133
150
|
inputPath: options.inputPath,
|
|
134
151
|
outputPath: options.outputPath,
|
|
135
152
|
start: keepRanges[0].start,
|
|
136
153
|
end: keepRanges[0].end,
|
|
137
154
|
})
|
|
155
|
+
progress?.finish('Complete')
|
|
138
156
|
return {
|
|
139
157
|
success: true,
|
|
140
158
|
outputPath: options.outputPath,
|
|
@@ -147,6 +165,9 @@ export async function editVideo(
|
|
|
147
165
|
try {
|
|
148
166
|
const segmentPaths: string[] = []
|
|
149
167
|
for (const [index, range] of keepRanges.entries()) {
|
|
168
|
+
progress?.setLabel(
|
|
169
|
+
`Extracting segment ${index + 1}/${keepRanges.length}`,
|
|
170
|
+
)
|
|
150
171
|
const segmentPath = path.join(tempDir, `segment-${index + 1}.mp4`)
|
|
151
172
|
await extractChapterSegmentAccurate({
|
|
152
173
|
inputPath: options.inputPath,
|
|
@@ -156,10 +177,12 @@ export async function editVideo(
|
|
|
156
177
|
})
|
|
157
178
|
segmentPaths.push(segmentPath)
|
|
158
179
|
}
|
|
180
|
+
progress?.setLabel('Concatenating segments')
|
|
159
181
|
await concatSegments({
|
|
160
182
|
segmentPaths,
|
|
161
183
|
outputPath: options.outputPath,
|
|
162
184
|
})
|
|
185
|
+
progress?.finish('Complete')
|
|
163
186
|
return {
|
|
164
187
|
success: true,
|
|
165
188
|
outputPath: options.outputPath,
|
package/src/cli.ts
CHANGED
|
@@ -27,6 +27,7 @@ import {
|
|
|
27
27
|
PromptCancelled,
|
|
28
28
|
createInquirerPrompter,
|
|
29
29
|
createPathPicker,
|
|
30
|
+
createStepProgressReporter,
|
|
30
31
|
isInteractive,
|
|
31
32
|
pauseActiveSpinner,
|
|
32
33
|
resumeActiveSpinner,
|
|
@@ -156,18 +157,33 @@ async function main(rawArgs = hideBin(process.argv)) {
|
|
|
156
157
|
}),
|
|
157
158
|
async (argv) => {
|
|
158
159
|
const transcribeArgs = await resolveTranscribeArgs(argv, context)
|
|
160
|
+
const progress = context.interactive
|
|
161
|
+
? createStepProgressReporter({ action: 'Transcribing audio' })
|
|
162
|
+
: undefined
|
|
159
163
|
let resultText = ''
|
|
160
164
|
await withSpinner(
|
|
161
165
|
'Transcribing audio',
|
|
162
166
|
async () => {
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
threads: transcribeArgs.threads,
|
|
167
|
-
binaryPath: transcribeArgs.binaryPath,
|
|
168
|
-
outputBasePath: transcribeArgs.outputBasePath,
|
|
167
|
+
setLogHooks({
|
|
168
|
+
beforeLog: pauseActiveSpinner,
|
|
169
|
+
afterLog: resumeActiveSpinner,
|
|
169
170
|
})
|
|
170
|
-
|
|
171
|
+
try {
|
|
172
|
+
const result = await transcribeAudio(
|
|
173
|
+
transcribeArgs.inputPath,
|
|
174
|
+
{
|
|
175
|
+
modelPath: transcribeArgs.modelPath,
|
|
176
|
+
language: transcribeArgs.language,
|
|
177
|
+
threads: transcribeArgs.threads,
|
|
178
|
+
binaryPath: transcribeArgs.binaryPath,
|
|
179
|
+
outputBasePath: transcribeArgs.outputBasePath,
|
|
180
|
+
progress,
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
|
+
resultText = result.text
|
|
184
|
+
} finally {
|
|
185
|
+
setLogHooks({})
|
|
186
|
+
}
|
|
171
187
|
},
|
|
172
188
|
{
|
|
173
189
|
successText: 'Transcription complete',
|
|
@@ -203,16 +219,28 @@ async function main(rawArgs = hideBin(process.argv)) {
|
|
|
203
219
|
argv,
|
|
204
220
|
context,
|
|
205
221
|
)
|
|
222
|
+
const progress = context.interactive
|
|
223
|
+
? createStepProgressReporter({ action: 'Detecting speech' })
|
|
224
|
+
: undefined
|
|
206
225
|
let segments: unknown = []
|
|
207
226
|
await withSpinner(
|
|
208
227
|
'Detecting speech',
|
|
209
228
|
async () => {
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
start,
|
|
214
|
-
end,
|
|
229
|
+
setLogHooks({
|
|
230
|
+
beforeLog: pauseActiveSpinner,
|
|
231
|
+
afterLog: resumeActiveSpinner,
|
|
215
232
|
})
|
|
233
|
+
try {
|
|
234
|
+
await ensureFfmpegAvailable()
|
|
235
|
+
segments = await detectSpeechSegmentsForFile({
|
|
236
|
+
inputPath,
|
|
237
|
+
start,
|
|
238
|
+
end,
|
|
239
|
+
progress,
|
|
240
|
+
})
|
|
241
|
+
} finally {
|
|
242
|
+
setLogHooks({})
|
|
243
|
+
}
|
|
216
244
|
},
|
|
217
245
|
{
|
|
218
246
|
successText: 'Speech detection complete',
|
package/src/speech-detection.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { CONFIG } from '../process-course/config'
|
|
|
6
6
|
import { formatSeconds, getMediaDurationSeconds } from './utils'
|
|
7
7
|
import { speechFallback } from '../process-course/utils/audio-analysis'
|
|
8
8
|
import type { SpeechBounds } from '../process-course/types'
|
|
9
|
+
import type { StepProgressReporter } from '../progress-reporter'
|
|
9
10
|
|
|
10
11
|
export type VadConfig = {
|
|
11
12
|
vadWindowSamples: number
|
|
@@ -27,6 +28,7 @@ export async function detectSpeechSegmentsWithVad(
|
|
|
27
28
|
samples: Float32Array,
|
|
28
29
|
sampleRate: number,
|
|
29
30
|
config: VadConfig,
|
|
31
|
+
options?: { onProgress?: () => void; updateStride?: number },
|
|
30
32
|
): Promise<VadSegment[]> {
|
|
31
33
|
const vadSession = await getVadSession(config)
|
|
32
34
|
const probabilities = await getVadProbabilities(
|
|
@@ -34,6 +36,7 @@ export async function detectSpeechSegmentsWithVad(
|
|
|
34
36
|
sampleRate,
|
|
35
37
|
config,
|
|
36
38
|
vadSession,
|
|
39
|
+
options,
|
|
37
40
|
)
|
|
38
41
|
return probabilitiesToSegments(
|
|
39
42
|
samples.length,
|
|
@@ -47,7 +50,10 @@ export async function detectSpeechSegmentsForFile(options: {
|
|
|
47
50
|
inputPath: string
|
|
48
51
|
start?: number
|
|
49
52
|
end?: number
|
|
53
|
+
progress?: StepProgressReporter
|
|
50
54
|
}): Promise<SpeechSegment[]> {
|
|
55
|
+
const progress = options.progress
|
|
56
|
+
progress?.start({ stepCount: 1, label: 'Loading audio' })
|
|
51
57
|
const start = options.start ?? 0
|
|
52
58
|
if (!Number.isFinite(start) || start < 0) {
|
|
53
59
|
throw new Error('Start time must be a non-negative number.')
|
|
@@ -66,13 +72,31 @@ export async function detectSpeechSegmentsForFile(options: {
|
|
|
66
72
|
sampleRate: CONFIG.vadSampleRate,
|
|
67
73
|
})
|
|
68
74
|
if (samples.length === 0) {
|
|
75
|
+
progress?.finish('No audio')
|
|
69
76
|
return []
|
|
70
77
|
}
|
|
78
|
+
const windowSamples = CONFIG.vadWindowSamples
|
|
79
|
+
const totalWindows = Math.ceil(samples.length / windowSamples)
|
|
80
|
+
const updateStride = Math.max(1, Math.floor(totalWindows / 50))
|
|
81
|
+
const updateCount = Math.max(1, Math.ceil(totalWindows / updateStride))
|
|
82
|
+
progress?.start({ stepCount: updateCount, label: 'Running VAD' })
|
|
83
|
+
let progressUpdates = 0
|
|
71
84
|
const segments = await detectSpeechSegmentsWithVad(
|
|
72
85
|
samples,
|
|
73
86
|
CONFIG.vadSampleRate,
|
|
74
87
|
CONFIG,
|
|
88
|
+
{
|
|
89
|
+
onProgress: () => {
|
|
90
|
+
progressUpdates += 1
|
|
91
|
+
if (progressUpdates <= updateCount) {
|
|
92
|
+
progress?.step('Running VAD')
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
updateStride,
|
|
96
|
+
},
|
|
75
97
|
)
|
|
98
|
+
progress?.setLabel('Building segments')
|
|
99
|
+
progress?.finish('Complete')
|
|
76
100
|
return segments.map((segment) => ({
|
|
77
101
|
start: segment.start + start,
|
|
78
102
|
end: segment.end + start,
|
|
@@ -116,6 +140,7 @@ async function getVadProbabilities(
|
|
|
116
140
|
sampleRate: number,
|
|
117
141
|
config: VadConfig,
|
|
118
142
|
session: ort.InferenceSession,
|
|
143
|
+
options?: { onProgress?: () => void; updateStride?: number },
|
|
119
144
|
) {
|
|
120
145
|
const windowSamples = config.vadWindowSamples
|
|
121
146
|
const srTensor = new ort.Tensor(
|
|
@@ -126,6 +151,8 @@ async function getVadProbabilities(
|
|
|
126
151
|
const probabilities: number[] = []
|
|
127
152
|
let stateH = new Float32Array(2 * 1 * 64)
|
|
128
153
|
let stateC = new Float32Array(2 * 1 * 64)
|
|
154
|
+
const updateStride = Math.max(1, Math.floor(options?.updateStride ?? 1))
|
|
155
|
+
let updateIndex = 0
|
|
129
156
|
|
|
130
157
|
for (let offset = 0; offset < samples.length; offset += windowSamples) {
|
|
131
158
|
const chunk = samples.subarray(offset, offset + windowSamples)
|
|
@@ -154,6 +181,10 @@ async function getVadProbabilities(
|
|
|
154
181
|
probabilities.push((probTensor.data as Float32Array)[0] ?? 0)
|
|
155
182
|
stateH = new Float32Array(nextH.data as Float32Array)
|
|
156
183
|
stateC = new Float32Array(nextC.data as Float32Array)
|
|
184
|
+
if (updateIndex % updateStride === 0) {
|
|
185
|
+
options?.onProgress?.()
|
|
186
|
+
}
|
|
187
|
+
updateIndex += 1
|
|
157
188
|
}
|
|
158
189
|
|
|
159
190
|
return probabilities
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import path from 'node:path'
|
|
2
2
|
import { mkdir } from 'node:fs/promises'
|
|
3
3
|
import { runCommand } from './utils'
|
|
4
|
+
import type { StepProgressReporter } from '../progress-reporter'
|
|
4
5
|
|
|
5
6
|
const DEFAULT_MODEL_FILENAME = 'ggml-small.en.bin'
|
|
6
7
|
const DEFAULT_MODEL_URL =
|
|
@@ -14,6 +15,7 @@ type TranscribeOptions = {
|
|
|
14
15
|
threads?: number
|
|
15
16
|
binaryPath?: string
|
|
16
17
|
outputBasePath?: string
|
|
18
|
+
progress?: StepProgressReporter
|
|
17
19
|
}
|
|
18
20
|
|
|
19
21
|
export type TranscriptSegment = {
|
|
@@ -36,6 +38,7 @@ export async function transcribeAudio(
|
|
|
36
38
|
audioPath: string,
|
|
37
39
|
options: TranscribeOptions = {},
|
|
38
40
|
): Promise<TranscriptionResult> {
|
|
41
|
+
const progress = options.progress
|
|
39
42
|
const resolvedAudioPath = path.resolve(audioPath)
|
|
40
43
|
const resolvedModelPath = path.resolve(
|
|
41
44
|
options.modelPath ?? getDefaultWhisperModelPath(),
|
|
@@ -49,7 +52,9 @@ export async function transcribeAudio(
|
|
|
49
52
|
`${path.parse(resolvedAudioPath).name}-transcript`,
|
|
50
53
|
)
|
|
51
54
|
|
|
52
|
-
|
|
55
|
+
const totalSteps = 3
|
|
56
|
+
progress?.start({ stepCount: totalSteps, label: 'Checking model' })
|
|
57
|
+
await ensureModelFile(resolvedModelPath, progress)
|
|
53
58
|
|
|
54
59
|
const args = [
|
|
55
60
|
binaryPath,
|
|
@@ -69,17 +74,23 @@ export async function transcribeAudio(
|
|
|
69
74
|
args.push('-t', String(options.threads))
|
|
70
75
|
}
|
|
71
76
|
|
|
77
|
+
progress?.step('Transcribing audio')
|
|
72
78
|
const result = await runCommand(args)
|
|
79
|
+
progress?.step('Reading output')
|
|
73
80
|
const transcriptPath = `${outputBasePath}.txt`
|
|
74
81
|
const transcript = await readTranscriptText(transcriptPath, result.stdout)
|
|
75
82
|
const { segments, source } = await readTranscriptSegments(
|
|
76
83
|
`${outputBasePath}.json`,
|
|
77
84
|
)
|
|
78
85
|
const normalized = normalizeTranscriptText(transcript)
|
|
86
|
+
progress?.finish('Complete')
|
|
79
87
|
return { text: normalized, segments, segmentsSource: source }
|
|
80
88
|
}
|
|
81
89
|
|
|
82
|
-
async function ensureModelFile(
|
|
90
|
+
async function ensureModelFile(
|
|
91
|
+
modelPath: string,
|
|
92
|
+
progress?: StepProgressReporter,
|
|
93
|
+
) {
|
|
83
94
|
const file = Bun.file(modelPath)
|
|
84
95
|
if (await file.exists()) {
|
|
85
96
|
return
|
|
@@ -90,6 +101,7 @@ async function ensureModelFile(modelPath: string) {
|
|
|
90
101
|
throw new Error(`Whisper model not found at ${modelPath}.`)
|
|
91
102
|
}
|
|
92
103
|
|
|
104
|
+
progress?.setLabel('Downloading model')
|
|
93
105
|
await mkdir(path.dirname(modelPath), { recursive: true })
|
|
94
106
|
const response = await fetch(DEFAULT_MODEL_URL)
|
|
95
107
|
if (!response.ok) {
|