describealign 1.0.8__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
describealign.py CHANGED
@@ -1,1221 +1,1292 @@
1
- # combines videos with matching audio files (e.g. audio descriptions)
2
- # input: video or folder of videos and an audio file or folder of audio files
3
- # output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
4
- # this script aligns the new audio to the video using the video's old audio
5
- # first, the video's sound and the audio file are both converted to spectrograms
6
- # second, the two spectrograms are roughly aligned by finding their longest common subsequence
7
- # third, the rough alignment is denoised through L1-Minimization
8
- # fourth, the spectrogram alignments determine where the new audio replaces the old
9
-
10
- '''
11
- Copyright (C) 2023 Julian Brown
12
-
13
- This program is free software: you can redistribute it and/or modify
14
- it under the terms of the GNU General Public License as published by
15
- the Free Software Foundation, either version 3 of the License, or
16
- (at your option) any later version.
17
-
18
- This program is distributed in the hope that it will be useful,
19
- but WITHOUT ANY WARRANTY; without even the implied warranty of
20
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21
- GNU General Public License for more details.
22
-
23
- You should have received a copy of the GNU General Public License
24
- along with this program. If not, see <https://www.gnu.org/licenses/>.
25
- '''
26
-
27
- VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
28
- AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
29
- PLOT_ALIGNMENT_TO_FILE = True
30
-
31
- TIMESTEP_SIZE_SECONDS = .16
32
- TIMESTEP_OVERLAP_RATIO = .5
33
- AUDIO_SAMPLE_RATE = 44100
34
- MEL_COEFFS_PER_TIMESTEP = 25
35
- DITHER_PERIOD_STEPS = 60
36
- MIN_CORR_FOR_TOKEN_MATCH = .6
37
- GAP_START_COST = 1.0
38
- GAP_EXTEND_COST = -.01
39
- GAP_EXTEND_DIAG_BONUS = -.01
40
- SKIP_MATCH_COST = .1
41
- MAX_RATE_RATIO_DIFF_ALIGN = .1
42
- PREF_CUT_AT_GAPS_FACTOR = 5
43
- MIN_DURATION_TO_REPLACE_SECONDS = 2
44
- MIN_START_END_SYNC_TIME_SECONDS = 2
45
- MAX_START_END_SYNC_ERR_SECONDS = .2
46
- MAX_RATE_RATIO_DIFF_BOOST = .003
47
- MIN_DESC_DURATION = .5
48
- MAX_GAP_IN_DESC_SEC = 1.5
49
- JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
50
- CATCHUP_RATE = 5
51
-
52
- if PLOT_ALIGNMENT_TO_FILE:
53
- import matplotlib.pyplot as plt
54
- import argparse
55
- import os
56
- import glob
57
- import itertools
58
- import datetime
59
- import numpy as np
60
- import ffmpeg
61
- import static_ffmpeg
62
- import python_speech_features as psf
63
- import scipy.signal
64
- import scipy.optimize
65
- import scipy.interpolate
66
- import scipy.ndimage as nd
67
- import scipy.sparse
68
- import pytsmod
69
- import configparser
70
- import traceback
71
- import multiprocessing
72
- import platform
73
-
74
- IS_RUNNING_WINDOWS = platform.system() == 'Windows'
75
- if IS_RUNNING_WINDOWS:
76
- import PySimpleGUIWx as sg
77
- default_output_dir = 'videos_with_ad'
78
- default_alignment_dir = 'alignment_plots'
79
- else:
80
- import PySimpleGUIQt as sg
81
- default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
82
- default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
83
-
84
- def display(text, func=None):
85
- if func:
86
- func(text)
87
- print(text)
88
-
89
- def throw_runtime_error(text, func=None):
90
- if func:
91
- func(text)
92
- raise RuntimeError(text)
93
-
94
- def ensure_folders_exist(dirs, display_func=None):
95
- for dir in dirs:
96
- if not os.path.isdir(dir):
97
- display("Directory not found, creating it: " + dir, display_func)
98
- os.makedirs(dir)
99
-
100
- def get_sorted_filenames(path, extensions, alt_extensions=set([])):
101
- # path could be three different things: a file, a directory, a list of files
102
- if type(path) is list:
103
- files = [os.path.abspath(file) for file in path]
104
- for file in files:
105
- if not os.path.isfile(file):
106
- raise RuntimeError(f"No file found at input path:\n {file}")
107
- else:
108
- path = os.path.abspath(path)
109
- if os.path.isdir(path):
110
- files = glob.glob(glob.escape(path) + "/*")
111
- if len(files) == 0:
112
- raise RuntimeError(f"Empty input directory:\n {path}")
113
- else:
114
- if not os.path.isfile(path):
115
- raise RuntimeError(f"No file or directory found at input path:\n {path}")
116
- files = [path]
117
- files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
118
- if len(files) == 0:
119
- error_msg = [f"No files with valid extensions found at input path:\n {path}",
120
- "Did you accidentally put the audio filepath before the video filepath?",
121
- "The video path should be the first positional input, audio second.",
122
- "Or maybe you need to add a new extension to this script's regex?",
123
- f"valid extensions for this input are:\n {extensions}"]
124
- raise RuntimeError("\n".join(error_msg))
125
- files = sorted(files)
126
- file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
127
- return files, file_types
128
-
129
- # read audio from file with ffmpeg and convert to numpy array
130
- def parse_audio_from_file(media_file):
131
- media_stream, _ = (ffmpeg
132
- .input(media_file)
133
- .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
134
- .run(capture_stdout=True, cmd=get_ffmpeg())
135
- )
136
- media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
137
- return media_arr
138
-
139
- # tokenize audio by transforming with a mel-frequency cepstrum (MFC)
140
- def tokenize_audio(media_arr, rate=1):
141
- step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
142
- window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
143
- window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
144
- fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
145
- get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
146
- samplerate=AUDIO_SAMPLE_RATE,
147
- winlen=window_size_seconds,
148
- winstep=TIMESTEP_SIZE_SECONDS * rate,
149
- numcep=MEL_COEFFS_PER_TIMESTEP,
150
- nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
151
- nfft=fft_size_samples,
152
- winfunc=scipy.signal.windows.hann)
153
- num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
154
- media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
155
- chunk_size = 1000
156
- for chunk_index in np.arange(0, num_timesteps, chunk_size):
157
- chunk_bounds_samples = ((chunk_index ) * step_size_samples,
158
- (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
159
- media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
160
- '''
161
- # alternate python library's MFC implementation
162
- import librosa
163
- media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
164
- sr=AUDIO_SAMPLE_RATE,
165
- n_mfcc=MEL_COEFFS_PER_TIMESTEP,
166
- lifter=22,
167
- n_fft=fft_size_samples,
168
- hop_length=step_size_samples,
169
- win_length=window_size_samples,
170
- window=scipy.signal.windows.hann).T
171
- num_timesteps = media_spec.shape[0]
172
- '''
173
- timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
174
- timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
175
- return media_spec, timings_seconds
176
-
177
- # same as tokenize_audio, but dithering the MFC window timings
178
- # this allows for finer alignment by ameliorating discretization error
179
- def tokenize_audio_dither(media_arr, slow_timings):
180
- # choose a relative step size slightly less than 1 to ameliorate quantization error
181
- # maximize alignment accuracy by using least approximable number with desired period
182
- # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
183
- fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
184
- fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
185
-
186
- # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
187
- # by approximately equalizing the number of tokens per unit time between dithered and undithered
188
- # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
189
- # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
190
- fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
191
- fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
192
- return fast_spec, fast_timings
193
-
194
- # normalize along both time and frequency axes to allow comparing tokens by correlation
195
- def normalize_spec(media_spec_raw, axes=(0,1)):
196
- media_spec = media_spec_raw.copy()
197
- for axis in axes:
198
- norm_func = np.std if axis == 0 else np.linalg.norm
199
- media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
200
- media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
201
- return media_spec
202
-
203
- # vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
204
- # modified to include affine gap penalties and skip+match options (i.e. knight's moves)
205
- # gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
206
- # or when the audio description includes a commercial break or an extra scene
207
- # the skip+match option allows for micro-adjustments without eating the full gap penalty
208
- # skip+match is primarily useful in maintaining alignment when the rates differ slightly
209
- def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
210
- pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
211
- 1:lambda node: (0, node[1]-2, node[2]-1),
212
- 2:lambda node: (0, node[1]-1, node[2]-2),
213
- 3:lambda node: (1, node[1]-1, node[2]-1),
214
- 4:lambda node: (0, node[1] , node[2] ),
215
- 5:lambda node: (1, node[1]-1, node[2] ),
216
- 6:lambda node: (1, node[1]-1, node[2]-1),
217
- 7:lambda node: (1, node[1] , node[2]-1)}
218
- pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
219
- pred_matrix[0,1:,:2] = 0
220
- pred_matrix[1,1:,:2] = 4
221
- pred_matrix[:,0,:2] = [0,5]
222
- path_corrs_match = np.zeros((3, video_spec.shape[0]))
223
- path_corrs_gap = np.zeros((3, video_spec.shape[0]))
224
- corrs = np.zeros((3, video_spec.shape[0]))
225
- corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
226
- for i in range(audio_desc_spec.shape[0]):
227
- i_mod = i % 3
228
- match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
229
- path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
230
- path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
231
- path_corrs_gap[ i_mod-1][1:-1][:,None]])
232
- pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
233
- path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
234
- corrs = np.roll(corrs, -1, axis=1)
235
- corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
236
- fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
237
- fisher_infos[fisher_infos < 0] = 0
238
- fisher_infos[fisher_infos > 10] = 10
239
- row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
240
- path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
241
- gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
242
- path_corrs_gap[i_mod-1][2: ][:,None],
243
- path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
244
- GAP_EXTEND_COST])
245
- pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
246
- path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
247
- pred_matrix[1][i][2:] += 4
248
- path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
249
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
250
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
251
- pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
252
- path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
253
-
254
- # reconstruct optimal path by following predecessors backwards through the table
255
- end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
256
- path_corrs_gap[ i_mod,-1]])
257
- cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
258
- get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
259
- path = []
260
- visited = set()
261
- while min(cur_node[1:]) >= 0:
262
- cur_node, last_node = get_predecessor(cur_node), cur_node
263
- # failsafe to prevent an infinite loop that should never happen anyways
264
- if cur_node in visited:
265
- break
266
- visited.add(cur_node)
267
- if last_node[0] == 0:
268
- path.append(last_node[1:])
269
- path = path[::-1]
270
-
271
- # determine how much information this node gives about the alignment
272
- # a larger double derivative means more precise timing information
273
- # sudden noises give more timing information than droning sounds
274
- def get_fisher_info(node):
275
- i,j = node
276
- if node[0] >= audio_desc_spec.shape[0]-1 or \
277
- node[1] >= video_spec.shape[0]-1 or \
278
- min(node) <= 0:
279
- return 0
280
- info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
281
- np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
282
- np.dot(audio_desc_spec[i+1],video_spec[j-1])
283
- info /= min(.2, TIMESTEP_SIZE_SECONDS)
284
- return info
285
-
286
- # the quality of a node combines the correlation of its tokens
287
- # with how precisely the match is localized in time
288
- def get_match_quality(node):
289
- # correlations are between -1 and 1, as all tokens have unit norm
290
- token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
291
- fisher_info = min(max(0, get_fisher_info(node)), 10)
292
- return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
293
-
294
- # filter out low match quality nodes from LCS path
295
- quals = [get_match_quality(node) for node in path]
296
- if len(quals) == 0 or max(quals) <= 0:
297
- raise RuntimeError("Rough alignment failed, are the input files mismatched?")
298
- path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
299
-
300
- # convert units of path nodes from timesteps to seconds
301
- path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
302
-
303
- return path, quals
304
-
305
- # chunk path segments of similar slope into clips
306
- # a clip has the form: (start_index, end_index)
307
- def chunk_path(smooth_path, tol):
308
- x,y = zip(*smooth_path)
309
- slopes = np.diff(y) / np.diff(x)
310
- median_slope = np.median(slopes)
311
- slope_changes = np.diff(slopes)
312
- breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
313
- breaks = [0] + list(breaks) + [len(x)-1]
314
- clips = list(zip(breaks[:-1], breaks[1:]))
315
- return clips, median_slope, slopes
316
-
317
- # find piece-wise linear alignment that minimizes the weighted combination of
318
- # total absolute error at each node and total absolute slope change of the fit
319
- # distance between nodes and the fit (i.e. errors) are weighted by node quality
320
- # absolute slope changes are differences between the slopes of adjacent fit lines
321
- # slope changes are weighted much more than node errors to smooth out noise
322
- # the main source of noise is rough alignment drift while the describer is speaking
323
- def smooth_align(path, quals, smoothness):
324
- # rotate basis to make vertical and horizontal slopes "cost" the same
325
- # the new horizontal axis is x+y and the new vertical is -x+y
326
- # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
327
- # after this transformation, we instead have -1 <= slope < 1
328
- # perfectly matching audio has pre-transformation slope = 1
329
- # after this transformation, it instead has slope = 0
330
- rotated_path = [(x+y,-x+y) for x,y in path]
331
-
332
- # stretch the x axis to make all slopes "cost" nearly the same
333
- # without this, small changes to the slope at slope = +/-1
334
- # cost sqrt(2) times as much as small changes at slope = 0
335
- # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
336
- # the small angle approximation means these slopes all cost roughly the same
337
- x_stretch_factor = 10.
338
- rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
339
-
340
- # L1-Minimization to solve the alignment problem using a linear program
341
- # the absolute value functions needed for "absolute error" can be represented
342
- # in a linear program by splitting variables into positive and negative pieces
343
- # and constraining each to be positive (done by default in scipy's linprog)
344
- # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
345
- # fit_err[i] = path[i][1] - y_fit[i]
346
- # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
347
- # (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
348
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
349
- # y_fit[i] = path[i][1] - fit_err[i]
350
- # this gives:
351
- # slope_change[i] = path_half[i] - fit_err_half[i]
352
- # where each half is just the original equation but y_fit is swapped out
353
- # the slope_change variables can then be set using equality constraints
354
- num_fit_points = len(rotated_stretched_path)
355
- x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
356
- x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
357
- y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
358
- slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
359
- slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
360
- slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
361
- slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
362
- slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
363
- c = np.hstack([quals,
364
- quals,
365
- slope_change_costs * x_stretch_factor,
366
- slope_change_costs * x_stretch_factor])
367
- fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
368
- -1. / x_diffs[:-1] - 1. / x_diffs[1:],
369
- 1. / x_diffs[1:]],
370
- offsets=[0,1,2],
371
- shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
372
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
373
- -fit_err_coeffs,
374
- scipy.sparse.eye(num_fit_points),
375
- -scipy.sparse.eye(num_fit_points)])
376
- b_eq = y_diffs[1: ] / x_diffs[1: ] - \
377
- y_diffs[ :-1] / x_diffs[ :-1]
378
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
379
- if not fit.success:
380
- print(fit)
381
- raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
382
-
383
- # combine fit_err_pos and fit_err_neg
384
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
385
-
386
- # subtract fit errors from nodes to retrieve the smooth fit's coordinates
387
- # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
388
- smooth_path = [(((x / x_stretch_factor) - y) / 2.,
389
- ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
390
-
391
- # clip off start/end of replacement audio if it doesn't match or isn't aligned
392
- # without this, describer intro/outro skips can cause mismatches at the start/end
393
- # the problem would be localized and just means audio might not match video at the start/end
394
- # instead we just keep the original video's audio in those segments if mismatches are detected
395
- # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
396
- # during audio replacement, synced edges will be extended backwards/forwards as far as possible
397
- # this is useful when the describer begins talking immediately (or before any alignable audio)
398
- # or when the describer continues speaking until the end (or no more alignable audio remains)
399
- # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
400
- max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
401
- smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
402
- smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
403
- smooth_err_path = zip(smoothed_fit_err, smooth_path)
404
- old_length = num_fit_points
405
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
406
- is_synced_at_start = len(smooth_err_path) == old_length
407
- old_length = len(smooth_err_path)
408
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
409
- is_synced_at_end = len(smooth_err_path) == old_length
410
- _, smooth_path = zip(*smooth_err_path)
411
- smooth_path = list(smooth_path)
412
- if is_synced_at_start:
413
- slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
414
- smooth_path.insert(0, (-10e10, -10e10 * slope))
415
- if is_synced_at_end:
416
- slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
417
- smooth_path.append((10e10, 10e10 * slope))
418
-
419
- clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
420
-
421
- # assemble clips with slopes within the rate tolerance into runs
422
- runs, run = [], []
423
- bad_clips = []
424
- for clip in clips:
425
- if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
426
- if len(run) > 0:
427
- runs.append(run)
428
- run = []
429
- bad_clips.append(clip)
430
- continue
431
- run.append(clip)
432
- if len(run) > 0:
433
- runs.append(run)
434
-
435
- return smooth_path, runs, bad_clips, clips
436
-
437
- # if the start or end were marked as synced during smooth alignment then
438
- # extend that alignment to the edge (i.e. to the start/end of the audio)
439
- def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
440
- if smooth_path[0][0] < -10e9:
441
- slope = smooth_path[0][1] / smooth_path[0][0]
442
- new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
443
- if new_start_point[1] < 0:
444
- new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
445
- smooth_path[0] = new_start_point
446
- if smooth_path[-1][0] > 10e9:
447
- video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
448
- audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
449
- slope = smooth_path[-1][1] / smooth_path[-1][0]
450
- new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
451
- if new_end_point[1] > video_runtime:
452
- new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
453
- smooth_path[-1] = new_end_point
454
-
455
- # visualize both the rough and smooth alignments
456
- def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
457
- scatter_color = [.2,.4,.8]
458
- lcs_rgba = np.zeros((len(quals),4))
459
- lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
460
- lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
461
- audio_times, video_times = np.array(path).T.reshape((2,-1))
462
- audio_offsets = audio_times - video_times
463
- def expand_limits(start, end, ratio=.01):
464
- average = (end + start) / 2.
465
- half_diff = (end - start) / 2.
466
- half_diff *= (1 + ratio)
467
- return (average - half_diff, average + half_diff)
468
- plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
469
- plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
470
- np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
471
- plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
472
- audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
473
- audio_offsets = audio_times - video_times
474
- if ad_timings is None:
475
- plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
476
- bad_path = []
477
- for clip in bad_clips:
478
- bad_path.extend(smooth_path[clip[0]:clip[1]+1])
479
- bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
480
- audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
481
- audio_offsets = audio_times - video_times
482
- if len(audio_offsets) > 0:
483
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
484
- else:
485
- interp = scipy.interpolate.interp1d(video_times, audio_offsets,
486
- fill_value = np.inf,
487
- bounds_error = False, assume_sorted = True)
488
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
489
- video_times = ad_timings
490
- audio_offsets = interp(ad_timings)
491
- if len(audio_offsets) > 0:
492
- plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
493
- plt.xlabel('Video Time (minutes)')
494
- plt.ylabel('Audio Description Offset (seconds)')
495
- plt.title('Alignment')
496
- plt.legend().legendHandles[0].set_color(scatter_color)
497
- plt.tight_layout()
498
- plt.savefig(plot_filename_no_ext + '.png', dpi=400)
499
- plt.clf()
500
-
501
- with open(plot_filename_no_ext + '.txt', 'w') as file:
502
- rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
503
- video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
504
- print("Main changes needed to video to align it to audio input:", file=file)
505
- print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
506
- print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
507
- for clip_start, clip_end in rough_clips:
508
- audio_desc_start, video_start = smooth_path[clip_start]
509
- audio_desc_end, video_end = smooth_path[clip_end]
510
- slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
511
- def str_from_time(seconds):
512
- minutes, seconds = divmod(seconds, 60)
513
- hours, minutes = divmod(minutes, 60)
514
- return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
515
- print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
516
- f"{str_from_time(video_end)} aligning with audio from " + \
517
- f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
518
-
519
- # use the smooth alignment to replace runs of video sound with corresponding described audio
520
- def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
521
- # perform quadratic interpolation of the audio description's waveform
522
- # this allows it to be stretched to match the corresponding video segment
523
- def audio_desc_arr_interp(samples):
524
- chunk_size = 10**7
525
- interpolated_chunks = []
526
- for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
527
- interp_bounds = (max(int(chunk[0]-2), 0),
528
- min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
529
- interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
530
- audio_desc_arr[:,slice(*interp_bounds)],
531
- copy=False, bounds_error=False, fill_value=0,
532
- kind='quadratic', assume_sorted=True)
533
- interpolated_chunks.append(interp(chunk).astype(np.float32))
534
- return np.hstack(interpolated_chunks)
535
-
536
- # construct a stretched audio description waveform using the quadratic interpolator
537
- def get_interped_segment(run, interp):
538
- segment = []
539
- for clip in run:
540
- num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
541
- int(y[clip[0]] * AUDIO_SAMPLE_RATE)
542
- clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
543
- sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
544
- segment.append(interp(sample_points))
545
- segment = np.hstack(segment)
546
- return segment
547
-
548
- x,y = zip(*smooth_path)
549
- for run in runs:
550
- run_length_seconds = y[run[-1][1]] - y[run[0][0]]
551
- if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
552
- continue
553
- anchor_point_path_indices = [clip[0] for clip in run]
554
- anchor_point_path_indices.append(run[-1][1])
555
- anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
556
- np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
557
- slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
558
- for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
559
- # only apply pitch correction if the difference would be noticeable
560
- if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
561
- stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
562
- else:
563
- anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
564
- # account for quirks of pytsmod's wsola anchor point implementation
565
- anchor_point_pair[1][-1] -= 1
566
- anchor_y_offset = anchor_point_pair[1][0]
567
- anchor_point_pair[1,:] -= anchor_y_offset
568
- stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
569
- video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
570
-
571
- # identify which segments of the replaced audio actually have the describer speaking
572
- # uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
573
- def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
574
- smooth_path, detect_sensitivity, boost_sensitivity):
575
- # retokenize the audio description, which has been stretched to match the video
576
- audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
577
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
578
-
579
- # avoid boosting or training on mismatched segments, like those close to skips
580
- # assumes matching segments all have the same, constant play rate
581
- # could be modified to handle a multi-modal distribution of rates
582
- aligned_audio_times, aligned_video_times = zip(*smooth_path)
583
- interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
584
- fill_value = 'extrapolate',
585
- bounds_error = False, assume_sorted = True)
586
- slopes = (interp(video_timings + 1e-5) - \
587
- interp(video_timings - 1e-5)) / 2e-5
588
- median_slope = np.median(slopes)
589
- aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
590
- well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
591
-
592
- # first pass identification by assuming poorly matched tokens are describer speech
593
- # also assumes the describer doesn't speak very quietly
594
- corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
595
- smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
596
- audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
597
- speech_mask = (corrs < .2) * audio_desc_loud
598
-
599
- # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
600
- audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
601
- audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
602
- video_spec = normalize_spec(video_spec_raw, axes=(0,))
603
- video_spec = np.clip(video_spec / 6., -1, 1)
604
-
605
- # convert sampled features (e.g. spectrogram) to probability densities of each feature
606
- # when given a spectrogram, finds the distributions of the MFC coefficients
607
- def make_log_pdfs(arr):
608
- resolution = 100
609
- bins_per_spot = 4
610
- num_bins = int(resolution * bins_per_spot)
611
- uniform_prior_strength_per_spot = 1
612
- uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
613
- bin_range = (-1 - 1e-10, 1 + 1e-10)
614
- get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
615
- pdfs = np.apply_along_axis(get_hist, 1, arr.T)
616
- pdfs = pdfs + uniform_prior_strength_per_bin
617
- smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
618
- pdfs = np.apply_along_axis(smooth, 1, pdfs)
619
- pdfs = pdfs / np.sum(pdfs[0,:])
620
- log_pdfs = np.log(pdfs)
621
- bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
622
- return log_pdfs, bin_edges
623
-
624
- diff_spec = audio_desc_spec - video_spec
625
- diff_spec = np.clip(diff_spec, -1, 1)
626
-
627
- # Naive Bayes classifier to roughly estimate whether each token is describer speech
628
- desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
629
- nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
630
- lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
631
- lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
632
- np.digitize(diff_spec, bin_edges, right=True)-1]
633
- ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
634
- (np.sum((~speech_mask) * well_aligned_mask) + 1.)
635
- relative_probs = np.sum(lratios, axis=1)
636
- relative_probs /= np.std(relative_probs)
637
- relative_probs -= np.mean(relative_probs)
638
-
639
- # L1-Minimization to smoothly identify audio descriptions using a linear program
640
- # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
641
- # fit_err[i] = relative_probs[i] - y_fit[i]
642
- # delta_fit[i] = y_fit[i] - y_fit[i-1]
643
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
644
- # y_fit[i] = relative_probs[i] - fit_err[i]
645
- # this gives:
646
- # delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
647
- # (fit_err[i] - fit_err[i-1])
648
- # the delta_fit variables can then be set using equality constraints
649
- num_fit_points = len(relative_probs)
650
- y_diffs = np.diff(relative_probs)
651
- pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
652
- neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
653
- c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
654
- np.ones(num_fit_points) / neg_err_cost_factor,
655
- np.ones(num_fit_points - 1) / 2.,
656
- np.ones(num_fit_points - 1) / 2.])
657
- fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
658
- np.ones(num_fit_points)],
659
- offsets=[0,1],
660
- shape=(num_fit_points - 1, num_fit_points)).tocsc()
661
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
662
- -fit_err_coeffs,
663
- scipy.sparse.eye(num_fit_points-1),
664
- -scipy.sparse.eye(num_fit_points-1)])
665
- b_eq = y_diffs
666
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
667
- if not fit.success:
668
- print(fit)
669
- raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
670
-
671
- # combine fit_err_pos and fit_err_neg
672
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
673
-
674
- # subtract fit errors from nodes to retrieve the smoothed fit
675
- smooth_desc_locations = relative_probs - fit_err
676
-
677
- # hard threshold to classify each token as describer speech or not
678
- speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
679
- speech_mask *= aligned_mask
680
-
681
- # a separate mask is created for describer volume boosting
682
- # as losing the describer's voice entirely is usually worse than it just being quiet
683
- # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
684
- boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
685
- boost_mask *= well_aligned_mask
686
-
687
- # convert a token classification into a mask that can be applied directly to samples
688
- # unlike the input, the output isn't a boolean array but an array of floats
689
- def token_mask_to_sample_mask(token_mask):
690
- description_timings = video_timings[1:-1][token_mask[1:-1]]
691
- sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
692
- window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
693
- window_size_seconds = 2 * window_radius + 1
694
- bump = scipy.signal.windows.hann(window_size_seconds)
695
- for description_timing in description_timings:
696
- window_center = int(description_timing * AUDIO_SAMPLE_RATE)
697
- sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
698
- return sample_mask
699
-
700
- speech_sample_mask = token_mask_to_sample_mask(speech_mask)
701
- boost_sample_mask = token_mask_to_sample_mask(boost_mask)
702
- ad_timings = video_timings.copy()
703
- ad_timings[~speech_mask] = np.inf
704
-
705
- return speech_sample_mask, boost_sample_mask, ad_timings
706
-
707
- # Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
708
- def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
709
- # PTS is the input frame's presentation timestamp, which is when frames are displayed
710
- # TB is the timebase, which is how many seconds each unit of PTS corresponds to
711
- # the output value of the expression will be the frame's new PTS
712
- setts_cmd = ['TS']
713
- start_skip = max(0, video_offset - start_key_frame)
714
- if start_skip > 0:
715
- # lossless cutting can only happen at key frames, so we cut the video before the audio starts
716
- # but that means the video is behind the audio and needs to catch up by playing quicker
717
- # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
718
- catchup_spread = 1./CATCHUP_RATE
719
- setts_cmd.append(f'+clip(TS-STARTPTS,0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
720
- elif video_offset < 0:
721
- # if the audio starts before the video, stretch the first frame of the video back to meet it
722
- setts_cmd.append(f'+clip(TS-STARTPTS,0,{-video_offset/10000.}/TB)*10000')
723
- # each segment of the linear fit can be encoded as a single clip function
724
- setts_cmd.append('+(0')
725
- for clip_start, clip_end in clips:
726
- audio_desc_start, video_start = smooth_path[clip_start]
727
- audio_desc_end, video_end = smooth_path[clip_end]
728
- video_start -= start_key_frame
729
- video_end -= start_key_frame
730
- audio_desc_length = audio_desc_end - audio_desc_start
731
- video_length = video_end - video_start
732
- slope = audio_desc_length / video_length
733
- setts_cmd.append(f'+clip(TS-STARTPTS-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
734
- setts_cmd.append(')')
735
- setts_cmd = ''.join(setts_cmd)
736
- return setts_cmd
737
-
738
- def get_ffmpeg():
739
- return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
740
-
741
- def get_ffprobe():
742
- return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
743
-
744
- def get_closest_key_frame_time(video_file, time):
745
- if time <= 0:
746
- return 0
747
- key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
748
- show_frames=None, skip_frame='nokey')['frames']
749
- key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
750
- return np.max(key_frame_times[key_frame_times <= time])
751
-
752
- # outputs a new media file with the replaced audio (which includes audio descriptions)
753
- def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
754
- setts_cmd=None, start_key_frame=None):
755
- if audio_desc_file is None:
756
- media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
757
- ac=2, ar=AUDIO_SAMPLE_RATE)
758
- if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
759
- write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
760
- else:
761
- original_video = ffmpeg.input(video_file)
762
- # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
763
- # ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
764
- # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
765
- write_command = ffmpeg.output(media_input, original_video, output_filename,
766
- acodec='copy', vcodec='copy', scodec='copy',
767
- max_interleave_delta='0', loglevel='fatal',
768
- **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
769
- ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
770
- ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
771
- ffmpeg_caller.stdin.close()
772
- ffmpeg_caller.wait()
773
- else:
774
- media_input = ffmpeg.input(audio_desc_file)
775
- audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
776
- show_entries='format=duration')['streams']
777
- audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
778
- original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
779
- if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
780
- # wav files don't have codecs compatible with most video containers, so we convert to aac
781
- audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
782
- write_command = ffmpeg.output(media_input, original_video, output_filename,
783
- acodec=audio_codec, vcodec='copy', scodec='copy',
784
- max_interleave_delta='0', loglevel='fatal',
785
- **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
786
- 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
787
- write_command.run(cmd=get_ffmpeg())
788
- else:
789
- # work around for bug that sometimes breaks setts when output and input formats differ
790
- # the trick is separating the input and output by piping from one ffmpeg process into another
791
- # mkv files break if 'nut' is used, while other files break when 'matroska' is used
792
- format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
793
- write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
794
- c='copy', loglevel='fatal')
795
- ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
796
- pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
797
- write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
798
- max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
799
- **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
800
- 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
801
- ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
802
- while True:
803
- in_bytes = ffmpeg_caller.stdout.read(100000)
804
- if not in_bytes:
805
- break
806
- ffmpeg_caller2.stdin.write(in_bytes)
807
- ffmpeg_caller2.stdin.close()
808
- ffmpeg_caller.wait()
809
- ffmpeg_caller2.wait()
810
-
811
-
812
- # check whether static_ffmpeg has already installed ffmpeg and ffprobe
813
- def is_ffmpeg_installed():
814
- ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
815
- indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
816
- return os.path.exists(indicator_file)
817
-
818
- # combines videos with matching audio files (e.g. audio descriptions)
819
- # this is the main function of this script, it calls the other functions in order
820
- def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
821
- boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
822
- prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
823
- alignment_dir=default_alignment_dir, extension="copy", display_func=None):
824
- video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
825
-
826
- if yes == False and sum(video_file_types) > 0:
827
- print("")
828
- print("One or more audio files found in video input. Was this intentional?")
829
- print("If not, press ctrl+c to kill this script.")
830
- input("If this was intended, press Enter to continue...")
831
- print("")
832
- audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
833
- if len(video_files) != len(audio_desc_files):
834
- error_msg = ["Number of valid files in input paths are not the same.",
835
- f"The video path has {len(video_files)} files",
836
- f"The audio path has {len(audio_desc_files)} files"]
837
- raise RuntimeError("\n".join(error_msg))
838
-
839
- ensure_folders_exist([output_dir], display_func)
840
- if PLOT_ALIGNMENT_TO_FILE:
841
- ensure_folders_exist([alignment_dir], display_func)
842
-
1
+ # combines videos with matching audio files (e.g. audio descriptions)
2
+ # input: video or folder of videos and an audio file or folder of audio files
3
+ # output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
4
+ # this script aligns the new audio to the video using the video's old audio
5
+ # first, the video's sound and the audio file are both converted to spectrograms
6
+ # second, the two spectrograms are roughly aligned by finding their longest common subsequence
7
+ # third, the rough alignment is denoised through L1-Minimization
8
+ # fourth, the spectrogram alignments determine where the new audio replaces the old
9
+
10
+ '''
11
+ Copyright (C) 2023 Julian Brown
12
+
13
+ This program is free software: you can redistribute it and/or modify
14
+ it under the terms of the GNU General Public License as published by
15
+ the Free Software Foundation, either version 3 of the License, or
16
+ (at your option) any later version.
17
+
18
+ This program is distributed in the hope that it will be useful,
19
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21
+ GNU General Public License for more details.
22
+
23
+ You should have received a copy of the GNU General Public License
24
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
25
+ '''
26
+
27
+ VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
28
+ AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
29
+ PLOT_ALIGNMENT_TO_FILE = True
30
+
31
+ TIMESTEP_SIZE_SECONDS = .16
32
+ TIMESTEP_OVERLAP_RATIO = .5
33
+ AUDIO_SAMPLE_RATE = 44100
34
+ MEL_COEFFS_PER_TIMESTEP = 25
35
+ DITHER_PERIOD_STEPS = 60
36
+ MIN_CORR_FOR_TOKEN_MATCH = .6
37
+ GAP_START_COST = 1.0
38
+ GAP_EXTEND_COST = -.01
39
+ GAP_EXTEND_DIAG_BONUS = -.01
40
+ SKIP_MATCH_COST = .1
41
+ MAX_RATE_RATIO_DIFF_ALIGN = .1
42
+ PREF_CUT_AT_GAPS_FACTOR = 5
43
+ MIN_DURATION_TO_REPLACE_SECONDS = 2
44
+ MIN_START_END_SYNC_TIME_SECONDS = 2
45
+ MAX_START_END_SYNC_ERR_SECONDS = .2
46
+ MAX_RATE_RATIO_DIFF_BOOST = .003
47
+ MIN_DESC_DURATION = .5
48
+ MAX_GAP_IN_DESC_SEC = 1.5
49
+ JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
50
+ CATCHUP_RATE = 5
51
+
52
+ if PLOT_ALIGNMENT_TO_FILE:
53
+ import matplotlib.pyplot as plt
54
+ import argparse
55
+ from contextlib import redirect_stderr, redirect_stdout
56
+ import io
57
+ import os
58
+ import glob
59
+ import itertools
60
+ from pathlib import Path
61
+ import sys
62
+ from typing import Optional
63
+ import numpy as np
64
+ import ffmpeg
65
+ import platformdirs
66
+ import static_ffmpeg
67
+ import python_speech_features as psf
68
+ import scipy.signal
69
+ import scipy.optimize
70
+ import scipy.interpolate
71
+ import scipy.ndimage as nd
72
+ import scipy.sparse
73
+ import pytsmod
74
+ import configparser
75
+ import traceback
76
+ import multiprocessing
77
+ import platform
78
+
79
+ IS_RUNNING_WINDOWS = platform.system() == 'Windows'
80
+ if IS_RUNNING_WINDOWS:
81
+ import PySimpleGUIWx as sg
82
+ default_output_dir = 'videos_with_ad'
83
+ default_alignment_dir = 'alignment_plots'
84
+ else:
85
+ import PySimpleGUIQt as sg
86
+ default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
87
+ default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
88
+
89
+ def display(text, func=None):
90
+ if func:
91
+ func(text)
92
+ print(text)
93
+
94
+ def throw_runtime_error(text, func=None):
95
+ if func:
96
+ func(text)
97
+ raise RuntimeError(text)
98
+
99
+ def ensure_folders_exist(dirs, display_func=None):
100
+ for dir in dirs:
101
+ if not os.path.isdir(dir):
102
+ display(f"Directory not found, creating it: {dir}", display_func)
103
+ os.makedirs(dir)
104
+
105
+ def get_sorted_filenames(path, extensions, alt_extensions=set([])):
106
+ # path could be three different things: a file, a directory, a list of files
107
+ if type(path) is list:
108
+ files = [os.path.abspath(file) for file in path]
109
+ for file in files:
110
+ if not os.path.isfile(file):
111
+ raise RuntimeError(f"No file found at input path:\n {file}")
112
+ else:
113
+ path = os.path.abspath(path)
114
+ if os.path.isdir(path):
115
+ files = glob.glob(glob.escape(path) + "/*")
116
+ if len(files) == 0:
117
+ raise RuntimeError(f"Empty input directory:\n {path}")
118
+ else:
119
+ if not os.path.isfile(path):
120
+ raise RuntimeError(f"No file or directory found at input path:\n {path}")
121
+ files = [path]
122
+ files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
123
+ if len(files) == 0:
124
+ error_msg = [f"No files with valid extensions found at input path:\n {path}",
125
+ "Did you accidentally put the audio filepath before the video filepath?",
126
+ "The video path should be the first positional input, audio second.",
127
+ "Or maybe you need to add a new extension to this script's regex?",
128
+ f"valid extensions for this input are:\n {extensions}"]
129
+ raise RuntimeError("\n".join(error_msg))
130
+ files = sorted(files)
131
+ file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
132
+ return files, file_types
133
+
134
+ # read audio from file with ffmpeg and convert to numpy array
135
+ def parse_audio_from_file(media_file):
136
+ media_stream, _ = (ffmpeg
137
+ .input(media_file)
138
+ .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
139
+ .run(capture_stdout=True, cmd=get_ffmpeg())
140
+ )
141
+ media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
142
+ return media_arr
143
+
144
+ # tokenize audio by transforming with a mel-frequency cepstrum (MFC)
145
+ def tokenize_audio(media_arr, rate=1):
146
+ step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
147
+ window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
148
+ window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
149
+ fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
150
+ get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
151
+ samplerate=AUDIO_SAMPLE_RATE,
152
+ winlen=window_size_seconds,
153
+ winstep=TIMESTEP_SIZE_SECONDS * rate,
154
+ numcep=MEL_COEFFS_PER_TIMESTEP,
155
+ nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
156
+ nfft=fft_size_samples,
157
+ winfunc=scipy.signal.windows.hann)
158
+ num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
159
+ media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
160
+ chunk_size = 1000
161
+ for chunk_index in np.arange(0, num_timesteps, chunk_size):
162
+ chunk_bounds_samples = ((chunk_index ) * step_size_samples,
163
+ (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
164
+ media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
165
+ '''
166
+ # alternate python library's MFC implementation
167
+ import librosa
168
+ media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
169
+ sr=AUDIO_SAMPLE_RATE,
170
+ n_mfcc=MEL_COEFFS_PER_TIMESTEP,
171
+ lifter=22,
172
+ n_fft=fft_size_samples,
173
+ hop_length=step_size_samples,
174
+ win_length=window_size_samples,
175
+ window=scipy.signal.windows.hann).T
176
+ num_timesteps = media_spec.shape[0]
177
+ '''
178
+ timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
179
+ timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
180
+ return media_spec, timings_seconds
181
+
182
+ # same as tokenize_audio, but dithering the MFC window timings
183
+ # this allows for finer alignment by ameliorating discretization error
184
+ def tokenize_audio_dither(media_arr, slow_timings):
185
+ # choose a relative step size slightly less than 1 to ameliorate quantization error
186
+ # maximize alignment accuracy by using least approximable number with desired period
187
+ # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
188
+ fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
189
+ fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
190
+
191
+ # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
192
+ # by approximately equalizing the number of tokens per unit time between dithered and undithered
193
+ # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
194
+ # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
195
+ fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
196
+ fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
197
+ return fast_spec, fast_timings
198
+
199
+ # normalize along both time and frequency axes to allow comparing tokens by correlation
200
+ def normalize_spec(media_spec_raw, axes=(0,1)):
201
+ media_spec = media_spec_raw.copy()
202
+ for axis in axes:
203
+ norm_func = np.std if axis == 0 else np.linalg.norm
204
+ media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
205
+ media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
206
+ return media_spec
207
+
208
+ # vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
209
+ # modified to include affine gap penalties and skip+match options (i.e. knight's moves)
210
+ # gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
211
+ # or when the audio description includes a commercial break or an extra scene
212
+ # the skip+match option allows for micro-adjustments without eating the full gap penalty
213
+ # skip+match is primarily useful in maintaining alignment when the rates differ slightly
214
+ def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
215
+ pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
216
+ 1:lambda node: (0, node[1]-2, node[2]-1),
217
+ 2:lambda node: (0, node[1]-1, node[2]-2),
218
+ 3:lambda node: (1, node[1]-1, node[2]-1),
219
+ 4:lambda node: (0, node[1] , node[2] ),
220
+ 5:lambda node: (1, node[1]-1, node[2] ),
221
+ 6:lambda node: (1, node[1]-1, node[2]-1),
222
+ 7:lambda node: (1, node[1] , node[2]-1)}
223
+ pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
224
+ pred_matrix[0,1:,:2] = 0
225
+ pred_matrix[1,1:,:2] = 4
226
+ pred_matrix[:,0,:2] = [0,5]
227
+ path_corrs_match = np.zeros((3, video_spec.shape[0]))
228
+ path_corrs_gap = np.zeros((3, video_spec.shape[0]))
229
+ corrs = np.zeros((3, video_spec.shape[0]))
230
+ corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
231
+ for i in range(audio_desc_spec.shape[0]):
232
+ i_mod = i % 3
233
+ match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
234
+ path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
235
+ path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
236
+ path_corrs_gap[ i_mod-1][1:-1][:,None]])
237
+ pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
238
+ path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
239
+ corrs = np.roll(corrs, -1, axis=1)
240
+ corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
241
+ fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
242
+ fisher_infos[fisher_infos < 0] = 0
243
+ fisher_infos[fisher_infos > 10] = 10
244
+ row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
245
+ path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
246
+ gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
247
+ path_corrs_gap[i_mod-1][2: ][:,None],
248
+ path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
249
+ GAP_EXTEND_COST])
250
+ pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
251
+ path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
252
+ pred_matrix[1][i][2:] += 4
253
+ path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
254
+ GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
255
+ GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
256
+ pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
257
+ path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
258
+
259
+ # reconstruct optimal path by following predecessors backwards through the table
260
+ end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
261
+ path_corrs_gap[ i_mod,-1]])
262
+ cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
263
+ get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
264
+ path = []
265
+ visited = set()
266
+ while min(cur_node[1:]) >= 0:
267
+ cur_node, last_node = get_predecessor(cur_node), cur_node
268
+ # failsafe to prevent an infinite loop that should never happen anyways
269
+ if cur_node in visited:
270
+ break
271
+ visited.add(cur_node)
272
+ if last_node[0] == 0:
273
+ path.append(last_node[1:])
274
+ path = path[::-1]
275
+
276
+ # determine how much information this node gives about the alignment
277
+ # a larger double derivative means more precise timing information
278
+ # sudden noises give more timing information than droning sounds
279
+ def get_fisher_info(node):
280
+ i,j = node
281
+ if node[0] >= audio_desc_spec.shape[0]-1 or \
282
+ node[1] >= video_spec.shape[0]-1 or \
283
+ min(node) <= 0:
284
+ return 0
285
+ info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
286
+ np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
287
+ np.dot(audio_desc_spec[i+1],video_spec[j-1])
288
+ info /= min(.2, TIMESTEP_SIZE_SECONDS)
289
+ return info
290
+
291
+ # the quality of a node combines the correlation of its tokens
292
+ # with how precisely the match is localized in time
293
+ def get_match_quality(node):
294
+ # correlations are between -1 and 1, as all tokens have unit norm
295
+ token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
296
+ fisher_info = min(max(0, get_fisher_info(node)), 10)
297
+ return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
298
+
299
+ # filter out low match quality nodes from LCS path
300
+ quals = [get_match_quality(node) for node in path]
301
+ if len(quals) == 0 or max(quals) <= 0:
302
+ raise RuntimeError("Rough alignment failed, are the input files mismatched?")
303
+ path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
304
+
305
+ # convert units of path nodes from timesteps to seconds
306
+ path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
307
+
308
+ return path, quals
309
+
310
+ # chunk path segments of similar slope into clips
311
+ # a clip has the form: (start_index, end_index)
312
+ def chunk_path(smooth_path, tol):
313
+ x,y = zip(*smooth_path)
314
+ slopes = np.diff(y) / np.diff(x)
315
+ median_slope = np.median(slopes)
316
+ slope_changes = np.diff(slopes)
317
+ breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
318
+ breaks = [0] + list(breaks) + [len(x)-1]
319
+ clips = list(zip(breaks[:-1], breaks[1:]))
320
+ return clips, median_slope, slopes
321
+
322
+ # find piece-wise linear alignment that minimizes the weighted combination of
323
+ # total absolute error at each node and total absolute slope change of the fit
324
+ # distance between nodes and the fit (i.e. errors) are weighted by node quality
325
+ # absolute slope changes are differences between the slopes of adjacent fit lines
326
+ # slope changes are weighted much more than node errors to smooth out noise
327
+ # the main source of noise is rough alignment drift while the describer is speaking
328
+ def smooth_align(path, quals, smoothness):
329
+ # rotate basis to make vertical and horizontal slopes "cost" the same
330
+ # the new horizontal axis is x+y and the new vertical is -x+y
331
+ # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
332
+ # after this transformation, we instead have -1 <= slope < 1
333
+ # perfectly matching audio has pre-transformation slope = 1
334
+ # after this transformation, it instead has slope = 0
335
+ rotated_path = [(x+y,-x+y) for x,y in path]
336
+
337
+ # stretch the x axis to make all slopes "cost" nearly the same
338
+ # without this, small changes to the slope at slope = +/-1
339
+ # cost sqrt(2) times as much as small changes at slope = 0
340
+ # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
341
+ # the small angle approximation means these slopes all cost roughly the same
342
+ x_stretch_factor = 10.
343
+ rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
344
+
345
+ # L1-Minimization to solve the alignment problem using a linear program
346
+ # the absolute value functions needed for "absolute error" can be represented
347
+ # in a linear program by splitting variables into positive and negative pieces
348
+ # and constraining each to be positive (done by default in scipy's linprog)
349
+ # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
350
+ # fit_err[i] = path[i][1] - y_fit[i]
351
+ # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
352
+ # (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
353
+ # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
354
+ # y_fit[i] = path[i][1] - fit_err[i]
355
+ # this gives:
356
+ # slope_change[i] = path_half[i] - fit_err_half[i]
357
+ # where each half is just the original equation but y_fit is swapped out
358
+ # the slope_change variables can then be set using equality constraints
359
+ num_fit_points = len(rotated_stretched_path)
360
+ x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
361
+ x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
362
+ y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
363
+ slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
364
+ slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
365
+ slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
366
+ slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
367
+ slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
368
+ c = np.hstack([quals,
369
+ quals,
370
+ slope_change_costs * x_stretch_factor,
371
+ slope_change_costs * x_stretch_factor])
372
+ fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
373
+ -1. / x_diffs[:-1] - 1. / x_diffs[1:],
374
+ 1. / x_diffs[1:]],
375
+ offsets=[0,1,2],
376
+ shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
377
+ A_eq = scipy.sparse.hstack([ fit_err_coeffs,
378
+ -fit_err_coeffs,
379
+ scipy.sparse.eye(num_fit_points),
380
+ -scipy.sparse.eye(num_fit_points)])
381
+ b_eq = y_diffs[1: ] / x_diffs[1: ] - \
382
+ y_diffs[ :-1] / x_diffs[ :-1]
383
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
384
+ if not fit.success:
385
+ print(fit)
386
+ raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
387
+
388
+ # combine fit_err_pos and fit_err_neg
389
+ fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
390
+
391
+ # subtract fit errors from nodes to retrieve the smooth fit's coordinates
392
+ # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
393
+ smooth_path = [(((x / x_stretch_factor) - y) / 2.,
394
+ ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
395
+
396
+ # clip off start/end of replacement audio if it doesn't match or isn't aligned
397
+ # without this, describer intro/outro skips can cause mismatches at the start/end
398
+ # the problem would be localized and just means audio might not match video at the start/end
399
+ # instead we just keep the original video's audio in those segments if mismatches are detected
400
+ # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
401
+ # during audio replacement, synced edges will be extended backwards/forwards as far as possible
402
+ # this is useful when the describer begins talking immediately (or before any alignable audio)
403
+ # or when the describer continues speaking until the end (or no more alignable audio remains)
404
+ # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
405
+ max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
406
+ smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
407
+ smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
408
+ smooth_err_path = zip(smoothed_fit_err, smooth_path)
409
+ old_length = num_fit_points
410
+ smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
411
+ is_synced_at_start = len(smooth_err_path) == old_length
412
+ old_length = len(smooth_err_path)
413
+ smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
414
+ is_synced_at_end = len(smooth_err_path) == old_length
415
+ _, smooth_path = zip(*smooth_err_path)
416
+ smooth_path = list(smooth_path)
417
+ if is_synced_at_start:
418
+ slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
419
+ smooth_path.insert(0, (-10e10, -10e10 * slope))
420
+ if is_synced_at_end:
421
+ slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
422
+ smooth_path.append((10e10, 10e10 * slope))
423
+
424
+ clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
425
+
426
+ # assemble clips with slopes within the rate tolerance into runs
427
+ runs, run = [], []
428
+ bad_clips = []
429
+ for clip in clips:
430
+ if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
431
+ if len(run) > 0:
432
+ runs.append(run)
433
+ run = []
434
+ bad_clips.append(clip)
435
+ continue
436
+ run.append(clip)
437
+ if len(run) > 0:
438
+ runs.append(run)
439
+
440
+ return smooth_path, runs, bad_clips, clips
441
+
442
+ # if the start or end were marked as synced during smooth alignment then
443
+ # extend that alignment to the edge (i.e. to the start/end of the audio)
444
+ def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
445
+ if smooth_path[0][0] < -10e9:
446
+ slope = smooth_path[0][1] / smooth_path[0][0]
447
+ new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
448
+ if new_start_point[1] < 0:
449
+ new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
450
+ smooth_path[0] = new_start_point
451
+ if smooth_path[-1][0] > 10e9:
452
+ video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
453
+ audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
454
+ slope = smooth_path[-1][1] / smooth_path[-1][0]
455
+ new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
456
+ if new_end_point[1] > video_runtime:
457
+ new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
458
+ smooth_path[-1] = new_end_point
459
+
460
+ # visualize both the rough and smooth alignments
461
+ def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
462
+ scatter_color = [.2,.4,.8]
463
+ lcs_rgba = np.zeros((len(quals),4))
464
+ lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
465
+ lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
466
+ audio_times, video_times = np.array(path).T.reshape((2,-1))
467
+ audio_offsets = audio_times - video_times
468
+ def expand_limits(start, end, ratio=.01):
469
+ average = (end + start) / 2.
470
+ half_diff = (end - start) / 2.
471
+ half_diff *= (1 + ratio)
472
+ return (average - half_diff, average + half_diff)
473
+ plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
474
+ plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
475
+ np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
476
+ plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
477
+ audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
478
+ audio_offsets = audio_times - video_times
479
+ if ad_timings is None:
480
+ plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
481
+ bad_path = []
482
+ for clip in bad_clips:
483
+ bad_path.extend(smooth_path[clip[0]:clip[1]+1])
484
+ bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
485
+ audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
486
+ audio_offsets = audio_times - video_times
487
+ if len(audio_offsets) > 0:
488
+ plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
489
+ else:
490
+ interp = scipy.interpolate.interp1d(video_times, audio_offsets,
491
+ fill_value = np.inf,
492
+ bounds_error = False, assume_sorted = True)
493
+ plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
494
+ video_times = ad_timings
495
+ audio_offsets = interp(ad_timings)
496
+ if len(audio_offsets) > 0:
497
+ plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
498
+ plt.xlabel('Video Time (minutes)')
499
+ plt.ylabel('Audio Description Offset (seconds)')
500
+ plt.title('Alignment')
501
+ plt.legend().legend_handles[0].set_color(scatter_color)
502
+ plt.tight_layout()
503
+ plt.savefig(plot_filename_no_ext + '.png', dpi=400)
504
+ plt.clf()
505
+
506
+ with open(plot_filename_no_ext + '.txt', 'w') as file:
507
+ rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
508
+ video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
509
+ print("Main changes needed to video to align it to audio input:", file=file)
510
+ print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
511
+ print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
512
+ for clip_start, clip_end in rough_clips:
513
+ audio_desc_start, video_start = smooth_path[clip_start]
514
+ audio_desc_end, video_end = smooth_path[clip_end]
515
+ slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
516
+ def str_from_time(seconds):
517
+ minutes, seconds = divmod(seconds, 60)
518
+ hours, minutes = divmod(minutes, 60)
519
+ return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
520
+ print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
521
+ f"{str_from_time(video_end)} aligning with audio from " + \
522
+ f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
523
+
524
+ # use the smooth alignment to replace runs of video sound with corresponding described audio
525
+ def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
526
+ # perform quadratic interpolation of the audio description's waveform
527
+ # this allows it to be stretched to match the corresponding video segment
528
+ def audio_desc_arr_interp(samples):
529
+ chunk_size = 10**7
530
+ interpolated_chunks = []
531
+ for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
532
+ interp_bounds = (max(int(chunk[0]-2), 0),
533
+ min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
534
+ interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
535
+ audio_desc_arr[:,slice(*interp_bounds)],
536
+ copy=False, bounds_error=False, fill_value=0,
537
+ kind='quadratic', assume_sorted=True)
538
+ interpolated_chunks.append(interp(chunk).astype(np.float32))
539
+ return np.hstack(interpolated_chunks)
540
+
541
+ # construct a stretched audio description waveform using the quadratic interpolator
542
+ def get_interped_segment(run, interp):
543
+ segment = []
544
+ for clip in run:
545
+ num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
546
+ int(y[clip[0]] * AUDIO_SAMPLE_RATE)
547
+ clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
548
+ sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
549
+ segment.append(interp(sample_points))
550
+ segment = np.hstack(segment)
551
+ return segment
552
+
553
+ x,y = zip(*smooth_path)
554
+ for run in runs:
555
+ run_length_seconds = y[run[-1][1]] - y[run[0][0]]
556
+ if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
557
+ continue
558
+ anchor_point_path_indices = [clip[0] for clip in run]
559
+ anchor_point_path_indices.append(run[-1][1])
560
+ anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
561
+ np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
562
+ slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
563
+ for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
564
+ # only apply pitch correction if the difference would be noticeable
565
+ if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
566
+ stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
567
+ else:
568
+ anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
569
+ # account for quirks of pytsmod's wsola anchor point implementation
570
+ anchor_point_pair[1][-1] -= 1
571
+ anchor_y_offset = anchor_point_pair[1][0]
572
+ anchor_point_pair[1,:] -= anchor_y_offset
573
+ stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
574
+ video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
575
+
576
+ # identify which segments of the replaced audio actually have the describer speaking
577
+ # uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
578
+ def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
579
+ smooth_path, detect_sensitivity, boost_sensitivity):
580
+ # retokenize the audio description, which has been stretched to match the video
581
+ audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
582
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw)
583
+
584
+ # avoid boosting or training on mismatched segments, like those close to skips
585
+ # assumes matching segments all have the same, constant play rate
586
+ # could be modified to handle a multi-modal distribution of rates
587
+ aligned_audio_times, aligned_video_times = zip(*smooth_path)
588
+ interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
589
+ fill_value = 'extrapolate',
590
+ bounds_error = False, assume_sorted = True)
591
+ slopes = (interp(video_timings + 1e-5) - \
592
+ interp(video_timings - 1e-5)) / 2e-5
593
+ median_slope = np.median(slopes)
594
+ aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
595
+ well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
596
+
597
+ # first pass identification by assuming poorly matched tokens are describer speech
598
+ # also assumes the describer doesn't speak very quietly
599
+ corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
600
+ smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
601
+ audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
602
+ speech_mask = (corrs < .2) * audio_desc_loud
603
+
604
+ # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
605
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
606
+ audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
607
+ video_spec = normalize_spec(video_spec_raw, axes=(0,))
608
+ video_spec = np.clip(video_spec / 6., -1, 1)
609
+
610
+ # convert sampled features (e.g. spectrogram) to probability densities of each feature
611
+ # when given a spectrogram, finds the distributions of the MFC coefficients
612
+ def make_log_pdfs(arr):
613
+ resolution = 100
614
+ bins_per_spot = 4
615
+ num_bins = int(resolution * bins_per_spot)
616
+ uniform_prior_strength_per_spot = 1
617
+ uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
618
+ bin_range = (-1 - 1e-10, 1 + 1e-10)
619
+ get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
620
+ pdfs = np.apply_along_axis(get_hist, 1, arr.T)
621
+ pdfs = pdfs + uniform_prior_strength_per_bin
622
+ smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
623
+ pdfs = np.apply_along_axis(smooth, 1, pdfs)
624
+ pdfs = pdfs / np.sum(pdfs[0,:])
625
+ log_pdfs = np.log(pdfs)
626
+ bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
627
+ return log_pdfs, bin_edges
628
+
629
+ diff_spec = audio_desc_spec - video_spec
630
+ diff_spec = np.clip(diff_spec, -1, 1)
631
+
632
+ # Naive Bayes classifier to roughly estimate whether each token is describer speech
633
+ desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
634
+ nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
635
+ lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
636
+ lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
637
+ np.digitize(diff_spec, bin_edges, right=True)-1]
638
+ ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
639
+ (np.sum((~speech_mask) * well_aligned_mask) + 1.)
640
+ relative_probs = np.sum(lratios, axis=1)
641
+ relative_probs /= np.std(relative_probs)
642
+ relative_probs -= np.mean(relative_probs)
643
+
644
+ # L1-Minimization to smoothly identify audio descriptions using a linear program
645
+ # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
646
+ # fit_err[i] = relative_probs[i] - y_fit[i]
647
+ # delta_fit[i] = y_fit[i] - y_fit[i-1]
648
+ # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
649
+ # y_fit[i] = relative_probs[i] - fit_err[i]
650
+ # this gives:
651
+ # delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
652
+ # (fit_err[i] - fit_err[i-1])
653
+ # the delta_fit variables can then be set using equality constraints
654
+ num_fit_points = len(relative_probs)
655
+ y_diffs = np.diff(relative_probs)
656
+ pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
657
+ neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
658
+ c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
659
+ np.ones(num_fit_points) / neg_err_cost_factor,
660
+ np.ones(num_fit_points - 1) / 2.,
661
+ np.ones(num_fit_points - 1) / 2.])
662
+ fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
663
+ np.ones(num_fit_points)],
664
+ offsets=[0,1],
665
+ shape=(num_fit_points - 1, num_fit_points)).tocsc()
666
+ A_eq = scipy.sparse.hstack([ fit_err_coeffs,
667
+ -fit_err_coeffs,
668
+ scipy.sparse.eye(num_fit_points-1),
669
+ -scipy.sparse.eye(num_fit_points-1)])
670
+ b_eq = y_diffs
671
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
672
+ if not fit.success:
673
+ print(fit)
674
+ raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
675
+
676
+ # combine fit_err_pos and fit_err_neg
677
+ fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
678
+
679
+ # subtract fit errors from nodes to retrieve the smoothed fit
680
+ smooth_desc_locations = relative_probs - fit_err
681
+
682
+ # hard threshold to classify each token as describer speech or not
683
+ speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
684
+ speech_mask *= aligned_mask
685
+
686
+ # a separate mask is created for describer volume boosting
687
+ # as losing the describer's voice entirely is usually worse than it just being quiet
688
+ # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
689
+ boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
690
+ boost_mask *= well_aligned_mask
691
+
692
+ # convert a token classification into a mask that can be applied directly to samples
693
+ # unlike the input, the output isn't a boolean array but an array of floats
694
+ def token_mask_to_sample_mask(token_mask):
695
+ description_timings = video_timings[1:-1][token_mask[1:-1]]
696
+ sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
697
+ window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
698
+ window_size_seconds = 2 * window_radius + 1
699
+ bump = scipy.signal.windows.hann(window_size_seconds)
700
+ for description_timing in description_timings:
701
+ window_center = int(description_timing * AUDIO_SAMPLE_RATE)
702
+ sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
703
+ return sample_mask
704
+
705
+ speech_sample_mask = token_mask_to_sample_mask(speech_mask)
706
+ boost_sample_mask = token_mask_to_sample_mask(boost_mask)
707
+ ad_timings = video_timings.copy()
708
+ ad_timings[~speech_mask] = np.inf
709
+
710
+ return speech_sample_mask, boost_sample_mask, ad_timings
711
+
712
+ # Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
713
+ def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
714
+ # PTS is the input frame's presentation timestamp, which is when frames are displayed
715
+ # TB is the timebase, which is how many seconds each unit of PTS corresponds to
716
+ # the output value of the expression will be the frame's new PTS
717
+ setts_cmd = ['TS']
718
+ start_skip = max(0, video_offset - start_key_frame)
719
+ if start_skip > 0:
720
+ # lossless cutting can only happen at key frames, so we cut the video before the audio starts
721
+ # but that means the video is behind the audio and needs to catch up by playing quicker
722
+ # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
723
+ catchup_spread = 1./CATCHUP_RATE
724
+ setts_cmd.append(f'+clip(TS-{start_key_frame},0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
725
+ elif video_offset < 0:
726
+ # if the audio starts before the video, stretch the first frame of the video back to meet it
727
+ setts_cmd.append(f'+clip(TS-{start_key_frame},0,{-video_offset/10000.}/TB)*10000')
728
+ # each segment of the linear fit can be encoded as a single clip function
729
+ setts_cmd.append('+(0')
730
+ for clip_start, clip_end in clips:
731
+ audio_desc_start, video_start = smooth_path[clip_start]
732
+ audio_desc_end, video_end = smooth_path[clip_end]
733
+ video_start -= start_key_frame
734
+ video_end -= start_key_frame
735
+ audio_desc_length = audio_desc_end - audio_desc_start
736
+ video_length = video_end - video_start
737
+ slope = audio_desc_length / video_length
738
+ setts_cmd.append(f'+clip(TS-{start_key_frame}-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
739
+ setts_cmd.append(')')
740
+ setts_cmd = ''.join(setts_cmd)
741
+ return setts_cmd
742
+
743
+ def get_ffmpeg():
744
+ return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
745
+
746
+ def get_ffprobe():
747
+ return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
748
+
749
+ def get_closest_key_frame_time(video_file, time):
750
+ if time <= 0:
751
+ return 0
752
+ key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
753
+ show_frames=None, skip_frame='nokey')['frames']
754
+ key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
755
+ return np.max(key_frame_times[key_frame_times <= time])
756
+
757
+ # outputs a new media file with the replaced audio (which includes audio descriptions)
758
+ def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
759
+ setts_cmd=None, start_key_frame=None):
760
+ if audio_desc_file is None:
761
+ media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
762
+ ac=2, ar=AUDIO_SAMPLE_RATE)
763
+ if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
764
+ write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
765
+ else:
766
+ original_video = ffmpeg.input(video_file)
767
+ # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
768
+ # ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
769
+ # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
770
+ write_command = ffmpeg.output(media_input, original_video, output_filename,
771
+ acodec='copy', vcodec='copy', scodec='copy',
772
+ max_interleave_delta='0', loglevel='fatal',
773
+ **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
774
+ ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
775
+ ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
776
+ ffmpeg_caller.stdin.close()
777
+ ffmpeg_caller.wait()
778
+ else:
779
+ media_input = ffmpeg.input(audio_desc_file)
780
+ audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
781
+ show_entries='format=duration')['streams']
782
+ audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
783
+ original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
784
+ if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
785
+ # wav files don't have codecs compatible with most video containers, so we convert to aac
786
+ audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
787
+ # flac audio may only have experimental support in some video containers (e.g. mp4)
788
+ standards = 'normal' if os.path.splitext(audio_desc_file)[1] != '.flac' else 'experimental'
789
+ write_command = ffmpeg.output(media_input, original_video, output_filename,
790
+ acodec=audio_codec, vcodec='copy', scodec='copy',
791
+ max_interleave_delta='0', loglevel='fatal', strict=standards,
792
+ **{'bsf:v': f'setts=ts=\'{setts_cmd}\'',
793
+ 'bsf:s': f'setts=ts=\'{setts_cmd}\''}).overwrite_output()
794
+ write_command.run(cmd=get_ffmpeg())
795
+ else:
796
+ # work around for bug that sometimes breaks setts when output and input formats differ
797
+ # the trick is separating the input and output by piping from one ffmpeg process into another
798
+ # mkv files break if 'nut' is used, while other files break when 'matroska' is used
799
+ format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
800
+ write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
801
+ c='copy', loglevel='fatal')
802
+ ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
803
+ pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
804
+ write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
805
+ max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
806
+ **{'bsf:v': f'setts=ts=\'{setts_cmd}\'',
807
+ 'bsf:s': f'setts=ts=\'{setts_cmd}\''}).overwrite_output()
808
+ ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
809
+ while True:
810
+ in_bytes = ffmpeg_caller.stdout.read(100000)
811
+ if not in_bytes:
812
+ break
813
+ ffmpeg_caller2.stdin.write(in_bytes)
814
+ ffmpeg_caller2.stdin.close()
815
+ ffmpeg_caller.wait()
816
+ ffmpeg_caller2.wait()
817
+
818
+
819
+ # check whether static_ffmpeg has already installed ffmpeg and ffprobe
820
+ def is_ffmpeg_installed():
821
+ ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
822
+ indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
823
+ return os.path.exists(indicator_file)
824
+
825
+ # combines videos with matching audio files (e.g. audio descriptions)
826
+ # this is the main function of this script, it calls the other functions in order
827
+ def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
828
+ boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
829
+ prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
830
+ alignment_dir=default_alignment_dir, extension="copy", display_func=None):
831
+ video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
832
+
833
+ if yes == False and sum(video_file_types) > 0:
834
+ print("")
835
+ print("One or more audio files found in video input. Was this intentional?")
836
+ print("If not, press ctrl+c to kill this script.")
837
+ input("If this was intended, press Enter to continue...")
838
+ print("")
839
+ audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
840
+ if len(video_files) != len(audio_desc_files):
841
+ error_msg = ["Number of valid files in input paths are not the same.",
842
+ f"The video path has {len(video_files)} files",
843
+ f"The audio path has {len(audio_desc_files)} files"]
844
+ raise RuntimeError("\n".join(error_msg))
845
+
843
846
  display("", display_func)
844
- for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
845
- display(os.path.split(video_file)[1], display_func)
846
- display(os.path.split(audio_desc_file)[1], display_func)
847
- display("", display_func)
848
- if yes == False:
849
- print("Are the above input file pairings correct?")
850
- print("If not, press ctrl+c to kill this script.")
851
- input("If they are correct, press Enter to continue...")
852
- print("")
853
-
854
- # if ffmpeg isn't installed, install it
855
- if not is_ffmpeg_installed():
856
- display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
857
- get_ffmpeg()
858
- if not is_ffmpeg_installed():
859
- RuntimeError("Failed to install ffmpeg.")
860
- display("Successfully installed ffmpeg.", display_func)
861
-
862
- display("Processing files:", display_func)
863
-
864
- for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
865
- video_file_types):
866
- # Default is to use the input video's extension for the output video
867
- if extension is None or extension in ["", "copy"]:
868
- ext = os.path.splitext(video_file)[1]
869
- else:
870
- # add a dot to the extension if it's missing
871
- ext = ('' if extension[0] == '.' else '.') + extension
872
- output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
873
- output_filename = os.path.join(output_dir, output_filename)
874
- display(" " + output_filename, display_func)
875
-
876
- if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
877
- display(" output file already exists, skipping...", display_func)
878
- continue
879
-
880
- video_arr = parse_audio_from_file(video_file)
881
- audio_desc_arr = parse_audio_from_file(audio_desc_file)
882
- video_spec_raw, video_timings = tokenize_audio(video_arr)
883
- video_spec = normalize_spec(video_spec_raw)
884
- audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
885
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
886
-
887
- # rescale RMS intensity of audio to match video
888
- audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
889
-
890
- path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
891
-
892
- smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
893
-
894
- cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
895
-
896
- ad_timings = None
897
- if stretch_audio:
898
- if keep_non_ad:
899
- video_arr_original = video_arr.copy()
900
-
901
- replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
902
- del audio_desc_arr
903
-
904
- if keep_non_ad or boost != 0:
905
- outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
906
- smooth_path, ad_detect_sensitivity, boost_sensitivity)
907
- speech_sample_mask, boost_sample_mask, ad_timings = outputs
908
- if keep_non_ad:
909
- video_arr *= speech_sample_mask
910
- video_arr += video_arr_original * (1 - speech_sample_mask)
911
- del video_arr_original
912
- del speech_sample_mask
913
- else:
914
- ad_timings = None
915
- if boost != 0:
916
- video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
917
- del boost_sample_mask
918
-
919
- # prevent peaking by rescaling to within +/- 16,382
920
- video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
921
-
922
- if video_filetype == 0:
923
- write_replaced_media_to_disk(output_filename, video_arr, video_file)
924
- else:
925
- write_replaced_media_to_disk(output_filename, video_arr)
926
- else:
927
- if video_filetype == 1:
928
- raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
929
- if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
930
- raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
931
- video_offset = np.diff(smooth_path[clips[0][0]])[0]
932
- start_key_frame = get_closest_key_frame_time(video_file, video_offset)
933
- setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
934
- write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
935
- setts_cmd, start_key_frame)
936
-
937
- del video_arr
938
- if PLOT_ALIGNMENT_TO_FILE:
939
- plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
940
- plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
941
- display("All files processed.", display_func)
942
-
943
- def write_config_file(config_path, settings):
944
- config = configparser.ConfigParser()
945
- config.add_section('alignment')
946
- config['alignment'] = {}
947
- for key, value in settings.items():
948
- config['alignment'][key] = str(value)
949
- with open(config_path, 'w') as f:
950
- config.write(f)
951
-
952
- def read_config_file(config_path):
953
- config = configparser.ConfigParser()
954
- config.read(config_path)
955
- settings = {'smoothness': config.getfloat('alignment', 'smoothness', fallback=50),
956
- 'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
957
- 'keep_non_ad': config.getboolean('alignment', 'keep_non_ad', fallback=False),
958
- 'boost': config.getfloat('alignment', 'boost', fallback=0),
959
- 'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
960
- 'boost_sensitivity': config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
961
- 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
962
- 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
963
- 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
964
- 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
965
- 'extension': config.get('alignment', 'extension', fallback='copy')}
966
- if not config.has_section('alignment'):
967
- write_config_file(config_path, settings)
968
- return settings
969
-
970
- def settings_gui(config_path):
971
- settings = read_config_file(config_path)
972
- layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
973
- [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
974
- sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
975
- tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
976
- 'file type of the corresponding input video. Default is "copy".')]])],
977
- [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
978
- sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
979
- tooltip='Output file name prepend text. Default is "ad_"')]])],
980
- [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
981
- sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
982
- tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
983
- sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
984
- [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
985
- sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
986
- tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
987
- sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
988
- [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
989
- sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
990
- tooltip='Lower values make the alignment more accurate when there are skips ' + \
991
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
992
- 'Default is 50.')]])],
993
- [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
994
- tooltip='Stretches the input audio to fit the input video. ' + \
995
- 'Default is to stretch the video to fit the audio.')],
996
- [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
997
- disabled=not settings['stretch_audio'],
998
- tooltip='Tries to only replace segments with audio description. Useful if ' + \
999
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1000
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
1001
- [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
1002
- sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
1003
- key='boost', disabled=not settings['stretch_audio'],
1004
- tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1005
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1006
- 'Requires --stretch_audio to be set, otherwise does nothing.')]])],
1007
- [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
1008
- sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
1009
- key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
1010
- tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
1011
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
1012
- [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
1013
- sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
1014
- key='boost_sensitivity', disabled=not settings['stretch_audio'],
1015
- tooltip='Higher values make --boost less likely to miss a description, but ' + \
1016
- 'also make it more likely to boost non-description audio. Default is 0.4')]])],
1017
- [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
1018
- disabled=not settings['stretch_audio'],
1019
- tooltip='Skips pitch correction step when stretching audio. ' + \
1020
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
1021
- [sg.Column([[sg.Submit('Save', pad=(40,3)),
1022
- sg.Button('Cancel')]], pad=((135,3),10))]]
1023
- settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
1024
- settings_window['extension'].set_focus()
1025
- while True:
1026
- event, values = settings_window.read()
1027
- if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
1028
- break
1029
- if event == 'stretch_audio':
1030
- # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
1031
- if IS_RUNNING_WINDOWS:
1032
- settings_window['boost'].Update(disabled = values['stretch_audio'])
1033
- settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
1034
- settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
1035
- else:
1036
- settings_window['boost'].Update(disabled = not values['stretch_audio'])
1037
- settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
1038
- settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
1039
- settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
1040
- settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
1041
- if event == 'Save':
1042
- settings = values.copy()
1043
- del settings['output_browse']
1044
- del settings['alignment_browse']
1045
- write_config_file(config_path, settings)
1046
- break
1047
- settings_window.close()
1048
-
1049
- def combine_print_exceptions(print_queue, *args, **kwargs):
1050
- try:
1051
- combine(*args, **kwargs)
1052
- except:
1053
- print_queue.put(traceback.format_exc())
1054
- # raise
1055
-
1056
- def combine_gui(video_files, audio_files, config_path):
1057
- output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
1058
- layout = [[output_textbox],
1059
- [sg.Button('Close', pad=(360,5))]]
1060
- combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
1061
- disable_close=True, finalize=True)
1062
- output_textbox.update('Combining media files:', append=True)
1063
- print_queue = multiprocessing.Queue()
1064
-
1065
- settings = read_config_file(config_path)
1066
- settings.update({'display_func':print_queue.put, 'yes':True})
1067
- proc = multiprocessing.Process(target=combine_print_exceptions,
1068
- args=(print_queue, video_files, audio_files),
1069
- kwargs=settings, daemon=True)
1070
- proc.start()
1071
- while True:
1072
- # if the script isn't running anymore, re-enable the default close window button
1073
- if not proc.is_alive():
1074
- combine_window.DisableClose = False
1075
- if not print_queue.empty():
1076
- if IS_RUNNING_WINDOWS:
1077
- cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
1078
- output_textbox.update('\n' + print_queue.get(), append=True)
1079
- if IS_RUNNING_WINDOWS:
1080
- output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
1081
- event, values = combine_window.read(timeout=100)
1082
- # window closed event isn't always emitted, so also manually check window status
1083
- if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
1084
- if proc.is_alive():
1085
- proc.terminate()
1086
- break
1087
- if event == 'Close':
1088
- if not proc.is_alive():
1089
- combine_window.DisableClose = False
1090
- break
1091
- selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
1092
- if selection != 'Yes':
1093
- continue
1094
- proc.terminate()
1095
- combine_window.DisableClose = False
1096
- break
1097
- combine_window.close()
1098
-
1099
- def main_gui():
1100
- config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
1101
- sg.theme('Light Blue 2')
1102
-
1103
- all_audio_file_types = [('All Audio File Types', '*.' + ';*.'.join(AUDIO_EXTENSIONS)),]
1104
- all_video_file_types = [('All Video File Types', '*.' + ';*.'.join(VIDEO_EXTENSIONS)),]
1105
- all_video_and_audio_file_types = [('All Video and Audio File Types',
1106
- '*.' + ';*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
1107
- audio_file_types = [(ext, "*." + ext) for ext in AUDIO_EXTENSIONS]
1108
- video_and_audio_file_types = [(ext, "*." + ext) for ext in VIDEO_EXTENSIONS] + audio_file_types
1109
- audio_file_types = all_audio_file_types + audio_file_types
1110
- video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
1111
- # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
1112
- if IS_RUNNING_WINDOWS:
1113
- file_fix = lambda file_types: file_types[:1] + [('|' + type[0], type[1]) for type in file_types[1:]]
1114
- audio_file_types = file_fix(audio_file_types)
1115
- video_and_audio_file_types = file_fix(video_and_audio_file_types)
1116
-
1117
- layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
1118
- [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
1119
- sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
1120
- tooltip='List video filenames here, in order, separated by semicolons'),
1121
- sg.FilesBrowse(button_text="Browse Video",
1122
- file_types=video_and_audio_file_types,
1123
- tooltip='Select one or more video files')]], pad=(2,7))],
1124
- [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
1125
- sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
1126
- tooltip='List audio filenames here, in order, separated by semicolons'),
1127
- sg.FilesBrowse(button_text="Browse Audio",
1128
- file_types=audio_file_types,
1129
- tooltip='Select one or more audio files')]], pad=(2,7))],
1130
- [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
1131
- sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
1132
- pad=((135,3),10))]]
1133
- window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
1134
- window['-VIDEO_FILES-'].set_focus()
1135
- while True:
1136
- event, values = window.read()
1137
- if event == 'Combine':
1138
- if len(values['-VIDEO_FILES-']) == 0 or \
1139
- len(values['-AUDIO_FILES-']) == 0:
1140
- window.disable()
1141
- sg.Popup('Error: empty input field.', font=('Arial', 20))
1142
- window.enable()
1143
- continue
1144
- video_files = values['-VIDEO_FILES-'].split(';')
1145
- audio_files = values['-AUDIO_FILES-'].split(';')
1146
- combine_gui(video_files, audio_files, config_path)
1147
- if event == 'Settings':
1148
- window.disable()
1149
- settings_gui(config_path)
1150
- window.enable()
1151
- if event == sg.WIN_CLOSED:
1152
- break
1153
- window.close()
1154
-
1155
- # Entry point for command line interaction, for example:
1156
- # > describealign video.mp4 audio_desc.mp3
1157
- def command_line_interface():
1158
- # override command line argument parser's error handler to make it pause before exiting
1159
- # this allows users to see the error message when accidentally not running from command line
1160
- class ArgumentParser(argparse.ArgumentParser):
1161
- def error(self, message):
1162
- if 'required: video, audio' in message:
1163
- print('No input arguments detected, starting GUI...')
1164
- main_gui()
1165
- self.exit()
1166
- else:
1167
- self.exit(2, f'{self.prog}: error: {message}\n')
1168
- parser = ArgumentParser(description="Replaces a video's sound with an audio description.",
1169
- usage="describealign video_file.mp4 audio_file.mp3")
1170
- parser.add_argument("video", help='A video file or directory containing video files.')
1171
- parser.add_argument("audio", help='An audio file or directory containing audio files.')
1172
- parser.add_argument('--smoothness', type=float, default=50,
1173
- help='Lower values make the alignment more accurate when there are skips ' + \
1174
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
1175
- 'Default is 50.')
1176
- parser.add_argument('--stretch_audio', action='store_true',
1177
- help='Stretches the input audio to fit the input video. ' + \
1178
- 'Default is to stretch the video to fit the audio.')
1179
- parser.add_argument('--keep_non_ad', action='store_true',
1180
- help='Tries to only replace segments with audio description. Useful if ' + \
1181
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1182
- 'Requires --stretch_audio to be set, otherwise does nothing.')
1183
- parser.add_argument('--boost', type=float, default=0,
1184
- help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1185
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1186
- 'Requires --stretch_audio to be set, otherwise does nothing.')
1187
- parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
1188
- help='Audio description detection sensitivity ratio. Higher values make ' + \
1189
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
1190
- parser.add_argument('--boost_sensitivity', type=float, default=.4,
1191
- help='Higher values make --boost less likely to miss a description, but ' + \
1192
- 'also make it more likely to boost non-description audio. Default is 0.4')
1193
- parser.add_argument('--yes', action='store_true',
1194
- help='Auto-skips user prompts asking to verify information.')
1195
- parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
1196
- parser.add_argument('--no_pitch_correction', action='store_true',
1197
- help='Skips pitch correction step when stretching audio. ' + \
1198
- 'Requires --stretch_audio to be set, otherwise does nothing.')
1199
- parser.add_argument("--output_dir", default=default_output_dir,
1200
- help='Directory combined output media is saved to. Default is "videos_with_ad"')
1201
- parser.add_argument("--alignment_dir", default=default_alignment_dir,
1202
- help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
1203
- parser.add_argument("--extension", default="copy",
1204
- help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
1205
- 'file type of the corresponding input video. Default is "copy".')
1206
- args = parser.parse_args()
1207
-
1208
- combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
1209
- args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
1210
- args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
1211
- args.extension)
1212
-
1213
- # allows the script to be run on its own, rather than through the package, for example:
1214
- # python3 describealign.py video.mp4 audio_desc.mp3
1215
- if __name__ == "__main__":
1216
- multiprocessing.freeze_support()
1217
- command_line_interface()
1218
-
1219
-
1220
-
1221
-
847
+ ensure_folders_exist([output_dir], display_func)
848
+ if PLOT_ALIGNMENT_TO_FILE:
849
+ ensure_folders_exist([alignment_dir], display_func)
850
+
851
+ display("", display_func)
852
+ for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
853
+ display(os.path.split(video_file)[1], display_func)
854
+ display(os.path.split(audio_desc_file)[1], display_func)
855
+ display("", display_func)
856
+ if yes == False:
857
+ print("Are the above input file pairings correct?")
858
+ print("If not, press ctrl+c to kill this script.")
859
+ input("If they are correct, press Enter to continue...")
860
+ print("")
861
+
862
+ # if ffmpeg isn't installed, install it
863
+ if not is_ffmpeg_installed():
864
+ display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
865
+ get_ffmpeg()
866
+ if not is_ffmpeg_installed():
867
+ RuntimeError("Failed to install ffmpeg.")
868
+ display("Successfully installed ffmpeg.", display_func)
869
+
870
+ display("Processing files:", display_func)
871
+
872
+ for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
873
+ video_file_types):
874
+ # Default is to use the input video's extension for the output video
875
+ if extension is None or extension in ["", "copy"]:
876
+ ext = os.path.splitext(video_file)[1]
877
+ else:
878
+ # add a dot to the extension if it's missing
879
+ ext = ('' if extension[0] == '.' else '.') + extension
880
+ output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
881
+ output_filename = os.path.join(output_dir, output_filename)
882
+ display(f" {output_filename}", display_func)
883
+
884
+ if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
885
+ display(" output file already exists, skipping...", display_func)
886
+ continue
887
+
888
+ video_arr = parse_audio_from_file(video_file)
889
+ audio_desc_arr = parse_audio_from_file(audio_desc_file)
890
+ video_spec_raw, video_timings = tokenize_audio(video_arr)
891
+ video_spec = normalize_spec(video_spec_raw)
892
+ audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
893
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw)
894
+
895
+ # rescale RMS intensity of audio to match video
896
+ audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
897
+
898
+ path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
899
+
900
+ smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
901
+
902
+ cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
903
+
904
+ ad_timings = None
905
+ if stretch_audio:
906
+ if keep_non_ad:
907
+ video_arr_original = video_arr.copy()
908
+
909
+ replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
910
+ del audio_desc_arr
911
+
912
+ if keep_non_ad or boost != 0:
913
+ outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
914
+ smooth_path, ad_detect_sensitivity, boost_sensitivity)
915
+ speech_sample_mask, boost_sample_mask, ad_timings = outputs
916
+ if keep_non_ad:
917
+ video_arr *= speech_sample_mask
918
+ video_arr += video_arr_original * (1 - speech_sample_mask)
919
+ del video_arr_original
920
+ del speech_sample_mask
921
+ else:
922
+ ad_timings = None
923
+ if boost != 0:
924
+ video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
925
+ del boost_sample_mask
926
+
927
+ # prevent peaking by rescaling to within +/- 16,382
928
+ video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
929
+
930
+ if video_filetype == 0:
931
+ write_replaced_media_to_disk(output_filename, video_arr, video_file)
932
+ else:
933
+ write_replaced_media_to_disk(output_filename, video_arr)
934
+ else:
935
+ if video_filetype == 1:
936
+ raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
937
+ if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
938
+ raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
939
+ video_offset = np.diff(smooth_path[clips[0][0]])[0]
940
+ start_key_frame = get_closest_key_frame_time(video_file, video_offset)
941
+ setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
942
+ write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
943
+ setts_cmd, start_key_frame)
944
+
945
+ del video_arr
946
+ if PLOT_ALIGNMENT_TO_FILE:
947
+ plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
948
+ plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
949
+ display("All files processed.", display_func)
950
+
951
+ def write_config_file(config_path, settings):
952
+ config = configparser.ConfigParser()
953
+ config.add_section('alignment')
954
+ config['alignment'] = {}
955
+ for key, value in settings.items():
956
+ config['alignment'][key] = str(value)
957
+ with open(config_path, 'w') as f:
958
+ config.write(f)
959
+
960
+ def read_config_file(config_path: Path):
961
+ config = configparser.ConfigParser()
962
+ config.read(config_path)
963
+ settings = {'smoothness': config.getfloat('alignment', 'smoothness', fallback=50),
964
+ 'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
965
+ 'keep_non_ad': config.getboolean('alignment', 'keep_non_ad', fallback=False),
966
+ 'boost': config.getfloat('alignment', 'boost', fallback=0),
967
+ 'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
968
+ 'boost_sensitivity': config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
969
+ 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
970
+ 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
971
+ 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
972
+ 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
973
+ 'extension': config.get('alignment', 'extension', fallback='copy')}
974
+ if not config.has_section('alignment'):
975
+ write_config_file(config_path, settings)
976
+ return settings
977
+
978
+ def settings_gui(config_path: Path):
979
+ settings = read_config_file(config_path)
980
+ layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
981
+ [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
982
+ sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
983
+ tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
984
+ 'file type of the corresponding input video. Default is "copy".')]])],
985
+ [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
986
+ sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
987
+ tooltip='Output file name prepend text. Default is "ad_"')]])],
988
+ [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
989
+ sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
990
+ tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
991
+ sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
992
+ [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
993
+ sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
994
+ tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
995
+ sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
996
+ [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
997
+ sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
998
+ tooltip='Lower values make the alignment more accurate when there are skips ' + \
999
+ '(e.g. describer pauses), but also make it more likely to misalign. ' + \
1000
+ 'Default is 50.')]])],
1001
+ [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
1002
+ tooltip='Stretches the input audio to fit the input video. ' + \
1003
+ 'Default is to stretch the video to fit the audio.')],
1004
+ [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
1005
+ disabled=not settings['stretch_audio'],
1006
+ tooltip='Tries to only replace segments with audio description. Useful if ' + \
1007
+ 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1008
+ 'Requires --stretch_audio to be set, otherwise does nothing.')],
1009
+ [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
1010
+ sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
1011
+ key='boost', disabled=not settings['stretch_audio'],
1012
+ tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1013
+ '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1014
+ 'Requires --stretch_audio to be set, otherwise does nothing.')]])],
1015
+ [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
1016
+ sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
1017
+ key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
1018
+ tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
1019
+ '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
1020
+ [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
1021
+ sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
1022
+ key='boost_sensitivity', disabled=not settings['stretch_audio'],
1023
+ tooltip='Higher values make --boost less likely to miss a description, but ' + \
1024
+ 'also make it more likely to boost non-description audio. Default is 0.4')]])],
1025
+ [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
1026
+ disabled=not settings['stretch_audio'],
1027
+ tooltip='Skips pitch correction step when stretching audio. ' + \
1028
+ 'Requires --stretch_audio to be set, otherwise does nothing.')],
1029
+ [sg.Column([[sg.Submit('Save', pad=(40,3)),
1030
+ sg.Button('Cancel')]], pad=((135,3),10))]]
1031
+ settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
1032
+ settings_window['extension'].set_focus()
1033
+ while True:
1034
+ event, values = settings_window.read()
1035
+ if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
1036
+ break
1037
+ if event == 'stretch_audio':
1038
+ # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
1039
+ if IS_RUNNING_WINDOWS:
1040
+ settings_window['boost'].Update(disabled = values['stretch_audio'])
1041
+ settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
1042
+ settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
1043
+ else:
1044
+ settings_window['boost'].Update(disabled = not values['stretch_audio'])
1045
+ settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
1046
+ settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
1047
+ settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
1048
+ settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
1049
+ if event == 'Save':
1050
+ settings = values.copy()
1051
+ del settings['output_browse']
1052
+ del settings['alignment_browse']
1053
+ write_config_file(config_path, settings)
1054
+ break
1055
+ settings_window.close()
1056
+
1057
+ class QueueWriter(io.TextIOWrapper):
1058
+ def __init__(self, queue) -> None:
1059
+ super().__init__(buffer=io.BytesIO())
1060
+ self._queue = queue
1061
+
1062
+ def write(self, s: str) -> int:
1063
+ self._queue.put(s)
1064
+ return len(s)
1065
+
1066
+ def combine_print_exceptions(print_queue, *args, **kwargs):
1067
+ writer = QueueWriter(print_queue)
1068
+ with redirect_stdout(writer), redirect_stderr(writer):
1069
+ try:
1070
+ combine(*args, **kwargs)
1071
+ except Exception:
1072
+ traceback.print_exc()
1073
+
1074
+ def combine_gui(video_files, audio_files, config_path):
1075
+ output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
1076
+ layout = [[output_textbox],
1077
+ [sg.Button('Close', pad=(360,5))]]
1078
+ combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
1079
+ disable_close=True, finalize=True)
1080
+ output_textbox.update('Combining media files:', append=True)
1081
+ print_queue = multiprocessing.Queue()
1082
+
1083
+ settings = read_config_file(config_path)
1084
+ settings.update({'yes':True})
1085
+ proc = multiprocessing.Process(target=combine_print_exceptions,
1086
+ args=(print_queue, video_files, audio_files),
1087
+ kwargs=settings, daemon=True)
1088
+ proc.start()
1089
+ while True:
1090
+ # if the script isn't running anymore, re-enable the default close window button
1091
+ if not proc.is_alive():
1092
+ combine_window.DisableClose = False
1093
+ if not print_queue.empty():
1094
+ if IS_RUNNING_WINDOWS:
1095
+ cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
1096
+ output_textbox.update(print_queue.get(), append=True)
1097
+ if IS_RUNNING_WINDOWS:
1098
+ output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
1099
+ event, values = combine_window.read(timeout=100)
1100
+ # window closed event isn't always emitted, so also manually check window status
1101
+ if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
1102
+ if proc.is_alive():
1103
+ proc.terminate()
1104
+ break
1105
+ if event == 'Close':
1106
+ if not proc.is_alive():
1107
+ combine_window.DisableClose = False
1108
+ break
1109
+ selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
1110
+ if selection != 'Yes':
1111
+ continue
1112
+ proc.terminate()
1113
+ combine_window.DisableClose = False
1114
+ break
1115
+ combine_window.close()
1116
+
1117
+ def migrate_config(old_path: Optional[Path], new_path: Path) -> None:
1118
+ """
1119
+ Migrate configuration from old location.
1120
+
1121
+ Only runs if the old_path exists but new_path does not
1122
+ """
1123
+ if new_path.exists() or not old_path or not old_path.exists():
1124
+ return
1125
+
1126
+ old_data = old_path.read_text(encoding='utf-8')
1127
+ new_path.write_text(old_data, encoding='utf-8')
1128
+ print(f"Configuration migrated to {new_path}")
1129
+ try:
1130
+ old_path.unlink()
1131
+ except OSError as exc:
1132
+ print("Failed to remove old config:", *traceback.format_exception_only(exc))
1133
+ else:
1134
+ print("Successfully removed old config file.")
1135
+
1136
+ def main_gui():
1137
+ config_path = platformdirs.user_config_path(appname='describealign', appauthor=False, ensure_exists=True) / 'config.ini'
1138
+ old_paths = [
1139
+ # Place in chronological order (oldest -> newest)
1140
+ Path(__file__).resolve().parent / 'config.ini',
1141
+ platformdirs.user_config_path(appname='describealign', ensure_exists=True) / 'config.ini',
1142
+ ]
1143
+
1144
+ # Get newest existent path
1145
+ old_config = next(
1146
+ (
1147
+ file
1148
+ for file in reversed(old_paths)
1149
+ if file.exists()
1150
+ ),
1151
+ None,
1152
+ )
1153
+
1154
+ try:
1155
+ migrate_config(old_config, config_path)
1156
+ except OSError as exc:
1157
+ print(f"Error migrating old config:", *traceback.format_exception_only(exc))
1158
+ print(f"Old config left in place at {old_config}")
1159
+
1160
+ sg.theme('Light Blue 2')
1161
+
1162
+ filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
1163
+ all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
1164
+ all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
1165
+ all_video_and_audio_file_types = [('All Video and Audio File Types',
1166
+ '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
1167
+ audio_file_types = [(ext, f"*.{ext}") for ext in AUDIO_EXTENSIONS]
1168
+ video_and_audio_file_types = [(ext, f"*.{ext}") for ext in VIDEO_EXTENSIONS] + audio_file_types
1169
+ audio_file_types = all_audio_file_types + audio_file_types
1170
+ video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
1171
+ # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
1172
+ if IS_RUNNING_WINDOWS:
1173
+ file_fix = lambda file_types: file_types[:1] + [(f'|{type[0]}', type[1]) for type in file_types[1:]]
1174
+ audio_file_types = file_fix(audio_file_types)
1175
+ video_and_audio_file_types = file_fix(video_and_audio_file_types)
1176
+
1177
+ layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
1178
+ [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
1179
+ sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
1180
+ tooltip='List video filenames here, in order, separated by semicolons'),
1181
+ sg.FilesBrowse(button_text="Browse Video",
1182
+ file_types=video_and_audio_file_types,
1183
+ tooltip='Select one or more video files')]], pad=(2,7))],
1184
+ [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
1185
+ sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
1186
+ tooltip='List audio filenames here, in order, separated by semicolons'),
1187
+ sg.FilesBrowse(button_text="Browse Audio",
1188
+ file_types=audio_file_types,
1189
+ tooltip='Select one or more audio files')]], pad=(2,7))],
1190
+ [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
1191
+ sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
1192
+ pad=((135,3),10))]]
1193
+ window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
1194
+ window['-VIDEO_FILES-'].set_focus()
1195
+ while True:
1196
+ event, values = window.read()
1197
+ if event == 'Combine':
1198
+ if len(values['-VIDEO_FILES-']) == 0 or \
1199
+ len(values['-AUDIO_FILES-']) == 0:
1200
+ window.disable()
1201
+ sg.Popup('Error: empty input field.', font=('Arial', 20))
1202
+ window.enable()
1203
+ continue
1204
+ video_files = values['-VIDEO_FILES-'].split(';')
1205
+ if len(video_files) == 1:
1206
+ video_files = video_files[0]
1207
+ audio_files = values['-AUDIO_FILES-'].split(';')
1208
+ if len(audio_files) == 1:
1209
+ audio_files = audio_files[0]
1210
+ combine_gui(video_files, audio_files, config_path)
1211
+ if event == 'Settings':
1212
+ window.disable()
1213
+ settings_gui(config_path)
1214
+ window.enable()
1215
+ if event == sg.WIN_CLOSED:
1216
+ break
1217
+ window.close()
1218
+
1219
+ # Entry point for command line interaction, for example:
1220
+ # > describealign video.mp4 audio_desc.mp3
1221
+ def command_line_interface():
1222
+ if len(sys.argv) < 2:
1223
+ # No args, run gui
1224
+ print('No input arguments detected, starting GUI...')
1225
+ main_gui()
1226
+ sys.exit(0)
1227
+
1228
+ parser = argparse.ArgumentParser(
1229
+ description="Replaces a video's sound with an audio description.",
1230
+ usage="describealign video_file.mp4 audio_file.mp3")
1231
+ parser.add_argument("video", help='A video file or directory containing video files.', nargs='?', default=None)
1232
+ parser.add_argument("audio", help='An audio file or directory containing audio files.', nargs='?', default=None)
1233
+ parser.add_argument('--smoothness', type=float, default=50,
1234
+ help='Lower values make the alignment more accurate when there are skips ' + \
1235
+ '(e.g. describer pauses), but also make it more likely to misalign. ' + \
1236
+ 'Default is 50.')
1237
+ parser.add_argument('--stretch_audio', action='store_true',
1238
+ help='Stretches the input audio to fit the input video. ' + \
1239
+ 'Default is to stretch the video to fit the audio.')
1240
+ parser.add_argument('--keep_non_ad', action='store_true',
1241
+ help='Tries to only replace segments with audio description. Useful if ' + \
1242
+ 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1243
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
1244
+ parser.add_argument('--boost', type=float, default=0,
1245
+ help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1246
+ '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1247
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
1248
+ parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
1249
+ help='Audio description detection sensitivity ratio. Higher values make ' + \
1250
+ '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
1251
+ parser.add_argument('--boost_sensitivity', type=float, default=.4,
1252
+ help='Higher values make --boost less likely to miss a description, but ' + \
1253
+ 'also make it more likely to boost non-description audio. Default is 0.4')
1254
+ parser.add_argument('--yes', action='store_true',
1255
+ help='Auto-skips user prompts asking to verify information.')
1256
+ parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
1257
+ parser.add_argument('--no_pitch_correction', action='store_true',
1258
+ help='Skips pitch correction step when stretching audio. ' + \
1259
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
1260
+ parser.add_argument("--output_dir", default=default_output_dir,
1261
+ help='Directory combined output media is saved to. Default is "videos_with_ad"')
1262
+ parser.add_argument("--alignment_dir", default=default_alignment_dir,
1263
+ help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
1264
+ parser.add_argument("--extension", default="copy",
1265
+ help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
1266
+ 'file type of the corresponding input video. Default is "copy".')
1267
+ parser.add_argument("--install-ffmpeg", action="store_true",
1268
+ help="Install the required ffmpeg binaries and then exit. This is meant to be" + \
1269
+ "run from a privileged installer process (e.g. OS X Installer)")
1270
+ args = parser.parse_args()
1271
+
1272
+ if args.install_ffmpeg:
1273
+ # Make sure the file is world executable
1274
+ os.chmod(get_ffmpeg(), 0o755)
1275
+ os.chmod(get_ffprobe(), 0o755)
1276
+ elif args.video or args.audio:
1277
+ combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
1278
+ args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
1279
+ args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
1280
+ args.extension)
1281
+ else:
1282
+ parser.print_usage()
1283
+
1284
+ # allows the script to be run on its own, rather than through the package, for example:
1285
+ # python3 describealign.py video.mp4 audio_desc.mp3
1286
+ if __name__ == "__main__":
1287
+ multiprocessing.freeze_support()
1288
+ command_line_interface()
1289
+
1290
+
1291
+
1292
+