describealign 1.2.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
describealign.py CHANGED
@@ -1,11 +1,9 @@
1
+ __version__ = '2.0.1'
2
+
1
3
  # combines videos with matching audio files (e.g. audio descriptions)
2
4
  # input: video or folder of videos and an audio file or folder of audio files
3
5
  # output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
4
6
  # this script aligns the new audio to the video using the video's old audio
5
- # first, the video's sound and the audio file are both converted to spectrograms
6
- # second, the two spectrograms are roughly aligned by finding their longest common subsequence
7
- # third, the rough alignment is denoised through L1-Minimization
8
- # fourth, the spectrogram alignments determine where the new audio replaces the old
9
7
 
10
8
  '''
11
9
  Copyright (C) 2023 Julian Brown
@@ -28,26 +26,14 @@ VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob']
28
26
  AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
29
27
  PLOT_ALIGNMENT_TO_FILE = True
30
28
 
31
- TIMESTEP_SIZE_SECONDS = .16
32
- TIMESTEP_OVERLAP_RATIO = .5
29
+ TIMESTEPS_PER_SECOND = 10 # factors must be subset of (2, 3, 5, 7)
30
+ TIMESTEP_SIZE_SECONDS = 1. / TIMESTEPS_PER_SECOND
33
31
  AUDIO_SAMPLE_RATE = 44100
34
- MEL_COEFFS_PER_TIMESTEP = 25
35
- DITHER_PERIOD_STEPS = 60
36
- MIN_CORR_FOR_TOKEN_MATCH = .6
37
- GAP_START_COST = 1.0
38
- GAP_EXTEND_COST = -.01
39
- GAP_EXTEND_DIAG_BONUS = -.01
40
- SKIP_MATCH_COST = .1
32
+ DITHER_PERIOD_STEPS = 10
41
33
  MAX_RATE_RATIO_DIFF_ALIGN = .1
42
- PREF_CUT_AT_GAPS_FACTOR = 5
43
34
  MIN_DURATION_TO_REPLACE_SECONDS = 2
44
- MIN_START_END_SYNC_TIME_SECONDS = 2
45
- MAX_START_END_SYNC_ERR_SECONDS = .2
46
- MAX_RATE_RATIO_DIFF_BOOST = .003
47
- MIN_DESC_DURATION = .5
48
- MAX_GAP_IN_DESC_SEC = 1.5
49
35
  JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
50
- CATCHUP_RATE = 5
36
+ MIN_STRETCH_OFFSET = 30
51
37
 
52
38
  if PLOT_ALIGNMENT_TO_FILE:
53
39
  import matplotlib.pyplot as plt
@@ -64,42 +50,41 @@ import numpy as np
64
50
  import ffmpeg
65
51
  import platformdirs
66
52
  import static_ffmpeg
67
- import python_speech_features as psf
68
53
  import scipy.signal
69
54
  import scipy.optimize
70
55
  import scipy.interpolate
71
- import scipy.ndimage as nd
72
56
  import scipy.sparse
73
- import pytsmod
74
57
  import configparser
75
58
  import traceback
76
59
  import multiprocessing
77
60
  import platform
61
+ import natsort
62
+ from collections import defaultdict
63
+ from sortedcontainers import SortedList
64
+ import hashlib
65
+
66
+ try:
67
+ import wx
68
+ gui_font = (11, wx.FONTFAMILY_SWISS, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL, False, "Arial")
69
+ except ImportError:
70
+ wx = None
71
+
72
+ gui_update_interval_ms = 100
73
+ gui_background_color_dark = (28, 30, 35)
74
+ gui_background_color_light = (170, 182, 211)
78
75
 
79
76
  IS_RUNNING_WINDOWS = platform.system() == 'Windows'
80
77
  if IS_RUNNING_WINDOWS:
81
- import PySimpleGUIWx as sg
82
78
  default_output_dir = 'videos_with_ad'
83
79
  default_alignment_dir = 'alignment_plots'
84
80
  else:
85
- import PySimpleGUIQt as sg
86
81
  default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
87
82
  default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
88
83
 
89
- def display(text, func=None):
90
- if func:
91
- func(text)
92
- print(text)
93
-
94
- def throw_runtime_error(text, func=None):
95
- if func:
96
- func(text)
97
- raise RuntimeError(text)
98
-
99
- def ensure_folders_exist(dirs, display_func=None):
84
+ def ensure_folders_exist(dirs):
100
85
  for dir in dirs:
101
86
  if not os.path.isdir(dir):
102
- display(f"Directory not found, creating it: {dir}", display_func)
87
+ print(f"Directory not found, creating it: {dir}")
103
88
  os.makedirs(dir)
104
89
 
105
90
  def get_sorted_filenames(path, extensions, alt_extensions=set([])):
@@ -127,347 +112,59 @@ def get_sorted_filenames(path, extensions, alt_extensions=set([])):
127
112
  "Or maybe you need to add a new extension to this script's regex?",
128
113
  f"valid extensions for this input are:\n {extensions}"]
129
114
  raise RuntimeError("\n".join(error_msg))
130
- files = sorted(files)
131
- file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
132
- return files, file_types
133
-
134
- # read audio from file with ffmpeg and convert to numpy array
135
- def parse_audio_from_file(media_file):
136
- media_stream, _ = (ffmpeg
137
- .input(media_file)
138
- .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
139
- .run(capture_stdout=True, cmd=get_ffmpeg())
140
- )
141
- media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
142
- return media_arr
143
-
144
- # tokenize audio by transforming with a mel-frequency cepstrum (MFC)
145
- def tokenize_audio(media_arr, rate=1):
146
- step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
147
- window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
148
- window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
149
- fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
150
- get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
151
- samplerate=AUDIO_SAMPLE_RATE,
152
- winlen=window_size_seconds,
153
- winstep=TIMESTEP_SIZE_SECONDS * rate,
154
- numcep=MEL_COEFFS_PER_TIMESTEP,
155
- nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
156
- nfft=fft_size_samples,
157
- winfunc=scipy.signal.windows.hann)
158
- num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
159
- media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
160
- chunk_size = 1000
161
- for chunk_index in np.arange(0, num_timesteps, chunk_size):
162
- chunk_bounds_samples = ((chunk_index ) * step_size_samples,
163
- (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
164
- media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
165
- '''
166
- # alternate python library's MFC implementation
167
- import librosa
168
- media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
169
- sr=AUDIO_SAMPLE_RATE,
170
- n_mfcc=MEL_COEFFS_PER_TIMESTEP,
171
- lifter=22,
172
- n_fft=fft_size_samples,
173
- hop_length=step_size_samples,
174
- win_length=window_size_samples,
175
- window=scipy.signal.windows.hann).T
176
- num_timesteps = media_spec.shape[0]
177
- '''
178
- timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
179
- timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
180
- return media_spec, timings_seconds
181
-
182
- # same as tokenize_audio, but dithering the MFC window timings
183
- # this allows for finer alignment by ameliorating discretization error
184
- def tokenize_audio_dither(media_arr, slow_timings):
185
- # choose a relative step size slightly less than 1 to ameliorate quantization error
186
- # maximize alignment accuracy by using least approximable number with desired period
187
- # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
188
- fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
189
- fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
190
-
191
- # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
192
- # by approximately equalizing the number of tokens per unit time between dithered and undithered
193
- # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
194
- # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
195
- fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
196
- fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
197
- return fast_spec, fast_timings
198
-
199
- # normalize along both time and frequency axes to allow comparing tokens by correlation
200
- def normalize_spec(media_spec_raw, axes=(0,1)):
201
- media_spec = media_spec_raw.copy()
202
- for axis in axes:
203
- norm_func = np.std if axis == 0 else np.linalg.norm
204
- media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
205
- media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
206
- return media_spec
207
-
208
- # vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
209
- # modified to include affine gap penalties and skip+match options (i.e. knight's moves)
210
- # gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
211
- # or when the audio description includes a commercial break or an extra scene
212
- # the skip+match option allows for micro-adjustments without eating the full gap penalty
213
- # skip+match is primarily useful in maintaining alignment when the rates differ slightly
214
- def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
215
- pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
216
- 1:lambda node: (0, node[1]-2, node[2]-1),
217
- 2:lambda node: (0, node[1]-1, node[2]-2),
218
- 3:lambda node: (1, node[1]-1, node[2]-1),
219
- 4:lambda node: (0, node[1] , node[2] ),
220
- 5:lambda node: (1, node[1]-1, node[2] ),
221
- 6:lambda node: (1, node[1]-1, node[2]-1),
222
- 7:lambda node: (1, node[1] , node[2]-1)}
223
- pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
224
- pred_matrix[0,1:,:2] = 0
225
- pred_matrix[1,1:,:2] = 4
226
- pred_matrix[:,0,:2] = [0,5]
227
- path_corrs_match = np.zeros((3, video_spec.shape[0]))
228
- path_corrs_gap = np.zeros((3, video_spec.shape[0]))
229
- corrs = np.zeros((3, video_spec.shape[0]))
230
- corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
231
- for i in range(audio_desc_spec.shape[0]):
232
- i_mod = i % 3
233
- match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
234
- path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
235
- path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
236
- path_corrs_gap[ i_mod-1][1:-1][:,None]])
237
- pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
238
- path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
239
- corrs = np.roll(corrs, -1, axis=1)
240
- corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
241
- fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
242
- fisher_infos[fisher_infos < 0] = 0
243
- fisher_infos[fisher_infos > 10] = 10
244
- row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
245
- path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
246
- gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
247
- path_corrs_gap[i_mod-1][2: ][:,None],
248
- path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
249
- GAP_EXTEND_COST])
250
- pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
251
- path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
252
- pred_matrix[1][i][2:] += 4
253
- path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
254
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
255
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
256
- pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
257
- path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
258
-
259
- # reconstruct optimal path by following predecessors backwards through the table
260
- end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
261
- path_corrs_gap[ i_mod,-1]])
262
- cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
263
- get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
264
- path = []
265
- visited = set()
266
- while min(cur_node[1:]) >= 0:
267
- cur_node, last_node = get_predecessor(cur_node), cur_node
268
- # failsafe to prevent an infinite loop that should never happen anyways
269
- if cur_node in visited:
270
- break
271
- visited.add(cur_node)
272
- if last_node[0] == 0:
273
- path.append(last_node[1:])
274
- path = path[::-1]
275
-
276
- # determine how much information this node gives about the alignment
277
- # a larger double derivative means more precise timing information
278
- # sudden noises give more timing information than droning sounds
279
- def get_fisher_info(node):
280
- i,j = node
281
- if node[0] >= audio_desc_spec.shape[0]-1 or \
282
- node[1] >= video_spec.shape[0]-1 or \
283
- min(node) <= 0:
284
- return 0
285
- info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
286
- np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
287
- np.dot(audio_desc_spec[i+1],video_spec[j-1])
288
- info /= min(.2, TIMESTEP_SIZE_SECONDS)
289
- return info
290
-
291
- # the quality of a node combines the correlation of its tokens
292
- # with how precisely the match is localized in time
293
- def get_match_quality(node):
294
- # correlations are between -1 and 1, as all tokens have unit norm
295
- token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
296
- fisher_info = min(max(0, get_fisher_info(node)), 10)
297
- return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
298
-
299
- # filter out low match quality nodes from LCS path
300
- quals = [get_match_quality(node) for node in path]
301
- if len(quals) == 0 or max(quals) <= 0:
302
- raise RuntimeError("Rough alignment failed, are the input files mismatched?")
303
- path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
304
-
305
- # convert units of path nodes from timesteps to seconds
306
- path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
307
-
308
- return path, quals
115
+ files = natsort.os_sorted(files)
116
+ has_alt_extensions = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
117
+ return files, has_alt_extensions
309
118
 
310
- # chunk path segments of similar slope into clips
311
- # a clip has the form: (start_index, end_index)
312
- def chunk_path(smooth_path, tol):
313
- x,y = zip(*smooth_path)
314
- slopes = np.diff(y) / np.diff(x)
315
- median_slope = np.median(slopes)
316
- slope_changes = np.diff(slopes)
317
- breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
318
- breaks = [0] + list(breaks) + [len(x)-1]
319
- clips = list(zip(breaks[:-1], breaks[1:]))
320
- return clips, median_slope, slopes
119
+ # ffmpeg command error handler
120
+ def run_ffmpeg_command(command, err_msg):
121
+ try:
122
+ return command.run(capture_stdout=True, capture_stderr=True, cmd=get_ffmpeg())
123
+ except ffmpeg.Error as e:
124
+ print(" ERROR: ffmpeg failed to " + err_msg)
125
+ print("FFmpeg error:")
126
+ print(e.stderr.decode('utf-8'))
127
+ raise
321
128
 
322
- # find piece-wise linear alignment that minimizes the weighted combination of
323
- # total absolute error at each node and total absolute slope change of the fit
324
- # distance between nodes and the fit (i.e. errors) are weighted by node quality
325
- # absolute slope changes are differences between the slopes of adjacent fit lines
326
- # slope changes are weighted much more than node errors to smooth out noise
327
- # the main source of noise is rough alignment drift while the describer is speaking
328
- def smooth_align(path, quals, smoothness):
329
- # rotate basis to make vertical and horizontal slopes "cost" the same
330
- # the new horizontal axis is x+y and the new vertical is -x+y
331
- # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
332
- # after this transformation, we instead have -1 <= slope < 1
333
- # perfectly matching audio has pre-transformation slope = 1
334
- # after this transformation, it instead has slope = 0
335
- rotated_path = [(x+y,-x+y) for x,y in path]
336
-
337
- # stretch the x axis to make all slopes "cost" nearly the same
338
- # without this, small changes to the slope at slope = +/-1
339
- # cost sqrt(2) times as much as small changes at slope = 0
340
- # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
341
- # the small angle approximation means these slopes all cost roughly the same
342
- x_stretch_factor = 10.
343
- rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
344
-
345
- # L1-Minimization to solve the alignment problem using a linear program
346
- # the absolute value functions needed for "absolute error" can be represented
347
- # in a linear program by splitting variables into positive and negative pieces
348
- # and constraining each to be positive (done by default in scipy's linprog)
349
- # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
350
- # fit_err[i] = path[i][1] - y_fit[i]
351
- # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
352
- # (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
353
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
354
- # y_fit[i] = path[i][1] - fit_err[i]
355
- # this gives:
356
- # slope_change[i] = path_half[i] - fit_err_half[i]
357
- # where each half is just the original equation but y_fit is swapped out
358
- # the slope_change variables can then be set using equality constraints
359
- num_fit_points = len(rotated_stretched_path)
360
- x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
361
- x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
362
- y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
363
- slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
364
- slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
365
- slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
366
- slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
367
- slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
368
- c = np.hstack([quals,
369
- quals,
370
- slope_change_costs * x_stretch_factor,
371
- slope_change_costs * x_stretch_factor])
372
- fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
373
- -1. / x_diffs[:-1] - 1. / x_diffs[1:],
374
- 1. / x_diffs[1:]],
375
- offsets=[0,1,2],
376
- shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
377
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
378
- -fit_err_coeffs,
379
- scipy.sparse.eye(num_fit_points),
380
- -scipy.sparse.eye(num_fit_points)])
381
- b_eq = y_diffs[1: ] / x_diffs[1: ] - \
382
- y_diffs[ :-1] / x_diffs[ :-1]
383
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ds')
384
- # if dual simplex solver encounters numerical problems, retry with interior point solver
385
- if not fit.success and fit.status == 4:
386
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ipm')
387
- if not fit.success:
388
- print(fit)
389
- raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
390
-
391
- # combine fit_err_pos and fit_err_neg
392
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
393
-
394
- # subtract fit errors from nodes to retrieve the smooth fit's coordinates
395
- # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
396
- smooth_path = [(((x / x_stretch_factor) - y) / 2.,
397
- ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
398
-
399
- # clip off start/end of replacement audio if it doesn't match or isn't aligned
400
- # without this, describer intro/outro skips can cause mismatches at the start/end
401
- # the problem would be localized and just means audio might not match video at the start/end
402
- # instead we just keep the original video's audio in those segments if mismatches are detected
403
- # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
404
- # during audio replacement, synced edges will be extended backwards/forwards as far as possible
405
- # this is useful when the describer begins talking immediately (or before any alignable audio)
406
- # or when the describer continues speaking until the end (or no more alignable audio remains)
407
- # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
408
- max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
409
- smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
410
- smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
411
- smooth_err_path = zip(smoothed_fit_err, smooth_path)
412
- old_length = num_fit_points
413
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
414
- is_synced_at_start = len(smooth_err_path) == old_length
415
- old_length = len(smooth_err_path)
416
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
417
- is_synced_at_end = len(smooth_err_path) == old_length
418
- _, smooth_path = zip(*smooth_err_path)
419
- smooth_path = list(smooth_path)
420
- if is_synced_at_start:
421
- slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
422
- smooth_path.insert(0, (-10e10, -10e10 * slope))
423
- if is_synced_at_end:
424
- slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
425
- smooth_path.append((10e10, 10e10 * slope))
426
-
427
- clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
428
-
429
- # assemble clips with slopes within the rate tolerance into runs
430
- runs, run = [], []
431
- bad_clips = []
432
- for clip in clips:
433
- if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
434
- if len(run) > 0:
435
- runs.append(run)
436
- run = []
437
- bad_clips.append(clip)
438
- continue
439
- run.append(clip)
440
- if len(run) > 0:
441
- runs.append(run)
442
-
443
- return smooth_path, runs, bad_clips, clips
129
+ def run_async_ffmpeg_command(command, media_arr, err_msg):
130
+ try:
131
+ ffmpeg_caller = command.run_async(pipe_stdin=True, quiet=True, cmd=get_ffmpeg())
132
+ out, err = ffmpeg_caller.communicate(media_arr.astype(np.int16).T.tobytes())
133
+ if len(err) > 0:
134
+ print(" ERROR: ffmpeg failed to " + err_msg)
135
+ print("FFmpeg error:")
136
+ print(err.decode('utf-8'))
137
+ raise ChildProcessError('FFmpeg error.')
138
+ except ffmpeg.Error as e:
139
+ print(" ERROR: ffmpeg failed to " + err_msg)
140
+ print("FFmpeg error:")
141
+ print(e.stderr.decode('utf-8'))
142
+ raise
444
143
 
445
- # if the start or end were marked as synced during smooth alignment then
446
- # extend that alignment to the edge (i.e. to the start/end of the audio)
447
- def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
448
- if smooth_path[0][0] < -10e9:
449
- slope = smooth_path[0][1] / smooth_path[0][0]
450
- new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
451
- if new_start_point[1] < 0:
452
- new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
453
- smooth_path[0] = new_start_point
454
- if smooth_path[-1][0] > 10e9:
455
- video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
456
- audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
457
- slope = smooth_path[-1][1] / smooth_path[-1][0]
458
- new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
459
- if new_end_point[1] > video_runtime:
460
- new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
461
- smooth_path[-1] = new_end_point
144
+ # read audio from file with ffmpeg and convert to numpy array
145
+ def parse_audio_from_file(media_file, num_channels=2):
146
+ # retrieve only the first audio track, injecting silence/trimming to force timestamps to match up
147
+ # for example, when the video starts before the audio this fills that starting gap with silence
148
+ ffmpeg_command = ffmpeg.input(media_file).output('-', format='s16le', acodec='pcm_s16le',
149
+ af='aresample=async=1:first_pts=0', map='0:a:0',
150
+ ac=num_channels, ar=AUDIO_SAMPLE_RATE, loglevel='error')
151
+ media_stream, _ = run_ffmpeg_command(ffmpeg_command, f"parse audio from input file: {media_file}")
152
+ # media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1, num_channels)).T
153
+ media_arr = np.frombuffer(media_stream, np.int16).astype(np.float16).reshape((-1, num_channels)).T
154
+ return media_arr
462
155
 
463
- # visualize both the rough and smooth alignments
464
- def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs,
465
- bad_clips, ad_timings, similarity_percent):
156
+ def plot_alignment(plot_filename_no_ext, path, audio_times, video_times, similarity_percent,
157
+ median_slope, stretch_audio, no_pitch_correction):
158
+ downsample = 20
159
+ path = path[::downsample]
160
+ video_times_full, audio_times_full, cluster_indices, quals, cum_quals = path.T
466
161
  scatter_color = [.2,.4,.8]
467
162
  lcs_rgba = np.zeros((len(quals),4))
468
163
  lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
469
- lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
470
- audio_times, video_times = np.array(path).T.reshape((2,-1))
164
+ lcs_rgba[:,3] = np.clip(quals * 400. / len(quals), 0, 1)
165
+ audio_offsets = audio_times_full - video_times_full
166
+ plt.switch_backend('Agg')
167
+ plt.scatter(video_times_full / 60., audio_offsets, s=3, c=lcs_rgba, label='Matches')
471
168
  audio_offsets = audio_times - video_times
472
169
  def expand_limits(start, end, ratio=.01):
473
170
  average = (end + start) / 2.
@@ -475,63 +172,59 @@ def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs,
475
172
  half_diff *= (1 + ratio)
476
173
  return (average - half_diff, average + half_diff)
477
174
  plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
478
- plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
479
- np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
480
- plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
481
- audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
482
- audio_offsets = audio_times - video_times
483
- if ad_timings is None:
175
+ plt.ylim(expand_limits(*(np.min(audio_offsets) - 10 * TIMESTEP_SIZE_SECONDS,
176
+ np.max(audio_offsets) + 10 * TIMESTEP_SIZE_SECONDS), .05))
177
+ if stretch_audio:
484
178
  plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
485
- bad_path = []
486
- for clip in bad_clips:
487
- bad_path.extend(smooth_path[clip[0]:clip[1]+1])
488
- bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
489
- audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
490
- audio_offsets = audio_times - video_times
491
- if len(audio_offsets) > 0:
492
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
179
+ audio_times_unreplaced = []
180
+ video_times_unreplaced = []
181
+ for i in range(len(video_times) - 1):
182
+ slope = (audio_times[i+1] - audio_times[i]) / (video_times[i+1] - video_times[i])
183
+ if abs(1 - slope) > MAX_RATE_RATIO_DIFF_ALIGN:
184
+ video_times_unreplaced.extend(video_times[i:i+2])
185
+ audio_times_unreplaced.extend(audio_times[i:i+2])
186
+ video_times_unreplaced.append(video_times[i+1])
187
+ audio_times_unreplaced.append(np.nan)
188
+ if len(video_times_unreplaced) > 0:
189
+ video_times_unreplaced = np.array(video_times_unreplaced)
190
+ audio_times_unreplaced = np.array(audio_times_unreplaced)
191
+ audio_offsets = audio_times_unreplaced - video_times_unreplaced
192
+ plt.plot(video_times_unreplaced / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
493
193
  else:
494
- interp = scipy.interpolate.interp1d(video_times, audio_offsets,
495
- fill_value = np.inf,
496
- bounds_error = False, assume_sorted = True)
497
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
498
- video_times = ad_timings
499
- audio_offsets = interp(ad_timings)
500
- if len(audio_offsets) > 0:
501
- plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
502
- plt.xlabel('Video Time (minutes)')
503
- plt.ylabel('Audio Description Offset (seconds)')
194
+ plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Combined Media')
195
+ plt.xlabel('Original Video Time (minutes)')
196
+ plt.ylabel('Original Audio Description Offset (seconds behind video)')
504
197
  plt.title(f"Alignment - Media Similarity {similarity_percent:.2f}%")
505
198
  plt.legend().legend_handles[0].set_color(scatter_color)
506
199
  plt.tight_layout()
507
200
  plt.savefig(plot_filename_no_ext + '.png', dpi=400)
508
201
  plt.clf()
509
-
510
202
  with open(plot_filename_no_ext + '.txt', 'w') as file:
511
- rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
512
- video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
203
+ parameters = {'stretch_audio':stretch_audio, 'no_pitch_correction':no_pitch_correction}
204
+ print(f"Parameters: {parameters}", file=file)
205
+ this_script_path = os.path.abspath(__file__)
206
+ print(f"Version Hash: {get_version_hash(this_script_path)}", file=file)
207
+ video_offset = video_times[0] - audio_times[0]
513
208
  print(f"Input file similarity: {similarity_percent:.2f}%", file=file)
514
209
  print("Main changes needed to video to align it to audio input:", file=file)
515
210
  print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
516
211
  print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
517
- for clip_start, clip_end in rough_clips:
518
- audio_desc_start, video_start = smooth_path[clip_start]
519
- audio_desc_end, video_end = smooth_path[clip_end]
520
- slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
212
+ for i in range(len(video_times) - 1):
213
+ slope = (video_times[i+1] - video_times[i]) / (audio_times[i+1] - audio_times[i])
521
214
  def str_from_time(seconds):
522
215
  minutes, seconds = divmod(seconds, 60)
523
216
  hours, minutes = divmod(minutes, 60)
524
- return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
525
- print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
526
- f"{str_from_time(video_end)} aligning with audio from " + \
527
- f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
217
+ return f"{hours:2.0f}:{minutes:02.0f}:{seconds:06.3f}"
218
+ print(f"Rate change of {(slope-1.)*100:8.1f}% from {str_from_time(video_times[i])} to " + \
219
+ f"{str_from_time(video_times[i+1])} aligning with audio from " + \
220
+ f"{str_from_time(audio_times[i])} to {str_from_time(audio_times[i+1])}", file=file)
528
221
 
529
222
  # use the smooth alignment to replace runs of video sound with corresponding described audio
530
- def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
223
+ def replace_aligned_segments(video_arr, audio_desc_arr, audio_desc_times, video_times, no_pitch_correction):
531
224
  # perform quadratic interpolation of the audio description's waveform
532
225
  # this allows it to be stretched to match the corresponding video segment
533
226
  def audio_desc_arr_interp(samples):
534
- chunk_size = 10**7
227
+ chunk_size = 10**5
535
228
  interpolated_chunks = []
536
229
  for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
537
230
  interp_bounds = (max(int(chunk[0]-2), 0),
@@ -540,215 +233,196 @@ def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pi
540
233
  audio_desc_arr[:,slice(*interp_bounds)],
541
234
  copy=False, bounds_error=False, fill_value=0,
542
235
  kind='quadratic', assume_sorted=True)
543
- interpolated_chunks.append(interp(chunk).astype(np.float32))
236
+ interpolated_chunks.append(interp(chunk).astype(np.float16))
544
237
  return np.hstack(interpolated_chunks)
545
238
 
546
- # construct a stretched audio description waveform using the quadratic interpolator
547
- def get_interped_segment(run, interp):
548
- segment = []
549
- for clip in run:
550
- num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
551
- int(y[clip[0]] * AUDIO_SAMPLE_RATE)
552
- clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
553
- sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
554
- segment.append(interp(sample_points))
555
- segment = np.hstack(segment)
556
- return segment
557
-
558
- # compress dual channel audio to mono for use in pitch corrected stretching
559
- # pytsmod's wsola treats channels separately, so without this it sounds weird
560
- if not no_pitch_correction:
561
- audio_desc_arr_mono = np.mean(audio_desc_arr, axis=0)
562
-
563
- x,y = zip(*smooth_path)
564
- for run in runs:
565
- run_length_seconds = y[run[-1][1]] - y[run[0][0]]
566
- if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
567
- continue
568
- anchor_point_path_indices = [clip[0] for clip in run]
569
- anchor_point_path_indices.append(run[-1][1])
570
- anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
571
- np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
572
- slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
573
- for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
574
- # only apply pitch correction if the difference would be noticeable
575
- if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
576
- stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
239
+ # yields matrices of pearson correlations indexed by the first window's start and
240
+ # the second window's offset from the first window
241
+ # the output matrix is truncated to the valid square with positive offsets
242
+ # if negative=True, it is truncated to the valid square with negative offsets
243
+ # subsequent yields are the adjacent square following the previously yielded one
244
+ def get_pearson_corrs_generator(input, negative, jumps, window_size=512):
245
+ # processing the entire vector at once is faster, but uses too much memory
246
+ # instead, parse the input vector in pieces with a recursive call
247
+ max_cached_chunks = 50
248
+ cut = max_cached_chunks * window_size
249
+ if input.shape[1] > (max_cached_chunks + 2) * 1.1 * window_size:
250
+ is_first_iter = True
251
+ while True:
252
+ output_start = 0 if is_first_iter else 1
253
+ is_last_iter = (input.shape[1] <= (max_cached_chunks + 2) * 1.1 * window_size)
254
+ output_end = None if is_last_iter else max_cached_chunks
255
+ input_end = None if is_last_iter else (cut + window_size)
256
+ yield from itertools.islice(get_pearson_corrs_generator(input[:,:input_end], negative, jumps),
257
+ output_start, output_end)
258
+ if is_last_iter:
259
+ return
260
+ input = input[:,cut-window_size:]
261
+ is_first_iter = False
262
+ if input.shape[1] < 3 * window_size - 1:
263
+ raise RuntimeError("Invalid state in Pearson generator.")
264
+ pearson_corrs = np.zeros((len(jumps), input.shape[1] - window_size + 1)) - np.inf
265
+ # calculate dot products of pairs of windows (i.e. autocorrelation)
266
+ # avoids redundant calculations by substituting differences in the cumulative sum of products
267
+ self_corr = np.sum(input.astype(np.float32)**2, axis=0)
268
+ corr_cumsum = np.cumsum(self_corr, dtype=np.float64)
269
+ corr_cumsum[window_size:] -= corr_cumsum[:-window_size]
270
+ window_rms = corr_cumsum[window_size-1:]
271
+ epsilon = 1e-4 * max(1, np.max(window_rms))
272
+ window_rms = np.sqrt(window_rms + epsilon)
273
+ for jump_index, jump in enumerate(jumps):
274
+ autocorrelation = np.sum(input[:,jump:].astype(np.float32) * input[:,:input.shape[1]-jump], axis=0)
275
+ autocorr_cumsum = np.cumsum(autocorrelation, dtype=np.float64)
276
+ autocorr_cumsum[window_size:] -= autocorr_cumsum[:-window_size]
277
+ if negative:
278
+ pearson_corrs[jump_index, jump:] = autocorr_cumsum[window_size-1:] + epsilon
279
+ pearson_corrs[jump_index, jump:] /= window_rms[:len(window_rms)-jump]
577
280
  else:
578
- anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
579
- # account for quirks of pytsmod's wsola anchor point implementation
580
- anchor_point_pair[1][-1] -= 1
581
- anchor_y_offset = anchor_point_pair[1][0]
582
- anchor_point_pair[1,:] -= anchor_y_offset
583
- stretched_audio = pytsmod.wsola(audio_desc_arr_mono, anchor_point_pair)
584
- video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
585
-
586
- # identify which segments of the replaced audio actually have the describer speaking
587
- # uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
588
- def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
589
- smooth_path, detect_sensitivity, boost_sensitivity):
590
- # retokenize the audio description, which has been stretched to match the video
591
- audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
592
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
593
-
594
- # avoid boosting or training on mismatched segments, like those close to skips
595
- # assumes matching segments all have the same, constant play rate
596
- # could be modified to handle a multi-modal distribution of rates
597
- aligned_audio_times, aligned_video_times = zip(*smooth_path)
598
- interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
599
- fill_value = 'extrapolate',
600
- bounds_error = False, assume_sorted = True)
601
- slopes = (interp(video_timings + 1e-5) - \
602
- interp(video_timings - 1e-5)) / 2e-5
603
- median_slope = np.median(slopes)
604
- aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
605
- well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
606
-
607
- # first pass identification by assuming poorly matched tokens are describer speech
608
- # also assumes the describer doesn't speak very quietly
609
- corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
610
- smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
611
- audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
612
- speech_mask = (corrs < .2) * audio_desc_loud
613
-
614
- # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
615
- audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
616
- audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
617
- video_spec = normalize_spec(video_spec_raw, axes=(0,))
618
- video_spec = np.clip(video_spec / 6., -1, 1)
619
-
620
- # convert sampled features (e.g. spectrogram) to probability densities of each feature
621
- # when given a spectrogram, finds the distributions of the MFC coefficients
622
- def make_log_pdfs(arr):
623
- resolution = 100
624
- bins_per_spot = 4
625
- num_bins = int(resolution * bins_per_spot)
626
- uniform_prior_strength_per_spot = 1
627
- uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
628
- bin_range = (-1 - 1e-10, 1 + 1e-10)
629
- get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
630
- pdfs = np.apply_along_axis(get_hist, 1, arr.T)
631
- pdfs = pdfs + uniform_prior_strength_per_bin
632
- smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
633
- pdfs = np.apply_along_axis(smooth, 1, pdfs)
634
- pdfs = pdfs / np.sum(pdfs[0,:])
635
- log_pdfs = np.log(pdfs)
636
- bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
637
- return log_pdfs, bin_edges
638
-
639
- diff_spec = audio_desc_spec - video_spec
640
- diff_spec = np.clip(diff_spec, -1, 1)
281
+ pearson_corrs[jump_index, :pearson_corrs.shape[1]-jump] = autocorr_cumsum[window_size-1:] + epsilon
282
+ pearson_corrs[jump_index, :pearson_corrs.shape[1]-jump] /= window_rms[jump:]
283
+ # divide by RMS of constituent windows to get Pearson correlations
284
+ pearson_corrs = pearson_corrs / window_rms[None,:]
285
+ pearson_corrs = pearson_corrs.T
286
+ for chunk_index in range(0, input.shape[1] // window_size):
287
+ yield pearson_corrs[chunk_index*window_size:(chunk_index+1)*window_size]
641
288
 
642
- # Naive Bayes classifier to roughly estimate whether each token is describer speech
643
- desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
644
- nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
645
- lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
646
- lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
647
- np.digitize(diff_spec, bin_edges, right=True)-1]
648
- ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
649
- (np.sum((~speech_mask) * well_aligned_mask) + 1.)
650
- relative_probs = np.sum(lratios, axis=1)
651
- relative_probs /= np.std(relative_probs)
652
- relative_probs -= np.mean(relative_probs)
653
-
654
- # L1-Minimization to smoothly identify audio descriptions using a linear program
655
- # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
656
- # fit_err[i] = relative_probs[i] - y_fit[i]
657
- # delta_fit[i] = y_fit[i] - y_fit[i-1]
658
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
659
- # y_fit[i] = relative_probs[i] - fit_err[i]
660
- # this gives:
661
- # delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
662
- # (fit_err[i] - fit_err[i-1])
663
- # the delta_fit variables can then be set using equality constraints
664
- num_fit_points = len(relative_probs)
665
- y_diffs = np.diff(relative_probs)
666
- pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
667
- neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
668
- c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
669
- np.ones(num_fit_points) / neg_err_cost_factor,
670
- np.ones(num_fit_points - 1) / 2.,
671
- np.ones(num_fit_points - 1) / 2.])
672
- fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
673
- np.ones(num_fit_points)],
674
- offsets=[0,1],
675
- shape=(num_fit_points - 1, num_fit_points)).tocsc()
676
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
677
- -fit_err_coeffs,
678
- scipy.sparse.eye(num_fit_points-1),
679
- -scipy.sparse.eye(num_fit_points-1)])
680
- b_eq = y_diffs
681
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ds')
682
- # if dual simplex solver encounters numerical problems, retry with interior point solver
683
- if not fit.success and fit.status == 4:
684
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ipm')
685
- if not fit.success:
686
- print(fit)
687
- raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
688
-
689
- # combine fit_err_pos and fit_err_neg
690
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
691
-
692
- # subtract fit errors from nodes to retrieve the smoothed fit
693
- smooth_desc_locations = relative_probs - fit_err
694
-
695
- # hard threshold to classify each token as describer speech or not
696
- speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
697
- speech_mask *= aligned_mask
698
-
699
- # a separate mask is created for describer volume boosting
700
- # as losing the describer's voice entirely is usually worse than it just being quiet
701
- # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
702
- boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
703
- boost_mask *= well_aligned_mask
704
-
705
- # convert a token classification into a mask that can be applied directly to samples
706
- # unlike the input, the output isn't a boolean array but an array of floats
707
- def token_mask_to_sample_mask(token_mask):
708
- description_timings = video_timings[1:-1][token_mask[1:-1]]
709
- sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
710
- window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
711
- window_size_seconds = 2 * window_radius + 1
712
- bump = scipy.signal.windows.hann(window_size_seconds)
713
- for description_timing in description_timings:
714
- window_center = int(description_timing * AUDIO_SAMPLE_RATE)
715
- sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
716
- return sample_mask
717
-
718
- speech_sample_mask = token_mask_to_sample_mask(speech_mask)
719
- boost_sample_mask = token_mask_to_sample_mask(boost_mask)
720
- ad_timings = video_timings.copy()
721
- ad_timings[~speech_mask] = np.inf
289
+ def stretch(input, output, window_size=512, max_drift=512*3):
290
+ drift_window_size = max_drift * 2 + 1
291
+ num_input_samples = input.shape[1]
292
+ num_output_samples = output.shape[1]
293
+ total_offset_samples = num_output_samples - num_input_samples
294
+ jumps = [506, 451, 284, 410, 480, 379, 308, 430, 265, 494]
295
+ # use all jumps when given unreachable or difficult to reach offsets (i.e. Frobenius coin problem)
296
+ # otherwise, skip most jumps to trade off a little performance for a lot of speed
297
+ if abs(total_offset_samples) < 10000:
298
+ if abs(total_offset_samples) > 1000:
299
+ jumps.extend([MIN_STRETCH_OFFSET + offset for offset in (2**np.arange(8))-1])
300
+ else:
301
+ jumps = range(MIN_STRETCH_OFFSET, window_size)
302
+ num_windows = (num_input_samples // window_size)
303
+ window_to_offset = lambda window_index: (total_offset_samples * \
304
+ min((num_windows - 1), max(0, window_index))) // (num_windows - 1)
305
+ # note the absolute value in the drift
306
+ # the following calculations also use the absolute value of the jumps
307
+ # their signs flip together, so this saves on casework in the code
308
+ # after the optimal route is determined, the sign of the jumps will be reintroduced
309
+ window_to_offset_diff = lambda window_index: abs(window_to_offset(window_index) - \
310
+ window_to_offset(window_index - 1))
311
+ backpointers = np.zeros((num_windows, drift_window_size), dtype=np.int16)
312
+ best_jump_locations = np.zeros((num_windows, len(jumps)), dtype=np.int16)
313
+ cum_loss = np.zeros((3, drift_window_size)) + np.inf
314
+ cum_loss[1:, max_drift] = 0
315
+ last_offset_diff = 0
316
+ # if the output needs to be longer than the input, we need to jump backwards in the input
317
+ pearson_corrs_generator = get_pearson_corrs_generator(input, (total_offset_samples > 0), jumps)
318
+ for window_index in range(num_windows):
319
+ corrs = next(pearson_corrs_generator)
320
+ # for each jump distance, determine the best input index in the window to make that jump
321
+ best_jump_locations[window_index] = np.argmax(corrs, axis=0)
322
+ best_jump_losses = 1 - corrs[best_jump_locations[window_index], np.arange(corrs.shape[1])]
323
+ offset_diff = window_to_offset_diff(window_index)
324
+ offset_diff2 = offset_diff + last_offset_diff
325
+ offset_jump_losses = np.zeros((len(jumps)+1, drift_window_size)) + np.inf
326
+ # consider not jumping at all, copying the loss from the corresponding offset one window back
327
+ offset_jump_slice = slice(None, offset_jump_losses.shape[1] - offset_diff)
328
+ offset_jump_losses[0,offset_jump_slice] = cum_loss[(window_index-1)%3,offset_diff:]
329
+ for jump_index, jump in enumerate(jumps):
330
+ truncation_amount = offset_diff2 - jump
331
+ offset_jump_slice = slice(jump, drift_window_size - max(0, truncation_amount))
332
+ cum_loss_slice = slice(offset_diff2, drift_window_size + min(0, truncation_amount))
333
+ # consider jumping the given distance from two windows back
334
+ # a window is skipped when jumping to prevent overlapping crossfades
335
+ offset_jump_losses[jump_index+1, offset_jump_slice] = cum_loss[(window_index-2)%3, cum_loss_slice] + \
336
+ best_jump_losses[jump_index]
337
+ best_jumps = np.argmin(offset_jump_losses, axis=0)
338
+ backpointers[window_index] = best_jumps
339
+ cum_loss[window_index%3] = offset_jump_losses[best_jumps, np.arange(offset_jump_losses.shape[1])]
340
+ last_offset_diff = offset_diff
341
+ drift = max_drift
342
+ best_jumps = []
343
+ skip_window = False
344
+ for window_index in range(num_windows - 1, -1, -1):
345
+ drift += window_to_offset_diff(window_index + 1)
346
+ if skip_window:
347
+ skip_window = False
348
+ continue
349
+ best_jump_index = backpointers[window_index, drift] - 1
350
+ if best_jump_index == -1:
351
+ continue
352
+ best_jump = jumps[best_jump_index]
353
+ jump_input_index = window_index * window_size + \
354
+ best_jump_locations[window_index, best_jump_index].item()
355
+ drift -= best_jump
356
+ skip_window = True
357
+ best_jumps.append((jump_input_index, best_jump))
358
+ best_jumps = best_jumps[::-1]
359
+ best_jumps = np.array(best_jumps)
360
+ # reintroduce the sign of the jump distances
361
+ # if the output is longer, use backwards jumps in the input to duplicate samples
362
+ # if the output is shorter, use forwards jumps in the input to remove samples
363
+ if total_offset_samples > 0:
364
+ best_jumps[:,1] *= -1
365
+ jump_input_indices = best_jumps[:,0]
366
+ jump_distances = best_jumps[:,1]
367
+ # calculate starts and ends of segments that will be copied from input to output
368
+ input_starts = np.concatenate(([0], jump_input_indices + jump_distances))
369
+ input_ends = np.concatenate((jump_input_indices, [input.shape[1]]))
370
+ chunk_lengths = input_ends - input_starts
371
+ output_ends = np.cumsum(chunk_lengths)
372
+ output_starts = np.concatenate(([0], output_ends[:-1]))
373
+ bump = scipy.signal.windows.hann(2 * window_size + 1)
374
+ bump_head = bump[:window_size]
375
+ bump_tail = bump[window_size:-1]
376
+ output[:,:window_size] = input[:,:window_size]
377
+ for in_start, in_end, out_start, out_end in zip(input_starts, input_ends, output_starts, output_ends):
378
+ output[:,out_start:out_start+window_size] *= bump_tail
379
+ output[:,out_start:out_start+window_size] += input[:,in_start:in_start+window_size] * bump_head
380
+ output[:,out_start+window_size:out_end+window_size] = input[:,in_start+window_size:in_end+window_size]
722
381
 
723
- return speech_sample_mask, boost_sample_mask, ad_timings
382
+ x = audio_desc_times
383
+ y = video_times
384
+ x_samples = (x * AUDIO_SAMPLE_RATE).astype(int)
385
+ y_samples = (y * AUDIO_SAMPLE_RATE).astype(int)
386
+ diff_x_samples = np.diff(x_samples)
387
+ diff_y_samples = np.diff(y_samples)
388
+ slopes = diff_x_samples / diff_y_samples
389
+ total_offset_samples = diff_y_samples - diff_x_samples
390
+ y_midpoint_samples = (y_samples[:-1] + y_samples[1:]) // 2
391
+ progress_update_interval = (video_arr.shape[1] // 100) + 1
392
+ last_progress_update = -1
393
+ for i in range(len(x) - 1):
394
+ if diff_y_samples[i] < (MIN_DURATION_TO_REPLACE_SECONDS * AUDIO_SAMPLE_RATE) or \
395
+ np.abs(1 - slopes[i]) > MAX_RATE_RATIO_DIFF_ALIGN:
396
+ continue
397
+ video_arr_slice = video_arr[:,slice(*y_samples[i:i+2])]
398
+ progress = int(y_midpoint_samples[i] // progress_update_interval)
399
+ if progress > last_progress_update:
400
+ last_progress_update = progress
401
+ print(f" stretching audio:{progress:3d}% \r", end='')
402
+ # only apply pitch correction if the difference would be noticeable
403
+ if no_pitch_correction or np.abs(1 - slopes[i]) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO or \
404
+ abs(total_offset_samples[i]) < MIN_STRETCH_OFFSET:
405
+ # construct a stretched audio description waveform using the quadratic interpolator
406
+ sample_points = np.linspace(*x_samples[i:i+2], num=diff_y_samples[i], endpoint=False)
407
+ video_arr_slice[:] = audio_desc_arr_interp(sample_points)
408
+ else:
409
+ stretch(audio_desc_arr[:,slice(*x_samples[i:i+2])], video_arr_slice)
724
410
 
725
411
  # Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
726
- def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
412
+ def encode_fit_as_ffmpeg_expr(audio_desc_times, video_times, video_offset):
727
413
  # PTS is the input frame's presentation timestamp, which is when frames are displayed
728
414
  # TB is the timebase, which is how many seconds each unit of PTS corresponds to
729
415
  # the output value of the expression will be the frame's new PTS
730
416
  setts_cmd = ['TS']
731
- start_skip = max(0, video_offset - start_key_frame)
732
- if start_skip > 0:
733
- # lossless cutting can only happen at key frames, so we cut the video before the audio starts
734
- # but that means the video is behind the audio and needs to catch up by playing quicker
735
- # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
736
- catchup_spread = 1./CATCHUP_RATE
737
- setts_cmd.append(f'+clip(TS-{start_key_frame},0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
738
- elif video_offset < 0:
739
- # if the audio starts before the video, stretch the first frame of the video back to meet it
740
- setts_cmd.append(f'+clip(TS-{start_key_frame},0,{-video_offset/10000.}/TB)*10000')
741
417
  # each segment of the linear fit can be encoded as a single clip function
742
418
  setts_cmd.append('+(0')
743
- for clip_start, clip_end in clips:
744
- audio_desc_start, video_start = smooth_path[clip_start]
745
- audio_desc_end, video_end = smooth_path[clip_end]
746
- video_start -= start_key_frame
747
- video_end -= start_key_frame
748
- audio_desc_length = audio_desc_end - audio_desc_start
749
- video_length = video_end - video_start
750
- slope = audio_desc_length / video_length
751
- setts_cmd.append(f'+clip(TS-{start_key_frame}-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
419
+ x = audio_desc_times
420
+ y = video_times
421
+ diff_x = np.diff(x)
422
+ diff_y = np.diff(y)
423
+ slopes = diff_x / diff_y
424
+ for i in range(len(audio_desc_times) - 1):
425
+ setts_cmd.append(f'+clip(TS-{y[i]-video_offset:.4f}/TB,0,{max(0,diff_y[i]):.4f}/TB)*{slopes[i]-1:.9f}')
752
426
  setts_cmd.append(')')
753
427
  setts_cmd = ''.join(setts_cmd)
754
428
  return setts_cmd
@@ -759,75 +433,65 @@ def get_ffmpeg():
759
433
  def get_ffprobe():
760
434
  return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
761
435
 
436
+ def get_key_frame_data(video_file, time=None, entry='pts_time'):
437
+ interval = f'%+{max(60,time+40)}' if time != None else '%'
438
+ key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='V', show_frames=None,
439
+ skip_frame='nokey', read_intervals=interval,
440
+ show_entries='frame='+entry)['frames']
441
+ return np.array([float(frame[entry]) for frame in key_frames if entry in frame])
442
+
443
+ # finds the average timestamp of (i.e. midpoint between) the key frames on either side of input time
762
444
  def get_closest_key_frame_time(video_file, time):
763
- if time <= 0:
764
- return 0
765
- key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='V',
766
- show_frames=None, skip_frame='nokey')['frames']
767
- key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
768
- return np.max(key_frame_times[key_frame_times <= time])
445
+ key_frame_times = get_key_frame_data(video_file, time)
446
+ key_frame_times = key_frame_times if len(key_frame_times) > 0 else np.array([0])
447
+ prev_key_frame_times = key_frame_times[key_frame_times <= time]
448
+ prev_key_frame = np.max(prev_key_frame_times) if len(prev_key_frame_times) > 0 else time
449
+ next_key_frame_times = key_frame_times[key_frame_times > time]
450
+ next_key_frame = np.min(next_key_frame_times) if len(next_key_frame_times) > 0 else time
451
+ return (prev_key_frame + next_key_frame) / 2.
769
452
 
770
453
  # outputs a new media file with the replaced audio (which includes audio descriptions)
771
454
  def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
772
- setts_cmd=None, start_key_frame=None):
773
- if audio_desc_file is None:
774
- media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
775
- ac=2, ar=AUDIO_SAMPLE_RATE)
776
- if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
777
- write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
455
+ setts_cmd=None, video_offset=None, after_start_key_frame=None):
456
+ # if a media array is given, stretch_audio is enabled and media_arr should be added to the video
457
+ if media_arr is not None:
458
+ media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE)
459
+ # if no video file is given, the input "video" was an audio file and the output should be too
460
+ if video_file is None:
461
+ write_command = ffmpeg.output(media_input, output_filename, loglevel='error').overwrite_output()
778
462
  else:
779
- original_video = ffmpeg.input(video_file)
463
+ original_video = ffmpeg.input(video_file, dn=None)
780
464
  # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
781
465
  # ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
782
466
  # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
783
467
  write_command = ffmpeg.output(media_input, original_video, output_filename,
784
468
  acodec='copy', vcodec='copy', scodec='copy',
785
- max_interleave_delta='0', loglevel='fatal',
786
- **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
787
- ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
788
- ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
789
- ffmpeg_caller.stdin.close()
790
- ffmpeg_caller.wait()
469
+ max_interleave_delta='0', loglevel='error',
470
+ **{"c:a:0": "aac", "disposition:a:1": "original",
471
+ "metadata:s:a:1": "title=original",
472
+ "disposition:a:0": "default+visual_impaired+descriptions",
473
+ "metadata:s:a:0": "title=AD"}).overwrite_output()
474
+ run_async_ffmpeg_command(write_command, media_arr, f"write output file: {output_filename}")
791
475
  else:
792
- media_input = ffmpeg.input(audio_desc_file)
793
- audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
794
- show_entries='format=duration')['streams']
795
- audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
796
- original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
797
- if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
798
- # wav files don't have codecs compatible with most video containers, so we convert to aac
799
- audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
800
- # flac audio may only have experimental support in some video containers (e.g. mp4)
801
- standards = 'normal' if os.path.splitext(audio_desc_file)[1] != '.flac' else 'experimental'
802
- write_command = ffmpeg.output(media_input, original_video, output_filename,
803
- acodec=audio_codec, vcodec='copy', scodec='copy',
804
- max_interleave_delta='0', loglevel='fatal', strict=standards,
805
- **{'bsf:v': f'setts=ts=\'{setts_cmd}\'',
806
- 'bsf:s': f'setts=ts=\'{setts_cmd}\''}).overwrite_output()
807
- write_command.run(cmd=get_ffmpeg())
808
- else:
809
- # work around for bug that sometimes breaks setts when output and input formats differ
810
- # the trick is separating the input and output by piping from one ffmpeg process into another
811
- # mkv files break if 'nut' is used, while other files break when 'matroska' is used
812
- format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
813
- write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
814
- c='copy', loglevel='fatal')
815
- ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
816
- pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
817
- write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
818
- max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
819
- **{'bsf:v': f'setts=ts=\'{setts_cmd}\'',
820
- 'bsf:s': f'setts=ts=\'{setts_cmd}\''}).overwrite_output()
821
- ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
822
- while True:
823
- in_bytes = ffmpeg_caller.stdout.read(100000)
824
- if not in_bytes:
825
- break
826
- ffmpeg_caller2.stdin.write(in_bytes)
827
- ffmpeg_caller2.stdin.close()
828
- ffmpeg_caller.wait()
829
- ffmpeg_caller2.wait()
830
-
476
+ start_offset = video_offset - after_start_key_frame
477
+ media_input = ffmpeg.input(audio_desc_file, itsoffset=f'{max(0, start_offset):.6f}')
478
+ original_video = ffmpeg.input(video_file, an=None, ss=f'{after_start_key_frame:.6f}',
479
+ itsoffset=f'{max(0, -start_offset):.6f}', dn=None)
480
+ # wav files don't have codecs compatible with most video containers, so we convert to aac
481
+ audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
482
+ # flac audio may only have experimental support in some video containers (e.g. mp4)
483
+ standards = 'normal' if os.path.splitext(audio_desc_file)[1] != '.flac' else 'experimental'
484
+ # add frag_keyframe flag to prevent some players from ignoring audio/video start offsets
485
+ # set both pts and dts simultaneously in video manually, as ts= does not do the same thing
486
+ write_command = ffmpeg.output(media_input, original_video, output_filename,
487
+ acodec=audio_codec, vcodec='copy', scodec='copy',
488
+ max_interleave_delta='0', loglevel='error',
489
+ strict=standards, movflags='frag_keyframe',
490
+ **{'bsf:v': f'setts=pts=\'{setts_cmd}\':dts=\'{setts_cmd}\'',
491
+ 'bsf:s': f'setts=ts=\'{setts_cmd}\'',
492
+ "disposition:a:0": "default+visual_impaired+descriptions",
493
+ "metadata:s:a:0": "title=AD"}).overwrite_output()
494
+ run_ffmpeg_command(write_command, f"write output file: {output_filename}")
831
495
 
832
496
  # check whether static_ffmpeg has already installed ffmpeg and ffprobe
833
497
  def is_ffmpeg_installed():
@@ -835,15 +499,482 @@ def is_ffmpeg_installed():
835
499
  indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
836
500
  return os.path.exists(indicator_file)
837
501
 
502
+ def get_energy(arr):
503
+ # downsample of 105, hann size 15, downsample by 2 gives 210 samples per second, ~65 halfwindows/second
504
+ decimation = 105
505
+ decimation2 = 2
506
+ arr_clip = arr[:,:(arr.shape[1] - (arr.shape[1] % decimation))].reshape(arr.shape[0], -1, decimation)
507
+ energy = np.einsum('ijk,ijk->j', arr_clip, arr_clip, dtype=np.float32) / (decimation * arr.shape[0])
508
+ hann_window = scipy.signal.windows.hann(15)[1:-1].astype(np.float32)
509
+ hann_window /= np.sum(hann_window)
510
+ energy_smooth = np.convolve(energy, hann_window, mode='same')
511
+ energy_smooth = np.log10(1 + energy_smooth) / 2.
512
+ return energy_smooth[::decimation2]
513
+
514
+ def get_zero_crossings(arr):
515
+ xings = np.diff(np.signbit(arr), prepend=False, axis=-1)
516
+ xings_clip = xings[:,:(xings.shape[1] - (xings.shape[1] % 210))].reshape(xings.shape[0], -1, 210)
517
+ zero_crossings = np.sum(np.abs(xings_clip), axis=(0,2)).astype(np.float32)
518
+ if xings.shape[0] == 1:
519
+ zero_crossings *= 2
520
+ hann_window = scipy.signal.windows.hann(15)[1:-1].astype(np.float32)
521
+ hann_window = hann_window / np.sum(hann_window)
522
+ zero_crossings_smooth = np.convolve(zero_crossings, hann_window, mode='same')
523
+ return zero_crossings_smooth
524
+
525
+ def downsample_blur(arr, downsample, blur):
526
+ hann_window = scipy.signal.windows.hann(downsample*blur+2)[1:-1].astype(np.float32)
527
+ hann_window = hann_window / np.sum(hann_window)
528
+ arr = arr[:len(arr)-(len(arr)%downsample)]
529
+ return sum((np.convolve(arr[i::downsample], hann_window[i::downsample],
530
+ mode='same') for i in range(downsample)))
531
+
532
+ def get_freq_bands(arr):
533
+ arr = np.mean(arr, axis=0) if arr.shape[0] > 1 else arr[0]
534
+ arr = arr[:len(arr)-(len(arr)%210)]
535
+ downsamples = [5, 7, 6]
536
+ decimation = 1
537
+ freq_bands = []
538
+ for downsample in downsamples:
539
+ if downsample == downsamples[-1]:
540
+ band_bottom = np.array(0).reshape(1)
541
+ else:
542
+ band_bottom = downsample_blur(arr, downsample, 3)
543
+ decimation *= downsample
544
+ arr = arr.reshape(-1, downsample)
545
+ band_energy = sum(((arr[:,i] - band_bottom) ** 2 for i in range(downsample)))
546
+ freq_band = downsample_blur(band_energy, (210 // decimation), 15) / 210
547
+ freq_band = np.log10(1 + freq_band) / 2.
548
+ freq_bands.append(freq_band)
549
+ arr = band_bottom
550
+ return freq_bands
551
+
552
+ def align(video_features, audio_desc_features, video_energy, audio_desc_energy):
553
+ samples_per_node = 210 // TIMESTEPS_PER_SECOND
554
+ hann_window_unnormed = scipy.signal.windows.hann(2*samples_per_node+1)[1:-1]
555
+ hann_window = hann_window_unnormed / np.sum(hann_window_unnormed)
556
+ get_mean = lambda arr: np.convolve(hann_window, arr, mode='same')
557
+ get_uniform_norm = lambda arr: np.convolve(np.ones(hann_window.shape), arr ** 2, mode='valid') ** .5
558
+ def get_uniform_norms(features):
559
+ return [np.clip(get_uniform_norm(feature), .001, None) for feature in features]
560
+
561
+ print(" memorizing video... \r", end='')
562
+ video_features_mean_sub = [feature - get_mean(feature) for feature in video_features]
563
+ audio_desc_features_mean_sub = [feature - get_mean(feature) for feature in audio_desc_features]
564
+ video_uniform_norms = get_uniform_norms(video_features_mean_sub)
565
+ audio_desc_uniform_norms = get_uniform_norms(audio_desc_features_mean_sub)
566
+
567
+ num_bins = 7
568
+ bin_spacing = 6
569
+ bins_width = (num_bins - 1) * bin_spacing + 1
570
+ bins_start = samples_per_node - 1 - (bins_width // 2)
571
+ bins_end = bins_start + bins_width
572
+ video_dicts = [defaultdict(set) for feature in video_features_mean_sub]
573
+ edges = np.array(np.meshgrid(*([np.arange(2)]*num_bins), indexing='ij')).reshape(num_bins,-1).T
574
+ bin_offsets = []
575
+ for edge in edges:
576
+ bin_offset = np.array(np.meshgrid(*[np.arange(x+1) for x in edge], indexing='ij'))
577
+ bin_offsets.append(np.dot(bin_offset.reshape(num_bins,-1)[::-1].T, 7**np.arange(num_bins)))
578
+
579
+ for video_dict, feature, norm in zip(video_dicts, video_features_mean_sub, video_uniform_norms):
580
+ bins = np.hstack([feature[bins_start+i:-bins_end+i+1, None] for i in bin_spacing * np.arange(num_bins)])
581
+ bins /= norm[:,None]
582
+ bins = 8 * bins + 3.3
583
+ np.clip(bins, 0, 6, out=bins)
584
+ bin_offset_indices = np.dot(((bins % 1) > .6), 2**np.arange(num_bins))
585
+ bins = np.dot(np.floor(bins).astype(int), 7**np.arange(num_bins)).tolist()
586
+ not_quiet = (video_energy[:-len(hann_window)] > .5)
587
+ for i in np.arange(len(video_energy) - len(hann_window))[not_quiet].tolist()[::4]:
588
+ bin = bins[i]
589
+ for bin_offset in bin_offsets[bin_offset_indices[i]].tolist():
590
+ video_dict[bin + bin_offset].add(i)
591
+
592
+ print(" matching audio... \r", end='')
593
+ audio_desc_bins = []
594
+ audio_desc_bin_offset_indices = []
595
+ for feature, norm in zip(audio_desc_features_mean_sub, audio_desc_uniform_norms):
596
+ bins = np.hstack([feature[bins_start+i:-bins_end+i+1, None] for i in bin_spacing * np.arange(num_bins)])
597
+ bins /= norm[:,None]
598
+ bins = 8 * bins + 3.5
599
+ bins = np.floor(bins).astype(int)
600
+ np.clip(bins, 0, 6, out=bins)
601
+ audio_desc_bins.append(np.dot(bins, 7**np.arange(num_bins)).tolist())
602
+
603
+ def pairwise_intersection(set1, set2, set3):
604
+ return (set1 & set2).union((set1 & set3), (set2 & set3))
605
+ def triwise_intersection(set1, set2, set3, set4, set5):
606
+ set123 = pairwise_intersection(set1, set2, set3)
607
+ return (set123 & set4) | (set123 & set5)
608
+ best_so_far = SortedList(key=lambda x:x[0])
609
+ best_so_far.add((-1,-1,0))
610
+ backpointers = {}
611
+ not_quiet = (audio_desc_energy[:-len(hann_window)] > .5)
612
+ for i in np.arange(len(audio_desc_energy) - len(hann_window))[not_quiet].tolist():
613
+ match_sets = [video_dict[bins[i]] for bins, video_dict in zip(audio_desc_bins, video_dicts)]
614
+ common = triwise_intersection(*match_sets)
615
+ match_points = []
616
+ for video_index in common:
617
+ prob = 1
618
+ for j in range(3):
619
+ corr = np.dot(audio_desc_features_mean_sub[j][i:i+2*samples_per_node-1],
620
+ video_features_mean_sub[j][video_index:video_index+2*samples_per_node-1])
621
+ corr /= audio_desc_uniform_norms[j][i] * video_uniform_norms[j][video_index]
622
+ prob *= max(1e-8, (1 - corr)) # Naive Bayes probability
623
+ prob = prob ** 2.9 # empirically determined, ranges from 2.5-3.4
624
+ if prob > 1e-8:
625
+ continue
626
+ qual = min(50, (prob / 1e-12) ** (-1. / 3)) # remove Naive Bayes assumption
627
+ match_points.append((video_index, qual))
628
+ audio_desc_index = i
629
+ for video_index, qual in sorted(match_points):
630
+ cur_index = best_so_far.bisect_right((video_index,))
631
+ prev_video_index, prev_audio_desc_index, prev_cum_qual = best_so_far[cur_index-1]
632
+ cum_qual = prev_cum_qual + qual
633
+ while (cur_index < len(best_so_far)) and (best_so_far[cur_index][2] <= cum_qual):
634
+ del best_so_far[cur_index]
635
+ best_so_far.add((video_index, audio_desc_index, cum_qual))
636
+ backpointers[(video_index, audio_desc_index)] = (prev_video_index, prev_audio_desc_index)
637
+ del video_dicts
638
+ path = [best_so_far[-1][:2]]
639
+ while path[-1][:2] in backpointers:
640
+ # failsafe to prevent an infinite loop that should never happen anyways
641
+ if len(path) > 10**8:
642
+ raise RuntimeError("Infinite Loop Encountered!")
643
+ path.append(backpointers[path[-1][:2]])
644
+ path.pop()
645
+ path.reverse()
646
+ if len(path) < max(min(len(video_energy), len(audio_desc_energy)) / 500., 5 * 210):
647
+ raise RuntimeError("Alignment failed, are the input files mismatched?")
648
+ y, x = np.array(path).T
649
+
650
+ half_hann_window = hann_window[:samples_per_node-1] / np.sum(hann_window[:samples_per_node-1])
651
+ half_samples_per_node = samples_per_node // 2
652
+ fit_delay = samples_per_node + half_samples_per_node - 2
653
+ diff_by = lambda arr, offset=half_samples_per_node: arr[offset:] - arr[:-offset]
654
+ def get_continuity_err(x, y, deriv=False):
655
+ x_smooth_future = np.convolve(x, half_hann_window, mode='valid')
656
+ y_smooth_future = np.convolve(y, half_hann_window, mode='valid')
657
+ slopes_future = diff_by(y_smooth_future) / diff_by(x_smooth_future)
658
+ offsets_future = y_smooth_future[:-half_samples_per_node] - \
659
+ x_smooth_future[:-half_samples_per_node] * slopes_future
660
+ x_smooth_past = np.convolve(x, half_hann_window[::-1], mode='valid')
661
+ y_smooth_past = np.convolve(y, half_hann_window[::-1], mode='valid')
662
+ slopes_past = diff_by(y_smooth_past) / diff_by(x_smooth_past)
663
+ offsets_past = y_smooth_past[half_samples_per_node:] - \
664
+ x_smooth_past[half_samples_per_node:] * slopes_past
665
+ continuity_err = np.full(len(x) - (1 if deriv else 0), np.inf)
666
+ fit_delay_offset = fit_delay - (1 if deriv else 0)
667
+ continuity_err[:-fit_delay_offset] = np.abs(slopes_future * x[:-fit_delay] + \
668
+ offsets_future - y[:-fit_delay])
669
+ continuity_err[fit_delay_offset:] = np.minimum(continuity_err[fit_delay_offset:],
670
+ np.abs(slopes_past * x[fit_delay:] + \
671
+ offsets_past - y[fit_delay:]))
672
+ return continuity_err
673
+
674
+ print(" refining match: pass 1 of 2...\r", end='')
675
+ continuity_err = get_continuity_err(x, y)
676
+ errs = (continuity_err < 3)
677
+ x = x[errs]
678
+ y = y[errs]
679
+
680
+ audio_desc_features_scaled = []
681
+ video_features_scaled = []
682
+ for video_feature, audio_desc_feature in zip(video_features, audio_desc_features):
683
+ audio_desc_feature_std = np.std(audio_desc_feature)
684
+ scale_factor = np.linalg.lstsq(video_feature[y][:,None], audio_desc_feature[x], rcond=None)[0]
685
+ audio_desc_features_scaled.append(audio_desc_feature / audio_desc_feature_std)
686
+ video_features_scaled.append(video_feature * scale_factor / audio_desc_feature_std)
687
+ audio_desc_features_scaled = np.array(list(zip(*(audio_desc_features_scaled[:3]))))
688
+ video_features_scaled = np.array(list(zip(*(video_features_scaled[:3]))))
689
+
690
+ smooth_x = get_mean(x)
691
+ smooth_y = get_mean(y)
692
+ slopes = np.diff(smooth_y) / np.diff(smooth_x)
693
+ offsets = smooth_y[:-1] - smooth_x[:-1] * slopes
694
+ err_y = slopes * x[:-1] + offsets - y[:-1]
695
+ compressed_x, compressed_y = [], []
696
+ def extend_all(index, compress=False, num=70):
697
+ compressed_x.extend([np.mean(x[index:index+num])] if compress else x[index:index+num])
698
+ compressed_y.extend([np.mean(y[index:index+num])] if compress else y[index:index+num])
699
+ extend_all(0, num=10)
700
+ for i in range(10, len(x) - 80, 70):
701
+ extend_all(i, compress=np.all(np.abs(err_y[i:i+70]) < 3))
702
+ extend_all(i+70)
703
+
704
+ x = compressed_x
705
+ y = compressed_y
706
+
707
+ match_dict = defaultdict(list)
708
+ x_unique = [-1]
709
+ for audio_desc_index, video_index in zip(x, y):
710
+ match_dict[audio_desc_index].append(video_index)
711
+ if audio_desc_index != x_unique[-1]:
712
+ x_unique.append(audio_desc_index)
713
+ x = np.array(x_unique[1:])
714
+ y = np.array([np.mean(match_dict[audio_desc_index]) for audio_desc_index in x])
715
+
716
+ # L1-Minimization to solve the alignment problem using a linear program
717
+ # the absolute value functions needed for "absolute error" can be represented
718
+ # in a linear program by splitting variables into positive and negative pieces
719
+ # and constraining each to be positive (done by default in scipy's linprog)
720
+ num_fit_points = len(x)
721
+ x_diffs = np.diff(x)
722
+ y_diffs = np.diff(y)
723
+ jump_cost_base = 10.
724
+ jump_costs = np.full(num_fit_points - 1, jump_cost_base)
725
+ continuity_err = get_continuity_err(x, y, deriv=True)
726
+ jump_costs /= np.maximum(1, np.sqrt(continuity_err / 3.))
727
+ rate_change_jump_costs = np.full(num_fit_points - 1, .001)
728
+ rate_change_costs = np.full(num_fit_points - 2, jump_cost_base * 4000)
729
+ shot_noise_costs = np.full(num_fit_points, .01)
730
+ shot_noise_jump_costs = np.full(num_fit_points - 1, 3)
731
+ shot_noise_bound = 2.
732
+ c = np.hstack([np.ones(2 * num_fit_points),
733
+ jump_costs,
734
+ jump_costs,
735
+ shot_noise_costs,
736
+ shot_noise_costs,
737
+ shot_noise_jump_costs,
738
+ shot_noise_jump_costs,
739
+ rate_change_jump_costs,
740
+ rate_change_jump_costs,
741
+ rate_change_costs,
742
+ rate_change_costs,
743
+ [0,]])
744
+ fit_err_coeffs = scipy.sparse.diags([-1. / x_diffs,
745
+ 1. / x_diffs],
746
+ offsets=[0,1],
747
+ shape=(num_fit_points - 1, num_fit_points)).tocsc()
748
+ jump_coeffs = scipy.sparse.diags([ 1. / x_diffs],
749
+ offsets=[0],
750
+ shape=(num_fit_points - 1, num_fit_points - 1)).tocsc()
751
+ A_eq1 = scipy.sparse.hstack([ fit_err_coeffs,
752
+ -fit_err_coeffs,
753
+ jump_coeffs,
754
+ -jump_coeffs,
755
+ scipy.sparse.csc_matrix((num_fit_points - 1, 2 * num_fit_points)),
756
+ jump_coeffs,
757
+ -jump_coeffs,
758
+ jump_coeffs,
759
+ -jump_coeffs,
760
+ scipy.sparse.csc_matrix((num_fit_points - 1, 2 * num_fit_points - 4)),
761
+ np.ones((num_fit_points - 1, 1))])
762
+ A_eq2 = scipy.sparse.hstack([ scipy.sparse.csc_matrix((num_fit_points - 1, 4 * num_fit_points - 2)),
763
+ scipy.sparse.diags([-1., 1.], offsets=[0, 1],
764
+ shape=(num_fit_points - 1, num_fit_points)).tocsc(),
765
+ scipy.sparse.diags([1., -1.], offsets=[0, 1],
766
+ shape=(num_fit_points - 1, num_fit_points)).tocsc(),
767
+ -scipy.sparse.eye(num_fit_points - 1),
768
+ scipy.sparse.eye(num_fit_points - 1),
769
+ scipy.sparse.csc_matrix((num_fit_points - 1, 4 * num_fit_points - 6)),
770
+ scipy.sparse.csc_matrix((num_fit_points - 1, 1))])
771
+ slope_change_coeffs = scipy.sparse.diags([-1. / x_diffs[:-1],
772
+ 1. / x_diffs[1:]],
773
+ offsets=[0,1],
774
+ shape=(num_fit_points - 2, num_fit_points - 1)).tocsc()
775
+ A_eq3 = scipy.sparse.hstack([scipy.sparse.csc_matrix((num_fit_points - 2, 8 * num_fit_points - 4)),
776
+ slope_change_coeffs,
777
+ -slope_change_coeffs,
778
+ -scipy.sparse.eye(num_fit_points - 2),
779
+ scipy.sparse.eye(num_fit_points - 2),
780
+ scipy.sparse.csc_matrix((num_fit_points - 2, 1))])
781
+ A_eq = scipy.sparse.vstack([A_eq1, A_eq2, A_eq3])
782
+ b_eq = y_diffs / x_diffs
783
+ b_eq = np.hstack((b_eq, np.zeros(2 * num_fit_points - 3)))
784
+ bounds = [[0, None]] * (4 * num_fit_points - 2) + \
785
+ [[0, shot_noise_bound]] * (2 * num_fit_points) + \
786
+ [[0, None]] * (6 * num_fit_points - 8) + \
787
+ [[None, None]]
788
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs-ds')
789
+ # if dual simplex solver encounters numerical problems, retry with interior point solver
790
+ if not fit.success and fit.status == 4:
791
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs-ipm')
792
+ if not fit.success:
793
+ print(fit)
794
+ raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
795
+
796
+ # combine positive and negative components of variables
797
+ fit_err = fit.x[ : num_fit_points ] - \
798
+ fit.x[ num_fit_points :2*num_fit_points ]
799
+ slope_jumps = fit.x[8*num_fit_points-4: 9*num_fit_points-5] - \
800
+ fit.x[9*num_fit_points-5:10*num_fit_points-6]
801
+ median_slope = fit.x[-1]
802
+ slopes = median_slope + (slope_jumps / x_diffs)
803
+
804
+ # subtract fit errors from nodes to retrieve the smooth fit's coordinates
805
+ smooth_path = [(x, y) for x,y in zip(x, y - fit_err)]
806
+
807
+ print(" refining match: pass 2 of 2...\r", end='')
808
+ slopes_plus_ends = np.hstack((slopes[:1], slopes, slopes[-1:]))
809
+ extensions = []
810
+ extend_radius = 210 * 30 # +/- 30 seconds
811
+ video_interp = scipy.interpolate.make_interp_spline(np.arange(len(video_features_scaled)),
812
+ video_features_scaled, k=1)
813
+ colinear_dict = defaultdict(list)
814
+ for i, (x, y) in enumerate(smooth_path):
815
+ for slope in slopes_plus_ends[i:i+2]:
816
+ if (slope < .1) or (slope > 10):
817
+ continue
818
+ offset = y - slope * x
819
+ colinear_dict[(round(slope, 6), int(round(offset, 0)))].append((x, y))
820
+ line_clusters = []
821
+ added_keys = set()
822
+ for (slope, offset), indices in sorted(colinear_dict.items(), key=lambda x: -len(x[1])):
823
+ if (slope, offset) in added_keys:
824
+ continue
825
+ line_clusters.append(indices)
826
+ added_keys.add((slope, offset))
827
+ del colinear_dict[(slope, offset)]
828
+ for (slope2, offset2), indices2 in list(colinear_dict.items()):
829
+ if (abs(indices2[ 0][1] - (indices2[ 0][0] * slope + offset)) < 3) and \
830
+ (abs(indices2[-1][1] - (indices2[-1][0] * slope + offset)) < 3):
831
+ line_clusters[-1].extend(colinear_dict[(slope2, offset2)])
832
+ added_keys.add((slope2, offset2))
833
+ del colinear_dict[(slope2, offset2)]
834
+ line_clusters = [sorted(cluster) for cluster in line_clusters]
835
+ line_clusters = [x for x in line_clusters if (abs(x[0][0] - x[-1][0]) > 10) and len(x) > 5]
836
+
837
+ for i, cluster in enumerate(line_clusters):
838
+ x, y = np.array(cluster).T
839
+ linear_fit = np.linalg.lstsq(np.hstack((np.ones((len(x), 1)), x[:, None])), y, rcond=None)[0]
840
+ line_clusters[i] = (x, linear_fit[0], linear_fit[1])
841
+
842
+ def get_x_limits(x, offset, slope, extend_horiz=extend_radius, buffer_vert=4):
843
+ limits = (max(int(x[0]) - extend_horiz, 0),
844
+ min(int(x[-1]) + extend_horiz, len(audio_desc_features_scaled) - 1))
845
+ limits = (max(limits[0], int(np.ceil((buffer_vert - offset) / slope))),
846
+ min(limits[1], int(np.floor((len(video_features_scaled) - buffer_vert - offset) / slope))))
847
+ return limits
848
+ def get_audio_video_matches(limits, slope, offset):
849
+ x = np.arange(*limits)
850
+ y = slope * x + offset
851
+ audio_match = audio_desc_features_scaled[slice(*limits)]
852
+ video_match = video_interp(y)
853
+ return x, y, audio_match, video_match
854
+
855
+ audio_desc_max_energy = np.max(audio_desc_features_scaled[:,0])
856
+ video_max_energy = np.max(video_features_scaled[:,0])
857
+ points = [[] for i in range(len(audio_desc_features_scaled))]
858
+ seen_points = set()
859
+ for cluster_index, (x, offset, slope) in enumerate(line_clusters):
860
+ limits = get_x_limits(x, offset, slope, extend_horiz=0)
861
+ if limits[1] < limits[0] + 5:
862
+ continue
863
+ if limits[1] > limits[0] + 100:
864
+ x, y, audio_match, video_match = get_audio_video_matches(limits, slope, offset)
865
+ video_match_err = audio_match[1:-1] - video_match[1:-1]
866
+ valid_matches = np.mean(video_match_err, axis=-1) < 0.1
867
+ if np.count_nonzero(valid_matches) > 50:
868
+ video_match_diff = (video_match[2:] - video_match[:-2]) / 2.
869
+ video_match_err = video_match_err[valid_matches]
870
+ video_match_diff = video_match_diff[valid_matches]
871
+ x_valid = x[1:-1][valid_matches][:,None]
872
+ A = video_match_diff.reshape(-1,1)
873
+ linear_fit, residual, _, _ = np.linalg.lstsq(A, video_match_err.flat, rcond=None)
874
+ explained_err_ratio = 1 - (residual / np.sum(video_match_err ** 2))
875
+ stds_above_noise_mean = np.sqrt(explained_err_ratio * np.prod(video_match_err.shape)) - 1.
876
+ if stds_above_noise_mean > 8 and abs(linear_fit[0]) < 2:
877
+ offset += linear_fit[0]
878
+ limits = get_x_limits(x, offset, slope)
879
+ x, y, audio_match, video_match = get_audio_video_matches(limits, slope, offset)
880
+ quals = np.sum(-.5 - np.log10(1e-4 + np.abs(audio_match - video_match)), axis=1)
881
+ quals *= np.clip(video_match[:,0] + 2.5 - video_max_energy, 0, 1)
882
+ quals += np.clip(audio_match[:,0] + 2.5 - audio_desc_max_energy, 0, 1) * .1
883
+ energy_diffs = audio_match[:,0] - video_match[:,0]
884
+ for i, j, qual in zip(x.tolist(), y.tolist(), quals.tolist()):
885
+ point = (i, int(j))
886
+ if point not in seen_points:
887
+ seen_points.add(point)
888
+ points[i].append((j, cluster_index, qual))
889
+ del seen_points
890
+ points = [sorted(point) for point in points]
891
+
892
+ best_so_far = SortedList(key=lambda x:x[0])
893
+ best_so_far.add((0, 0, -1, 0, 0)) # video_index, audio_desc_index, cluster_index, qual, cum_qual
894
+ clusters_best_so_far = [(0, 0, 0, -1000) for cluster in line_clusters]
895
+ backpointers = {}
896
+ prev_cache = np.full((len(video_features_scaled), 5), -np.inf)
897
+ prev_cache[0] = (0, 0, -1, 0, 0) # video_index, audio_desc_index, cluster_index, qual, cum_qual
898
+ reversed_min_points = [min(x)[0] if len(x) > 0 else np.inf for x in points[::-1]]
899
+ forward_min = list(itertools.accumulate(reversed_min_points, min))[::-1]
900
+ for i in range(len(audio_desc_features_scaled)):
901
+ for j, cluster_index, qual in points[i]:
902
+ cur_index = best_so_far.bisect_right((j,))
903
+ prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual = best_so_far[cur_index-1]
904
+ cluster_last = clusters_best_so_far[cluster_index]
905
+ if cluster_last[3] >= best_prev_cum_qual:
906
+ prev_j, prev_i, prev_qual, best_prev_cum_qual = cluster_last
907
+ prev_cluster_index = cluster_index
908
+ for prev_j_temp in range(max(0, int(j) - 2), int(j) + 1):
909
+ prev_node = prev_cache[prev_j_temp].tolist()
910
+ if cluster_index != prev_node[2]:
911
+ prev_node[4] -= 100 + 100 * ((j - prev_node[0]) - (i - prev_node[1])) ** 2
912
+ if prev_node[1] >= (i - 2) and \
913
+ prev_node[0] <= j and \
914
+ prev_node[4] >= best_prev_cum_qual:
915
+ prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual = prev_node
916
+ cum_qual = best_prev_cum_qual + qual
917
+ prev_cache[int(j)] = (j, i, cluster_index, qual, cum_qual)
918
+ cum_qual_jump = cum_qual - 1000
919
+ if best_so_far[cur_index-1][4] < cum_qual_jump:
920
+ while (cur_index < len(best_so_far)) and (best_so_far[cur_index][4] <= cum_qual_jump):
921
+ del best_so_far[cur_index]
922
+ best_so_far.add((j, i, cluster_index, qual, cum_qual_jump))
923
+ if forward_min[i] == j and cur_index > 1:
924
+ del best_so_far[:cur_index-1]
925
+ cum_qual_cluster_jump = cum_qual - 50
926
+ if cluster_last[3] < cum_qual_cluster_jump:
927
+ clusters_best_so_far[cluster_index] = (j, i, qual, cum_qual_cluster_jump)
928
+ backpointers[(j, i)] = (prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual)
929
+ path = [best_so_far[-1]]
930
+ while path[-1][:2] in backpointers:
931
+ path.append(backpointers[path[-1][:2]])
932
+ path.pop()
933
+ path.reverse()
934
+ path = np.array(path)
935
+ y, x, cluster_indices, quals, cum_quals = path.T
936
+
937
+ nondescription = ((quals == 0) | (quals > .3))
938
+ similarity_ratio_x = float(len(set(x[nondescription]))) / len(audio_desc_features_scaled)
939
+ similarity_ratio_y = float(len(set(y[nondescription]))) / len(video_features_scaled)
940
+ similarity_percent = 100 * max(similarity_ratio_x, similarity_ratio_y)
941
+
942
+ nodes = []
943
+ if cluster_indices[0] == cluster_indices[1]:
944
+ nodes.append((x[0], y[0]))
945
+ for i in range(len(x) - 1):
946
+ if cluster_indices[i] != cluster_indices[i+1]:
947
+ nodes.append((x[i] - .1, y[i] - .1))
948
+ nodes.append((x[i+1] + .1, y[i+1] + .1))
949
+ if cluster_indices[-2] == cluster_indices[-1]:
950
+ nodes.append((x[-1], y[-1]))
951
+ x, y = np.array(nodes).T / 210.
952
+
953
+ if (x[1] - x[0]) > 2:
954
+ slope_start = (y[1] - y[0]) / (x[1] - x[0])
955
+ x[0] = 0
956
+ y[0] = y[1] - (x[1] * slope_start)
957
+ if y[0] < 0:
958
+ x[0] = x[1] - (y[1] / slope_start)
959
+ y[0] = 0
960
+ if (x[-1] - x[-2]) > 2:
961
+ slope_end = (y[-1] - y[-2]) / (x[-1] - x[-2])
962
+ x[-1] = ((len(audio_desc_energy) - 1) / 210.)
963
+ y[-1] = y[-2] + ((x[-1] - x[-2]) * slope_end)
964
+ if y[-1] > ((len(video_energy) - 1) / 210.):
965
+ y[-1] = ((len(video_energy) - 1) / 210.)
966
+ x[-1] = x[-2] + ((y[-1] - y[-2]) / slope_end)
967
+
968
+ path[:,:2] /= 210.
969
+ return x, y, similarity_percent, path, median_slope
970
+
838
971
  # combines videos with matching audio files (e.g. audio descriptions)
839
972
  # this is the main function of this script, it calls the other functions in order
840
- def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
841
- boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
842
- prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
843
- alignment_dir=default_alignment_dir, extension="copy", display_func=None):
844
- video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
973
+ def combine(video, audio, stretch_audio=False, yes=False, prepend="ad_", no_pitch_correction=False,
974
+ output_dir=default_output_dir, alignment_dir=default_alignment_dir):
975
+ video_files, has_audio_extensions = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
845
976
 
846
- if yes == False and sum(video_file_types) > 0:
977
+ if yes == False and sum(has_audio_extensions) > 0:
847
978
  print("")
848
979
  print("One or more audio files found in video input. Was this intentional?")
849
980
  print("If not, press ctrl+c to kill this script.")
@@ -856,16 +987,16 @@ def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
856
987
  f"The audio path has {len(audio_desc_files)} files"]
857
988
  raise RuntimeError("\n".join(error_msg))
858
989
 
859
- display("", display_func)
860
- ensure_folders_exist([output_dir], display_func)
990
+ print("")
991
+ ensure_folders_exist([output_dir])
861
992
  if PLOT_ALIGNMENT_TO_FILE:
862
- ensure_folders_exist([alignment_dir], display_func)
993
+ ensure_folders_exist([alignment_dir])
863
994
 
864
- display("", display_func)
995
+ print("")
865
996
  for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
866
- display(os.path.split(video_file)[1], display_func)
867
- display(os.path.split(audio_desc_file)[1], display_func)
868
- display("", display_func)
997
+ print(os.path.split(video_file)[1])
998
+ print(os.path.split(audio_desc_file)[1])
999
+ print("")
869
1000
  if yes == False:
870
1001
  print("Are the above input file pairings correct?")
871
1002
  print("If not, press ctrl+c to kill this script.")
@@ -874,414 +1005,728 @@ def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
874
1005
 
875
1006
  # if ffmpeg isn't installed, install it
876
1007
  if not is_ffmpeg_installed():
877
- display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
1008
+ print("Downloading and installing ffmpeg (media editor, 50 MB download)...")
878
1009
  get_ffmpeg()
879
1010
  if not is_ffmpeg_installed():
880
1011
  RuntimeError("Failed to install ffmpeg.")
881
- display("Successfully installed ffmpeg.", display_func)
1012
+ print("Successfully installed ffmpeg.")
882
1013
 
883
- display("Processing files:", display_func)
1014
+ print("Processing files:")
884
1015
 
885
- for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
886
- video_file_types):
887
- # Default is to use the input video's extension for the output video
888
- if extension is None or extension in ["", "copy"]:
889
- ext = os.path.splitext(video_file)[1]
890
- else:
891
- # add a dot to the extension if it's missing
892
- ext = ('' if extension[0] == '.' else '.') + extension
893
- output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
1016
+ for (video_file, audio_desc_file, has_audio_extension) in zip(video_files, audio_desc_files,
1017
+ has_audio_extensions):
1018
+ # Output filename (and extension) is the same as input, except the prepend and directory
1019
+ output_filename = prepend + os.path.split(video_file)[1]
894
1020
  output_filename = os.path.join(output_dir, output_filename)
895
- display(f" {output_filename}", display_func)
1021
+ print(f" {output_filename}")
896
1022
 
897
- if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
898
- display(" output file already exists, skipping...", display_func)
1023
+ if (not stretch_audio) & has_audio_extension:
1024
+ raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
1025
+
1026
+ if os.path.exists(output_filename) and os.path.getsize(output_filename) > 1e5:
1027
+ print(" output file already exists, skipping...")
899
1028
  continue
900
1029
 
901
1030
  # print warning if output file's full path is longer than Windows MAX_PATH (260)
902
1031
  full_output_filename = os.path.abspath(output_filename)
903
1032
  if IS_RUNNING_WINDOWS and len(full_output_filename) >= 260:
904
- display(" WARNING: very long output path, ffmpeg may fail...", display_func)
1033
+ print(" WARNING: very long output path, ffmpeg may fail...")
905
1034
 
906
- video_arr = parse_audio_from_file(video_file)
907
- audio_desc_arr = parse_audio_from_file(audio_desc_file)
908
- video_spec_raw, video_timings = tokenize_audio(video_arr)
909
- video_spec = normalize_spec(video_spec_raw)
910
- audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
911
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
1035
+ num_channels = 2 if stretch_audio else 1
1036
+ print(" reading video file...\r", end='')
1037
+ video_arr = parse_audio_from_file(video_file, num_channels)
912
1038
 
913
- # rescale RMS intensity of audio to match video
914
- audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
1039
+ print(" computing video features... \r", end='')
1040
+ video_energy = get_energy(video_arr)
1041
+ video_zero_crossings = get_zero_crossings(video_arr)
1042
+ video_freq_bands = get_freq_bands(video_arr)
1043
+ video_features = [video_energy, video_zero_crossings] + video_freq_bands
915
1044
 
916
- path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
1045
+ if not stretch_audio:
1046
+ del video_arr
917
1047
 
918
- similarity_ratio = float(len(quals)) / max(video_spec.shape[0], audio_desc_spec.shape[0])
919
- similarity_percent = min(100, 100 * similarity_ratio)
920
- if similarity_percent < 10:
921
- display(f" WARNING: similarity {similarity_percent:.1f}%, likely mismatched files", display_func)
922
- if similarity_percent > 90:
923
- display(f" WARNING: similarity {similarity_percent:.1f}%, likely undescribed media", display_func)
1048
+ print(" reading audio file... \r", end='')
1049
+ audio_desc_arr = parse_audio_from_file(audio_desc_file, num_channels)
1050
+
1051
+ print(" computing audio features...\r", end='')
1052
+ audio_desc_energy = get_energy(audio_desc_arr)
1053
+ audio_desc_zero_crossings = get_zero_crossings(audio_desc_arr)
1054
+ audio_desc_freq_bands = get_freq_bands(audio_desc_arr)
1055
+ audio_desc_features = [audio_desc_energy, audio_desc_zero_crossings] + audio_desc_freq_bands
1056
+
1057
+ if not stretch_audio:
1058
+ del audio_desc_arr
924
1059
 
925
- smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
1060
+ outputs = align(video_features, audio_desc_features, video_energy, audio_desc_energy)
1061
+ audio_desc_times, video_times, similarity_percent, path, median_slope = outputs
926
1062
 
927
- cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
1063
+ del video_energy, video_zero_crossings, video_freq_bands, video_features
1064
+ del audio_desc_energy, audio_desc_zero_crossings, audio_desc_freq_bands, audio_desc_features
1065
+
1066
+ if similarity_percent < 20:
1067
+ print(f" WARNING: similarity {similarity_percent:.1f}%, likely mismatched files")
1068
+ if similarity_percent > 90:
1069
+ print(f" WARNING: similarity {similarity_percent:.1f}%, likely undescribed media")
928
1070
 
929
- ad_timings = None
930
1071
  if stretch_audio:
931
- if keep_non_ad:
932
- video_arr_original = video_arr.copy()
1072
+ # lower memory usage version of np.std for large arrays
1073
+ def low_ram_std(arr):
1074
+ avg = np.mean(arr, dtype=np.float64)
1075
+ return np.sqrt(np.einsum('ij,ij->i', arr, arr, dtype=np.float64)/np.prod(arr.shape) - (avg**2))
933
1076
 
934
- replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
935
- del audio_desc_arr
1077
+ # rescale RMS intensity of audio to match video
1078
+ audio_desc_arr *= (low_ram_std(video_arr) / low_ram_std(audio_desc_arr))[:, None]
936
1079
 
937
- if keep_non_ad or boost != 0:
938
- outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
939
- smooth_path, ad_detect_sensitivity, boost_sensitivity)
940
- speech_sample_mask, boost_sample_mask, ad_timings = outputs
941
- if keep_non_ad:
942
- video_arr *= speech_sample_mask
943
- video_arr += video_arr_original * (1 - speech_sample_mask)
944
- del video_arr_original
945
- del speech_sample_mask
946
- else:
947
- ad_timings = None
948
- if boost != 0:
949
- video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
950
- del boost_sample_mask
1080
+ replace_aligned_segments(video_arr, audio_desc_arr, audio_desc_times, video_times, no_pitch_correction)
1081
+ del audio_desc_arr
951
1082
 
952
- # prevent peaking by rescaling to within +/- 16,382
1083
+ # prevent peaking by rescaling to within +/- 32,766
953
1084
  video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
954
1085
 
955
- if video_filetype == 0:
956
- write_replaced_media_to_disk(output_filename, video_arr, video_file)
957
- else:
958
- write_replaced_media_to_disk(output_filename, video_arr)
1086
+ print(" processing output file... \r", end='')
1087
+ write_replaced_media_to_disk(output_filename, video_arr, None if has_audio_extension else video_file)
1088
+ del video_arr
959
1089
  else:
960
- if video_filetype == 1:
961
- raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
962
- if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
963
- raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
964
- video_offset = np.diff(smooth_path[clips[0][0]])[0]
965
- start_key_frame = get_closest_key_frame_time(video_file, video_offset)
966
- setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
1090
+ video_offset = video_times[0] - audio_desc_times[0]
1091
+ # to make ffmpeg cut at the last keyframe before the audio starts, use a timestamp after it
1092
+ after_start_key_frame = get_closest_key_frame_time(video_file, video_offset)
1093
+ print(" processing output file... \r", end='')
1094
+ setts_cmd = encode_fit_as_ffmpeg_expr(audio_desc_times, video_times, video_offset)
967
1095
  write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
968
- setts_cmd, start_key_frame)
1096
+ setts_cmd, video_offset, after_start_key_frame)
969
1097
 
970
- del video_arr
971
1098
  if PLOT_ALIGNMENT_TO_FILE:
972
1099
  plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
973
- plot_alignment(plot_filename_no_ext, path, smooth_path, quals,
974
- runs, bad_clips, ad_timings, similarity_percent)
975
- display("All files processed.", display_func)
1100
+ plot_alignment(plot_filename_no_ext, path, audio_desc_times, video_times, similarity_percent,
1101
+ median_slope, stretch_audio, no_pitch_correction)
1102
+ print("All files processed. ")
976
1103
 
977
- def write_config_file(config_path, settings):
978
- config = configparser.ConfigParser()
979
- config.add_section('alignment')
980
- config['alignment'] = {}
981
- for key, value in settings.items():
982
- config['alignment'][key] = str(value)
983
- with open(config_path, 'w') as f:
984
- config.write(f)
1104
+ if wx is not None:
1105
+ def write_config_file(config_path, settings):
1106
+ config = configparser.ConfigParser()
1107
+ config.add_section('alignment')
1108
+ config['alignment'] = {}
1109
+ for key, value in settings.items():
1110
+ config['alignment'][key] = str(value)
1111
+ with open(config_path, 'w') as f:
1112
+ config.write(f)
985
1113
 
986
- def read_config_file(config_path: Path):
987
- config = configparser.ConfigParser()
988
- config.read(config_path)
989
- settings = {'smoothness': config.getfloat('alignment', 'smoothness', fallback=50),
990
- 'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
991
- 'keep_non_ad': config.getboolean('alignment', 'keep_non_ad', fallback=False),
992
- 'boost': config.getfloat('alignment', 'boost', fallback=0),
993
- 'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
994
- 'boost_sensitivity': config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
995
- 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
996
- 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
997
- 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
998
- 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
999
- 'extension': config.get('alignment', 'extension', fallback='copy')}
1000
- if not config.has_section('alignment'):
1001
- write_config_file(config_path, settings)
1002
- return settings
1114
+ def read_config_file(config_path: Path):
1115
+ config = configparser.ConfigParser()
1116
+ config.read(config_path)
1117
+ settings = {'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
1118
+ 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
1119
+ 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
1120
+ 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
1121
+ 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir)}
1122
+ if not config.has_section('alignment'):
1123
+ write_config_file(config_path, settings)
1124
+ return settings
1125
+
1126
+ def set_tooltip(element, tip):
1127
+ element.SetToolTip(tip)
1128
+ # prevent tooltip from disappearing for 30 seconds
1129
+ tooltip_object = element.GetToolTip()
1130
+ if not tooltip_object is None:
1131
+ tooltip_object.SetAutoPop(30000)
1132
+
1133
+ class DialogSettings(wx.Dialog):
1134
+ def __init__(self, parent, config_path, is_dark):
1135
+ wx.Dialog.__init__(self, parent, title="Settings - describealign", size=wx.Size(450,330),
1136
+ style=wx.DEFAULT_DIALOG_STYLE|wx.TAB_TRAVERSAL)
1137
+ # setting the GUI dialog's font causes all contained elements to inherit that font by default
1138
+ self.SetFont(wx.Font(*gui_font))
1139
+ self.SetBackgroundColour(gui_background_color_dark if is_dark else gui_background_color_light)
1140
+
1141
+ self.text_header = wx.StaticText(self, label="Check tooltips (i.e. mouse-over text) for descriptions:")
1142
+
1143
+ self.static_box_sizer_output = wx.StaticBoxSizer(wx.VERTICAL, self, "output_dir")
1144
+ self.dir_picker_output = wx.DirPickerCtrl(self, message="Select a folder", name="output_dir")
1145
+ set_tooltip(self.dir_picker_output, "Directory combined output media is saved to. " + \
1146
+ "Default is \"videos_with_ad\"")
1147
+
1148
+ self.static_box_sizer_alignment = wx.StaticBoxSizer(wx.VERTICAL, self, "alignment_dir")
1149
+ self.dir_picker_alignment = wx.DirPickerCtrl(self, message="Select a folder", name="alignment_dir")
1150
+ set_tooltip(self.dir_picker_alignment, "Directory alignment data and plots are saved to. " + \
1151
+ "Default is \"alignment_plots\"")
1152
+
1153
+ self.text_prepend = wx.StaticText(self, label="prepend:")
1154
+ self.text_ctrl_prepend = wx.TextCtrl(self, name="prepend")
1155
+ set_tooltip(self.text_ctrl_prepend, "Output file name prepend text. Default is \"ad_\"")
1156
+
1157
+ panel_stretch_audio_no_pitch_correction = wx.Panel(self)
1158
+
1159
+ self.checkbox_stretch_audio = wx.CheckBox(panel_stretch_audio_no_pitch_correction,
1160
+ label="stretch_audio", name="stretch_audio")
1161
+ set_tooltip(self.checkbox_stretch_audio, "Stretches the input audio to fit the input video. " + \
1162
+ "Default is to stretch the video to fit the audio. " + \
1163
+ "Keeps original video audio as secondary tracks. Slower " + \
1164
+ "and uses more RAM when enabled, long videos may cause " + \
1165
+ "paging or Out of Memory errors on low-RAM systems.")
1166
+ self.checkbox_stretch_audio.Bind(wx.EVT_CHECKBOX, self.update_stretch_audio_subsettings)
1167
+
1168
+ self.checkbox_no_pitch_correction = wx.CheckBox(panel_stretch_audio_no_pitch_correction,
1169
+ label="no_pitch_correction", name="no_pitch_correction")
1170
+ set_tooltip(self.checkbox_no_pitch_correction, "Skips pitch correction step when stretching audio. " + \
1171
+ "Requires --stretch_audio to be set, otherwise " + \
1172
+ "does nothing.")
1173
+
1174
+ self.button_save = wx.Button(self, label="Save")
1175
+ self.button_save.Bind(wx.EVT_BUTTON, self.save_settings)
1176
+ self.button_cancel = wx.Button(self, label="Cancel")
1177
+ self.button_cancel.Bind(wx.EVT_BUTTON, lambda event: self.EndModal(0))
1178
+
1179
+ sizer_dialog = wx.BoxSizer(wx.VERTICAL)
1180
+ sizer_output_dir = wx.BoxSizer(wx.HORIZONTAL)
1181
+ sizer_alignment_dir = wx.BoxSizer(wx.HORIZONTAL)
1182
+ sizer_prepend = wx.BoxSizer(wx.HORIZONTAL)
1183
+ sizer_stretch_audio_no_pitch_correction_outer = wx.BoxSizer(wx.HORIZONTAL)
1184
+ sizer_stretch_audio_no_pitch_correction_inner = wx.BoxSizer(wx.VERTICAL)
1185
+ sizer_save_cancel = wx.BoxSizer(wx.HORIZONTAL)
1186
+
1187
+ # Configure layout with nested Box Sizers:
1188
+ #
1189
+ # Frame
1190
+ # sizer_dialog
1191
+ # text_header
1192
+ # sizer_output_dir
1193
+ # static_box_sizer_output
1194
+ # dir_picker_output
1195
+ # sizer_alignment_dir
1196
+ # static_box_sizer_alignment
1197
+ # dir_picker_alignment
1198
+ # sizer_prepend
1199
+ # text_prepend
1200
+ # text_ctrl_prepend
1201
+ # sizer_stretch_audio_no_pitch_correction_outer
1202
+ # panel_stretch_audio_no_pitch_correction
1203
+ # sizer_stretch_audio_no_pitch_correction_inner
1204
+ # checkbox_stretch_audio
1205
+ # checkbox_no_pitch_correction
1206
+ # sizer_save_cancel
1207
+ # button_save
1208
+ # button_cancel
1209
+ #
1210
+ self.SetSizer(sizer_dialog)
1211
+ sizer_dialog.Add(self.text_header, 0, wx.ALL, 5)
1212
+ sizer_dialog.Add(sizer_output_dir, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 2)
1213
+ sizer_dialog.Add(sizer_alignment_dir, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 2)
1214
+ sizer_dialog.Add(sizer_prepend, 1, wx.LEFT|wx.EXPAND, 5)
1215
+ sizer_dialog.Add(sizer_stretch_audio_no_pitch_correction_outer, 1, wx.LEFT|wx.EXPAND, 5)
1216
+ sizer_stretch_audio_no_pitch_correction_outer.Add(panel_stretch_audio_no_pitch_correction,
1217
+ 1, wx.LEFT|wx.EXPAND, 5)
1218
+ sizer_stretch_audio_no_pitch_correction_outer.Add((0, 0), 2, wx.EXPAND, 5) # spacer
1219
+ sizer_dialog.Add(sizer_save_cancel, 2, wx.BOTTOM|wx.EXPAND, 5)
1220
+ sizer_prepend.Add(self.text_prepend, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1221
+ sizer_prepend.Add(self.text_ctrl_prepend, 0, wx.ALIGN_CENTER_VERTICAL, 5)
1222
+ sizer_output_dir.Add(self.static_box_sizer_output, 1, wx.LEFT|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 5)
1223
+ self.static_box_sizer_output.Add(self.dir_picker_output, 1, wx.EXPAND)
1224
+ sizer_alignment_dir.Add(self.static_box_sizer_alignment, 1, wx.LEFT|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 5)
1225
+ self.static_box_sizer_alignment.Add(self.dir_picker_alignment, 1, wx.EXPAND)
1226
+ panel_stretch_audio_no_pitch_correction.SetSizer(sizer_stretch_audio_no_pitch_correction_inner)
1227
+ sizer_stretch_audio_no_pitch_correction_inner.Add(self.checkbox_stretch_audio, 0, wx.ALL, 5)
1228
+ sizer_stretch_audio_no_pitch_correction_inner.Add(self.checkbox_no_pitch_correction, 0, wx.ALL, 5)
1229
+ sizer_save_cancel.Add((0, 0), 3, wx.EXPAND, 5) # spacer
1230
+ sizer_save_cancel.Add(self.button_save, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1231
+ sizer_save_cancel.Add((0, 0), 2, wx.EXPAND, 5) # spacer
1232
+ sizer_save_cancel.Add(self.button_cancel, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1233
+ sizer_save_cancel.Add((0, 0), 3, wx.EXPAND, 5) # spacer
1234
+
1235
+ # centers GUI on the screen
1236
+ self.Centre(wx.BOTH)
1237
+
1238
+ # cache dictionaries mapping setting names to widget setter and getter functions
1239
+ self.setting_getters = {}
1240
+ self.setting_setters = {}
1241
+ for child in itertools.chain(self.GetChildren(),
1242
+ panel_stretch_audio_no_pitch_correction.GetChildren()):
1243
+ child_class_name = child.GetClassName()
1244
+ child_name = child.GetName()
1245
+ if child_class_name == "wxDirPickerCtrl":
1246
+ self.setting_getters[child_name] = child.GetPath
1247
+ self.setting_setters[child_name] = child.SetPath
1248
+ if child_class_name in ["wxCheckBox"]:
1249
+ self.setting_getters[child_name] = child.GetValue
1250
+ self.setting_setters[child_name] = child.SetValue
1251
+ if child_class_name in ["wxTextCtrl"]:
1252
+ self.setting_getters[child_name] = child.GetValue
1253
+ self.setting_setters[child_name] = lambda value, child=child: child.SetValue(str(value))
1254
+ self.setting_names = self.setting_getters.keys()
1255
+
1256
+ # initialize setting widgets to saved config values
1257
+ self.config_path = config_path
1258
+ config_file_settings = read_config_file(self.config_path)
1259
+ for setting_name in self.setting_names:
1260
+ self.setting_setters[setting_name](config_file_settings[setting_name])
1003
1261
 
1004
- def settings_gui(config_path: Path):
1005
- settings = read_config_file(config_path)
1006
- layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
1007
- [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
1008
- sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
1009
- tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
1010
- 'file type of the corresponding input video. Default is "copy".')]])],
1011
- [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
1012
- sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
1013
- tooltip='Output file name prepend text. Default is "ad_"')]])],
1014
- [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
1015
- sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
1016
- tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
1017
- sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
1018
- [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
1019
- sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
1020
- tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
1021
- sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
1022
- [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
1023
- sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
1024
- tooltip='Lower values make the alignment more accurate when there are skips ' + \
1025
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
1026
- 'Default is 50.')]])],
1027
- [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
1028
- tooltip='Stretches the input audio to fit the input video. ' + \
1029
- 'Default is to stretch the video to fit the audio.')],
1030
- [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
1031
- disabled=not settings['stretch_audio'],
1032
- tooltip='Tries to only replace segments with audio description. Useful if ' + \
1033
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1034
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
1035
- [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
1036
- sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
1037
- key='boost', disabled=not settings['stretch_audio'],
1038
- tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1039
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1040
- 'Requires --stretch_audio to be set, otherwise does nothing.')]])],
1041
- [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
1042
- sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
1043
- key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
1044
- tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
1045
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
1046
- [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
1047
- sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
1048
- key='boost_sensitivity', disabled=not settings['stretch_audio'],
1049
- tooltip='Higher values make --boost less likely to miss a description, but ' + \
1050
- 'also make it more likely to boost non-description audio. Default is 0.4')]])],
1051
- [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
1052
- disabled=not settings['stretch_audio'],
1053
- tooltip='Skips pitch correction step when stretching audio. ' + \
1054
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
1055
- [sg.Column([[sg.Submit('Save', pad=(40,3)),
1056
- sg.Button('Cancel')]], pad=((135,3),10))]]
1057
- settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
1058
- settings_window['extension'].set_focus()
1059
- while True:
1060
- event, values = settings_window.read()
1061
- if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
1062
- break
1063
- if event == 'stretch_audio':
1064
- # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
1065
- if IS_RUNNING_WINDOWS:
1066
- settings_window['boost'].Update(disabled = values['stretch_audio'])
1067
- settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
1068
- settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
1262
+ # initialize stretch_audio subsettings to be disabled/enabled
1263
+ self.update_stretch_audio_subsettings()
1264
+
1265
+ set_background_color(self, is_dark)
1266
+ if sum(self.checkbox_stretch_audio.GetForegroundColour()[:3]) < 350:
1267
+ panel_stretch_audio_no_pitch_correction.SetBackgroundColour(gui_background_color_light)
1268
+
1269
+ def update_stretch_audio_subsettings(self, event=None):
1270
+ subsettings = [self.checkbox_no_pitch_correction]
1271
+ if self.checkbox_stretch_audio.IsChecked():
1272
+ for subsetting in subsettings:
1273
+ subsetting.Enable()
1069
1274
  else:
1070
- settings_window['boost'].Update(disabled = not values['stretch_audio'])
1071
- settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
1072
- settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
1073
- settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
1074
- settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
1075
- if event == 'Save':
1076
- settings = values.copy()
1077
- del settings['output_browse']
1078
- del settings['alignment_browse']
1079
- write_config_file(config_path, settings)
1080
- break
1081
- settings_window.close()
1275
+ for subsetting in subsettings:
1276
+ subsetting.Disable()
1277
+
1278
+ def save_settings(self, event):
1279
+ settings = {}
1280
+ for setting_name in self.setting_names:
1281
+ settings[setting_name] = self.setting_getters[setting_name]()
1282
+ write_config_file(self.config_path, settings)
1283
+ self.EndModal(0)
1284
+
1285
+ class QueueWriter(io.TextIOWrapper):
1286
+ def __init__(self, queue) -> None:
1287
+ super().__init__(buffer=io.BytesIO())
1288
+ self._queue = queue
1289
+
1290
+ def write(self, s: str) -> int:
1291
+ self._queue.put(s)
1292
+ return len(s)
1082
1293
 
1083
- class QueueWriter(io.TextIOWrapper):
1084
- def __init__(self, queue) -> None:
1085
- super().__init__(buffer=io.BytesIO())
1086
- self._queue = queue
1294
+ def combine_print_exceptions(print_queue, *args, **kwargs):
1295
+ writer = QueueWriter(print_queue)
1296
+ with redirect_stdout(writer), redirect_stderr(writer):
1297
+ try:
1298
+ combine(*args, **kwargs)
1299
+ except Exception:
1300
+ print(" ERROR: exception raised")
1301
+ traceback.print_exc()
1302
+
1303
+ class FrameCombine(wx.Frame):
1304
+ def __init__(self, parent, config_path, video_files, audio_files, is_dark):
1305
+ wx.Frame.__init__(self, parent, title="Combining - describealign", size=wx.Size(800,600))
1306
+ # setting the GUI frame's font causes all contained elements to inherit that font by default
1307
+ self.SetFont(wx.Font(*gui_font))
1308
+ self.SetBackgroundColour(gui_background_color_dark if is_dark else gui_background_color_light)
1309
+ # wrap all widgets within a panel to enable tab traversal (i.e. pressing tab to swap GUI focus)
1310
+ self.panel0 = wx.Panel(self, style=wx.TAB_TRAVERSAL)
1311
+
1312
+ self.text_ctrl_output = wx.TextCtrl(self.panel0, style=wx.TE_MULTILINE|wx.TE_READONLY|wx.TE_RICH)
1313
+
1314
+ self.button_close = wx.Button(self.panel0, label="Close")
1315
+ self.button_close.Bind(wx.EVT_BUTTON, self.attempt_close)
1316
+ # also capture other close events such as alt+f4 or clicking the X in the top corner of the frame
1317
+ self.Bind(wx.EVT_CLOSE, self.attempt_close)
1318
+
1319
+ self.update_timer = wx.Timer(self)
1320
+ self.Bind(wx.EVT_TIMER, self.update_gui, self.update_timer)
1321
+
1322
+ sizer_panel_outer = wx.BoxSizer(wx.VERTICAL)
1323
+ sizer_panel_inner = wx.BoxSizer(wx.VERTICAL)
1324
+ sizer_close_button = wx.BoxSizer(wx.HORIZONTAL)
1325
+
1326
+ # Configure layout with nested Box Sizers:
1327
+ #
1328
+ # Frame
1329
+ # sizer_panel_outer
1330
+ # panel0
1331
+ # sizer_panel_inner
1332
+ # text_ctrl_output
1333
+ # sizer_close_button
1334
+ # button_close
1335
+ #
1336
+ self.SetSizer(sizer_panel_outer)
1337
+ sizer_panel_outer.Add(self.panel0, 1, wx.EXPAND|wx.ALL, 5)
1338
+ self.panel0.SetSizer(sizer_panel_inner)
1339
+ sizer_panel_inner.Add(self.text_ctrl_output, 1, wx.ALL|wx.EXPAND, 5)
1340
+ sizer_panel_inner.Add(sizer_close_button, 0, wx.EXPAND, 5)
1341
+ sizer_close_button.Add((0, 0), 1, wx.EXPAND, 5) # spacer
1342
+ sizer_close_button.Add(self.button_close, 0, wx.ALL, 5)
1343
+ sizer_close_button.Add((0, 0), 1, wx.EXPAND, 5) # spacer
1344
+
1345
+ # centers GUI on the screen
1346
+ self.Centre(wx.BOTH)
1347
+
1348
+ set_background_color(self, is_dark)
1349
+
1350
+ self.config_path = config_path
1351
+ self.overwrite_last_line = False
1352
+ self.display_line('Combining media files:')
1353
+ self.text_ctrl_output.SetInsertionPoint(0)
1354
+
1355
+ # launch combiner using settings from config file, redirecting its output to a queue
1356
+ self.print_queue = multiprocessing.Queue()
1357
+ settings = read_config_file(self.config_path)
1358
+ settings.update({'yes':True})
1359
+ self.combine_process = multiprocessing.Process(target=combine_print_exceptions,
1360
+ args=(self.print_queue, video_files, audio_files),
1361
+ kwargs=settings, daemon=True)
1362
+ self.combine_process.start()
1363
+ self.update_gui()
1364
+
1365
+ def attempt_close(self, event):
1366
+ if self.combine_process.is_alive():
1367
+ dialog = wx.MessageDialog(self, "Warning: combiner is still running, stop it and close anyway?",
1368
+ "Warning", wx.YES_NO|wx.ICON_WARNING)
1369
+ response = dialog.ShowModal()
1370
+ if (response == wx.ID_YES):
1371
+ self.combine_process.terminate()
1372
+ self.Destroy()
1373
+ elif (response == wx.ID_NO):
1374
+ # If the EVT_CLOSE came from the OS, let the OS know it didn't succeed
1375
+ if event.GetEventType() == wx.EVT_CLOSE.evtType[0]:
1376
+ event.Veto(True)
1377
+ else:
1378
+ self.Destroy()
1379
+
1380
+ def set_last_line_color(self, color, line_start):
1381
+ num_lines = self.text_ctrl_output.GetNumberOfLines()
1382
+ end = self.text_ctrl_output.GetLastPosition()
1383
+ self.text_ctrl_output.SetStyle(line_start, end, wx.TextAttr("black", color))
1087
1384
 
1088
- def write(self, s: str) -> int:
1089
- self._queue.put(s)
1090
- return len(s)
1385
+ def display_line(self, line):
1386
+ if self.overwrite_last_line:
1387
+ # skip the empty line following lines ending in "\r"
1388
+ if line == "":
1389
+ return
1390
+ num_lines = self.text_ctrl_output.GetNumberOfLines()
1391
+ start = self.text_ctrl_output.XYToPosition(0,num_lines-2)
1392
+ end = self.text_ctrl_output.GetLastPosition()
1393
+ self.text_ctrl_output.Remove(start, end)
1394
+ self.overwrite_last_line = False
1395
+ if line[-1:] == "\r":
1396
+ self.overwrite_last_line = True
1397
+ line = line[:-1].rstrip(' ') + "\r"
1398
+ line_start = self.text_ctrl_output.GetLastPosition()
1399
+ self.text_ctrl_output.AppendText(line)
1400
+ # highlight warnings by changing their background color to light orange
1401
+ if line[:10] == " WARNING:":
1402
+ self.set_last_line_color(wx.Colour(255, 188, 64), line_start)
1403
+ # highlight errors by changing their background color to red
1404
+ if line[:8] == " ERROR:":
1405
+ self.set_last_line_color(wx.Colour(255, 128, 128), line_start)
1406
+
1407
+ def update_gui(self, event=None):
1408
+ lines = []
1409
+ while not self.print_queue.empty():
1410
+ lines.append(self.print_queue.get())
1411
+ if len(lines) > 0:
1412
+ cursor_position = self.text_ctrl_output.GetInsertionPoint()
1413
+ self.text_ctrl_output.Freeze()
1414
+ for line in lines:
1415
+ self.display_line(line)
1416
+ self.text_ctrl_output.SetInsertionPoint(cursor_position)
1417
+ self.text_ctrl_output.Thaw()
1418
+ self.update_timer.StartOnce(gui_update_interval_ms)
1091
1419
 
1092
- def combine_print_exceptions(print_queue, *args, **kwargs):
1093
- writer = QueueWriter(print_queue)
1094
- with redirect_stdout(writer), redirect_stderr(writer):
1420
+ def migrate_config(old_path: Optional[Path], new_path: Path) -> None:
1421
+ """
1422
+ Migrate configuration from old location.
1423
+
1424
+ Only runs if the old_path exists but new_path does not
1425
+ """
1426
+ if new_path.exists() or not old_path or not old_path.exists():
1427
+ return
1428
+
1429
+ old_data = old_path.read_text(encoding='utf-8')
1430
+ new_path.write_text(old_data, encoding='utf-8')
1431
+ print(f"Configuration migrated to {new_path}")
1095
1432
  try:
1096
- combine(*args, **kwargs)
1097
- except Exception:
1098
- traceback.print_exc()
1433
+ old_path.unlink()
1434
+ except OSError as exc:
1435
+ print("Failed to remove old config:", *traceback.format_exception_only(exc))
1436
+ else:
1437
+ print("Successfully removed old config file.")
1099
1438
 
1100
- def combine_gui(video_files, audio_files, config_path):
1101
- output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
1102
- layout = [[output_textbox],
1103
- [sg.Button('Close', pad=(360,5))]]
1104
- combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
1105
- disable_close=True, finalize=True)
1106
- output_textbox.update('Combining media files:', append=True)
1107
- print_queue = multiprocessing.Queue()
1108
-
1109
- settings = read_config_file(config_path)
1110
- settings.update({'yes':True})
1111
- proc = multiprocessing.Process(target=combine_print_exceptions,
1112
- args=(print_queue, video_files, audio_files),
1113
- kwargs=settings, daemon=True)
1114
- proc.start()
1115
- while True:
1116
- # if the script isn't running anymore, re-enable the default close window button
1117
- if not proc.is_alive():
1118
- combine_window.DisableClose = False
1119
- if not print_queue.empty():
1120
- if IS_RUNNING_WINDOWS:
1121
- cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
1122
- output_textbox.update(print_queue.get(), append=True)
1123
- if IS_RUNNING_WINDOWS:
1124
- output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
1125
- event, values = combine_window.read(timeout=100)
1126
- # window closed event isn't always emitted, so also manually check window status
1127
- if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
1128
- if proc.is_alive():
1129
- proc.terminate()
1130
- break
1131
- if event == 'Close':
1132
- if not proc.is_alive():
1133
- combine_window.DisableClose = False
1134
- break
1135
- selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
1136
- if selection != 'Yes':
1439
+ class ListCtrlDropTarget(wx.FileDropTarget):
1440
+ def __init__(self, list_ctrl, parent_frame):
1441
+ super().__init__()
1442
+ self.list_ctrl = list_ctrl
1443
+ self.parent_frame = parent_frame
1444
+
1445
+ def expand_folders(self, files):
1446
+ expanded_files = []
1447
+ for file in files:
1448
+ if os.path.isdir(file):
1449
+ for dir, subdirs, dir_files in os.walk(file):
1450
+ for dir_file in dir_files:
1451
+ expanded_files.append(os.path.join(dir, dir_file))
1452
+ else:
1453
+ expanded_files.append(file)
1454
+ return expanded_files
1455
+
1456
+ def OnDropFiles(self, x, y, files):
1457
+ files = self.expand_folders(files)
1458
+ valid_file_types = self.parent_frame.list_ctrl_file_types_drop[self.list_ctrl]
1459
+ files = [file for file in files if os.path.splitext(file)[-1][1:] in valid_file_types]
1460
+ self.parent_frame.populate_list_ctrl(self.list_ctrl, natsort.os_sorted(files))
1461
+ return True
1462
+
1463
+ def get_children(window):
1464
+ children = list(window.GetChildren())
1465
+ subchildren = [subchild for child in children for subchild in get_children(child)]
1466
+ return children + subchildren
1467
+
1468
+ def set_background_color(window, is_dark):
1469
+ children = get_children(window)
1470
+ for window in children + [window]:
1471
+ # modifying a CheckBox converts it into a Button, which would mess with screen readers
1472
+ if isinstance(window, wx.CheckBox):
1137
1473
  continue
1138
- proc.terminate()
1139
- combine_window.DisableClose = False
1140
- break
1141
- combine_window.close()
1474
+ if is_dark:
1475
+ if isinstance(window, (wx.ListCtrl, wx.TextCtrl)):
1476
+ window.SetBackgroundColour("Black")
1477
+ elif isinstance(window, wx.Button):
1478
+ window.SetBackgroundColour(tuple(x // 2 for x in gui_background_color_dark))
1479
+ else:
1480
+ window.SetBackgroundColour(gui_background_color_dark)
1481
+ window.SetForegroundColour("White" if is_dark else "Black")
1142
1482
 
1143
- def migrate_config(old_path: Optional[Path], new_path: Path) -> None:
1144
- """
1145
- Migrate configuration from old location.
1146
-
1147
- Only runs if the old_path exists but new_path does not
1148
- """
1149
- if new_path.exists() or not old_path or not old_path.exists():
1150
- return
1151
-
1152
- old_data = old_path.read_text(encoding='utf-8')
1153
- new_path.write_text(old_data, encoding='utf-8')
1154
- print(f"Configuration migrated to {new_path}")
1155
- try:
1156
- old_path.unlink()
1157
- except OSError as exc:
1158
- print("Failed to remove old config:", *traceback.format_exception_only(exc))
1159
- else:
1160
- print("Successfully removed old config file.")
1483
+ class FrameMain(wx.Frame):
1484
+ def __init__(self, parent):
1485
+ wx.Frame.__init__(self, parent, title=f"describealign v{__version__}", size=wx.Size(800, 500))
1486
+ # setting the GUI frame's font causes all contained elements to inherit that font by default
1487
+ self.SetFont(wx.Font(*gui_font))
1488
+ appearance = wx.SystemSettings.GetAppearance()
1489
+ self.is_dark = appearance.IsDark() or appearance.IsUsingDarkBackground()
1490
+ self.SetBackgroundColour(gui_background_color_dark if self.is_dark else gui_background_color_light)
1491
+
1492
+ # wrap all widgets within a panel to enable tab traversal (i.e. pressing tab to swap GUI focus)
1493
+ self.panel0 = wx.Panel(self, style=wx.TAB_TRAVERSAL)
1494
+
1495
+ self.text_header = wx.StaticText(self.panel0, label="Select media files to combine:")
1496
+ self.text_header.SetFont(self.text_header.GetFont().Scale(1.7))
1497
+
1498
+ # Video Input selection and display row of GUI
1499
+ self.static_box_sizer_video = wx.StaticBoxSizer(wx.HORIZONTAL, self.panel0, "Video Input")
1500
+ self.list_ctrl_video = self.init_list_ctrl(self.static_box_sizer_video.GetStaticBox(),
1501
+ "Drag and Drop Videos Here or Press Browse Video")
1502
+ set_tooltip(self.list_ctrl_video, "Video filenames are listed here in the sorted order they will " + \
1503
+ "be used as input. Drag and Drop or press Browse to overwrite.")
1504
+ self.button_browse_video = wx.Button(self.static_box_sizer_video.GetStaticBox(), label="Browse Video")
1505
+ set_tooltip(self.button_browse_video, "Select one or more video files as input.")
1506
+ self.button_browse_video.Bind(wx.EVT_BUTTON, lambda event: self.browse_files(self.list_ctrl_video))
1507
+
1508
+ # Audio Input selection and display row of GUI
1509
+ self.static_box_sizer_audio = wx.StaticBoxSizer(wx.HORIZONTAL, self.panel0, "Audio Input")
1510
+ self.list_ctrl_audio = self.init_list_ctrl(self.static_box_sizer_audio.GetStaticBox(),
1511
+ "Drag and Drop Audio Here or Press Browse Audio")
1512
+ set_tooltip(self.list_ctrl_audio, "Audio filenames are listed here in the sorted order they will " + \
1513
+ "be used as input. Drag and Drop or press Browse to overwrite.")
1514
+ self.button_browse_audio = wx.Button(self.static_box_sizer_audio.GetStaticBox(), label="Browse Audio")
1515
+ set_tooltip(self.button_browse_audio, "Select one or more audio files as input.")
1516
+ self.button_browse_audio.Bind(wx.EVT_BUTTON, lambda event: self.browse_files(self.list_ctrl_audio))
1517
+
1518
+ self.button_combine = wx.Button(self.panel0, label="Combine")
1519
+ set_tooltip(self.button_combine, "Combine selected video and audio files.")
1520
+ self.button_combine.Bind(wx.EVT_BUTTON, self.open_combine)
1521
+ self.button_settings = wx.Button(self.panel0, label="Settings")
1522
+ set_tooltip(self.button_settings, "Edit settings for the GUI and algorithm.")
1523
+ self.button_settings.Bind(wx.EVT_BUTTON, self.open_settings)
1524
+
1525
+ sizer_panel_outer = wx.BoxSizer(wx.VERTICAL)
1526
+ sizer_panel_inner = wx.BoxSizer(wx.VERTICAL)
1527
+ sizer_header = wx.BoxSizer(wx.HORIZONTAL)
1528
+ sizer_video = wx.BoxSizer(wx.HORIZONTAL)
1529
+ sizer_audio = wx.BoxSizer(wx.HORIZONTAL)
1530
+ sizer_combine_settings = wx.BoxSizer(wx.HORIZONTAL)
1531
+
1532
+ # Configure layout with nested Box Sizers:
1533
+ #
1534
+ # Frame
1535
+ # sizer_panel_outer
1536
+ # panel0
1537
+ # sizer_panel_inner
1538
+ # sizer_header
1539
+ # text_header
1540
+ # sizer_video
1541
+ # list_ctrl_video
1542
+ # button_browse_video
1543
+ # sizer_audio
1544
+ # list_ctrl_audio
1545
+ # button_browse_audio
1546
+ # sizer_combine_settings
1547
+ # button_combine
1548
+ # button_settings
1549
+ #
1550
+ self.SetSizer(sizer_panel_outer)
1551
+ sizer_panel_outer.Add(self.panel0, 1, wx.EXPAND|wx.ALL, 5)
1552
+ self.panel0.SetSizer(sizer_panel_inner)
1553
+ sizer_panel_inner.Add(sizer_header, 3, wx.EXPAND, 5)
1554
+ sizer_panel_inner.Add(sizer_video, 9, wx.EXPAND, 5)
1555
+ sizer_panel_inner.Add(sizer_audio, 9, wx.TOP|wx.EXPAND, 3)
1556
+ sizer_panel_inner.Add(sizer_combine_settings, 3, wx.EXPAND, 5)
1557
+ sizer_header.Add(self.text_header, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1558
+ sizer_video.Add(self.static_box_sizer_video, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 3)
1559
+ self.static_box_sizer_video.Add(self.list_ctrl_video, 1, wx.BOTTOM|wx.EXPAND, 2)
1560
+ self.static_box_sizer_video.Add(self.button_browse_video, 0,
1561
+ wx.LEFT|wx.BOTTOM|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 10)
1562
+ sizer_audio.Add(self.static_box_sizer_audio, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 3)
1563
+ self.static_box_sizer_audio.Add(self.list_ctrl_audio, 1, wx.BOTTOM|wx.EXPAND, 2)
1564
+ self.static_box_sizer_audio.Add(self.button_browse_audio, 0,
1565
+ wx.LEFT|wx.BOTTOM|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 10)
1566
+ sizer_combine_settings.Add((0, 0), 7, wx.EXPAND, 5) # spacer
1567
+ sizer_combine_settings.Add(self.button_combine, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1568
+ sizer_combine_settings.Add((0, 0), 2, wx.EXPAND, 5) # spacer
1569
+ sizer_combine_settings.Add(self.button_settings, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
1570
+ sizer_combine_settings.Add((0, 0), 7, wx.EXPAND, 5) # spacer
1571
+
1572
+ # centers GUI on the screen
1573
+ self.Centre(wx.BOTH)
1574
+
1575
+ all_video_file_types = [('All Video File Types', '*.' + ';*.'.join(VIDEO_EXTENSIONS)),]
1576
+ all_audio_file_types = [('All Audio File Types', '*.' + ';*.'.join(AUDIO_EXTENSIONS)),]
1577
+ all_video_and_audio_file_types = [('All Video and Audio File Types',
1578
+ '*.' + ';*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
1579
+ self.video_file_types = [(ext, f"*.{ext}") for ext in VIDEO_EXTENSIONS]
1580
+ self.audio_file_types = [(ext, f"*.{ext}") for ext in AUDIO_EXTENSIONS]
1581
+ self.video_and_audio_file_types = self.video_file_types + self.audio_file_types
1582
+ self.video_file_types = all_video_file_types + self.video_file_types
1583
+ self.audio_file_types = all_audio_file_types + self.audio_file_types
1584
+ self.video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + \
1585
+ self.video_and_audio_file_types
1586
+ self.video_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type in self.video_file_types])
1587
+ self.audio_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type in self.audio_file_types])
1588
+ self.video_and_audio_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type \
1589
+ in self.video_and_audio_file_types])
1590
+
1591
+ # track the allowed file types and selected files' full paths for each List Ctrl
1592
+ self.list_ctrl_file_types_browse = {self.list_ctrl_video: self.video_and_audio_file_types,
1593
+ self.list_ctrl_audio: self.audio_file_types}
1594
+ self.list_ctrl_file_types_drop = {self.list_ctrl_video: self.video_file_types,
1595
+ self.list_ctrl_audio: self.audio_file_types}
1596
+ self.list_ctrl_files_selected = {self.list_ctrl_video: [],
1597
+ self.list_ctrl_audio: []}
1598
+
1599
+ self.config_path = self.get_config()
1600
+
1601
+ set_background_color(self, self.is_dark)
1602
+
1603
+ def init_list_ctrl(self, parent_panel, default_text):
1604
+ list_ctrl = wx.ListCtrl(parent_panel, style=wx.LC_NO_HEADER|wx.LC_REPORT|wx.BORDER_SUNKEN|wx.HSCROLL)
1605
+ list_ctrl.EnableSystemTheme(False) # get rid of vertical grid lines on Windows
1606
+ list_ctrl.SetMinSize(wx.Size(-1,80))
1607
+ list_ctrl.SetDropTarget(ListCtrlDropTarget(list_ctrl, self))
1608
+ list_ctrl.InsertColumn(0, "")
1609
+ list_ctrl.InsertItem(0, default_text)
1610
+ list_ctrl.SetColumnWidth(0, wx.LIST_AUTOSIZE)
1611
+ list_ctrl.Bind(wx.EVT_CHAR, self.delete_from_list_ctrl)
1612
+ return list_ctrl
1613
+
1614
+ def populate_list_ctrl(self, list_ctrl, files):
1615
+ self.list_ctrl_files_selected[list_ctrl] = files
1616
+ if len(files) == 0:
1617
+ files = ["No files with valid file types found"]
1618
+ list_ctrl.DeleteAllItems()
1619
+ list_ctrl.DeleteAllColumns()
1620
+ list_ctrl.InsertColumn(0, "")
1621
+ for i, file in enumerate(files):
1622
+ list_ctrl.InsertItem(i, os.path.basename(file))
1623
+ list_ctrl.SetColumnWidth(0, wx.LIST_AUTOSIZE)
1624
+
1625
+ def browse_files(self, list_ctrl):
1626
+ dialog = wx.FileDialog(self, wildcard=self.list_ctrl_file_types_browse[list_ctrl], style=wx.FD_MULTIPLE)
1627
+ if dialog.ShowModal() == wx.ID_OK:
1628
+ files = dialog.GetPaths()
1629
+ self.populate_list_ctrl(list_ctrl, files)
1630
+
1631
+ def delete_from_list_ctrl(self, event):
1632
+ if event.GetKeyCode() == wx.WXK_DELETE:
1633
+ list_ctrl = event.GetEventObject()
1634
+ item_index = list_ctrl.GetFirstSelected()
1635
+ if item_index == -1:
1636
+ item_index = list_ctrl.GetFocusedItem()
1637
+ items_to_delete = []
1638
+ while item_index != -1:
1639
+ items_to_delete.append(item_index)
1640
+ item_index = list_ctrl.GetNextSelected(item_index)
1641
+ for item_index in items_to_delete[::-1]:
1642
+ if len(self.list_ctrl_files_selected[list_ctrl]) != 0:
1643
+ list_ctrl.DeleteItem(item_index)
1644
+ del self.list_ctrl_files_selected[list_ctrl][item_index]
1645
+ else:
1646
+ event.Skip()
1647
+
1648
+ def open_combine(self, event):
1649
+ video_files = self.list_ctrl_files_selected[self.list_ctrl_video]
1650
+ audio_files = self.list_ctrl_files_selected[self.list_ctrl_audio]
1651
+ if len(video_files) == 0:
1652
+ error_dialog = wx.MessageDialog(self, "Error: no video input selected.", "Error", wx.OK|wx.ICON_ERROR)
1653
+ error_dialog.ShowModal()
1654
+ elif len(audio_files) == 0:
1655
+ error_dialog = wx.MessageDialog(self, "Error: no audio input selected.", "Error", wx.OK|wx.ICON_ERROR)
1656
+ error_dialog.ShowModal()
1657
+ elif len(video_files) != len(audio_files):
1658
+ error_dialog = wx.MessageDialog(self, f"Error: different numbers of video ({len(video_files)}) " + \
1659
+ f"and audio ({len(audio_files)}) inputs.",
1660
+ "Error", wx.OK|wx.ICON_ERROR)
1661
+ error_dialog.ShowModal()
1662
+ else:
1663
+ frame_combine = FrameCombine(None, self.config_path, video_files, audio_files, self.is_dark)
1664
+ self.list_ctrl_video.SetFocus()
1665
+ frame_combine.Show()
1666
+
1667
+ def open_settings(self, event):
1668
+ dialog_settings = DialogSettings(None, self.config_path, self.is_dark)
1669
+ dialog_settings.ShowModal()
1670
+ dialog_settings.Destroy()
1671
+
1672
+ def get_config(self):
1673
+ config_path = platformdirs.user_config_path(appname='describealign', appauthor=False,
1674
+ ensure_exists=True) / 'config.ini'
1675
+ old_paths = [
1676
+ # Place in chronological order (oldest -> newest)
1677
+ Path(__file__).resolve().parent / 'config.ini',
1678
+ platformdirs.user_config_path(appname='describealign', ensure_exists=True) / 'config.ini',
1679
+ ]
1680
+ # Get newest existent path
1681
+ old_config = next((file for file in reversed(old_paths) if file.exists()), None,)
1682
+ try:
1683
+ migrate_config(old_config, config_path)
1684
+ except OSError as exc:
1685
+ print(f"Error migrating old config:", *traceback.format_exception_only(exc))
1686
+ print(f"Old config left in place at {old_config}")
1687
+ return config_path
1161
1688
 
1162
- def main_gui():
1163
- config_path = platformdirs.user_config_path(appname='describealign', appauthor=False, ensure_exists=True) / 'config.ini'
1164
- old_paths = [
1165
- # Place in chronological order (oldest -> newest)
1166
- Path(__file__).resolve().parent / 'config.ini',
1167
- platformdirs.user_config_path(appname='describealign', ensure_exists=True) / 'config.ini',
1168
- ]
1169
-
1170
- # Get newest existent path
1171
- old_config = next(
1172
- (
1173
- file
1174
- for file in reversed(old_paths)
1175
- if file.exists()
1176
- ),
1177
- None,
1178
- )
1179
-
1689
+ def get_version_hash(filename):
1180
1690
  try:
1181
- migrate_config(old_config, config_path)
1182
- except OSError as exc:
1183
- print(f"Error migrating old config:", *traceback.format_exception_only(exc))
1184
- print(f"Old config left in place at {old_config}")
1185
-
1186
- sg.theme('Light Blue 2')
1187
-
1188
- filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
1189
- all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
1190
- all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
1191
- all_video_and_audio_file_types = [('All Video and Audio File Types',
1192
- '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
1193
- audio_file_types = [(ext, f"*.{ext}") for ext in AUDIO_EXTENSIONS]
1194
- video_and_audio_file_types = [(ext, f"*.{ext}") for ext in VIDEO_EXTENSIONS] + audio_file_types
1195
- audio_file_types = all_audio_file_types + audio_file_types
1196
- video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
1197
- # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
1198
- if IS_RUNNING_WINDOWS:
1199
- file_fix = lambda file_types: file_types[:1] + [(f'|{type[0]}', type[1]) for type in file_types[1:]]
1200
- audio_file_types = file_fix(audio_file_types)
1201
- video_and_audio_file_types = file_fix(video_and_audio_file_types)
1202
-
1203
- layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
1204
- [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
1205
- sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
1206
- tooltip='List video filenames here, in order, separated by semicolons'),
1207
- sg.FilesBrowse(button_text="Browse Video",
1208
- file_types=video_and_audio_file_types,
1209
- tooltip='Select one or more video files')]], pad=(2,7))],
1210
- [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
1211
- sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
1212
- tooltip='List audio filenames here, in order, separated by semicolons'),
1213
- sg.FilesBrowse(button_text="Browse Audio",
1214
- file_types=audio_file_types,
1215
- tooltip='Select one or more audio files')]], pad=(2,7))],
1216
- [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
1217
- sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
1218
- pad=((135,3),10))]]
1219
- window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
1220
- window['-VIDEO_FILES-'].set_focus()
1221
- while True:
1222
- event, values = window.read()
1223
- if event == 'Combine':
1224
- if len(values['-VIDEO_FILES-']) == 0 or \
1225
- len(values['-AUDIO_FILES-']) == 0:
1226
- window.disable()
1227
- sg.Popup('Error: empty input field.', font=('Arial', 20))
1228
- window.enable()
1229
- window['-VIDEO_FILES-'].set_focus()
1230
- continue
1231
- video_files = values['-VIDEO_FILES-'].split(';')
1232
- if len(video_files) == 1:
1233
- video_files = video_files[0]
1234
- audio_files = values['-AUDIO_FILES-'].split(';')
1235
- if len(audio_files) == 1:
1236
- audio_files = audio_files[0]
1237
- window.disable()
1238
- combine_gui(video_files, audio_files, config_path)
1239
- window.enable()
1240
- window['-VIDEO_FILES-'].set_focus()
1241
- if event == 'Settings':
1242
- window.disable()
1243
- settings_gui(config_path)
1244
- window.enable()
1245
- window['-VIDEO_FILES-'].set_focus()
1246
- if event == sg.WIN_CLOSED:
1247
- break
1248
- window.close()
1691
+ with open(filename, 'rb') as f:
1692
+ data = f.read()
1693
+ sha_hash = hashlib.sha1(data).hexdigest()
1694
+ return sha_hash[:8]
1695
+ except:
1696
+ return "None"
1249
1697
 
1250
1698
  # Entry point for command line interaction, for example:
1251
1699
  # > describealign video.mp4 audio_desc.mp3
1252
1700
  def command_line_interface():
1253
1701
  if len(sys.argv) < 2:
1254
- # No args, run gui
1255
- print('No input arguments detected, starting GUI...')
1256
- main_gui()
1257
- sys.exit(0)
1702
+ if wx is not None:
1703
+ # No args, run gui
1704
+ print('No input arguments detected, starting GUI...')
1705
+ # the following line is necessary on MacOS X to fix the filectrlpicker
1706
+ # https://docs.wxpython.org/wx.FileDialog.html#wx-filedialog
1707
+ # https://github.com/wxWidgets/Phoenix/issues/2368
1708
+ if platform.system() == 'Darwin':
1709
+ wx.SystemOptions.SetOption('osx.openfiledialog.always-show-types', 1)
1710
+ app = wx.App()
1711
+ main_gui = FrameMain(None)
1712
+ main_gui.Show()
1713
+ app.MainLoop()
1714
+ sys.exit(0)
1715
+ else:
1716
+ print("Can't launch GUI and arguments missing.\nGUI dependencies missing.")
1258
1717
 
1259
- parser = argparse.ArgumentParser(
1260
- description="Replaces a video's sound with an audio description.",
1261
- usage="describealign video_file.mp4 audio_file.mp3")
1262
- parser.add_argument("video", help='A video file or directory containing video files.', nargs='?', default=None)
1263
- parser.add_argument("audio", help='An audio file or directory containing audio files.', nargs='?', default=None)
1264
- parser.add_argument('--smoothness', type=float, default=50,
1265
- help='Lower values make the alignment more accurate when there are skips ' + \
1266
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
1267
- 'Default is 50.')
1718
+ parser = argparse.ArgumentParser(description="Replaces a video's sound with an audio description.",
1719
+ usage="describealign video_file.mp4 audio_file.mp3")
1720
+ parser.add_argument("video", help='A video file or directory containing video files.',
1721
+ nargs='?', default=None)
1722
+ parser.add_argument("audio", help='An audio file or directory containing audio files.',
1723
+ nargs='?', default=None)
1268
1724
  parser.add_argument('--stretch_audio', action='store_true',
1269
1725
  help='Stretches the input audio to fit the input video. ' + \
1270
- 'Default is to stretch the video to fit the audio.')
1271
- parser.add_argument('--keep_non_ad', action='store_true',
1272
- help='Tries to only replace segments with audio description. Useful if ' + \
1273
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
1274
- 'Requires --stretch_audio to be set, otherwise does nothing.')
1275
- parser.add_argument('--boost', type=float, default=0,
1276
- help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
1277
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
1278
- 'Requires --stretch_audio to be set, otherwise does nothing.')
1279
- parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
1280
- help='Audio description detection sensitivity ratio. Higher values make ' + \
1281
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
1282
- parser.add_argument('--boost_sensitivity', type=float, default=.4,
1283
- help='Higher values make --boost less likely to miss a description, but ' + \
1284
- 'also make it more likely to boost non-description audio. Default is 0.4')
1726
+ 'Default is to stretch the video to fit the audio. ' + \
1727
+ 'Keeps original video audio as secondary tracks. Slower ' + \
1728
+ 'and uses more RAM when enabled, long videos may cause ' + \
1729
+ 'paging or Out of Memory errors on low-RAM systems.')
1285
1730
  parser.add_argument('--yes', action='store_true',
1286
1731
  help='Auto-skips user prompts asking to verify information.')
1287
1732
  parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
@@ -1292,23 +1737,41 @@ def command_line_interface():
1292
1737
  help='Directory combined output media is saved to. Default is "videos_with_ad"')
1293
1738
  parser.add_argument("--alignment_dir", default=default_alignment_dir,
1294
1739
  help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
1295
- parser.add_argument("--extension", default="copy",
1296
- help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
1297
- 'file type of the corresponding input video. Default is "copy".')
1298
1740
  parser.add_argument("--install-ffmpeg", action="store_true",
1299
- help="Install the required ffmpeg binaries and then exit. This is meant to be" + \
1741
+ help="Install the required ffmpeg binaries and then exit. This is meant to be " + \
1300
1742
  "run from a privileged installer process (e.g. OS X Installer)")
1743
+ parser.add_argument('--version', action='store_true',
1744
+ help='Checks and prints the installed version of describealign.')
1301
1745
  args = parser.parse_args()
1302
1746
 
1303
- if args.install_ffmpeg:
1747
+ if args.version:
1748
+ print(f"version: {__version__}")
1749
+ if "__compiled__" in globals() or getattr(sys, 'frozen', False):
1750
+ print("running from compiled binary")
1751
+ else:
1752
+ import importlib
1753
+ cur_dir = os.getcwd()
1754
+ if sys.path[0] == cur_dir:
1755
+ # ignore describealign.py in current directory
1756
+ del sys.path[0]
1757
+ installed_spec = importlib.util.find_spec('describealign')
1758
+ sys.path = [cur_dir] + sys.path
1759
+ else:
1760
+ installed_spec = importlib.util.find_spec('describealign')
1761
+ this_script_path = os.path.abspath(__file__)
1762
+ if installed_spec is None or (this_script_path != os.path.abspath(installed_spec.origin)):
1763
+ print("running from downloaded .py file")
1764
+ else:
1765
+ print("running from installed package")
1766
+ print(f"path: {this_script_path}")
1767
+ print(f"content hash: {get_version_hash(this_script_path)}")
1768
+ elif args.install_ffmpeg:
1304
1769
  # Make sure the file is world executable
1305
1770
  os.chmod(get_ffmpeg(), 0o755)
1306
1771
  os.chmod(get_ffprobe(), 0o755)
1307
- elif args.video or args.audio:
1308
- combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
1309
- args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
1310
- args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
1311
- args.extension)
1772
+ elif args.video and args.audio:
1773
+ combine(args.video, args.audio, args.stretch_audio, args.yes, args.prepend,
1774
+ args.no_pitch_correction, args.output_dir, args.alignment_dir)
1312
1775
  else:
1313
1776
  parser.print_usage()
1314
1777