describealign 1.2.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {describealign-1.2.0.dist-info → describealign-2.0.1.dist-info}/METADATA +8 -9
- describealign-2.0.1.dist-info/RECORD +7 -0
- {describealign-1.2.0.dist-info → describealign-2.0.1.dist-info}/WHEEL +1 -1
- describealign.py +1505 -1042
- describealign-1.2.0.dist-info/RECORD +0 -7
- {describealign-1.2.0.dist-info → describealign-2.0.1.dist-info}/entry_points.txt +0 -0
- {describealign-1.2.0.dist-info → describealign-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {describealign-1.2.0.dist-info → describealign-2.0.1.dist-info}/top_level.txt +0 -0
describealign.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
+
__version__ = '2.0.1'
|
|
2
|
+
|
|
1
3
|
# combines videos with matching audio files (e.g. audio descriptions)
|
|
2
4
|
# input: video or folder of videos and an audio file or folder of audio files
|
|
3
5
|
# output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
|
|
4
6
|
# this script aligns the new audio to the video using the video's old audio
|
|
5
|
-
# first, the video's sound and the audio file are both converted to spectrograms
|
|
6
|
-
# second, the two spectrograms are roughly aligned by finding their longest common subsequence
|
|
7
|
-
# third, the rough alignment is denoised through L1-Minimization
|
|
8
|
-
# fourth, the spectrogram alignments determine where the new audio replaces the old
|
|
9
7
|
|
|
10
8
|
'''
|
|
11
9
|
Copyright (C) 2023 Julian Brown
|
|
@@ -28,26 +26,14 @@ VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob']
|
|
|
28
26
|
AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
|
|
29
27
|
PLOT_ALIGNMENT_TO_FILE = True
|
|
30
28
|
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
TIMESTEPS_PER_SECOND = 10 # factors must be subset of (2, 3, 5, 7)
|
|
30
|
+
TIMESTEP_SIZE_SECONDS = 1. / TIMESTEPS_PER_SECOND
|
|
33
31
|
AUDIO_SAMPLE_RATE = 44100
|
|
34
|
-
|
|
35
|
-
DITHER_PERIOD_STEPS = 60
|
|
36
|
-
MIN_CORR_FOR_TOKEN_MATCH = .6
|
|
37
|
-
GAP_START_COST = 1.0
|
|
38
|
-
GAP_EXTEND_COST = -.01
|
|
39
|
-
GAP_EXTEND_DIAG_BONUS = -.01
|
|
40
|
-
SKIP_MATCH_COST = .1
|
|
32
|
+
DITHER_PERIOD_STEPS = 10
|
|
41
33
|
MAX_RATE_RATIO_DIFF_ALIGN = .1
|
|
42
|
-
PREF_CUT_AT_GAPS_FACTOR = 5
|
|
43
34
|
MIN_DURATION_TO_REPLACE_SECONDS = 2
|
|
44
|
-
MIN_START_END_SYNC_TIME_SECONDS = 2
|
|
45
|
-
MAX_START_END_SYNC_ERR_SECONDS = .2
|
|
46
|
-
MAX_RATE_RATIO_DIFF_BOOST = .003
|
|
47
|
-
MIN_DESC_DURATION = .5
|
|
48
|
-
MAX_GAP_IN_DESC_SEC = 1.5
|
|
49
35
|
JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
|
|
50
|
-
|
|
36
|
+
MIN_STRETCH_OFFSET = 30
|
|
51
37
|
|
|
52
38
|
if PLOT_ALIGNMENT_TO_FILE:
|
|
53
39
|
import matplotlib.pyplot as plt
|
|
@@ -64,42 +50,41 @@ import numpy as np
|
|
|
64
50
|
import ffmpeg
|
|
65
51
|
import platformdirs
|
|
66
52
|
import static_ffmpeg
|
|
67
|
-
import python_speech_features as psf
|
|
68
53
|
import scipy.signal
|
|
69
54
|
import scipy.optimize
|
|
70
55
|
import scipy.interpolate
|
|
71
|
-
import scipy.ndimage as nd
|
|
72
56
|
import scipy.sparse
|
|
73
|
-
import pytsmod
|
|
74
57
|
import configparser
|
|
75
58
|
import traceback
|
|
76
59
|
import multiprocessing
|
|
77
60
|
import platform
|
|
61
|
+
import natsort
|
|
62
|
+
from collections import defaultdict
|
|
63
|
+
from sortedcontainers import SortedList
|
|
64
|
+
import hashlib
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import wx
|
|
68
|
+
gui_font = (11, wx.FONTFAMILY_SWISS, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL, False, "Arial")
|
|
69
|
+
except ImportError:
|
|
70
|
+
wx = None
|
|
71
|
+
|
|
72
|
+
gui_update_interval_ms = 100
|
|
73
|
+
gui_background_color_dark = (28, 30, 35)
|
|
74
|
+
gui_background_color_light = (170, 182, 211)
|
|
78
75
|
|
|
79
76
|
IS_RUNNING_WINDOWS = platform.system() == 'Windows'
|
|
80
77
|
if IS_RUNNING_WINDOWS:
|
|
81
|
-
import PySimpleGUIWx as sg
|
|
82
78
|
default_output_dir = 'videos_with_ad'
|
|
83
79
|
default_alignment_dir = 'alignment_plots'
|
|
84
80
|
else:
|
|
85
|
-
import PySimpleGUIQt as sg
|
|
86
81
|
default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
|
|
87
82
|
default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
|
|
88
83
|
|
|
89
|
-
def
|
|
90
|
-
if func:
|
|
91
|
-
func(text)
|
|
92
|
-
print(text)
|
|
93
|
-
|
|
94
|
-
def throw_runtime_error(text, func=None):
|
|
95
|
-
if func:
|
|
96
|
-
func(text)
|
|
97
|
-
raise RuntimeError(text)
|
|
98
|
-
|
|
99
|
-
def ensure_folders_exist(dirs, display_func=None):
|
|
84
|
+
def ensure_folders_exist(dirs):
|
|
100
85
|
for dir in dirs:
|
|
101
86
|
if not os.path.isdir(dir):
|
|
102
|
-
|
|
87
|
+
print(f"Directory not found, creating it: {dir}")
|
|
103
88
|
os.makedirs(dir)
|
|
104
89
|
|
|
105
90
|
def get_sorted_filenames(path, extensions, alt_extensions=set([])):
|
|
@@ -127,347 +112,59 @@ def get_sorted_filenames(path, extensions, alt_extensions=set([])):
|
|
|
127
112
|
"Or maybe you need to add a new extension to this script's regex?",
|
|
128
113
|
f"valid extensions for this input are:\n {extensions}"]
|
|
129
114
|
raise RuntimeError("\n".join(error_msg))
|
|
130
|
-
files =
|
|
131
|
-
|
|
132
|
-
return files,
|
|
133
|
-
|
|
134
|
-
# read audio from file with ffmpeg and convert to numpy array
|
|
135
|
-
def parse_audio_from_file(media_file):
|
|
136
|
-
media_stream, _ = (ffmpeg
|
|
137
|
-
.input(media_file)
|
|
138
|
-
.output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
|
|
139
|
-
.run(capture_stdout=True, cmd=get_ffmpeg())
|
|
140
|
-
)
|
|
141
|
-
media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
|
|
142
|
-
return media_arr
|
|
143
|
-
|
|
144
|
-
# tokenize audio by transforming with a mel-frequency cepstrum (MFC)
|
|
145
|
-
def tokenize_audio(media_arr, rate=1):
|
|
146
|
-
step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
|
|
147
|
-
window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
|
|
148
|
-
window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
|
|
149
|
-
fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
|
|
150
|
-
get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
|
|
151
|
-
samplerate=AUDIO_SAMPLE_RATE,
|
|
152
|
-
winlen=window_size_seconds,
|
|
153
|
-
winstep=TIMESTEP_SIZE_SECONDS * rate,
|
|
154
|
-
numcep=MEL_COEFFS_PER_TIMESTEP,
|
|
155
|
-
nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
|
|
156
|
-
nfft=fft_size_samples,
|
|
157
|
-
winfunc=scipy.signal.windows.hann)
|
|
158
|
-
num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
|
|
159
|
-
media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
|
|
160
|
-
chunk_size = 1000
|
|
161
|
-
for chunk_index in np.arange(0, num_timesteps, chunk_size):
|
|
162
|
-
chunk_bounds_samples = ((chunk_index ) * step_size_samples,
|
|
163
|
-
(chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
|
|
164
|
-
media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
|
|
165
|
-
'''
|
|
166
|
-
# alternate python library's MFC implementation
|
|
167
|
-
import librosa
|
|
168
|
-
media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
|
|
169
|
-
sr=AUDIO_SAMPLE_RATE,
|
|
170
|
-
n_mfcc=MEL_COEFFS_PER_TIMESTEP,
|
|
171
|
-
lifter=22,
|
|
172
|
-
n_fft=fft_size_samples,
|
|
173
|
-
hop_length=step_size_samples,
|
|
174
|
-
win_length=window_size_samples,
|
|
175
|
-
window=scipy.signal.windows.hann).T
|
|
176
|
-
num_timesteps = media_spec.shape[0]
|
|
177
|
-
'''
|
|
178
|
-
timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
|
|
179
|
-
timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
|
|
180
|
-
return media_spec, timings_seconds
|
|
181
|
-
|
|
182
|
-
# same as tokenize_audio, but dithering the MFC window timings
|
|
183
|
-
# this allows for finer alignment by ameliorating discretization error
|
|
184
|
-
def tokenize_audio_dither(media_arr, slow_timings):
|
|
185
|
-
# choose a relative step size slightly less than 1 to ameliorate quantization error
|
|
186
|
-
# maximize alignment accuracy by using least approximable number with desired period
|
|
187
|
-
# this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
|
|
188
|
-
fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
|
|
189
|
-
fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
|
|
190
|
-
|
|
191
|
-
# prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
|
|
192
|
-
# by approximately equalizing the number of tokens per unit time between dithered and undithered
|
|
193
|
-
# the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
|
|
194
|
-
# this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
|
|
195
|
-
fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
|
|
196
|
-
fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
|
|
197
|
-
return fast_spec, fast_timings
|
|
198
|
-
|
|
199
|
-
# normalize along both time and frequency axes to allow comparing tokens by correlation
|
|
200
|
-
def normalize_spec(media_spec_raw, axes=(0,1)):
|
|
201
|
-
media_spec = media_spec_raw.copy()
|
|
202
|
-
for axis in axes:
|
|
203
|
-
norm_func = np.std if axis == 0 else np.linalg.norm
|
|
204
|
-
media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
|
|
205
|
-
media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
|
|
206
|
-
return media_spec
|
|
207
|
-
|
|
208
|
-
# vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
|
|
209
|
-
# modified to include affine gap penalties and skip+match options (i.e. knight's moves)
|
|
210
|
-
# gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
|
|
211
|
-
# or when the audio description includes a commercial break or an extra scene
|
|
212
|
-
# the skip+match option allows for micro-adjustments without eating the full gap penalty
|
|
213
|
-
# skip+match is primarily useful in maintaining alignment when the rates differ slightly
|
|
214
|
-
def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
|
|
215
|
-
pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
|
|
216
|
-
1:lambda node: (0, node[1]-2, node[2]-1),
|
|
217
|
-
2:lambda node: (0, node[1]-1, node[2]-2),
|
|
218
|
-
3:lambda node: (1, node[1]-1, node[2]-1),
|
|
219
|
-
4:lambda node: (0, node[1] , node[2] ),
|
|
220
|
-
5:lambda node: (1, node[1]-1, node[2] ),
|
|
221
|
-
6:lambda node: (1, node[1]-1, node[2]-1),
|
|
222
|
-
7:lambda node: (1, node[1] , node[2]-1)}
|
|
223
|
-
pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
|
|
224
|
-
pred_matrix[0,1:,:2] = 0
|
|
225
|
-
pred_matrix[1,1:,:2] = 4
|
|
226
|
-
pred_matrix[:,0,:2] = [0,5]
|
|
227
|
-
path_corrs_match = np.zeros((3, video_spec.shape[0]))
|
|
228
|
-
path_corrs_gap = np.zeros((3, video_spec.shape[0]))
|
|
229
|
-
corrs = np.zeros((3, video_spec.shape[0]))
|
|
230
|
-
corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
|
|
231
|
-
for i in range(audio_desc_spec.shape[0]):
|
|
232
|
-
i_mod = i % 3
|
|
233
|
-
match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
|
|
234
|
-
path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
|
|
235
|
-
path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
|
|
236
|
-
path_corrs_gap[ i_mod-1][1:-1][:,None]])
|
|
237
|
-
pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
|
|
238
|
-
path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
|
|
239
|
-
corrs = np.roll(corrs, -1, axis=1)
|
|
240
|
-
corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
|
|
241
|
-
fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
|
|
242
|
-
fisher_infos[fisher_infos < 0] = 0
|
|
243
|
-
fisher_infos[fisher_infos > 10] = 10
|
|
244
|
-
row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
|
|
245
|
-
path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
|
|
246
|
-
gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
|
|
247
|
-
path_corrs_gap[i_mod-1][2: ][:,None],
|
|
248
|
-
path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
|
|
249
|
-
GAP_EXTEND_COST])
|
|
250
|
-
pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
|
|
251
|
-
path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
|
|
252
|
-
pred_matrix[1][i][2:] += 4
|
|
253
|
-
path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
|
|
254
|
-
GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
|
|
255
|
-
GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
|
|
256
|
-
pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
|
|
257
|
-
path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
|
|
258
|
-
|
|
259
|
-
# reconstruct optimal path by following predecessors backwards through the table
|
|
260
|
-
end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
|
|
261
|
-
path_corrs_gap[ i_mod,-1]])
|
|
262
|
-
cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
|
|
263
|
-
get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
|
|
264
|
-
path = []
|
|
265
|
-
visited = set()
|
|
266
|
-
while min(cur_node[1:]) >= 0:
|
|
267
|
-
cur_node, last_node = get_predecessor(cur_node), cur_node
|
|
268
|
-
# failsafe to prevent an infinite loop that should never happen anyways
|
|
269
|
-
if cur_node in visited:
|
|
270
|
-
break
|
|
271
|
-
visited.add(cur_node)
|
|
272
|
-
if last_node[0] == 0:
|
|
273
|
-
path.append(last_node[1:])
|
|
274
|
-
path = path[::-1]
|
|
275
|
-
|
|
276
|
-
# determine how much information this node gives about the alignment
|
|
277
|
-
# a larger double derivative means more precise timing information
|
|
278
|
-
# sudden noises give more timing information than droning sounds
|
|
279
|
-
def get_fisher_info(node):
|
|
280
|
-
i,j = node
|
|
281
|
-
if node[0] >= audio_desc_spec.shape[0]-1 or \
|
|
282
|
-
node[1] >= video_spec.shape[0]-1 or \
|
|
283
|
-
min(node) <= 0:
|
|
284
|
-
return 0
|
|
285
|
-
info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
|
|
286
|
-
np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
|
|
287
|
-
np.dot(audio_desc_spec[i+1],video_spec[j-1])
|
|
288
|
-
info /= min(.2, TIMESTEP_SIZE_SECONDS)
|
|
289
|
-
return info
|
|
290
|
-
|
|
291
|
-
# the quality of a node combines the correlation of its tokens
|
|
292
|
-
# with how precisely the match is localized in time
|
|
293
|
-
def get_match_quality(node):
|
|
294
|
-
# correlations are between -1 and 1, as all tokens have unit norm
|
|
295
|
-
token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
|
|
296
|
-
fisher_info = min(max(0, get_fisher_info(node)), 10)
|
|
297
|
-
return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
|
|
298
|
-
|
|
299
|
-
# filter out low match quality nodes from LCS path
|
|
300
|
-
quals = [get_match_quality(node) for node in path]
|
|
301
|
-
if len(quals) == 0 or max(quals) <= 0:
|
|
302
|
-
raise RuntimeError("Rough alignment failed, are the input files mismatched?")
|
|
303
|
-
path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
|
|
304
|
-
|
|
305
|
-
# convert units of path nodes from timesteps to seconds
|
|
306
|
-
path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
|
|
307
|
-
|
|
308
|
-
return path, quals
|
|
115
|
+
files = natsort.os_sorted(files)
|
|
116
|
+
has_alt_extensions = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
|
|
117
|
+
return files, has_alt_extensions
|
|
309
118
|
|
|
310
|
-
#
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
clips = list(zip(breaks[:-1], breaks[1:]))
|
|
320
|
-
return clips, median_slope, slopes
|
|
119
|
+
# ffmpeg command error handler
|
|
120
|
+
def run_ffmpeg_command(command, err_msg):
|
|
121
|
+
try:
|
|
122
|
+
return command.run(capture_stdout=True, capture_stderr=True, cmd=get_ffmpeg())
|
|
123
|
+
except ffmpeg.Error as e:
|
|
124
|
+
print(" ERROR: ffmpeg failed to " + err_msg)
|
|
125
|
+
print("FFmpeg error:")
|
|
126
|
+
print(e.stderr.decode('utf-8'))
|
|
127
|
+
raise
|
|
321
128
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
# stretch the x axis to make all slopes "cost" nearly the same
|
|
338
|
-
# without this, small changes to the slope at slope = +/-1
|
|
339
|
-
# cost sqrt(2) times as much as small changes at slope = 0
|
|
340
|
-
# by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
|
|
341
|
-
# the small angle approximation means these slopes all cost roughly the same
|
|
342
|
-
x_stretch_factor = 10.
|
|
343
|
-
rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
|
|
344
|
-
|
|
345
|
-
# L1-Minimization to solve the alignment problem using a linear program
|
|
346
|
-
# the absolute value functions needed for "absolute error" can be represented
|
|
347
|
-
# in a linear program by splitting variables into positive and negative pieces
|
|
348
|
-
# and constraining each to be positive (done by default in scipy's linprog)
|
|
349
|
-
# x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
|
|
350
|
-
# fit_err[i] = path[i][1] - y_fit[i]
|
|
351
|
-
# slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
|
|
352
|
-
# (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
|
|
353
|
-
# this can be rewritten in terms of fit_err by re-arranging the 1st equation:
|
|
354
|
-
# y_fit[i] = path[i][1] - fit_err[i]
|
|
355
|
-
# this gives:
|
|
356
|
-
# slope_change[i] = path_half[i] - fit_err_half[i]
|
|
357
|
-
# where each half is just the original equation but y_fit is swapped out
|
|
358
|
-
# the slope_change variables can then be set using equality constraints
|
|
359
|
-
num_fit_points = len(rotated_stretched_path)
|
|
360
|
-
x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
|
|
361
|
-
x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
|
|
362
|
-
y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
|
|
363
|
-
slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
|
|
364
|
-
slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
|
|
365
|
-
slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
|
|
366
|
-
slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
|
|
367
|
-
slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
|
|
368
|
-
c = np.hstack([quals,
|
|
369
|
-
quals,
|
|
370
|
-
slope_change_costs * x_stretch_factor,
|
|
371
|
-
slope_change_costs * x_stretch_factor])
|
|
372
|
-
fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
|
|
373
|
-
-1. / x_diffs[:-1] - 1. / x_diffs[1:],
|
|
374
|
-
1. / x_diffs[1:]],
|
|
375
|
-
offsets=[0,1,2],
|
|
376
|
-
shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
|
|
377
|
-
A_eq = scipy.sparse.hstack([ fit_err_coeffs,
|
|
378
|
-
-fit_err_coeffs,
|
|
379
|
-
scipy.sparse.eye(num_fit_points),
|
|
380
|
-
-scipy.sparse.eye(num_fit_points)])
|
|
381
|
-
b_eq = y_diffs[1: ] / x_diffs[1: ] - \
|
|
382
|
-
y_diffs[ :-1] / x_diffs[ :-1]
|
|
383
|
-
fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ds')
|
|
384
|
-
# if dual simplex solver encounters numerical problems, retry with interior point solver
|
|
385
|
-
if not fit.success and fit.status == 4:
|
|
386
|
-
fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, method='highs-ipm')
|
|
387
|
-
if not fit.success:
|
|
388
|
-
print(fit)
|
|
389
|
-
raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
|
|
390
|
-
|
|
391
|
-
# combine fit_err_pos and fit_err_neg
|
|
392
|
-
fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
|
|
393
|
-
|
|
394
|
-
# subtract fit errors from nodes to retrieve the smooth fit's coordinates
|
|
395
|
-
# also, unstretch x axis and rotate basis back, reversing the affine pre-processing
|
|
396
|
-
smooth_path = [(((x / x_stretch_factor) - y) / 2.,
|
|
397
|
-
((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
|
|
398
|
-
|
|
399
|
-
# clip off start/end of replacement audio if it doesn't match or isn't aligned
|
|
400
|
-
# without this, describer intro/outro skips can cause mismatches at the start/end
|
|
401
|
-
# the problem would be localized and just means audio might not match video at the start/end
|
|
402
|
-
# instead we just keep the original video's audio in those segments if mismatches are detected
|
|
403
|
-
# if instead the first few or last few nodes are well-aligned, that edge is marked as synced
|
|
404
|
-
# during audio replacement, synced edges will be extended backwards/forwards as far as possible
|
|
405
|
-
# this is useful when the describer begins talking immediately (or before any alignable audio)
|
|
406
|
-
# or when the describer continues speaking until the end (or no more alignable audio remains)
|
|
407
|
-
# otherwise, the mismatch would result in the describer's voice not replacing audio in that part
|
|
408
|
-
max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
|
|
409
|
-
smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
|
|
410
|
-
smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
|
|
411
|
-
smooth_err_path = zip(smoothed_fit_err, smooth_path)
|
|
412
|
-
old_length = num_fit_points
|
|
413
|
-
smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
|
|
414
|
-
is_synced_at_start = len(smooth_err_path) == old_length
|
|
415
|
-
old_length = len(smooth_err_path)
|
|
416
|
-
smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
|
|
417
|
-
is_synced_at_end = len(smooth_err_path) == old_length
|
|
418
|
-
_, smooth_path = zip(*smooth_err_path)
|
|
419
|
-
smooth_path = list(smooth_path)
|
|
420
|
-
if is_synced_at_start:
|
|
421
|
-
slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
|
|
422
|
-
smooth_path.insert(0, (-10e10, -10e10 * slope))
|
|
423
|
-
if is_synced_at_end:
|
|
424
|
-
slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
|
|
425
|
-
smooth_path.append((10e10, 10e10 * slope))
|
|
426
|
-
|
|
427
|
-
clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
|
|
428
|
-
|
|
429
|
-
# assemble clips with slopes within the rate tolerance into runs
|
|
430
|
-
runs, run = [], []
|
|
431
|
-
bad_clips = []
|
|
432
|
-
for clip in clips:
|
|
433
|
-
if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
|
|
434
|
-
if len(run) > 0:
|
|
435
|
-
runs.append(run)
|
|
436
|
-
run = []
|
|
437
|
-
bad_clips.append(clip)
|
|
438
|
-
continue
|
|
439
|
-
run.append(clip)
|
|
440
|
-
if len(run) > 0:
|
|
441
|
-
runs.append(run)
|
|
442
|
-
|
|
443
|
-
return smooth_path, runs, bad_clips, clips
|
|
129
|
+
def run_async_ffmpeg_command(command, media_arr, err_msg):
|
|
130
|
+
try:
|
|
131
|
+
ffmpeg_caller = command.run_async(pipe_stdin=True, quiet=True, cmd=get_ffmpeg())
|
|
132
|
+
out, err = ffmpeg_caller.communicate(media_arr.astype(np.int16).T.tobytes())
|
|
133
|
+
if len(err) > 0:
|
|
134
|
+
print(" ERROR: ffmpeg failed to " + err_msg)
|
|
135
|
+
print("FFmpeg error:")
|
|
136
|
+
print(err.decode('utf-8'))
|
|
137
|
+
raise ChildProcessError('FFmpeg error.')
|
|
138
|
+
except ffmpeg.Error as e:
|
|
139
|
+
print(" ERROR: ffmpeg failed to " + err_msg)
|
|
140
|
+
print("FFmpeg error:")
|
|
141
|
+
print(e.stderr.decode('utf-8'))
|
|
142
|
+
raise
|
|
444
143
|
|
|
445
|
-
#
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
|
|
457
|
-
slope = smooth_path[-1][1] / smooth_path[-1][0]
|
|
458
|
-
new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
|
|
459
|
-
if new_end_point[1] > video_runtime:
|
|
460
|
-
new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
|
|
461
|
-
smooth_path[-1] = new_end_point
|
|
144
|
+
# read audio from file with ffmpeg and convert to numpy array
|
|
145
|
+
def parse_audio_from_file(media_file, num_channels=2):
|
|
146
|
+
# retrieve only the first audio track, injecting silence/trimming to force timestamps to match up
|
|
147
|
+
# for example, when the video starts before the audio this fills that starting gap with silence
|
|
148
|
+
ffmpeg_command = ffmpeg.input(media_file).output('-', format='s16le', acodec='pcm_s16le',
|
|
149
|
+
af='aresample=async=1:first_pts=0', map='0:a:0',
|
|
150
|
+
ac=num_channels, ar=AUDIO_SAMPLE_RATE, loglevel='error')
|
|
151
|
+
media_stream, _ = run_ffmpeg_command(ffmpeg_command, f"parse audio from input file: {media_file}")
|
|
152
|
+
# media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1, num_channels)).T
|
|
153
|
+
media_arr = np.frombuffer(media_stream, np.int16).astype(np.float16).reshape((-1, num_channels)).T
|
|
154
|
+
return media_arr
|
|
462
155
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
156
|
+
def plot_alignment(plot_filename_no_ext, path, audio_times, video_times, similarity_percent,
|
|
157
|
+
median_slope, stretch_audio, no_pitch_correction):
|
|
158
|
+
downsample = 20
|
|
159
|
+
path = path[::downsample]
|
|
160
|
+
video_times_full, audio_times_full, cluster_indices, quals, cum_quals = path.T
|
|
466
161
|
scatter_color = [.2,.4,.8]
|
|
467
162
|
lcs_rgba = np.zeros((len(quals),4))
|
|
468
163
|
lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
|
|
469
|
-
lcs_rgba[:,3] = np.
|
|
470
|
-
|
|
164
|
+
lcs_rgba[:,3] = np.clip(quals * 400. / len(quals), 0, 1)
|
|
165
|
+
audio_offsets = audio_times_full - video_times_full
|
|
166
|
+
plt.switch_backend('Agg')
|
|
167
|
+
plt.scatter(video_times_full / 60., audio_offsets, s=3, c=lcs_rgba, label='Matches')
|
|
471
168
|
audio_offsets = audio_times - video_times
|
|
472
169
|
def expand_limits(start, end, ratio=.01):
|
|
473
170
|
average = (end + start) / 2.
|
|
@@ -475,63 +172,59 @@ def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs,
|
|
|
475
172
|
half_diff *= (1 + ratio)
|
|
476
173
|
return (average - half_diff, average + half_diff)
|
|
477
174
|
plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
|
|
478
|
-
plt.ylim(expand_limits(*(np.min(audio_offsets) -
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
|
|
482
|
-
audio_offsets = audio_times - video_times
|
|
483
|
-
if ad_timings is None:
|
|
175
|
+
plt.ylim(expand_limits(*(np.min(audio_offsets) - 10 * TIMESTEP_SIZE_SECONDS,
|
|
176
|
+
np.max(audio_offsets) + 10 * TIMESTEP_SIZE_SECONDS), .05))
|
|
177
|
+
if stretch_audio:
|
|
484
178
|
plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
179
|
+
audio_times_unreplaced = []
|
|
180
|
+
video_times_unreplaced = []
|
|
181
|
+
for i in range(len(video_times) - 1):
|
|
182
|
+
slope = (audio_times[i+1] - audio_times[i]) / (video_times[i+1] - video_times[i])
|
|
183
|
+
if abs(1 - slope) > MAX_RATE_RATIO_DIFF_ALIGN:
|
|
184
|
+
video_times_unreplaced.extend(video_times[i:i+2])
|
|
185
|
+
audio_times_unreplaced.extend(audio_times[i:i+2])
|
|
186
|
+
video_times_unreplaced.append(video_times[i+1])
|
|
187
|
+
audio_times_unreplaced.append(np.nan)
|
|
188
|
+
if len(video_times_unreplaced) > 0:
|
|
189
|
+
video_times_unreplaced = np.array(video_times_unreplaced)
|
|
190
|
+
audio_times_unreplaced = np.array(audio_times_unreplaced)
|
|
191
|
+
audio_offsets = audio_times_unreplaced - video_times_unreplaced
|
|
192
|
+
plt.plot(video_times_unreplaced / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
|
|
493
193
|
else:
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
|
|
498
|
-
video_times = ad_timings
|
|
499
|
-
audio_offsets = interp(ad_timings)
|
|
500
|
-
if len(audio_offsets) > 0:
|
|
501
|
-
plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
|
|
502
|
-
plt.xlabel('Video Time (minutes)')
|
|
503
|
-
plt.ylabel('Audio Description Offset (seconds)')
|
|
194
|
+
plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Combined Media')
|
|
195
|
+
plt.xlabel('Original Video Time (minutes)')
|
|
196
|
+
plt.ylabel('Original Audio Description Offset (seconds behind video)')
|
|
504
197
|
plt.title(f"Alignment - Media Similarity {similarity_percent:.2f}%")
|
|
505
198
|
plt.legend().legend_handles[0].set_color(scatter_color)
|
|
506
199
|
plt.tight_layout()
|
|
507
200
|
plt.savefig(plot_filename_no_ext + '.png', dpi=400)
|
|
508
201
|
plt.clf()
|
|
509
|
-
|
|
510
202
|
with open(plot_filename_no_ext + '.txt', 'w') as file:
|
|
511
|
-
|
|
512
|
-
|
|
203
|
+
parameters = {'stretch_audio':stretch_audio, 'no_pitch_correction':no_pitch_correction}
|
|
204
|
+
print(f"Parameters: {parameters}", file=file)
|
|
205
|
+
this_script_path = os.path.abspath(__file__)
|
|
206
|
+
print(f"Version Hash: {get_version_hash(this_script_path)}", file=file)
|
|
207
|
+
video_offset = video_times[0] - audio_times[0]
|
|
513
208
|
print(f"Input file similarity: {similarity_percent:.2f}%", file=file)
|
|
514
209
|
print("Main changes needed to video to align it to audio input:", file=file)
|
|
515
210
|
print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
|
|
516
211
|
print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
|
|
517
|
-
for
|
|
518
|
-
|
|
519
|
-
audio_desc_end, video_end = smooth_path[clip_end]
|
|
520
|
-
slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
|
|
212
|
+
for i in range(len(video_times) - 1):
|
|
213
|
+
slope = (video_times[i+1] - video_times[i]) / (audio_times[i+1] - audio_times[i])
|
|
521
214
|
def str_from_time(seconds):
|
|
522
215
|
minutes, seconds = divmod(seconds, 60)
|
|
523
216
|
hours, minutes = divmod(minutes, 60)
|
|
524
|
-
return f"{hours:2.0f}:{minutes:02.0f}:{seconds:
|
|
525
|
-
print(f"Rate change of {(slope-1.)*100:
|
|
526
|
-
f"{str_from_time(
|
|
527
|
-
f"{str_from_time(
|
|
217
|
+
return f"{hours:2.0f}:{minutes:02.0f}:{seconds:06.3f}"
|
|
218
|
+
print(f"Rate change of {(slope-1.)*100:8.1f}% from {str_from_time(video_times[i])} to " + \
|
|
219
|
+
f"{str_from_time(video_times[i+1])} aligning with audio from " + \
|
|
220
|
+
f"{str_from_time(audio_times[i])} to {str_from_time(audio_times[i+1])}", file=file)
|
|
528
221
|
|
|
529
222
|
# use the smooth alignment to replace runs of video sound with corresponding described audio
|
|
530
|
-
def replace_aligned_segments(video_arr, audio_desc_arr,
|
|
223
|
+
def replace_aligned_segments(video_arr, audio_desc_arr, audio_desc_times, video_times, no_pitch_correction):
|
|
531
224
|
# perform quadratic interpolation of the audio description's waveform
|
|
532
225
|
# this allows it to be stretched to match the corresponding video segment
|
|
533
226
|
def audio_desc_arr_interp(samples):
|
|
534
|
-
chunk_size = 10**
|
|
227
|
+
chunk_size = 10**5
|
|
535
228
|
interpolated_chunks = []
|
|
536
229
|
for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
|
|
537
230
|
interp_bounds = (max(int(chunk[0]-2), 0),
|
|
@@ -540,215 +233,196 @@ def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pi
|
|
|
540
233
|
audio_desc_arr[:,slice(*interp_bounds)],
|
|
541
234
|
copy=False, bounds_error=False, fill_value=0,
|
|
542
235
|
kind='quadratic', assume_sorted=True)
|
|
543
|
-
interpolated_chunks.append(interp(chunk).astype(np.
|
|
236
|
+
interpolated_chunks.append(interp(chunk).astype(np.float16))
|
|
544
237
|
return np.hstack(interpolated_chunks)
|
|
545
238
|
|
|
546
|
-
#
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
239
|
+
# yields matrices of pearson correlations indexed by the first window's start and
|
|
240
|
+
# the second window's offset from the first window
|
|
241
|
+
# the output matrix is truncated to the valid square with positive offsets
|
|
242
|
+
# if negative=True, it is truncated to the valid square with negative offsets
|
|
243
|
+
# subsequent yields are the adjacent square following the previously yielded one
|
|
244
|
+
def get_pearson_corrs_generator(input, negative, jumps, window_size=512):
|
|
245
|
+
# processing the entire vector at once is faster, but uses too much memory
|
|
246
|
+
# instead, parse the input vector in pieces with a recursive call
|
|
247
|
+
max_cached_chunks = 50
|
|
248
|
+
cut = max_cached_chunks * window_size
|
|
249
|
+
if input.shape[1] > (max_cached_chunks + 2) * 1.1 * window_size:
|
|
250
|
+
is_first_iter = True
|
|
251
|
+
while True:
|
|
252
|
+
output_start = 0 if is_first_iter else 1
|
|
253
|
+
is_last_iter = (input.shape[1] <= (max_cached_chunks + 2) * 1.1 * window_size)
|
|
254
|
+
output_end = None if is_last_iter else max_cached_chunks
|
|
255
|
+
input_end = None if is_last_iter else (cut + window_size)
|
|
256
|
+
yield from itertools.islice(get_pearson_corrs_generator(input[:,:input_end], negative, jumps),
|
|
257
|
+
output_start, output_end)
|
|
258
|
+
if is_last_iter:
|
|
259
|
+
return
|
|
260
|
+
input = input[:,cut-window_size:]
|
|
261
|
+
is_first_iter = False
|
|
262
|
+
if input.shape[1] < 3 * window_size - 1:
|
|
263
|
+
raise RuntimeError("Invalid state in Pearson generator.")
|
|
264
|
+
pearson_corrs = np.zeros((len(jumps), input.shape[1] - window_size + 1)) - np.inf
|
|
265
|
+
# calculate dot products of pairs of windows (i.e. autocorrelation)
|
|
266
|
+
# avoids redundant calculations by substituting differences in the cumulative sum of products
|
|
267
|
+
self_corr = np.sum(input.astype(np.float32)**2, axis=0)
|
|
268
|
+
corr_cumsum = np.cumsum(self_corr, dtype=np.float64)
|
|
269
|
+
corr_cumsum[window_size:] -= corr_cumsum[:-window_size]
|
|
270
|
+
window_rms = corr_cumsum[window_size-1:]
|
|
271
|
+
epsilon = 1e-4 * max(1, np.max(window_rms))
|
|
272
|
+
window_rms = np.sqrt(window_rms + epsilon)
|
|
273
|
+
for jump_index, jump in enumerate(jumps):
|
|
274
|
+
autocorrelation = np.sum(input[:,jump:].astype(np.float32) * input[:,:input.shape[1]-jump], axis=0)
|
|
275
|
+
autocorr_cumsum = np.cumsum(autocorrelation, dtype=np.float64)
|
|
276
|
+
autocorr_cumsum[window_size:] -= autocorr_cumsum[:-window_size]
|
|
277
|
+
if negative:
|
|
278
|
+
pearson_corrs[jump_index, jump:] = autocorr_cumsum[window_size-1:] + epsilon
|
|
279
|
+
pearson_corrs[jump_index, jump:] /= window_rms[:len(window_rms)-jump]
|
|
577
280
|
else:
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
# identify which segments of the replaced audio actually have the describer speaking
|
|
587
|
-
# uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
|
|
588
|
-
def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
|
|
589
|
-
smooth_path, detect_sensitivity, boost_sensitivity):
|
|
590
|
-
# retokenize the audio description, which has been stretched to match the video
|
|
591
|
-
audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
|
|
592
|
-
audio_desc_spec = normalize_spec(audio_desc_spec_raw)
|
|
593
|
-
|
|
594
|
-
# avoid boosting or training on mismatched segments, like those close to skips
|
|
595
|
-
# assumes matching segments all have the same, constant play rate
|
|
596
|
-
# could be modified to handle a multi-modal distribution of rates
|
|
597
|
-
aligned_audio_times, aligned_video_times = zip(*smooth_path)
|
|
598
|
-
interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
|
|
599
|
-
fill_value = 'extrapolate',
|
|
600
|
-
bounds_error = False, assume_sorted = True)
|
|
601
|
-
slopes = (interp(video_timings + 1e-5) - \
|
|
602
|
-
interp(video_timings - 1e-5)) / 2e-5
|
|
603
|
-
median_slope = np.median(slopes)
|
|
604
|
-
aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
|
|
605
|
-
well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
|
|
606
|
-
|
|
607
|
-
# first pass identification by assuming poorly matched tokens are describer speech
|
|
608
|
-
# also assumes the describer doesn't speak very quietly
|
|
609
|
-
corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
|
|
610
|
-
smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
|
|
611
|
-
audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
|
|
612
|
-
speech_mask = (corrs < .2) * audio_desc_loud
|
|
613
|
-
|
|
614
|
-
# normalize spectrogram coefficients along time axis to prep for conversion to PDFs
|
|
615
|
-
audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
|
|
616
|
-
audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
|
|
617
|
-
video_spec = normalize_spec(video_spec_raw, axes=(0,))
|
|
618
|
-
video_spec = np.clip(video_spec / 6., -1, 1)
|
|
619
|
-
|
|
620
|
-
# convert sampled features (e.g. spectrogram) to probability densities of each feature
|
|
621
|
-
# when given a spectrogram, finds the distributions of the MFC coefficients
|
|
622
|
-
def make_log_pdfs(arr):
|
|
623
|
-
resolution = 100
|
|
624
|
-
bins_per_spot = 4
|
|
625
|
-
num_bins = int(resolution * bins_per_spot)
|
|
626
|
-
uniform_prior_strength_per_spot = 1
|
|
627
|
-
uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
|
|
628
|
-
bin_range = (-1 - 1e-10, 1 + 1e-10)
|
|
629
|
-
get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
|
|
630
|
-
pdfs = np.apply_along_axis(get_hist, 1, arr.T)
|
|
631
|
-
pdfs = pdfs + uniform_prior_strength_per_bin
|
|
632
|
-
smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
|
|
633
|
-
pdfs = np.apply_along_axis(smooth, 1, pdfs)
|
|
634
|
-
pdfs = pdfs / np.sum(pdfs[0,:])
|
|
635
|
-
log_pdfs = np.log(pdfs)
|
|
636
|
-
bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
|
|
637
|
-
return log_pdfs, bin_edges
|
|
638
|
-
|
|
639
|
-
diff_spec = audio_desc_spec - video_spec
|
|
640
|
-
diff_spec = np.clip(diff_spec, -1, 1)
|
|
281
|
+
pearson_corrs[jump_index, :pearson_corrs.shape[1]-jump] = autocorr_cumsum[window_size-1:] + epsilon
|
|
282
|
+
pearson_corrs[jump_index, :pearson_corrs.shape[1]-jump] /= window_rms[jump:]
|
|
283
|
+
# divide by RMS of constituent windows to get Pearson correlations
|
|
284
|
+
pearson_corrs = pearson_corrs / window_rms[None,:]
|
|
285
|
+
pearson_corrs = pearson_corrs.T
|
|
286
|
+
for chunk_index in range(0, input.shape[1] // window_size):
|
|
287
|
+
yield pearson_corrs[chunk_index*window_size:(chunk_index+1)*window_size]
|
|
641
288
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
289
|
+
def stretch(input, output, window_size=512, max_drift=512*3):
|
|
290
|
+
drift_window_size = max_drift * 2 + 1
|
|
291
|
+
num_input_samples = input.shape[1]
|
|
292
|
+
num_output_samples = output.shape[1]
|
|
293
|
+
total_offset_samples = num_output_samples - num_input_samples
|
|
294
|
+
jumps = [506, 451, 284, 410, 480, 379, 308, 430, 265, 494]
|
|
295
|
+
# use all jumps when given unreachable or difficult to reach offsets (i.e. Frobenius coin problem)
|
|
296
|
+
# otherwise, skip most jumps to trade off a little performance for a lot of speed
|
|
297
|
+
if abs(total_offset_samples) < 10000:
|
|
298
|
+
if abs(total_offset_samples) > 1000:
|
|
299
|
+
jumps.extend([MIN_STRETCH_OFFSET + offset for offset in (2**np.arange(8))-1])
|
|
300
|
+
else:
|
|
301
|
+
jumps = range(MIN_STRETCH_OFFSET, window_size)
|
|
302
|
+
num_windows = (num_input_samples // window_size)
|
|
303
|
+
window_to_offset = lambda window_index: (total_offset_samples * \
|
|
304
|
+
min((num_windows - 1), max(0, window_index))) // (num_windows - 1)
|
|
305
|
+
# note the absolute value in the drift
|
|
306
|
+
# the following calculations also use the absolute value of the jumps
|
|
307
|
+
# their signs flip together, so this saves on casework in the code
|
|
308
|
+
# after the optimal route is determined, the sign of the jumps will be reintroduced
|
|
309
|
+
window_to_offset_diff = lambda window_index: abs(window_to_offset(window_index) - \
|
|
310
|
+
window_to_offset(window_index - 1))
|
|
311
|
+
backpointers = np.zeros((num_windows, drift_window_size), dtype=np.int16)
|
|
312
|
+
best_jump_locations = np.zeros((num_windows, len(jumps)), dtype=np.int16)
|
|
313
|
+
cum_loss = np.zeros((3, drift_window_size)) + np.inf
|
|
314
|
+
cum_loss[1:, max_drift] = 0
|
|
315
|
+
last_offset_diff = 0
|
|
316
|
+
# if the output needs to be longer than the input, we need to jump backwards in the input
|
|
317
|
+
pearson_corrs_generator = get_pearson_corrs_generator(input, (total_offset_samples > 0), jumps)
|
|
318
|
+
for window_index in range(num_windows):
|
|
319
|
+
corrs = next(pearson_corrs_generator)
|
|
320
|
+
# for each jump distance, determine the best input index in the window to make that jump
|
|
321
|
+
best_jump_locations[window_index] = np.argmax(corrs, axis=0)
|
|
322
|
+
best_jump_losses = 1 - corrs[best_jump_locations[window_index], np.arange(corrs.shape[1])]
|
|
323
|
+
offset_diff = window_to_offset_diff(window_index)
|
|
324
|
+
offset_diff2 = offset_diff + last_offset_diff
|
|
325
|
+
offset_jump_losses = np.zeros((len(jumps)+1, drift_window_size)) + np.inf
|
|
326
|
+
# consider not jumping at all, copying the loss from the corresponding offset one window back
|
|
327
|
+
offset_jump_slice = slice(None, offset_jump_losses.shape[1] - offset_diff)
|
|
328
|
+
offset_jump_losses[0,offset_jump_slice] = cum_loss[(window_index-1)%3,offset_diff:]
|
|
329
|
+
for jump_index, jump in enumerate(jumps):
|
|
330
|
+
truncation_amount = offset_diff2 - jump
|
|
331
|
+
offset_jump_slice = slice(jump, drift_window_size - max(0, truncation_amount))
|
|
332
|
+
cum_loss_slice = slice(offset_diff2, drift_window_size + min(0, truncation_amount))
|
|
333
|
+
# consider jumping the given distance from two windows back
|
|
334
|
+
# a window is skipped when jumping to prevent overlapping crossfades
|
|
335
|
+
offset_jump_losses[jump_index+1, offset_jump_slice] = cum_loss[(window_index-2)%3, cum_loss_slice] + \
|
|
336
|
+
best_jump_losses[jump_index]
|
|
337
|
+
best_jumps = np.argmin(offset_jump_losses, axis=0)
|
|
338
|
+
backpointers[window_index] = best_jumps
|
|
339
|
+
cum_loss[window_index%3] = offset_jump_losses[best_jumps, np.arange(offset_jump_losses.shape[1])]
|
|
340
|
+
last_offset_diff = offset_diff
|
|
341
|
+
drift = max_drift
|
|
342
|
+
best_jumps = []
|
|
343
|
+
skip_window = False
|
|
344
|
+
for window_index in range(num_windows - 1, -1, -1):
|
|
345
|
+
drift += window_to_offset_diff(window_index + 1)
|
|
346
|
+
if skip_window:
|
|
347
|
+
skip_window = False
|
|
348
|
+
continue
|
|
349
|
+
best_jump_index = backpointers[window_index, drift] - 1
|
|
350
|
+
if best_jump_index == -1:
|
|
351
|
+
continue
|
|
352
|
+
best_jump = jumps[best_jump_index]
|
|
353
|
+
jump_input_index = window_index * window_size + \
|
|
354
|
+
best_jump_locations[window_index, best_jump_index].item()
|
|
355
|
+
drift -= best_jump
|
|
356
|
+
skip_window = True
|
|
357
|
+
best_jumps.append((jump_input_index, best_jump))
|
|
358
|
+
best_jumps = best_jumps[::-1]
|
|
359
|
+
best_jumps = np.array(best_jumps)
|
|
360
|
+
# reintroduce the sign of the jump distances
|
|
361
|
+
# if the output is longer, use backwards jumps in the input to duplicate samples
|
|
362
|
+
# if the output is shorter, use forwards jumps in the input to remove samples
|
|
363
|
+
if total_offset_samples > 0:
|
|
364
|
+
best_jumps[:,1] *= -1
|
|
365
|
+
jump_input_indices = best_jumps[:,0]
|
|
366
|
+
jump_distances = best_jumps[:,1]
|
|
367
|
+
# calculate starts and ends of segments that will be copied from input to output
|
|
368
|
+
input_starts = np.concatenate(([0], jump_input_indices + jump_distances))
|
|
369
|
+
input_ends = np.concatenate((jump_input_indices, [input.shape[1]]))
|
|
370
|
+
chunk_lengths = input_ends - input_starts
|
|
371
|
+
output_ends = np.cumsum(chunk_lengths)
|
|
372
|
+
output_starts = np.concatenate(([0], output_ends[:-1]))
|
|
373
|
+
bump = scipy.signal.windows.hann(2 * window_size + 1)
|
|
374
|
+
bump_head = bump[:window_size]
|
|
375
|
+
bump_tail = bump[window_size:-1]
|
|
376
|
+
output[:,:window_size] = input[:,:window_size]
|
|
377
|
+
for in_start, in_end, out_start, out_end in zip(input_starts, input_ends, output_starts, output_ends):
|
|
378
|
+
output[:,out_start:out_start+window_size] *= bump_tail
|
|
379
|
+
output[:,out_start:out_start+window_size] += input[:,in_start:in_start+window_size] * bump_head
|
|
380
|
+
output[:,out_start+window_size:out_end+window_size] = input[:,in_start+window_size:in_end+window_size]
|
|
722
381
|
|
|
723
|
-
|
|
382
|
+
x = audio_desc_times
|
|
383
|
+
y = video_times
|
|
384
|
+
x_samples = (x * AUDIO_SAMPLE_RATE).astype(int)
|
|
385
|
+
y_samples = (y * AUDIO_SAMPLE_RATE).astype(int)
|
|
386
|
+
diff_x_samples = np.diff(x_samples)
|
|
387
|
+
diff_y_samples = np.diff(y_samples)
|
|
388
|
+
slopes = diff_x_samples / diff_y_samples
|
|
389
|
+
total_offset_samples = diff_y_samples - diff_x_samples
|
|
390
|
+
y_midpoint_samples = (y_samples[:-1] + y_samples[1:]) // 2
|
|
391
|
+
progress_update_interval = (video_arr.shape[1] // 100) + 1
|
|
392
|
+
last_progress_update = -1
|
|
393
|
+
for i in range(len(x) - 1):
|
|
394
|
+
if diff_y_samples[i] < (MIN_DURATION_TO_REPLACE_SECONDS * AUDIO_SAMPLE_RATE) or \
|
|
395
|
+
np.abs(1 - slopes[i]) > MAX_RATE_RATIO_DIFF_ALIGN:
|
|
396
|
+
continue
|
|
397
|
+
video_arr_slice = video_arr[:,slice(*y_samples[i:i+2])]
|
|
398
|
+
progress = int(y_midpoint_samples[i] // progress_update_interval)
|
|
399
|
+
if progress > last_progress_update:
|
|
400
|
+
last_progress_update = progress
|
|
401
|
+
print(f" stretching audio:{progress:3d}% \r", end='')
|
|
402
|
+
# only apply pitch correction if the difference would be noticeable
|
|
403
|
+
if no_pitch_correction or np.abs(1 - slopes[i]) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO or \
|
|
404
|
+
abs(total_offset_samples[i]) < MIN_STRETCH_OFFSET:
|
|
405
|
+
# construct a stretched audio description waveform using the quadratic interpolator
|
|
406
|
+
sample_points = np.linspace(*x_samples[i:i+2], num=diff_y_samples[i], endpoint=False)
|
|
407
|
+
video_arr_slice[:] = audio_desc_arr_interp(sample_points)
|
|
408
|
+
else:
|
|
409
|
+
stretch(audio_desc_arr[:,slice(*x_samples[i:i+2])], video_arr_slice)
|
|
724
410
|
|
|
725
411
|
# Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
|
|
726
|
-
def encode_fit_as_ffmpeg_expr(
|
|
412
|
+
def encode_fit_as_ffmpeg_expr(audio_desc_times, video_times, video_offset):
|
|
727
413
|
# PTS is the input frame's presentation timestamp, which is when frames are displayed
|
|
728
414
|
# TB is the timebase, which is how many seconds each unit of PTS corresponds to
|
|
729
415
|
# the output value of the expression will be the frame's new PTS
|
|
730
416
|
setts_cmd = ['TS']
|
|
731
|
-
start_skip = max(0, video_offset - start_key_frame)
|
|
732
|
-
if start_skip > 0:
|
|
733
|
-
# lossless cutting can only happen at key frames, so we cut the video before the audio starts
|
|
734
|
-
# but that means the video is behind the audio and needs to catch up by playing quicker
|
|
735
|
-
# catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
|
|
736
|
-
catchup_spread = 1./CATCHUP_RATE
|
|
737
|
-
setts_cmd.append(f'+clip(TS-{start_key_frame},0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
|
|
738
|
-
elif video_offset < 0:
|
|
739
|
-
# if the audio starts before the video, stretch the first frame of the video back to meet it
|
|
740
|
-
setts_cmd.append(f'+clip(TS-{start_key_frame},0,{-video_offset/10000.}/TB)*10000')
|
|
741
417
|
# each segment of the linear fit can be encoded as a single clip function
|
|
742
418
|
setts_cmd.append('+(0')
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
slope = audio_desc_length / video_length
|
|
751
|
-
setts_cmd.append(f'+clip(TS-{start_key_frame}-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
|
|
419
|
+
x = audio_desc_times
|
|
420
|
+
y = video_times
|
|
421
|
+
diff_x = np.diff(x)
|
|
422
|
+
diff_y = np.diff(y)
|
|
423
|
+
slopes = diff_x / diff_y
|
|
424
|
+
for i in range(len(audio_desc_times) - 1):
|
|
425
|
+
setts_cmd.append(f'+clip(TS-{y[i]-video_offset:.4f}/TB,0,{max(0,diff_y[i]):.4f}/TB)*{slopes[i]-1:.9f}')
|
|
752
426
|
setts_cmd.append(')')
|
|
753
427
|
setts_cmd = ''.join(setts_cmd)
|
|
754
428
|
return setts_cmd
|
|
@@ -759,75 +433,65 @@ def get_ffmpeg():
|
|
|
759
433
|
def get_ffprobe():
|
|
760
434
|
return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
|
|
761
435
|
|
|
436
|
+
def get_key_frame_data(video_file, time=None, entry='pts_time'):
|
|
437
|
+
interval = f'%+{max(60,time+40)}' if time != None else '%'
|
|
438
|
+
key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='V', show_frames=None,
|
|
439
|
+
skip_frame='nokey', read_intervals=interval,
|
|
440
|
+
show_entries='frame='+entry)['frames']
|
|
441
|
+
return np.array([float(frame[entry]) for frame in key_frames if entry in frame])
|
|
442
|
+
|
|
443
|
+
# finds the average timestamp of (i.e. midpoint between) the key frames on either side of input time
|
|
762
444
|
def get_closest_key_frame_time(video_file, time):
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
445
|
+
key_frame_times = get_key_frame_data(video_file, time)
|
|
446
|
+
key_frame_times = key_frame_times if len(key_frame_times) > 0 else np.array([0])
|
|
447
|
+
prev_key_frame_times = key_frame_times[key_frame_times <= time]
|
|
448
|
+
prev_key_frame = np.max(prev_key_frame_times) if len(prev_key_frame_times) > 0 else time
|
|
449
|
+
next_key_frame_times = key_frame_times[key_frame_times > time]
|
|
450
|
+
next_key_frame = np.min(next_key_frame_times) if len(next_key_frame_times) > 0 else time
|
|
451
|
+
return (prev_key_frame + next_key_frame) / 2.
|
|
769
452
|
|
|
770
453
|
# outputs a new media file with the replaced audio (which includes audio descriptions)
|
|
771
454
|
def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
|
|
772
|
-
setts_cmd=None,
|
|
773
|
-
if
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
if
|
|
777
|
-
|
|
455
|
+
setts_cmd=None, video_offset=None, after_start_key_frame=None):
|
|
456
|
+
# if a media array is given, stretch_audio is enabled and media_arr should be added to the video
|
|
457
|
+
if media_arr is not None:
|
|
458
|
+
media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE)
|
|
459
|
+
# if no video file is given, the input "video" was an audio file and the output should be too
|
|
460
|
+
if video_file is None:
|
|
461
|
+
write_command = ffmpeg.output(media_input, output_filename, loglevel='error').overwrite_output()
|
|
778
462
|
else:
|
|
779
|
-
original_video = ffmpeg.input(video_file)
|
|
463
|
+
original_video = ffmpeg.input(video_file, dn=None)
|
|
780
464
|
# "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
|
|
781
465
|
# ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
|
|
782
466
|
# more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
|
|
783
467
|
write_command = ffmpeg.output(media_input, original_video, output_filename,
|
|
784
468
|
acodec='copy', vcodec='copy', scodec='copy',
|
|
785
|
-
max_interleave_delta='0', loglevel='
|
|
786
|
-
**{"c:a:0": "aac", "disposition:a:
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
469
|
+
max_interleave_delta='0', loglevel='error',
|
|
470
|
+
**{"c:a:0": "aac", "disposition:a:1": "original",
|
|
471
|
+
"metadata:s:a:1": "title=original",
|
|
472
|
+
"disposition:a:0": "default+visual_impaired+descriptions",
|
|
473
|
+
"metadata:s:a:0": "title=AD"}).overwrite_output()
|
|
474
|
+
run_async_ffmpeg_command(write_command, media_arr, f"write output file: {output_filename}")
|
|
791
475
|
else:
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
if os.path.splitext(
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
# mkv files break if 'nut' is used, while other files break when 'matroska' is used
|
|
812
|
-
format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
|
|
813
|
-
write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
|
|
814
|
-
c='copy', loglevel='fatal')
|
|
815
|
-
ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
|
|
816
|
-
pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
|
|
817
|
-
write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
|
|
818
|
-
max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
|
|
819
|
-
**{'bsf:v': f'setts=ts=\'{setts_cmd}\'',
|
|
820
|
-
'bsf:s': f'setts=ts=\'{setts_cmd}\''}).overwrite_output()
|
|
821
|
-
ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
|
|
822
|
-
while True:
|
|
823
|
-
in_bytes = ffmpeg_caller.stdout.read(100000)
|
|
824
|
-
if not in_bytes:
|
|
825
|
-
break
|
|
826
|
-
ffmpeg_caller2.stdin.write(in_bytes)
|
|
827
|
-
ffmpeg_caller2.stdin.close()
|
|
828
|
-
ffmpeg_caller.wait()
|
|
829
|
-
ffmpeg_caller2.wait()
|
|
830
|
-
|
|
476
|
+
start_offset = video_offset - after_start_key_frame
|
|
477
|
+
media_input = ffmpeg.input(audio_desc_file, itsoffset=f'{max(0, start_offset):.6f}')
|
|
478
|
+
original_video = ffmpeg.input(video_file, an=None, ss=f'{after_start_key_frame:.6f}',
|
|
479
|
+
itsoffset=f'{max(0, -start_offset):.6f}', dn=None)
|
|
480
|
+
# wav files don't have codecs compatible with most video containers, so we convert to aac
|
|
481
|
+
audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
|
|
482
|
+
# flac audio may only have experimental support in some video containers (e.g. mp4)
|
|
483
|
+
standards = 'normal' if os.path.splitext(audio_desc_file)[1] != '.flac' else 'experimental'
|
|
484
|
+
# add frag_keyframe flag to prevent some players from ignoring audio/video start offsets
|
|
485
|
+
# set both pts and dts simultaneously in video manually, as ts= does not do the same thing
|
|
486
|
+
write_command = ffmpeg.output(media_input, original_video, output_filename,
|
|
487
|
+
acodec=audio_codec, vcodec='copy', scodec='copy',
|
|
488
|
+
max_interleave_delta='0', loglevel='error',
|
|
489
|
+
strict=standards, movflags='frag_keyframe',
|
|
490
|
+
**{'bsf:v': f'setts=pts=\'{setts_cmd}\':dts=\'{setts_cmd}\'',
|
|
491
|
+
'bsf:s': f'setts=ts=\'{setts_cmd}\'',
|
|
492
|
+
"disposition:a:0": "default+visual_impaired+descriptions",
|
|
493
|
+
"metadata:s:a:0": "title=AD"}).overwrite_output()
|
|
494
|
+
run_ffmpeg_command(write_command, f"write output file: {output_filename}")
|
|
831
495
|
|
|
832
496
|
# check whether static_ffmpeg has already installed ffmpeg and ffprobe
|
|
833
497
|
def is_ffmpeg_installed():
|
|
@@ -835,15 +499,482 @@ def is_ffmpeg_installed():
|
|
|
835
499
|
indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
|
|
836
500
|
return os.path.exists(indicator_file)
|
|
837
501
|
|
|
502
|
+
def get_energy(arr):
|
|
503
|
+
# downsample of 105, hann size 15, downsample by 2 gives 210 samples per second, ~65 halfwindows/second
|
|
504
|
+
decimation = 105
|
|
505
|
+
decimation2 = 2
|
|
506
|
+
arr_clip = arr[:,:(arr.shape[1] - (arr.shape[1] % decimation))].reshape(arr.shape[0], -1, decimation)
|
|
507
|
+
energy = np.einsum('ijk,ijk->j', arr_clip, arr_clip, dtype=np.float32) / (decimation * arr.shape[0])
|
|
508
|
+
hann_window = scipy.signal.windows.hann(15)[1:-1].astype(np.float32)
|
|
509
|
+
hann_window /= np.sum(hann_window)
|
|
510
|
+
energy_smooth = np.convolve(energy, hann_window, mode='same')
|
|
511
|
+
energy_smooth = np.log10(1 + energy_smooth) / 2.
|
|
512
|
+
return energy_smooth[::decimation2]
|
|
513
|
+
|
|
514
|
+
def get_zero_crossings(arr):
|
|
515
|
+
xings = np.diff(np.signbit(arr), prepend=False, axis=-1)
|
|
516
|
+
xings_clip = xings[:,:(xings.shape[1] - (xings.shape[1] % 210))].reshape(xings.shape[0], -1, 210)
|
|
517
|
+
zero_crossings = np.sum(np.abs(xings_clip), axis=(0,2)).astype(np.float32)
|
|
518
|
+
if xings.shape[0] == 1:
|
|
519
|
+
zero_crossings *= 2
|
|
520
|
+
hann_window = scipy.signal.windows.hann(15)[1:-1].astype(np.float32)
|
|
521
|
+
hann_window = hann_window / np.sum(hann_window)
|
|
522
|
+
zero_crossings_smooth = np.convolve(zero_crossings, hann_window, mode='same')
|
|
523
|
+
return zero_crossings_smooth
|
|
524
|
+
|
|
525
|
+
def downsample_blur(arr, downsample, blur):
|
|
526
|
+
hann_window = scipy.signal.windows.hann(downsample*blur+2)[1:-1].astype(np.float32)
|
|
527
|
+
hann_window = hann_window / np.sum(hann_window)
|
|
528
|
+
arr = arr[:len(arr)-(len(arr)%downsample)]
|
|
529
|
+
return sum((np.convolve(arr[i::downsample], hann_window[i::downsample],
|
|
530
|
+
mode='same') for i in range(downsample)))
|
|
531
|
+
|
|
532
|
+
def get_freq_bands(arr):
|
|
533
|
+
arr = np.mean(arr, axis=0) if arr.shape[0] > 1 else arr[0]
|
|
534
|
+
arr = arr[:len(arr)-(len(arr)%210)]
|
|
535
|
+
downsamples = [5, 7, 6]
|
|
536
|
+
decimation = 1
|
|
537
|
+
freq_bands = []
|
|
538
|
+
for downsample in downsamples:
|
|
539
|
+
if downsample == downsamples[-1]:
|
|
540
|
+
band_bottom = np.array(0).reshape(1)
|
|
541
|
+
else:
|
|
542
|
+
band_bottom = downsample_blur(arr, downsample, 3)
|
|
543
|
+
decimation *= downsample
|
|
544
|
+
arr = arr.reshape(-1, downsample)
|
|
545
|
+
band_energy = sum(((arr[:,i] - band_bottom) ** 2 for i in range(downsample)))
|
|
546
|
+
freq_band = downsample_blur(band_energy, (210 // decimation), 15) / 210
|
|
547
|
+
freq_band = np.log10(1 + freq_band) / 2.
|
|
548
|
+
freq_bands.append(freq_band)
|
|
549
|
+
arr = band_bottom
|
|
550
|
+
return freq_bands
|
|
551
|
+
|
|
552
|
+
def align(video_features, audio_desc_features, video_energy, audio_desc_energy):
|
|
553
|
+
samples_per_node = 210 // TIMESTEPS_PER_SECOND
|
|
554
|
+
hann_window_unnormed = scipy.signal.windows.hann(2*samples_per_node+1)[1:-1]
|
|
555
|
+
hann_window = hann_window_unnormed / np.sum(hann_window_unnormed)
|
|
556
|
+
get_mean = lambda arr: np.convolve(hann_window, arr, mode='same')
|
|
557
|
+
get_uniform_norm = lambda arr: np.convolve(np.ones(hann_window.shape), arr ** 2, mode='valid') ** .5
|
|
558
|
+
def get_uniform_norms(features):
|
|
559
|
+
return [np.clip(get_uniform_norm(feature), .001, None) for feature in features]
|
|
560
|
+
|
|
561
|
+
print(" memorizing video... \r", end='')
|
|
562
|
+
video_features_mean_sub = [feature - get_mean(feature) for feature in video_features]
|
|
563
|
+
audio_desc_features_mean_sub = [feature - get_mean(feature) for feature in audio_desc_features]
|
|
564
|
+
video_uniform_norms = get_uniform_norms(video_features_mean_sub)
|
|
565
|
+
audio_desc_uniform_norms = get_uniform_norms(audio_desc_features_mean_sub)
|
|
566
|
+
|
|
567
|
+
num_bins = 7
|
|
568
|
+
bin_spacing = 6
|
|
569
|
+
bins_width = (num_bins - 1) * bin_spacing + 1
|
|
570
|
+
bins_start = samples_per_node - 1 - (bins_width // 2)
|
|
571
|
+
bins_end = bins_start + bins_width
|
|
572
|
+
video_dicts = [defaultdict(set) for feature in video_features_mean_sub]
|
|
573
|
+
edges = np.array(np.meshgrid(*([np.arange(2)]*num_bins), indexing='ij')).reshape(num_bins,-1).T
|
|
574
|
+
bin_offsets = []
|
|
575
|
+
for edge in edges:
|
|
576
|
+
bin_offset = np.array(np.meshgrid(*[np.arange(x+1) for x in edge], indexing='ij'))
|
|
577
|
+
bin_offsets.append(np.dot(bin_offset.reshape(num_bins,-1)[::-1].T, 7**np.arange(num_bins)))
|
|
578
|
+
|
|
579
|
+
for video_dict, feature, norm in zip(video_dicts, video_features_mean_sub, video_uniform_norms):
|
|
580
|
+
bins = np.hstack([feature[bins_start+i:-bins_end+i+1, None] for i in bin_spacing * np.arange(num_bins)])
|
|
581
|
+
bins /= norm[:,None]
|
|
582
|
+
bins = 8 * bins + 3.3
|
|
583
|
+
np.clip(bins, 0, 6, out=bins)
|
|
584
|
+
bin_offset_indices = np.dot(((bins % 1) > .6), 2**np.arange(num_bins))
|
|
585
|
+
bins = np.dot(np.floor(bins).astype(int), 7**np.arange(num_bins)).tolist()
|
|
586
|
+
not_quiet = (video_energy[:-len(hann_window)] > .5)
|
|
587
|
+
for i in np.arange(len(video_energy) - len(hann_window))[not_quiet].tolist()[::4]:
|
|
588
|
+
bin = bins[i]
|
|
589
|
+
for bin_offset in bin_offsets[bin_offset_indices[i]].tolist():
|
|
590
|
+
video_dict[bin + bin_offset].add(i)
|
|
591
|
+
|
|
592
|
+
print(" matching audio... \r", end='')
|
|
593
|
+
audio_desc_bins = []
|
|
594
|
+
audio_desc_bin_offset_indices = []
|
|
595
|
+
for feature, norm in zip(audio_desc_features_mean_sub, audio_desc_uniform_norms):
|
|
596
|
+
bins = np.hstack([feature[bins_start+i:-bins_end+i+1, None] for i in bin_spacing * np.arange(num_bins)])
|
|
597
|
+
bins /= norm[:,None]
|
|
598
|
+
bins = 8 * bins + 3.5
|
|
599
|
+
bins = np.floor(bins).astype(int)
|
|
600
|
+
np.clip(bins, 0, 6, out=bins)
|
|
601
|
+
audio_desc_bins.append(np.dot(bins, 7**np.arange(num_bins)).tolist())
|
|
602
|
+
|
|
603
|
+
def pairwise_intersection(set1, set2, set3):
|
|
604
|
+
return (set1 & set2).union((set1 & set3), (set2 & set3))
|
|
605
|
+
def triwise_intersection(set1, set2, set3, set4, set5):
|
|
606
|
+
set123 = pairwise_intersection(set1, set2, set3)
|
|
607
|
+
return (set123 & set4) | (set123 & set5)
|
|
608
|
+
best_so_far = SortedList(key=lambda x:x[0])
|
|
609
|
+
best_so_far.add((-1,-1,0))
|
|
610
|
+
backpointers = {}
|
|
611
|
+
not_quiet = (audio_desc_energy[:-len(hann_window)] > .5)
|
|
612
|
+
for i in np.arange(len(audio_desc_energy) - len(hann_window))[not_quiet].tolist():
|
|
613
|
+
match_sets = [video_dict[bins[i]] for bins, video_dict in zip(audio_desc_bins, video_dicts)]
|
|
614
|
+
common = triwise_intersection(*match_sets)
|
|
615
|
+
match_points = []
|
|
616
|
+
for video_index in common:
|
|
617
|
+
prob = 1
|
|
618
|
+
for j in range(3):
|
|
619
|
+
corr = np.dot(audio_desc_features_mean_sub[j][i:i+2*samples_per_node-1],
|
|
620
|
+
video_features_mean_sub[j][video_index:video_index+2*samples_per_node-1])
|
|
621
|
+
corr /= audio_desc_uniform_norms[j][i] * video_uniform_norms[j][video_index]
|
|
622
|
+
prob *= max(1e-8, (1 - corr)) # Naive Bayes probability
|
|
623
|
+
prob = prob ** 2.9 # empirically determined, ranges from 2.5-3.4
|
|
624
|
+
if prob > 1e-8:
|
|
625
|
+
continue
|
|
626
|
+
qual = min(50, (prob / 1e-12) ** (-1. / 3)) # remove Naive Bayes assumption
|
|
627
|
+
match_points.append((video_index, qual))
|
|
628
|
+
audio_desc_index = i
|
|
629
|
+
for video_index, qual in sorted(match_points):
|
|
630
|
+
cur_index = best_so_far.bisect_right((video_index,))
|
|
631
|
+
prev_video_index, prev_audio_desc_index, prev_cum_qual = best_so_far[cur_index-1]
|
|
632
|
+
cum_qual = prev_cum_qual + qual
|
|
633
|
+
while (cur_index < len(best_so_far)) and (best_so_far[cur_index][2] <= cum_qual):
|
|
634
|
+
del best_so_far[cur_index]
|
|
635
|
+
best_so_far.add((video_index, audio_desc_index, cum_qual))
|
|
636
|
+
backpointers[(video_index, audio_desc_index)] = (prev_video_index, prev_audio_desc_index)
|
|
637
|
+
del video_dicts
|
|
638
|
+
path = [best_so_far[-1][:2]]
|
|
639
|
+
while path[-1][:2] in backpointers:
|
|
640
|
+
# failsafe to prevent an infinite loop that should never happen anyways
|
|
641
|
+
if len(path) > 10**8:
|
|
642
|
+
raise RuntimeError("Infinite Loop Encountered!")
|
|
643
|
+
path.append(backpointers[path[-1][:2]])
|
|
644
|
+
path.pop()
|
|
645
|
+
path.reverse()
|
|
646
|
+
if len(path) < max(min(len(video_energy), len(audio_desc_energy)) / 500., 5 * 210):
|
|
647
|
+
raise RuntimeError("Alignment failed, are the input files mismatched?")
|
|
648
|
+
y, x = np.array(path).T
|
|
649
|
+
|
|
650
|
+
half_hann_window = hann_window[:samples_per_node-1] / np.sum(hann_window[:samples_per_node-1])
|
|
651
|
+
half_samples_per_node = samples_per_node // 2
|
|
652
|
+
fit_delay = samples_per_node + half_samples_per_node - 2
|
|
653
|
+
diff_by = lambda arr, offset=half_samples_per_node: arr[offset:] - arr[:-offset]
|
|
654
|
+
def get_continuity_err(x, y, deriv=False):
|
|
655
|
+
x_smooth_future = np.convolve(x, half_hann_window, mode='valid')
|
|
656
|
+
y_smooth_future = np.convolve(y, half_hann_window, mode='valid')
|
|
657
|
+
slopes_future = diff_by(y_smooth_future) / diff_by(x_smooth_future)
|
|
658
|
+
offsets_future = y_smooth_future[:-half_samples_per_node] - \
|
|
659
|
+
x_smooth_future[:-half_samples_per_node] * slopes_future
|
|
660
|
+
x_smooth_past = np.convolve(x, half_hann_window[::-1], mode='valid')
|
|
661
|
+
y_smooth_past = np.convolve(y, half_hann_window[::-1], mode='valid')
|
|
662
|
+
slopes_past = diff_by(y_smooth_past) / diff_by(x_smooth_past)
|
|
663
|
+
offsets_past = y_smooth_past[half_samples_per_node:] - \
|
|
664
|
+
x_smooth_past[half_samples_per_node:] * slopes_past
|
|
665
|
+
continuity_err = np.full(len(x) - (1 if deriv else 0), np.inf)
|
|
666
|
+
fit_delay_offset = fit_delay - (1 if deriv else 0)
|
|
667
|
+
continuity_err[:-fit_delay_offset] = np.abs(slopes_future * x[:-fit_delay] + \
|
|
668
|
+
offsets_future - y[:-fit_delay])
|
|
669
|
+
continuity_err[fit_delay_offset:] = np.minimum(continuity_err[fit_delay_offset:],
|
|
670
|
+
np.abs(slopes_past * x[fit_delay:] + \
|
|
671
|
+
offsets_past - y[fit_delay:]))
|
|
672
|
+
return continuity_err
|
|
673
|
+
|
|
674
|
+
print(" refining match: pass 1 of 2...\r", end='')
|
|
675
|
+
continuity_err = get_continuity_err(x, y)
|
|
676
|
+
errs = (continuity_err < 3)
|
|
677
|
+
x = x[errs]
|
|
678
|
+
y = y[errs]
|
|
679
|
+
|
|
680
|
+
audio_desc_features_scaled = []
|
|
681
|
+
video_features_scaled = []
|
|
682
|
+
for video_feature, audio_desc_feature in zip(video_features, audio_desc_features):
|
|
683
|
+
audio_desc_feature_std = np.std(audio_desc_feature)
|
|
684
|
+
scale_factor = np.linalg.lstsq(video_feature[y][:,None], audio_desc_feature[x], rcond=None)[0]
|
|
685
|
+
audio_desc_features_scaled.append(audio_desc_feature / audio_desc_feature_std)
|
|
686
|
+
video_features_scaled.append(video_feature * scale_factor / audio_desc_feature_std)
|
|
687
|
+
audio_desc_features_scaled = np.array(list(zip(*(audio_desc_features_scaled[:3]))))
|
|
688
|
+
video_features_scaled = np.array(list(zip(*(video_features_scaled[:3]))))
|
|
689
|
+
|
|
690
|
+
smooth_x = get_mean(x)
|
|
691
|
+
smooth_y = get_mean(y)
|
|
692
|
+
slopes = np.diff(smooth_y) / np.diff(smooth_x)
|
|
693
|
+
offsets = smooth_y[:-1] - smooth_x[:-1] * slopes
|
|
694
|
+
err_y = slopes * x[:-1] + offsets - y[:-1]
|
|
695
|
+
compressed_x, compressed_y = [], []
|
|
696
|
+
def extend_all(index, compress=False, num=70):
|
|
697
|
+
compressed_x.extend([np.mean(x[index:index+num])] if compress else x[index:index+num])
|
|
698
|
+
compressed_y.extend([np.mean(y[index:index+num])] if compress else y[index:index+num])
|
|
699
|
+
extend_all(0, num=10)
|
|
700
|
+
for i in range(10, len(x) - 80, 70):
|
|
701
|
+
extend_all(i, compress=np.all(np.abs(err_y[i:i+70]) < 3))
|
|
702
|
+
extend_all(i+70)
|
|
703
|
+
|
|
704
|
+
x = compressed_x
|
|
705
|
+
y = compressed_y
|
|
706
|
+
|
|
707
|
+
match_dict = defaultdict(list)
|
|
708
|
+
x_unique = [-1]
|
|
709
|
+
for audio_desc_index, video_index in zip(x, y):
|
|
710
|
+
match_dict[audio_desc_index].append(video_index)
|
|
711
|
+
if audio_desc_index != x_unique[-1]:
|
|
712
|
+
x_unique.append(audio_desc_index)
|
|
713
|
+
x = np.array(x_unique[1:])
|
|
714
|
+
y = np.array([np.mean(match_dict[audio_desc_index]) for audio_desc_index in x])
|
|
715
|
+
|
|
716
|
+
# L1-Minimization to solve the alignment problem using a linear program
|
|
717
|
+
# the absolute value functions needed for "absolute error" can be represented
|
|
718
|
+
# in a linear program by splitting variables into positive and negative pieces
|
|
719
|
+
# and constraining each to be positive (done by default in scipy's linprog)
|
|
720
|
+
num_fit_points = len(x)
|
|
721
|
+
x_diffs = np.diff(x)
|
|
722
|
+
y_diffs = np.diff(y)
|
|
723
|
+
jump_cost_base = 10.
|
|
724
|
+
jump_costs = np.full(num_fit_points - 1, jump_cost_base)
|
|
725
|
+
continuity_err = get_continuity_err(x, y, deriv=True)
|
|
726
|
+
jump_costs /= np.maximum(1, np.sqrt(continuity_err / 3.))
|
|
727
|
+
rate_change_jump_costs = np.full(num_fit_points - 1, .001)
|
|
728
|
+
rate_change_costs = np.full(num_fit_points - 2, jump_cost_base * 4000)
|
|
729
|
+
shot_noise_costs = np.full(num_fit_points, .01)
|
|
730
|
+
shot_noise_jump_costs = np.full(num_fit_points - 1, 3)
|
|
731
|
+
shot_noise_bound = 2.
|
|
732
|
+
c = np.hstack([np.ones(2 * num_fit_points),
|
|
733
|
+
jump_costs,
|
|
734
|
+
jump_costs,
|
|
735
|
+
shot_noise_costs,
|
|
736
|
+
shot_noise_costs,
|
|
737
|
+
shot_noise_jump_costs,
|
|
738
|
+
shot_noise_jump_costs,
|
|
739
|
+
rate_change_jump_costs,
|
|
740
|
+
rate_change_jump_costs,
|
|
741
|
+
rate_change_costs,
|
|
742
|
+
rate_change_costs,
|
|
743
|
+
[0,]])
|
|
744
|
+
fit_err_coeffs = scipy.sparse.diags([-1. / x_diffs,
|
|
745
|
+
1. / x_diffs],
|
|
746
|
+
offsets=[0,1],
|
|
747
|
+
shape=(num_fit_points - 1, num_fit_points)).tocsc()
|
|
748
|
+
jump_coeffs = scipy.sparse.diags([ 1. / x_diffs],
|
|
749
|
+
offsets=[0],
|
|
750
|
+
shape=(num_fit_points - 1, num_fit_points - 1)).tocsc()
|
|
751
|
+
A_eq1 = scipy.sparse.hstack([ fit_err_coeffs,
|
|
752
|
+
-fit_err_coeffs,
|
|
753
|
+
jump_coeffs,
|
|
754
|
+
-jump_coeffs,
|
|
755
|
+
scipy.sparse.csc_matrix((num_fit_points - 1, 2 * num_fit_points)),
|
|
756
|
+
jump_coeffs,
|
|
757
|
+
-jump_coeffs,
|
|
758
|
+
jump_coeffs,
|
|
759
|
+
-jump_coeffs,
|
|
760
|
+
scipy.sparse.csc_matrix((num_fit_points - 1, 2 * num_fit_points - 4)),
|
|
761
|
+
np.ones((num_fit_points - 1, 1))])
|
|
762
|
+
A_eq2 = scipy.sparse.hstack([ scipy.sparse.csc_matrix((num_fit_points - 1, 4 * num_fit_points - 2)),
|
|
763
|
+
scipy.sparse.diags([-1., 1.], offsets=[0, 1],
|
|
764
|
+
shape=(num_fit_points - 1, num_fit_points)).tocsc(),
|
|
765
|
+
scipy.sparse.diags([1., -1.], offsets=[0, 1],
|
|
766
|
+
shape=(num_fit_points - 1, num_fit_points)).tocsc(),
|
|
767
|
+
-scipy.sparse.eye(num_fit_points - 1),
|
|
768
|
+
scipy.sparse.eye(num_fit_points - 1),
|
|
769
|
+
scipy.sparse.csc_matrix((num_fit_points - 1, 4 * num_fit_points - 6)),
|
|
770
|
+
scipy.sparse.csc_matrix((num_fit_points - 1, 1))])
|
|
771
|
+
slope_change_coeffs = scipy.sparse.diags([-1. / x_diffs[:-1],
|
|
772
|
+
1. / x_diffs[1:]],
|
|
773
|
+
offsets=[0,1],
|
|
774
|
+
shape=(num_fit_points - 2, num_fit_points - 1)).tocsc()
|
|
775
|
+
A_eq3 = scipy.sparse.hstack([scipy.sparse.csc_matrix((num_fit_points - 2, 8 * num_fit_points - 4)),
|
|
776
|
+
slope_change_coeffs,
|
|
777
|
+
-slope_change_coeffs,
|
|
778
|
+
-scipy.sparse.eye(num_fit_points - 2),
|
|
779
|
+
scipy.sparse.eye(num_fit_points - 2),
|
|
780
|
+
scipy.sparse.csc_matrix((num_fit_points - 2, 1))])
|
|
781
|
+
A_eq = scipy.sparse.vstack([A_eq1, A_eq2, A_eq3])
|
|
782
|
+
b_eq = y_diffs / x_diffs
|
|
783
|
+
b_eq = np.hstack((b_eq, np.zeros(2 * num_fit_points - 3)))
|
|
784
|
+
bounds = [[0, None]] * (4 * num_fit_points - 2) + \
|
|
785
|
+
[[0, shot_noise_bound]] * (2 * num_fit_points) + \
|
|
786
|
+
[[0, None]] * (6 * num_fit_points - 8) + \
|
|
787
|
+
[[None, None]]
|
|
788
|
+
fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs-ds')
|
|
789
|
+
# if dual simplex solver encounters numerical problems, retry with interior point solver
|
|
790
|
+
if not fit.success and fit.status == 4:
|
|
791
|
+
fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq, bounds=bounds, method='highs-ipm')
|
|
792
|
+
if not fit.success:
|
|
793
|
+
print(fit)
|
|
794
|
+
raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
|
|
795
|
+
|
|
796
|
+
# combine positive and negative components of variables
|
|
797
|
+
fit_err = fit.x[ : num_fit_points ] - \
|
|
798
|
+
fit.x[ num_fit_points :2*num_fit_points ]
|
|
799
|
+
slope_jumps = fit.x[8*num_fit_points-4: 9*num_fit_points-5] - \
|
|
800
|
+
fit.x[9*num_fit_points-5:10*num_fit_points-6]
|
|
801
|
+
median_slope = fit.x[-1]
|
|
802
|
+
slopes = median_slope + (slope_jumps / x_diffs)
|
|
803
|
+
|
|
804
|
+
# subtract fit errors from nodes to retrieve the smooth fit's coordinates
|
|
805
|
+
smooth_path = [(x, y) for x,y in zip(x, y - fit_err)]
|
|
806
|
+
|
|
807
|
+
print(" refining match: pass 2 of 2...\r", end='')
|
|
808
|
+
slopes_plus_ends = np.hstack((slopes[:1], slopes, slopes[-1:]))
|
|
809
|
+
extensions = []
|
|
810
|
+
extend_radius = 210 * 30 # +/- 30 seconds
|
|
811
|
+
video_interp = scipy.interpolate.make_interp_spline(np.arange(len(video_features_scaled)),
|
|
812
|
+
video_features_scaled, k=1)
|
|
813
|
+
colinear_dict = defaultdict(list)
|
|
814
|
+
for i, (x, y) in enumerate(smooth_path):
|
|
815
|
+
for slope in slopes_plus_ends[i:i+2]:
|
|
816
|
+
if (slope < .1) or (slope > 10):
|
|
817
|
+
continue
|
|
818
|
+
offset = y - slope * x
|
|
819
|
+
colinear_dict[(round(slope, 6), int(round(offset, 0)))].append((x, y))
|
|
820
|
+
line_clusters = []
|
|
821
|
+
added_keys = set()
|
|
822
|
+
for (slope, offset), indices in sorted(colinear_dict.items(), key=lambda x: -len(x[1])):
|
|
823
|
+
if (slope, offset) in added_keys:
|
|
824
|
+
continue
|
|
825
|
+
line_clusters.append(indices)
|
|
826
|
+
added_keys.add((slope, offset))
|
|
827
|
+
del colinear_dict[(slope, offset)]
|
|
828
|
+
for (slope2, offset2), indices2 in list(colinear_dict.items()):
|
|
829
|
+
if (abs(indices2[ 0][1] - (indices2[ 0][0] * slope + offset)) < 3) and \
|
|
830
|
+
(abs(indices2[-1][1] - (indices2[-1][0] * slope + offset)) < 3):
|
|
831
|
+
line_clusters[-1].extend(colinear_dict[(slope2, offset2)])
|
|
832
|
+
added_keys.add((slope2, offset2))
|
|
833
|
+
del colinear_dict[(slope2, offset2)]
|
|
834
|
+
line_clusters = [sorted(cluster) for cluster in line_clusters]
|
|
835
|
+
line_clusters = [x for x in line_clusters if (abs(x[0][0] - x[-1][0]) > 10) and len(x) > 5]
|
|
836
|
+
|
|
837
|
+
for i, cluster in enumerate(line_clusters):
|
|
838
|
+
x, y = np.array(cluster).T
|
|
839
|
+
linear_fit = np.linalg.lstsq(np.hstack((np.ones((len(x), 1)), x[:, None])), y, rcond=None)[0]
|
|
840
|
+
line_clusters[i] = (x, linear_fit[0], linear_fit[1])
|
|
841
|
+
|
|
842
|
+
def get_x_limits(x, offset, slope, extend_horiz=extend_radius, buffer_vert=4):
|
|
843
|
+
limits = (max(int(x[0]) - extend_horiz, 0),
|
|
844
|
+
min(int(x[-1]) + extend_horiz, len(audio_desc_features_scaled) - 1))
|
|
845
|
+
limits = (max(limits[0], int(np.ceil((buffer_vert - offset) / slope))),
|
|
846
|
+
min(limits[1], int(np.floor((len(video_features_scaled) - buffer_vert - offset) / slope))))
|
|
847
|
+
return limits
|
|
848
|
+
def get_audio_video_matches(limits, slope, offset):
|
|
849
|
+
x = np.arange(*limits)
|
|
850
|
+
y = slope * x + offset
|
|
851
|
+
audio_match = audio_desc_features_scaled[slice(*limits)]
|
|
852
|
+
video_match = video_interp(y)
|
|
853
|
+
return x, y, audio_match, video_match
|
|
854
|
+
|
|
855
|
+
audio_desc_max_energy = np.max(audio_desc_features_scaled[:,0])
|
|
856
|
+
video_max_energy = np.max(video_features_scaled[:,0])
|
|
857
|
+
points = [[] for i in range(len(audio_desc_features_scaled))]
|
|
858
|
+
seen_points = set()
|
|
859
|
+
for cluster_index, (x, offset, slope) in enumerate(line_clusters):
|
|
860
|
+
limits = get_x_limits(x, offset, slope, extend_horiz=0)
|
|
861
|
+
if limits[1] < limits[0] + 5:
|
|
862
|
+
continue
|
|
863
|
+
if limits[1] > limits[0] + 100:
|
|
864
|
+
x, y, audio_match, video_match = get_audio_video_matches(limits, slope, offset)
|
|
865
|
+
video_match_err = audio_match[1:-1] - video_match[1:-1]
|
|
866
|
+
valid_matches = np.mean(video_match_err, axis=-1) < 0.1
|
|
867
|
+
if np.count_nonzero(valid_matches) > 50:
|
|
868
|
+
video_match_diff = (video_match[2:] - video_match[:-2]) / 2.
|
|
869
|
+
video_match_err = video_match_err[valid_matches]
|
|
870
|
+
video_match_diff = video_match_diff[valid_matches]
|
|
871
|
+
x_valid = x[1:-1][valid_matches][:,None]
|
|
872
|
+
A = video_match_diff.reshape(-1,1)
|
|
873
|
+
linear_fit, residual, _, _ = np.linalg.lstsq(A, video_match_err.flat, rcond=None)
|
|
874
|
+
explained_err_ratio = 1 - (residual / np.sum(video_match_err ** 2))
|
|
875
|
+
stds_above_noise_mean = np.sqrt(explained_err_ratio * np.prod(video_match_err.shape)) - 1.
|
|
876
|
+
if stds_above_noise_mean > 8 and abs(linear_fit[0]) < 2:
|
|
877
|
+
offset += linear_fit[0]
|
|
878
|
+
limits = get_x_limits(x, offset, slope)
|
|
879
|
+
x, y, audio_match, video_match = get_audio_video_matches(limits, slope, offset)
|
|
880
|
+
quals = np.sum(-.5 - np.log10(1e-4 + np.abs(audio_match - video_match)), axis=1)
|
|
881
|
+
quals *= np.clip(video_match[:,0] + 2.5 - video_max_energy, 0, 1)
|
|
882
|
+
quals += np.clip(audio_match[:,0] + 2.5 - audio_desc_max_energy, 0, 1) * .1
|
|
883
|
+
energy_diffs = audio_match[:,0] - video_match[:,0]
|
|
884
|
+
for i, j, qual in zip(x.tolist(), y.tolist(), quals.tolist()):
|
|
885
|
+
point = (i, int(j))
|
|
886
|
+
if point not in seen_points:
|
|
887
|
+
seen_points.add(point)
|
|
888
|
+
points[i].append((j, cluster_index, qual))
|
|
889
|
+
del seen_points
|
|
890
|
+
points = [sorted(point) for point in points]
|
|
891
|
+
|
|
892
|
+
best_so_far = SortedList(key=lambda x:x[0])
|
|
893
|
+
best_so_far.add((0, 0, -1, 0, 0)) # video_index, audio_desc_index, cluster_index, qual, cum_qual
|
|
894
|
+
clusters_best_so_far = [(0, 0, 0, -1000) for cluster in line_clusters]
|
|
895
|
+
backpointers = {}
|
|
896
|
+
prev_cache = np.full((len(video_features_scaled), 5), -np.inf)
|
|
897
|
+
prev_cache[0] = (0, 0, -1, 0, 0) # video_index, audio_desc_index, cluster_index, qual, cum_qual
|
|
898
|
+
reversed_min_points = [min(x)[0] if len(x) > 0 else np.inf for x in points[::-1]]
|
|
899
|
+
forward_min = list(itertools.accumulate(reversed_min_points, min))[::-1]
|
|
900
|
+
for i in range(len(audio_desc_features_scaled)):
|
|
901
|
+
for j, cluster_index, qual in points[i]:
|
|
902
|
+
cur_index = best_so_far.bisect_right((j,))
|
|
903
|
+
prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual = best_so_far[cur_index-1]
|
|
904
|
+
cluster_last = clusters_best_so_far[cluster_index]
|
|
905
|
+
if cluster_last[3] >= best_prev_cum_qual:
|
|
906
|
+
prev_j, prev_i, prev_qual, best_prev_cum_qual = cluster_last
|
|
907
|
+
prev_cluster_index = cluster_index
|
|
908
|
+
for prev_j_temp in range(max(0, int(j) - 2), int(j) + 1):
|
|
909
|
+
prev_node = prev_cache[prev_j_temp].tolist()
|
|
910
|
+
if cluster_index != prev_node[2]:
|
|
911
|
+
prev_node[4] -= 100 + 100 * ((j - prev_node[0]) - (i - prev_node[1])) ** 2
|
|
912
|
+
if prev_node[1] >= (i - 2) and \
|
|
913
|
+
prev_node[0] <= j and \
|
|
914
|
+
prev_node[4] >= best_prev_cum_qual:
|
|
915
|
+
prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual = prev_node
|
|
916
|
+
cum_qual = best_prev_cum_qual + qual
|
|
917
|
+
prev_cache[int(j)] = (j, i, cluster_index, qual, cum_qual)
|
|
918
|
+
cum_qual_jump = cum_qual - 1000
|
|
919
|
+
if best_so_far[cur_index-1][4] < cum_qual_jump:
|
|
920
|
+
while (cur_index < len(best_so_far)) and (best_so_far[cur_index][4] <= cum_qual_jump):
|
|
921
|
+
del best_so_far[cur_index]
|
|
922
|
+
best_so_far.add((j, i, cluster_index, qual, cum_qual_jump))
|
|
923
|
+
if forward_min[i] == j and cur_index > 1:
|
|
924
|
+
del best_so_far[:cur_index-1]
|
|
925
|
+
cum_qual_cluster_jump = cum_qual - 50
|
|
926
|
+
if cluster_last[3] < cum_qual_cluster_jump:
|
|
927
|
+
clusters_best_so_far[cluster_index] = (j, i, qual, cum_qual_cluster_jump)
|
|
928
|
+
backpointers[(j, i)] = (prev_j, prev_i, prev_cluster_index, prev_qual, best_prev_cum_qual)
|
|
929
|
+
path = [best_so_far[-1]]
|
|
930
|
+
while path[-1][:2] in backpointers:
|
|
931
|
+
path.append(backpointers[path[-1][:2]])
|
|
932
|
+
path.pop()
|
|
933
|
+
path.reverse()
|
|
934
|
+
path = np.array(path)
|
|
935
|
+
y, x, cluster_indices, quals, cum_quals = path.T
|
|
936
|
+
|
|
937
|
+
nondescription = ((quals == 0) | (quals > .3))
|
|
938
|
+
similarity_ratio_x = float(len(set(x[nondescription]))) / len(audio_desc_features_scaled)
|
|
939
|
+
similarity_ratio_y = float(len(set(y[nondescription]))) / len(video_features_scaled)
|
|
940
|
+
similarity_percent = 100 * max(similarity_ratio_x, similarity_ratio_y)
|
|
941
|
+
|
|
942
|
+
nodes = []
|
|
943
|
+
if cluster_indices[0] == cluster_indices[1]:
|
|
944
|
+
nodes.append((x[0], y[0]))
|
|
945
|
+
for i in range(len(x) - 1):
|
|
946
|
+
if cluster_indices[i] != cluster_indices[i+1]:
|
|
947
|
+
nodes.append((x[i] - .1, y[i] - .1))
|
|
948
|
+
nodes.append((x[i+1] + .1, y[i+1] + .1))
|
|
949
|
+
if cluster_indices[-2] == cluster_indices[-1]:
|
|
950
|
+
nodes.append((x[-1], y[-1]))
|
|
951
|
+
x, y = np.array(nodes).T / 210.
|
|
952
|
+
|
|
953
|
+
if (x[1] - x[0]) > 2:
|
|
954
|
+
slope_start = (y[1] - y[0]) / (x[1] - x[0])
|
|
955
|
+
x[0] = 0
|
|
956
|
+
y[0] = y[1] - (x[1] * slope_start)
|
|
957
|
+
if y[0] < 0:
|
|
958
|
+
x[0] = x[1] - (y[1] / slope_start)
|
|
959
|
+
y[0] = 0
|
|
960
|
+
if (x[-1] - x[-2]) > 2:
|
|
961
|
+
slope_end = (y[-1] - y[-2]) / (x[-1] - x[-2])
|
|
962
|
+
x[-1] = ((len(audio_desc_energy) - 1) / 210.)
|
|
963
|
+
y[-1] = y[-2] + ((x[-1] - x[-2]) * slope_end)
|
|
964
|
+
if y[-1] > ((len(video_energy) - 1) / 210.):
|
|
965
|
+
y[-1] = ((len(video_energy) - 1) / 210.)
|
|
966
|
+
x[-1] = x[-2] + ((y[-1] - y[-2]) / slope_end)
|
|
967
|
+
|
|
968
|
+
path[:,:2] /= 210.
|
|
969
|
+
return x, y, similarity_percent, path, median_slope
|
|
970
|
+
|
|
838
971
|
# combines videos with matching audio files (e.g. audio descriptions)
|
|
839
972
|
# this is the main function of this script, it calls the other functions in order
|
|
840
|
-
def combine(video, audio,
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
alignment_dir=default_alignment_dir, extension="copy", display_func=None):
|
|
844
|
-
video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
|
|
973
|
+
def combine(video, audio, stretch_audio=False, yes=False, prepend="ad_", no_pitch_correction=False,
|
|
974
|
+
output_dir=default_output_dir, alignment_dir=default_alignment_dir):
|
|
975
|
+
video_files, has_audio_extensions = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
|
|
845
976
|
|
|
846
|
-
if yes == False and sum(
|
|
977
|
+
if yes == False and sum(has_audio_extensions) > 0:
|
|
847
978
|
print("")
|
|
848
979
|
print("One or more audio files found in video input. Was this intentional?")
|
|
849
980
|
print("If not, press ctrl+c to kill this script.")
|
|
@@ -856,16 +987,16 @@ def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
|
|
|
856
987
|
f"The audio path has {len(audio_desc_files)} files"]
|
|
857
988
|
raise RuntimeError("\n".join(error_msg))
|
|
858
989
|
|
|
859
|
-
|
|
860
|
-
ensure_folders_exist([output_dir]
|
|
990
|
+
print("")
|
|
991
|
+
ensure_folders_exist([output_dir])
|
|
861
992
|
if PLOT_ALIGNMENT_TO_FILE:
|
|
862
|
-
ensure_folders_exist([alignment_dir]
|
|
993
|
+
ensure_folders_exist([alignment_dir])
|
|
863
994
|
|
|
864
|
-
|
|
995
|
+
print("")
|
|
865
996
|
for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
997
|
+
print(os.path.split(video_file)[1])
|
|
998
|
+
print(os.path.split(audio_desc_file)[1])
|
|
999
|
+
print("")
|
|
869
1000
|
if yes == False:
|
|
870
1001
|
print("Are the above input file pairings correct?")
|
|
871
1002
|
print("If not, press ctrl+c to kill this script.")
|
|
@@ -874,414 +1005,728 @@ def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
|
|
|
874
1005
|
|
|
875
1006
|
# if ffmpeg isn't installed, install it
|
|
876
1007
|
if not is_ffmpeg_installed():
|
|
877
|
-
|
|
1008
|
+
print("Downloading and installing ffmpeg (media editor, 50 MB download)...")
|
|
878
1009
|
get_ffmpeg()
|
|
879
1010
|
if not is_ffmpeg_installed():
|
|
880
1011
|
RuntimeError("Failed to install ffmpeg.")
|
|
881
|
-
|
|
1012
|
+
print("Successfully installed ffmpeg.")
|
|
882
1013
|
|
|
883
|
-
|
|
1014
|
+
print("Processing files:")
|
|
884
1015
|
|
|
885
|
-
for (video_file, audio_desc_file,
|
|
886
|
-
|
|
887
|
-
#
|
|
888
|
-
|
|
889
|
-
ext = os.path.splitext(video_file)[1]
|
|
890
|
-
else:
|
|
891
|
-
# add a dot to the extension if it's missing
|
|
892
|
-
ext = ('' if extension[0] == '.' else '.') + extension
|
|
893
|
-
output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
|
|
1016
|
+
for (video_file, audio_desc_file, has_audio_extension) in zip(video_files, audio_desc_files,
|
|
1017
|
+
has_audio_extensions):
|
|
1018
|
+
# Output filename (and extension) is the same as input, except the prepend and directory
|
|
1019
|
+
output_filename = prepend + os.path.split(video_file)[1]
|
|
894
1020
|
output_filename = os.path.join(output_dir, output_filename)
|
|
895
|
-
|
|
1021
|
+
print(f" {output_filename}")
|
|
896
1022
|
|
|
897
|
-
if
|
|
898
|
-
|
|
1023
|
+
if (not stretch_audio) & has_audio_extension:
|
|
1024
|
+
raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
|
|
1025
|
+
|
|
1026
|
+
if os.path.exists(output_filename) and os.path.getsize(output_filename) > 1e5:
|
|
1027
|
+
print(" output file already exists, skipping...")
|
|
899
1028
|
continue
|
|
900
1029
|
|
|
901
1030
|
# print warning if output file's full path is longer than Windows MAX_PATH (260)
|
|
902
1031
|
full_output_filename = os.path.abspath(output_filename)
|
|
903
1032
|
if IS_RUNNING_WINDOWS and len(full_output_filename) >= 260:
|
|
904
|
-
|
|
1033
|
+
print(" WARNING: very long output path, ffmpeg may fail...")
|
|
905
1034
|
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
video_spec = normalize_spec(video_spec_raw)
|
|
910
|
-
audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
|
|
911
|
-
audio_desc_spec = normalize_spec(audio_desc_spec_raw)
|
|
1035
|
+
num_channels = 2 if stretch_audio else 1
|
|
1036
|
+
print(" reading video file...\r", end='')
|
|
1037
|
+
video_arr = parse_audio_from_file(video_file, num_channels)
|
|
912
1038
|
|
|
913
|
-
|
|
914
|
-
|
|
1039
|
+
print(" computing video features... \r", end='')
|
|
1040
|
+
video_energy = get_energy(video_arr)
|
|
1041
|
+
video_zero_crossings = get_zero_crossings(video_arr)
|
|
1042
|
+
video_freq_bands = get_freq_bands(video_arr)
|
|
1043
|
+
video_features = [video_energy, video_zero_crossings] + video_freq_bands
|
|
915
1044
|
|
|
916
|
-
|
|
1045
|
+
if not stretch_audio:
|
|
1046
|
+
del video_arr
|
|
917
1047
|
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1048
|
+
print(" reading audio file... \r", end='')
|
|
1049
|
+
audio_desc_arr = parse_audio_from_file(audio_desc_file, num_channels)
|
|
1050
|
+
|
|
1051
|
+
print(" computing audio features...\r", end='')
|
|
1052
|
+
audio_desc_energy = get_energy(audio_desc_arr)
|
|
1053
|
+
audio_desc_zero_crossings = get_zero_crossings(audio_desc_arr)
|
|
1054
|
+
audio_desc_freq_bands = get_freq_bands(audio_desc_arr)
|
|
1055
|
+
audio_desc_features = [audio_desc_energy, audio_desc_zero_crossings] + audio_desc_freq_bands
|
|
1056
|
+
|
|
1057
|
+
if not stretch_audio:
|
|
1058
|
+
del audio_desc_arr
|
|
924
1059
|
|
|
925
|
-
|
|
1060
|
+
outputs = align(video_features, audio_desc_features, video_energy, audio_desc_energy)
|
|
1061
|
+
audio_desc_times, video_times, similarity_percent, path, median_slope = outputs
|
|
926
1062
|
|
|
927
|
-
|
|
1063
|
+
del video_energy, video_zero_crossings, video_freq_bands, video_features
|
|
1064
|
+
del audio_desc_energy, audio_desc_zero_crossings, audio_desc_freq_bands, audio_desc_features
|
|
1065
|
+
|
|
1066
|
+
if similarity_percent < 20:
|
|
1067
|
+
print(f" WARNING: similarity {similarity_percent:.1f}%, likely mismatched files")
|
|
1068
|
+
if similarity_percent > 90:
|
|
1069
|
+
print(f" WARNING: similarity {similarity_percent:.1f}%, likely undescribed media")
|
|
928
1070
|
|
|
929
|
-
ad_timings = None
|
|
930
1071
|
if stretch_audio:
|
|
931
|
-
|
|
932
|
-
|
|
1072
|
+
# lower memory usage version of np.std for large arrays
|
|
1073
|
+
def low_ram_std(arr):
|
|
1074
|
+
avg = np.mean(arr, dtype=np.float64)
|
|
1075
|
+
return np.sqrt(np.einsum('ij,ij->i', arr, arr, dtype=np.float64)/np.prod(arr.shape) - (avg**2))
|
|
933
1076
|
|
|
934
|
-
|
|
935
|
-
|
|
1077
|
+
# rescale RMS intensity of audio to match video
|
|
1078
|
+
audio_desc_arr *= (low_ram_std(video_arr) / low_ram_std(audio_desc_arr))[:, None]
|
|
936
1079
|
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
smooth_path, ad_detect_sensitivity, boost_sensitivity)
|
|
940
|
-
speech_sample_mask, boost_sample_mask, ad_timings = outputs
|
|
941
|
-
if keep_non_ad:
|
|
942
|
-
video_arr *= speech_sample_mask
|
|
943
|
-
video_arr += video_arr_original * (1 - speech_sample_mask)
|
|
944
|
-
del video_arr_original
|
|
945
|
-
del speech_sample_mask
|
|
946
|
-
else:
|
|
947
|
-
ad_timings = None
|
|
948
|
-
if boost != 0:
|
|
949
|
-
video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
|
|
950
|
-
del boost_sample_mask
|
|
1080
|
+
replace_aligned_segments(video_arr, audio_desc_arr, audio_desc_times, video_times, no_pitch_correction)
|
|
1081
|
+
del audio_desc_arr
|
|
951
1082
|
|
|
952
|
-
# prevent peaking by rescaling to within +/-
|
|
1083
|
+
# prevent peaking by rescaling to within +/- 32,766
|
|
953
1084
|
video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
|
|
954
1085
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
write_replaced_media_to_disk(output_filename, video_arr)
|
|
1086
|
+
print(" processing output file... \r", end='')
|
|
1087
|
+
write_replaced_media_to_disk(output_filename, video_arr, None if has_audio_extension else video_file)
|
|
1088
|
+
del video_arr
|
|
959
1089
|
else:
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
start_key_frame = get_closest_key_frame_time(video_file, video_offset)
|
|
966
|
-
setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
|
|
1090
|
+
video_offset = video_times[0] - audio_desc_times[0]
|
|
1091
|
+
# to make ffmpeg cut at the last keyframe before the audio starts, use a timestamp after it
|
|
1092
|
+
after_start_key_frame = get_closest_key_frame_time(video_file, video_offset)
|
|
1093
|
+
print(" processing output file... \r", end='')
|
|
1094
|
+
setts_cmd = encode_fit_as_ffmpeg_expr(audio_desc_times, video_times, video_offset)
|
|
967
1095
|
write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
|
|
968
|
-
setts_cmd,
|
|
1096
|
+
setts_cmd, video_offset, after_start_key_frame)
|
|
969
1097
|
|
|
970
|
-
del video_arr
|
|
971
1098
|
if PLOT_ALIGNMENT_TO_FILE:
|
|
972
1099
|
plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
|
|
973
|
-
plot_alignment(plot_filename_no_ext, path,
|
|
974
|
-
|
|
975
|
-
|
|
1100
|
+
plot_alignment(plot_filename_no_ext, path, audio_desc_times, video_times, similarity_percent,
|
|
1101
|
+
median_slope, stretch_audio, no_pitch_correction)
|
|
1102
|
+
print("All files processed. ")
|
|
976
1103
|
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1104
|
+
if wx is not None:
|
|
1105
|
+
def write_config_file(config_path, settings):
|
|
1106
|
+
config = configparser.ConfigParser()
|
|
1107
|
+
config.add_section('alignment')
|
|
1108
|
+
config['alignment'] = {}
|
|
1109
|
+
for key, value in settings.items():
|
|
1110
|
+
config['alignment'][key] = str(value)
|
|
1111
|
+
with open(config_path, 'w') as f:
|
|
1112
|
+
config.write(f)
|
|
985
1113
|
|
|
986
|
-
def read_config_file(config_path: Path):
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1114
|
+
def read_config_file(config_path: Path):
|
|
1115
|
+
config = configparser.ConfigParser()
|
|
1116
|
+
config.read(config_path)
|
|
1117
|
+
settings = {'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
|
|
1118
|
+
'prepend': config.get('alignment', 'prepend', fallback='ad_'),
|
|
1119
|
+
'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
|
|
1120
|
+
'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
|
|
1121
|
+
'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir)}
|
|
1122
|
+
if not config.has_section('alignment'):
|
|
1123
|
+
write_config_file(config_path, settings)
|
|
1124
|
+
return settings
|
|
1125
|
+
|
|
1126
|
+
def set_tooltip(element, tip):
|
|
1127
|
+
element.SetToolTip(tip)
|
|
1128
|
+
# prevent tooltip from disappearing for 30 seconds
|
|
1129
|
+
tooltip_object = element.GetToolTip()
|
|
1130
|
+
if not tooltip_object is None:
|
|
1131
|
+
tooltip_object.SetAutoPop(30000)
|
|
1132
|
+
|
|
1133
|
+
class DialogSettings(wx.Dialog):
|
|
1134
|
+
def __init__(self, parent, config_path, is_dark):
|
|
1135
|
+
wx.Dialog.__init__(self, parent, title="Settings - describealign", size=wx.Size(450,330),
|
|
1136
|
+
style=wx.DEFAULT_DIALOG_STYLE|wx.TAB_TRAVERSAL)
|
|
1137
|
+
# setting the GUI dialog's font causes all contained elements to inherit that font by default
|
|
1138
|
+
self.SetFont(wx.Font(*gui_font))
|
|
1139
|
+
self.SetBackgroundColour(gui_background_color_dark if is_dark else gui_background_color_light)
|
|
1140
|
+
|
|
1141
|
+
self.text_header = wx.StaticText(self, label="Check tooltips (i.e. mouse-over text) for descriptions:")
|
|
1142
|
+
|
|
1143
|
+
self.static_box_sizer_output = wx.StaticBoxSizer(wx.VERTICAL, self, "output_dir")
|
|
1144
|
+
self.dir_picker_output = wx.DirPickerCtrl(self, message="Select a folder", name="output_dir")
|
|
1145
|
+
set_tooltip(self.dir_picker_output, "Directory combined output media is saved to. " + \
|
|
1146
|
+
"Default is \"videos_with_ad\"")
|
|
1147
|
+
|
|
1148
|
+
self.static_box_sizer_alignment = wx.StaticBoxSizer(wx.VERTICAL, self, "alignment_dir")
|
|
1149
|
+
self.dir_picker_alignment = wx.DirPickerCtrl(self, message="Select a folder", name="alignment_dir")
|
|
1150
|
+
set_tooltip(self.dir_picker_alignment, "Directory alignment data and plots are saved to. " + \
|
|
1151
|
+
"Default is \"alignment_plots\"")
|
|
1152
|
+
|
|
1153
|
+
self.text_prepend = wx.StaticText(self, label="prepend:")
|
|
1154
|
+
self.text_ctrl_prepend = wx.TextCtrl(self, name="prepend")
|
|
1155
|
+
set_tooltip(self.text_ctrl_prepend, "Output file name prepend text. Default is \"ad_\"")
|
|
1156
|
+
|
|
1157
|
+
panel_stretch_audio_no_pitch_correction = wx.Panel(self)
|
|
1158
|
+
|
|
1159
|
+
self.checkbox_stretch_audio = wx.CheckBox(panel_stretch_audio_no_pitch_correction,
|
|
1160
|
+
label="stretch_audio", name="stretch_audio")
|
|
1161
|
+
set_tooltip(self.checkbox_stretch_audio, "Stretches the input audio to fit the input video. " + \
|
|
1162
|
+
"Default is to stretch the video to fit the audio. " + \
|
|
1163
|
+
"Keeps original video audio as secondary tracks. Slower " + \
|
|
1164
|
+
"and uses more RAM when enabled, long videos may cause " + \
|
|
1165
|
+
"paging or Out of Memory errors on low-RAM systems.")
|
|
1166
|
+
self.checkbox_stretch_audio.Bind(wx.EVT_CHECKBOX, self.update_stretch_audio_subsettings)
|
|
1167
|
+
|
|
1168
|
+
self.checkbox_no_pitch_correction = wx.CheckBox(panel_stretch_audio_no_pitch_correction,
|
|
1169
|
+
label="no_pitch_correction", name="no_pitch_correction")
|
|
1170
|
+
set_tooltip(self.checkbox_no_pitch_correction, "Skips pitch correction step when stretching audio. " + \
|
|
1171
|
+
"Requires --stretch_audio to be set, otherwise " + \
|
|
1172
|
+
"does nothing.")
|
|
1173
|
+
|
|
1174
|
+
self.button_save = wx.Button(self, label="Save")
|
|
1175
|
+
self.button_save.Bind(wx.EVT_BUTTON, self.save_settings)
|
|
1176
|
+
self.button_cancel = wx.Button(self, label="Cancel")
|
|
1177
|
+
self.button_cancel.Bind(wx.EVT_BUTTON, lambda event: self.EndModal(0))
|
|
1178
|
+
|
|
1179
|
+
sizer_dialog = wx.BoxSizer(wx.VERTICAL)
|
|
1180
|
+
sizer_output_dir = wx.BoxSizer(wx.HORIZONTAL)
|
|
1181
|
+
sizer_alignment_dir = wx.BoxSizer(wx.HORIZONTAL)
|
|
1182
|
+
sizer_prepend = wx.BoxSizer(wx.HORIZONTAL)
|
|
1183
|
+
sizer_stretch_audio_no_pitch_correction_outer = wx.BoxSizer(wx.HORIZONTAL)
|
|
1184
|
+
sizer_stretch_audio_no_pitch_correction_inner = wx.BoxSizer(wx.VERTICAL)
|
|
1185
|
+
sizer_save_cancel = wx.BoxSizer(wx.HORIZONTAL)
|
|
1186
|
+
|
|
1187
|
+
# Configure layout with nested Box Sizers:
|
|
1188
|
+
#
|
|
1189
|
+
# Frame
|
|
1190
|
+
# sizer_dialog
|
|
1191
|
+
# text_header
|
|
1192
|
+
# sizer_output_dir
|
|
1193
|
+
# static_box_sizer_output
|
|
1194
|
+
# dir_picker_output
|
|
1195
|
+
# sizer_alignment_dir
|
|
1196
|
+
# static_box_sizer_alignment
|
|
1197
|
+
# dir_picker_alignment
|
|
1198
|
+
# sizer_prepend
|
|
1199
|
+
# text_prepend
|
|
1200
|
+
# text_ctrl_prepend
|
|
1201
|
+
# sizer_stretch_audio_no_pitch_correction_outer
|
|
1202
|
+
# panel_stretch_audio_no_pitch_correction
|
|
1203
|
+
# sizer_stretch_audio_no_pitch_correction_inner
|
|
1204
|
+
# checkbox_stretch_audio
|
|
1205
|
+
# checkbox_no_pitch_correction
|
|
1206
|
+
# sizer_save_cancel
|
|
1207
|
+
# button_save
|
|
1208
|
+
# button_cancel
|
|
1209
|
+
#
|
|
1210
|
+
self.SetSizer(sizer_dialog)
|
|
1211
|
+
sizer_dialog.Add(self.text_header, 0, wx.ALL, 5)
|
|
1212
|
+
sizer_dialog.Add(sizer_output_dir, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 2)
|
|
1213
|
+
sizer_dialog.Add(sizer_alignment_dir, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 2)
|
|
1214
|
+
sizer_dialog.Add(sizer_prepend, 1, wx.LEFT|wx.EXPAND, 5)
|
|
1215
|
+
sizer_dialog.Add(sizer_stretch_audio_no_pitch_correction_outer, 1, wx.LEFT|wx.EXPAND, 5)
|
|
1216
|
+
sizer_stretch_audio_no_pitch_correction_outer.Add(panel_stretch_audio_no_pitch_correction,
|
|
1217
|
+
1, wx.LEFT|wx.EXPAND, 5)
|
|
1218
|
+
sizer_stretch_audio_no_pitch_correction_outer.Add((0, 0), 2, wx.EXPAND, 5) # spacer
|
|
1219
|
+
sizer_dialog.Add(sizer_save_cancel, 2, wx.BOTTOM|wx.EXPAND, 5)
|
|
1220
|
+
sizer_prepend.Add(self.text_prepend, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1221
|
+
sizer_prepend.Add(self.text_ctrl_prepend, 0, wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1222
|
+
sizer_output_dir.Add(self.static_box_sizer_output, 1, wx.LEFT|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1223
|
+
self.static_box_sizer_output.Add(self.dir_picker_output, 1, wx.EXPAND)
|
|
1224
|
+
sizer_alignment_dir.Add(self.static_box_sizer_alignment, 1, wx.LEFT|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1225
|
+
self.static_box_sizer_alignment.Add(self.dir_picker_alignment, 1, wx.EXPAND)
|
|
1226
|
+
panel_stretch_audio_no_pitch_correction.SetSizer(sizer_stretch_audio_no_pitch_correction_inner)
|
|
1227
|
+
sizer_stretch_audio_no_pitch_correction_inner.Add(self.checkbox_stretch_audio, 0, wx.ALL, 5)
|
|
1228
|
+
sizer_stretch_audio_no_pitch_correction_inner.Add(self.checkbox_no_pitch_correction, 0, wx.ALL, 5)
|
|
1229
|
+
sizer_save_cancel.Add((0, 0), 3, wx.EXPAND, 5) # spacer
|
|
1230
|
+
sizer_save_cancel.Add(self.button_save, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1231
|
+
sizer_save_cancel.Add((0, 0), 2, wx.EXPAND, 5) # spacer
|
|
1232
|
+
sizer_save_cancel.Add(self.button_cancel, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1233
|
+
sizer_save_cancel.Add((0, 0), 3, wx.EXPAND, 5) # spacer
|
|
1234
|
+
|
|
1235
|
+
# centers GUI on the screen
|
|
1236
|
+
self.Centre(wx.BOTH)
|
|
1237
|
+
|
|
1238
|
+
# cache dictionaries mapping setting names to widget setter and getter functions
|
|
1239
|
+
self.setting_getters = {}
|
|
1240
|
+
self.setting_setters = {}
|
|
1241
|
+
for child in itertools.chain(self.GetChildren(),
|
|
1242
|
+
panel_stretch_audio_no_pitch_correction.GetChildren()):
|
|
1243
|
+
child_class_name = child.GetClassName()
|
|
1244
|
+
child_name = child.GetName()
|
|
1245
|
+
if child_class_name == "wxDirPickerCtrl":
|
|
1246
|
+
self.setting_getters[child_name] = child.GetPath
|
|
1247
|
+
self.setting_setters[child_name] = child.SetPath
|
|
1248
|
+
if child_class_name in ["wxCheckBox"]:
|
|
1249
|
+
self.setting_getters[child_name] = child.GetValue
|
|
1250
|
+
self.setting_setters[child_name] = child.SetValue
|
|
1251
|
+
if child_class_name in ["wxTextCtrl"]:
|
|
1252
|
+
self.setting_getters[child_name] = child.GetValue
|
|
1253
|
+
self.setting_setters[child_name] = lambda value, child=child: child.SetValue(str(value))
|
|
1254
|
+
self.setting_names = self.setting_getters.keys()
|
|
1255
|
+
|
|
1256
|
+
# initialize setting widgets to saved config values
|
|
1257
|
+
self.config_path = config_path
|
|
1258
|
+
config_file_settings = read_config_file(self.config_path)
|
|
1259
|
+
for setting_name in self.setting_names:
|
|
1260
|
+
self.setting_setters[setting_name](config_file_settings[setting_name])
|
|
1003
1261
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
|
|
1017
|
-
sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
|
|
1018
|
-
[sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
|
|
1019
|
-
sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
|
|
1020
|
-
tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
|
|
1021
|
-
sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
|
|
1022
|
-
[sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
|
|
1023
|
-
sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
|
|
1024
|
-
tooltip='Lower values make the alignment more accurate when there are skips ' + \
|
|
1025
|
-
'(e.g. describer pauses), but also make it more likely to misalign. ' + \
|
|
1026
|
-
'Default is 50.')]])],
|
|
1027
|
-
[sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
|
|
1028
|
-
tooltip='Stretches the input audio to fit the input video. ' + \
|
|
1029
|
-
'Default is to stretch the video to fit the audio.')],
|
|
1030
|
-
[sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
|
|
1031
|
-
disabled=not settings['stretch_audio'],
|
|
1032
|
-
tooltip='Tries to only replace segments with audio description. Useful if ' + \
|
|
1033
|
-
'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
|
|
1034
|
-
'Requires --stretch_audio to be set, otherwise does nothing.')],
|
|
1035
|
-
[sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
|
|
1036
|
-
sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
|
|
1037
|
-
key='boost', disabled=not settings['stretch_audio'],
|
|
1038
|
-
tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
|
|
1039
|
-
'-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
|
|
1040
|
-
'Requires --stretch_audio to be set, otherwise does nothing.')]])],
|
|
1041
|
-
[sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
|
|
1042
|
-
sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
|
|
1043
|
-
key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
|
|
1044
|
-
tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
|
|
1045
|
-
'--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
|
|
1046
|
-
[sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
|
|
1047
|
-
sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
|
|
1048
|
-
key='boost_sensitivity', disabled=not settings['stretch_audio'],
|
|
1049
|
-
tooltip='Higher values make --boost less likely to miss a description, but ' + \
|
|
1050
|
-
'also make it more likely to boost non-description audio. Default is 0.4')]])],
|
|
1051
|
-
[sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
|
|
1052
|
-
disabled=not settings['stretch_audio'],
|
|
1053
|
-
tooltip='Skips pitch correction step when stretching audio. ' + \
|
|
1054
|
-
'Requires --stretch_audio to be set, otherwise does nothing.')],
|
|
1055
|
-
[sg.Column([[sg.Submit('Save', pad=(40,3)),
|
|
1056
|
-
sg.Button('Cancel')]], pad=((135,3),10))]]
|
|
1057
|
-
settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
|
|
1058
|
-
settings_window['extension'].set_focus()
|
|
1059
|
-
while True:
|
|
1060
|
-
event, values = settings_window.read()
|
|
1061
|
-
if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
|
|
1062
|
-
break
|
|
1063
|
-
if event == 'stretch_audio':
|
|
1064
|
-
# work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
|
|
1065
|
-
if IS_RUNNING_WINDOWS:
|
|
1066
|
-
settings_window['boost'].Update(disabled = values['stretch_audio'])
|
|
1067
|
-
settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
|
|
1068
|
-
settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
|
|
1262
|
+
# initialize stretch_audio subsettings to be disabled/enabled
|
|
1263
|
+
self.update_stretch_audio_subsettings()
|
|
1264
|
+
|
|
1265
|
+
set_background_color(self, is_dark)
|
|
1266
|
+
if sum(self.checkbox_stretch_audio.GetForegroundColour()[:3]) < 350:
|
|
1267
|
+
panel_stretch_audio_no_pitch_correction.SetBackgroundColour(gui_background_color_light)
|
|
1268
|
+
|
|
1269
|
+
def update_stretch_audio_subsettings(self, event=None):
|
|
1270
|
+
subsettings = [self.checkbox_no_pitch_correction]
|
|
1271
|
+
if self.checkbox_stretch_audio.IsChecked():
|
|
1272
|
+
for subsetting in subsettings:
|
|
1273
|
+
subsetting.Enable()
|
|
1069
1274
|
else:
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1275
|
+
for subsetting in subsettings:
|
|
1276
|
+
subsetting.Disable()
|
|
1277
|
+
|
|
1278
|
+
def save_settings(self, event):
|
|
1279
|
+
settings = {}
|
|
1280
|
+
for setting_name in self.setting_names:
|
|
1281
|
+
settings[setting_name] = self.setting_getters[setting_name]()
|
|
1282
|
+
write_config_file(self.config_path, settings)
|
|
1283
|
+
self.EndModal(0)
|
|
1284
|
+
|
|
1285
|
+
class QueueWriter(io.TextIOWrapper):
|
|
1286
|
+
def __init__(self, queue) -> None:
|
|
1287
|
+
super().__init__(buffer=io.BytesIO())
|
|
1288
|
+
self._queue = queue
|
|
1289
|
+
|
|
1290
|
+
def write(self, s: str) -> int:
|
|
1291
|
+
self._queue.put(s)
|
|
1292
|
+
return len(s)
|
|
1082
1293
|
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1294
|
+
def combine_print_exceptions(print_queue, *args, **kwargs):
|
|
1295
|
+
writer = QueueWriter(print_queue)
|
|
1296
|
+
with redirect_stdout(writer), redirect_stderr(writer):
|
|
1297
|
+
try:
|
|
1298
|
+
combine(*args, **kwargs)
|
|
1299
|
+
except Exception:
|
|
1300
|
+
print(" ERROR: exception raised")
|
|
1301
|
+
traceback.print_exc()
|
|
1302
|
+
|
|
1303
|
+
class FrameCombine(wx.Frame):
|
|
1304
|
+
def __init__(self, parent, config_path, video_files, audio_files, is_dark):
|
|
1305
|
+
wx.Frame.__init__(self, parent, title="Combining - describealign", size=wx.Size(800,600))
|
|
1306
|
+
# setting the GUI frame's font causes all contained elements to inherit that font by default
|
|
1307
|
+
self.SetFont(wx.Font(*gui_font))
|
|
1308
|
+
self.SetBackgroundColour(gui_background_color_dark if is_dark else gui_background_color_light)
|
|
1309
|
+
# wrap all widgets within a panel to enable tab traversal (i.e. pressing tab to swap GUI focus)
|
|
1310
|
+
self.panel0 = wx.Panel(self, style=wx.TAB_TRAVERSAL)
|
|
1311
|
+
|
|
1312
|
+
self.text_ctrl_output = wx.TextCtrl(self.panel0, style=wx.TE_MULTILINE|wx.TE_READONLY|wx.TE_RICH)
|
|
1313
|
+
|
|
1314
|
+
self.button_close = wx.Button(self.panel0, label="Close")
|
|
1315
|
+
self.button_close.Bind(wx.EVT_BUTTON, self.attempt_close)
|
|
1316
|
+
# also capture other close events such as alt+f4 or clicking the X in the top corner of the frame
|
|
1317
|
+
self.Bind(wx.EVT_CLOSE, self.attempt_close)
|
|
1318
|
+
|
|
1319
|
+
self.update_timer = wx.Timer(self)
|
|
1320
|
+
self.Bind(wx.EVT_TIMER, self.update_gui, self.update_timer)
|
|
1321
|
+
|
|
1322
|
+
sizer_panel_outer = wx.BoxSizer(wx.VERTICAL)
|
|
1323
|
+
sizer_panel_inner = wx.BoxSizer(wx.VERTICAL)
|
|
1324
|
+
sizer_close_button = wx.BoxSizer(wx.HORIZONTAL)
|
|
1325
|
+
|
|
1326
|
+
# Configure layout with nested Box Sizers:
|
|
1327
|
+
#
|
|
1328
|
+
# Frame
|
|
1329
|
+
# sizer_panel_outer
|
|
1330
|
+
# panel0
|
|
1331
|
+
# sizer_panel_inner
|
|
1332
|
+
# text_ctrl_output
|
|
1333
|
+
# sizer_close_button
|
|
1334
|
+
# button_close
|
|
1335
|
+
#
|
|
1336
|
+
self.SetSizer(sizer_panel_outer)
|
|
1337
|
+
sizer_panel_outer.Add(self.panel0, 1, wx.EXPAND|wx.ALL, 5)
|
|
1338
|
+
self.panel0.SetSizer(sizer_panel_inner)
|
|
1339
|
+
sizer_panel_inner.Add(self.text_ctrl_output, 1, wx.ALL|wx.EXPAND, 5)
|
|
1340
|
+
sizer_panel_inner.Add(sizer_close_button, 0, wx.EXPAND, 5)
|
|
1341
|
+
sizer_close_button.Add((0, 0), 1, wx.EXPAND, 5) # spacer
|
|
1342
|
+
sizer_close_button.Add(self.button_close, 0, wx.ALL, 5)
|
|
1343
|
+
sizer_close_button.Add((0, 0), 1, wx.EXPAND, 5) # spacer
|
|
1344
|
+
|
|
1345
|
+
# centers GUI on the screen
|
|
1346
|
+
self.Centre(wx.BOTH)
|
|
1347
|
+
|
|
1348
|
+
set_background_color(self, is_dark)
|
|
1349
|
+
|
|
1350
|
+
self.config_path = config_path
|
|
1351
|
+
self.overwrite_last_line = False
|
|
1352
|
+
self.display_line('Combining media files:')
|
|
1353
|
+
self.text_ctrl_output.SetInsertionPoint(0)
|
|
1354
|
+
|
|
1355
|
+
# launch combiner using settings from config file, redirecting its output to a queue
|
|
1356
|
+
self.print_queue = multiprocessing.Queue()
|
|
1357
|
+
settings = read_config_file(self.config_path)
|
|
1358
|
+
settings.update({'yes':True})
|
|
1359
|
+
self.combine_process = multiprocessing.Process(target=combine_print_exceptions,
|
|
1360
|
+
args=(self.print_queue, video_files, audio_files),
|
|
1361
|
+
kwargs=settings, daemon=True)
|
|
1362
|
+
self.combine_process.start()
|
|
1363
|
+
self.update_gui()
|
|
1364
|
+
|
|
1365
|
+
def attempt_close(self, event):
|
|
1366
|
+
if self.combine_process.is_alive():
|
|
1367
|
+
dialog = wx.MessageDialog(self, "Warning: combiner is still running, stop it and close anyway?",
|
|
1368
|
+
"Warning", wx.YES_NO|wx.ICON_WARNING)
|
|
1369
|
+
response = dialog.ShowModal()
|
|
1370
|
+
if (response == wx.ID_YES):
|
|
1371
|
+
self.combine_process.terminate()
|
|
1372
|
+
self.Destroy()
|
|
1373
|
+
elif (response == wx.ID_NO):
|
|
1374
|
+
# If the EVT_CLOSE came from the OS, let the OS know it didn't succeed
|
|
1375
|
+
if event.GetEventType() == wx.EVT_CLOSE.evtType[0]:
|
|
1376
|
+
event.Veto(True)
|
|
1377
|
+
else:
|
|
1378
|
+
self.Destroy()
|
|
1379
|
+
|
|
1380
|
+
def set_last_line_color(self, color, line_start):
|
|
1381
|
+
num_lines = self.text_ctrl_output.GetNumberOfLines()
|
|
1382
|
+
end = self.text_ctrl_output.GetLastPosition()
|
|
1383
|
+
self.text_ctrl_output.SetStyle(line_start, end, wx.TextAttr("black", color))
|
|
1087
1384
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1385
|
+
def display_line(self, line):
|
|
1386
|
+
if self.overwrite_last_line:
|
|
1387
|
+
# skip the empty line following lines ending in "\r"
|
|
1388
|
+
if line == "":
|
|
1389
|
+
return
|
|
1390
|
+
num_lines = self.text_ctrl_output.GetNumberOfLines()
|
|
1391
|
+
start = self.text_ctrl_output.XYToPosition(0,num_lines-2)
|
|
1392
|
+
end = self.text_ctrl_output.GetLastPosition()
|
|
1393
|
+
self.text_ctrl_output.Remove(start, end)
|
|
1394
|
+
self.overwrite_last_line = False
|
|
1395
|
+
if line[-1:] == "\r":
|
|
1396
|
+
self.overwrite_last_line = True
|
|
1397
|
+
line = line[:-1].rstrip(' ') + "\r"
|
|
1398
|
+
line_start = self.text_ctrl_output.GetLastPosition()
|
|
1399
|
+
self.text_ctrl_output.AppendText(line)
|
|
1400
|
+
# highlight warnings by changing their background color to light orange
|
|
1401
|
+
if line[:10] == " WARNING:":
|
|
1402
|
+
self.set_last_line_color(wx.Colour(255, 188, 64), line_start)
|
|
1403
|
+
# highlight errors by changing their background color to red
|
|
1404
|
+
if line[:8] == " ERROR:":
|
|
1405
|
+
self.set_last_line_color(wx.Colour(255, 128, 128), line_start)
|
|
1406
|
+
|
|
1407
|
+
def update_gui(self, event=None):
|
|
1408
|
+
lines = []
|
|
1409
|
+
while not self.print_queue.empty():
|
|
1410
|
+
lines.append(self.print_queue.get())
|
|
1411
|
+
if len(lines) > 0:
|
|
1412
|
+
cursor_position = self.text_ctrl_output.GetInsertionPoint()
|
|
1413
|
+
self.text_ctrl_output.Freeze()
|
|
1414
|
+
for line in lines:
|
|
1415
|
+
self.display_line(line)
|
|
1416
|
+
self.text_ctrl_output.SetInsertionPoint(cursor_position)
|
|
1417
|
+
self.text_ctrl_output.Thaw()
|
|
1418
|
+
self.update_timer.StartOnce(gui_update_interval_ms)
|
|
1091
1419
|
|
|
1092
|
-
def
|
|
1093
|
-
|
|
1094
|
-
|
|
1420
|
+
def migrate_config(old_path: Optional[Path], new_path: Path) -> None:
|
|
1421
|
+
"""
|
|
1422
|
+
Migrate configuration from old location.
|
|
1423
|
+
|
|
1424
|
+
Only runs if the old_path exists but new_path does not
|
|
1425
|
+
"""
|
|
1426
|
+
if new_path.exists() or not old_path or not old_path.exists():
|
|
1427
|
+
return
|
|
1428
|
+
|
|
1429
|
+
old_data = old_path.read_text(encoding='utf-8')
|
|
1430
|
+
new_path.write_text(old_data, encoding='utf-8')
|
|
1431
|
+
print(f"Configuration migrated to {new_path}")
|
|
1095
1432
|
try:
|
|
1096
|
-
|
|
1097
|
-
except
|
|
1098
|
-
traceback.
|
|
1433
|
+
old_path.unlink()
|
|
1434
|
+
except OSError as exc:
|
|
1435
|
+
print("Failed to remove old config:", *traceback.format_exception_only(exc))
|
|
1436
|
+
else:
|
|
1437
|
+
print("Successfully removed old config file.")
|
|
1099
1438
|
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
if
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
break
|
|
1135
|
-
selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
|
|
1136
|
-
if selection != 'Yes':
|
|
1439
|
+
class ListCtrlDropTarget(wx.FileDropTarget):
|
|
1440
|
+
def __init__(self, list_ctrl, parent_frame):
|
|
1441
|
+
super().__init__()
|
|
1442
|
+
self.list_ctrl = list_ctrl
|
|
1443
|
+
self.parent_frame = parent_frame
|
|
1444
|
+
|
|
1445
|
+
def expand_folders(self, files):
|
|
1446
|
+
expanded_files = []
|
|
1447
|
+
for file in files:
|
|
1448
|
+
if os.path.isdir(file):
|
|
1449
|
+
for dir, subdirs, dir_files in os.walk(file):
|
|
1450
|
+
for dir_file in dir_files:
|
|
1451
|
+
expanded_files.append(os.path.join(dir, dir_file))
|
|
1452
|
+
else:
|
|
1453
|
+
expanded_files.append(file)
|
|
1454
|
+
return expanded_files
|
|
1455
|
+
|
|
1456
|
+
def OnDropFiles(self, x, y, files):
|
|
1457
|
+
files = self.expand_folders(files)
|
|
1458
|
+
valid_file_types = self.parent_frame.list_ctrl_file_types_drop[self.list_ctrl]
|
|
1459
|
+
files = [file for file in files if os.path.splitext(file)[-1][1:] in valid_file_types]
|
|
1460
|
+
self.parent_frame.populate_list_ctrl(self.list_ctrl, natsort.os_sorted(files))
|
|
1461
|
+
return True
|
|
1462
|
+
|
|
1463
|
+
def get_children(window):
|
|
1464
|
+
children = list(window.GetChildren())
|
|
1465
|
+
subchildren = [subchild for child in children for subchild in get_children(child)]
|
|
1466
|
+
return children + subchildren
|
|
1467
|
+
|
|
1468
|
+
def set_background_color(window, is_dark):
|
|
1469
|
+
children = get_children(window)
|
|
1470
|
+
for window in children + [window]:
|
|
1471
|
+
# modifying a CheckBox converts it into a Button, which would mess with screen readers
|
|
1472
|
+
if isinstance(window, wx.CheckBox):
|
|
1137
1473
|
continue
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1474
|
+
if is_dark:
|
|
1475
|
+
if isinstance(window, (wx.ListCtrl, wx.TextCtrl)):
|
|
1476
|
+
window.SetBackgroundColour("Black")
|
|
1477
|
+
elif isinstance(window, wx.Button):
|
|
1478
|
+
window.SetBackgroundColour(tuple(x // 2 for x in gui_background_color_dark))
|
|
1479
|
+
else:
|
|
1480
|
+
window.SetBackgroundColour(gui_background_color_dark)
|
|
1481
|
+
window.SetForegroundColour("White" if is_dark else "Black")
|
|
1142
1482
|
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1483
|
+
class FrameMain(wx.Frame):
|
|
1484
|
+
def __init__(self, parent):
|
|
1485
|
+
wx.Frame.__init__(self, parent, title=f"describealign v{__version__}", size=wx.Size(800, 500))
|
|
1486
|
+
# setting the GUI frame's font causes all contained elements to inherit that font by default
|
|
1487
|
+
self.SetFont(wx.Font(*gui_font))
|
|
1488
|
+
appearance = wx.SystemSettings.GetAppearance()
|
|
1489
|
+
self.is_dark = appearance.IsDark() or appearance.IsUsingDarkBackground()
|
|
1490
|
+
self.SetBackgroundColour(gui_background_color_dark if self.is_dark else gui_background_color_light)
|
|
1491
|
+
|
|
1492
|
+
# wrap all widgets within a panel to enable tab traversal (i.e. pressing tab to swap GUI focus)
|
|
1493
|
+
self.panel0 = wx.Panel(self, style=wx.TAB_TRAVERSAL)
|
|
1494
|
+
|
|
1495
|
+
self.text_header = wx.StaticText(self.panel0, label="Select media files to combine:")
|
|
1496
|
+
self.text_header.SetFont(self.text_header.GetFont().Scale(1.7))
|
|
1497
|
+
|
|
1498
|
+
# Video Input selection and display row of GUI
|
|
1499
|
+
self.static_box_sizer_video = wx.StaticBoxSizer(wx.HORIZONTAL, self.panel0, "Video Input")
|
|
1500
|
+
self.list_ctrl_video = self.init_list_ctrl(self.static_box_sizer_video.GetStaticBox(),
|
|
1501
|
+
"Drag and Drop Videos Here or Press Browse Video")
|
|
1502
|
+
set_tooltip(self.list_ctrl_video, "Video filenames are listed here in the sorted order they will " + \
|
|
1503
|
+
"be used as input. Drag and Drop or press Browse to overwrite.")
|
|
1504
|
+
self.button_browse_video = wx.Button(self.static_box_sizer_video.GetStaticBox(), label="Browse Video")
|
|
1505
|
+
set_tooltip(self.button_browse_video, "Select one or more video files as input.")
|
|
1506
|
+
self.button_browse_video.Bind(wx.EVT_BUTTON, lambda event: self.browse_files(self.list_ctrl_video))
|
|
1507
|
+
|
|
1508
|
+
# Audio Input selection and display row of GUI
|
|
1509
|
+
self.static_box_sizer_audio = wx.StaticBoxSizer(wx.HORIZONTAL, self.panel0, "Audio Input")
|
|
1510
|
+
self.list_ctrl_audio = self.init_list_ctrl(self.static_box_sizer_audio.GetStaticBox(),
|
|
1511
|
+
"Drag and Drop Audio Here or Press Browse Audio")
|
|
1512
|
+
set_tooltip(self.list_ctrl_audio, "Audio filenames are listed here in the sorted order they will " + \
|
|
1513
|
+
"be used as input. Drag and Drop or press Browse to overwrite.")
|
|
1514
|
+
self.button_browse_audio = wx.Button(self.static_box_sizer_audio.GetStaticBox(), label="Browse Audio")
|
|
1515
|
+
set_tooltip(self.button_browse_audio, "Select one or more audio files as input.")
|
|
1516
|
+
self.button_browse_audio.Bind(wx.EVT_BUTTON, lambda event: self.browse_files(self.list_ctrl_audio))
|
|
1517
|
+
|
|
1518
|
+
self.button_combine = wx.Button(self.panel0, label="Combine")
|
|
1519
|
+
set_tooltip(self.button_combine, "Combine selected video and audio files.")
|
|
1520
|
+
self.button_combine.Bind(wx.EVT_BUTTON, self.open_combine)
|
|
1521
|
+
self.button_settings = wx.Button(self.panel0, label="Settings")
|
|
1522
|
+
set_tooltip(self.button_settings, "Edit settings for the GUI and algorithm.")
|
|
1523
|
+
self.button_settings.Bind(wx.EVT_BUTTON, self.open_settings)
|
|
1524
|
+
|
|
1525
|
+
sizer_panel_outer = wx.BoxSizer(wx.VERTICAL)
|
|
1526
|
+
sizer_panel_inner = wx.BoxSizer(wx.VERTICAL)
|
|
1527
|
+
sizer_header = wx.BoxSizer(wx.HORIZONTAL)
|
|
1528
|
+
sizer_video = wx.BoxSizer(wx.HORIZONTAL)
|
|
1529
|
+
sizer_audio = wx.BoxSizer(wx.HORIZONTAL)
|
|
1530
|
+
sizer_combine_settings = wx.BoxSizer(wx.HORIZONTAL)
|
|
1531
|
+
|
|
1532
|
+
# Configure layout with nested Box Sizers:
|
|
1533
|
+
#
|
|
1534
|
+
# Frame
|
|
1535
|
+
# sizer_panel_outer
|
|
1536
|
+
# panel0
|
|
1537
|
+
# sizer_panel_inner
|
|
1538
|
+
# sizer_header
|
|
1539
|
+
# text_header
|
|
1540
|
+
# sizer_video
|
|
1541
|
+
# list_ctrl_video
|
|
1542
|
+
# button_browse_video
|
|
1543
|
+
# sizer_audio
|
|
1544
|
+
# list_ctrl_audio
|
|
1545
|
+
# button_browse_audio
|
|
1546
|
+
# sizer_combine_settings
|
|
1547
|
+
# button_combine
|
|
1548
|
+
# button_settings
|
|
1549
|
+
#
|
|
1550
|
+
self.SetSizer(sizer_panel_outer)
|
|
1551
|
+
sizer_panel_outer.Add(self.panel0, 1, wx.EXPAND|wx.ALL, 5)
|
|
1552
|
+
self.panel0.SetSizer(sizer_panel_inner)
|
|
1553
|
+
sizer_panel_inner.Add(sizer_header, 3, wx.EXPAND, 5)
|
|
1554
|
+
sizer_panel_inner.Add(sizer_video, 9, wx.EXPAND, 5)
|
|
1555
|
+
sizer_panel_inner.Add(sizer_audio, 9, wx.TOP|wx.EXPAND, 3)
|
|
1556
|
+
sizer_panel_inner.Add(sizer_combine_settings, 3, wx.EXPAND, 5)
|
|
1557
|
+
sizer_header.Add(self.text_header, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1558
|
+
sizer_video.Add(self.static_box_sizer_video, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 3)
|
|
1559
|
+
self.static_box_sizer_video.Add(self.list_ctrl_video, 1, wx.BOTTOM|wx.EXPAND, 2)
|
|
1560
|
+
self.static_box_sizer_video.Add(self.button_browse_video, 0,
|
|
1561
|
+
wx.LEFT|wx.BOTTOM|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 10)
|
|
1562
|
+
sizer_audio.Add(self.static_box_sizer_audio, 1, wx.LEFT|wx.RIGHT|wx.EXPAND, 3)
|
|
1563
|
+
self.static_box_sizer_audio.Add(self.list_ctrl_audio, 1, wx.BOTTOM|wx.EXPAND, 2)
|
|
1564
|
+
self.static_box_sizer_audio.Add(self.button_browse_audio, 0,
|
|
1565
|
+
wx.LEFT|wx.BOTTOM|wx.RIGHT|wx.ALIGN_CENTER_VERTICAL, 10)
|
|
1566
|
+
sizer_combine_settings.Add((0, 0), 7, wx.EXPAND, 5) # spacer
|
|
1567
|
+
sizer_combine_settings.Add(self.button_combine, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1568
|
+
sizer_combine_settings.Add((0, 0), 2, wx.EXPAND, 5) # spacer
|
|
1569
|
+
sizer_combine_settings.Add(self.button_settings, 0, wx.ALL|wx.ALIGN_CENTER_VERTICAL, 5)
|
|
1570
|
+
sizer_combine_settings.Add((0, 0), 7, wx.EXPAND, 5) # spacer
|
|
1571
|
+
|
|
1572
|
+
# centers GUI on the screen
|
|
1573
|
+
self.Centre(wx.BOTH)
|
|
1574
|
+
|
|
1575
|
+
all_video_file_types = [('All Video File Types', '*.' + ';*.'.join(VIDEO_EXTENSIONS)),]
|
|
1576
|
+
all_audio_file_types = [('All Audio File Types', '*.' + ';*.'.join(AUDIO_EXTENSIONS)),]
|
|
1577
|
+
all_video_and_audio_file_types = [('All Video and Audio File Types',
|
|
1578
|
+
'*.' + ';*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
|
|
1579
|
+
self.video_file_types = [(ext, f"*.{ext}") for ext in VIDEO_EXTENSIONS]
|
|
1580
|
+
self.audio_file_types = [(ext, f"*.{ext}") for ext in AUDIO_EXTENSIONS]
|
|
1581
|
+
self.video_and_audio_file_types = self.video_file_types + self.audio_file_types
|
|
1582
|
+
self.video_file_types = all_video_file_types + self.video_file_types
|
|
1583
|
+
self.audio_file_types = all_audio_file_types + self.audio_file_types
|
|
1584
|
+
self.video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + \
|
|
1585
|
+
self.video_and_audio_file_types
|
|
1586
|
+
self.video_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type in self.video_file_types])
|
|
1587
|
+
self.audio_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type in self.audio_file_types])
|
|
1588
|
+
self.video_and_audio_file_types = '|'.join([f'{type[0]} ({type[1]})|{type[1]}' for type \
|
|
1589
|
+
in self.video_and_audio_file_types])
|
|
1590
|
+
|
|
1591
|
+
# track the allowed file types and selected files' full paths for each List Ctrl
|
|
1592
|
+
self.list_ctrl_file_types_browse = {self.list_ctrl_video: self.video_and_audio_file_types,
|
|
1593
|
+
self.list_ctrl_audio: self.audio_file_types}
|
|
1594
|
+
self.list_ctrl_file_types_drop = {self.list_ctrl_video: self.video_file_types,
|
|
1595
|
+
self.list_ctrl_audio: self.audio_file_types}
|
|
1596
|
+
self.list_ctrl_files_selected = {self.list_ctrl_video: [],
|
|
1597
|
+
self.list_ctrl_audio: []}
|
|
1598
|
+
|
|
1599
|
+
self.config_path = self.get_config()
|
|
1600
|
+
|
|
1601
|
+
set_background_color(self, self.is_dark)
|
|
1602
|
+
|
|
1603
|
+
def init_list_ctrl(self, parent_panel, default_text):
|
|
1604
|
+
list_ctrl = wx.ListCtrl(parent_panel, style=wx.LC_NO_HEADER|wx.LC_REPORT|wx.BORDER_SUNKEN|wx.HSCROLL)
|
|
1605
|
+
list_ctrl.EnableSystemTheme(False) # get rid of vertical grid lines on Windows
|
|
1606
|
+
list_ctrl.SetMinSize(wx.Size(-1,80))
|
|
1607
|
+
list_ctrl.SetDropTarget(ListCtrlDropTarget(list_ctrl, self))
|
|
1608
|
+
list_ctrl.InsertColumn(0, "")
|
|
1609
|
+
list_ctrl.InsertItem(0, default_text)
|
|
1610
|
+
list_ctrl.SetColumnWidth(0, wx.LIST_AUTOSIZE)
|
|
1611
|
+
list_ctrl.Bind(wx.EVT_CHAR, self.delete_from_list_ctrl)
|
|
1612
|
+
return list_ctrl
|
|
1613
|
+
|
|
1614
|
+
def populate_list_ctrl(self, list_ctrl, files):
|
|
1615
|
+
self.list_ctrl_files_selected[list_ctrl] = files
|
|
1616
|
+
if len(files) == 0:
|
|
1617
|
+
files = ["No files with valid file types found"]
|
|
1618
|
+
list_ctrl.DeleteAllItems()
|
|
1619
|
+
list_ctrl.DeleteAllColumns()
|
|
1620
|
+
list_ctrl.InsertColumn(0, "")
|
|
1621
|
+
for i, file in enumerate(files):
|
|
1622
|
+
list_ctrl.InsertItem(i, os.path.basename(file))
|
|
1623
|
+
list_ctrl.SetColumnWidth(0, wx.LIST_AUTOSIZE)
|
|
1624
|
+
|
|
1625
|
+
def browse_files(self, list_ctrl):
|
|
1626
|
+
dialog = wx.FileDialog(self, wildcard=self.list_ctrl_file_types_browse[list_ctrl], style=wx.FD_MULTIPLE)
|
|
1627
|
+
if dialog.ShowModal() == wx.ID_OK:
|
|
1628
|
+
files = dialog.GetPaths()
|
|
1629
|
+
self.populate_list_ctrl(list_ctrl, files)
|
|
1630
|
+
|
|
1631
|
+
def delete_from_list_ctrl(self, event):
|
|
1632
|
+
if event.GetKeyCode() == wx.WXK_DELETE:
|
|
1633
|
+
list_ctrl = event.GetEventObject()
|
|
1634
|
+
item_index = list_ctrl.GetFirstSelected()
|
|
1635
|
+
if item_index == -1:
|
|
1636
|
+
item_index = list_ctrl.GetFocusedItem()
|
|
1637
|
+
items_to_delete = []
|
|
1638
|
+
while item_index != -1:
|
|
1639
|
+
items_to_delete.append(item_index)
|
|
1640
|
+
item_index = list_ctrl.GetNextSelected(item_index)
|
|
1641
|
+
for item_index in items_to_delete[::-1]:
|
|
1642
|
+
if len(self.list_ctrl_files_selected[list_ctrl]) != 0:
|
|
1643
|
+
list_ctrl.DeleteItem(item_index)
|
|
1644
|
+
del self.list_ctrl_files_selected[list_ctrl][item_index]
|
|
1645
|
+
else:
|
|
1646
|
+
event.Skip()
|
|
1647
|
+
|
|
1648
|
+
def open_combine(self, event):
|
|
1649
|
+
video_files = self.list_ctrl_files_selected[self.list_ctrl_video]
|
|
1650
|
+
audio_files = self.list_ctrl_files_selected[self.list_ctrl_audio]
|
|
1651
|
+
if len(video_files) == 0:
|
|
1652
|
+
error_dialog = wx.MessageDialog(self, "Error: no video input selected.", "Error", wx.OK|wx.ICON_ERROR)
|
|
1653
|
+
error_dialog.ShowModal()
|
|
1654
|
+
elif len(audio_files) == 0:
|
|
1655
|
+
error_dialog = wx.MessageDialog(self, "Error: no audio input selected.", "Error", wx.OK|wx.ICON_ERROR)
|
|
1656
|
+
error_dialog.ShowModal()
|
|
1657
|
+
elif len(video_files) != len(audio_files):
|
|
1658
|
+
error_dialog = wx.MessageDialog(self, f"Error: different numbers of video ({len(video_files)}) " + \
|
|
1659
|
+
f"and audio ({len(audio_files)}) inputs.",
|
|
1660
|
+
"Error", wx.OK|wx.ICON_ERROR)
|
|
1661
|
+
error_dialog.ShowModal()
|
|
1662
|
+
else:
|
|
1663
|
+
frame_combine = FrameCombine(None, self.config_path, video_files, audio_files, self.is_dark)
|
|
1664
|
+
self.list_ctrl_video.SetFocus()
|
|
1665
|
+
frame_combine.Show()
|
|
1666
|
+
|
|
1667
|
+
def open_settings(self, event):
|
|
1668
|
+
dialog_settings = DialogSettings(None, self.config_path, self.is_dark)
|
|
1669
|
+
dialog_settings.ShowModal()
|
|
1670
|
+
dialog_settings.Destroy()
|
|
1671
|
+
|
|
1672
|
+
def get_config(self):
|
|
1673
|
+
config_path = platformdirs.user_config_path(appname='describealign', appauthor=False,
|
|
1674
|
+
ensure_exists=True) / 'config.ini'
|
|
1675
|
+
old_paths = [
|
|
1676
|
+
# Place in chronological order (oldest -> newest)
|
|
1677
|
+
Path(__file__).resolve().parent / 'config.ini',
|
|
1678
|
+
platformdirs.user_config_path(appname='describealign', ensure_exists=True) / 'config.ini',
|
|
1679
|
+
]
|
|
1680
|
+
# Get newest existent path
|
|
1681
|
+
old_config = next((file for file in reversed(old_paths) if file.exists()), None,)
|
|
1682
|
+
try:
|
|
1683
|
+
migrate_config(old_config, config_path)
|
|
1684
|
+
except OSError as exc:
|
|
1685
|
+
print(f"Error migrating old config:", *traceback.format_exception_only(exc))
|
|
1686
|
+
print(f"Old config left in place at {old_config}")
|
|
1687
|
+
return config_path
|
|
1161
1688
|
|
|
1162
|
-
def
|
|
1163
|
-
config_path = platformdirs.user_config_path(appname='describealign', appauthor=False, ensure_exists=True) / 'config.ini'
|
|
1164
|
-
old_paths = [
|
|
1165
|
-
# Place in chronological order (oldest -> newest)
|
|
1166
|
-
Path(__file__).resolve().parent / 'config.ini',
|
|
1167
|
-
platformdirs.user_config_path(appname='describealign', ensure_exists=True) / 'config.ini',
|
|
1168
|
-
]
|
|
1169
|
-
|
|
1170
|
-
# Get newest existent path
|
|
1171
|
-
old_config = next(
|
|
1172
|
-
(
|
|
1173
|
-
file
|
|
1174
|
-
for file in reversed(old_paths)
|
|
1175
|
-
if file.exists()
|
|
1176
|
-
),
|
|
1177
|
-
None,
|
|
1178
|
-
)
|
|
1179
|
-
|
|
1689
|
+
def get_version_hash(filename):
|
|
1180
1690
|
try:
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
|
|
1189
|
-
all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
|
|
1190
|
-
all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
|
|
1191
|
-
all_video_and_audio_file_types = [('All Video and Audio File Types',
|
|
1192
|
-
'*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
|
|
1193
|
-
audio_file_types = [(ext, f"*.{ext}") for ext in AUDIO_EXTENSIONS]
|
|
1194
|
-
video_and_audio_file_types = [(ext, f"*.{ext}") for ext in VIDEO_EXTENSIONS] + audio_file_types
|
|
1195
|
-
audio_file_types = all_audio_file_types + audio_file_types
|
|
1196
|
-
video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
|
|
1197
|
-
# work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
|
|
1198
|
-
if IS_RUNNING_WINDOWS:
|
|
1199
|
-
file_fix = lambda file_types: file_types[:1] + [(f'|{type[0]}', type[1]) for type in file_types[1:]]
|
|
1200
|
-
audio_file_types = file_fix(audio_file_types)
|
|
1201
|
-
video_and_audio_file_types = file_fix(video_and_audio_file_types)
|
|
1202
|
-
|
|
1203
|
-
layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
|
|
1204
|
-
[sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
|
|
1205
|
-
sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
|
|
1206
|
-
tooltip='List video filenames here, in order, separated by semicolons'),
|
|
1207
|
-
sg.FilesBrowse(button_text="Browse Video",
|
|
1208
|
-
file_types=video_and_audio_file_types,
|
|
1209
|
-
tooltip='Select one or more video files')]], pad=(2,7))],
|
|
1210
|
-
[sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
|
|
1211
|
-
sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
|
|
1212
|
-
tooltip='List audio filenames here, in order, separated by semicolons'),
|
|
1213
|
-
sg.FilesBrowse(button_text="Browse Audio",
|
|
1214
|
-
file_types=audio_file_types,
|
|
1215
|
-
tooltip='Select one or more audio files')]], pad=(2,7))],
|
|
1216
|
-
[sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
|
|
1217
|
-
sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
|
|
1218
|
-
pad=((135,3),10))]]
|
|
1219
|
-
window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
|
|
1220
|
-
window['-VIDEO_FILES-'].set_focus()
|
|
1221
|
-
while True:
|
|
1222
|
-
event, values = window.read()
|
|
1223
|
-
if event == 'Combine':
|
|
1224
|
-
if len(values['-VIDEO_FILES-']) == 0 or \
|
|
1225
|
-
len(values['-AUDIO_FILES-']) == 0:
|
|
1226
|
-
window.disable()
|
|
1227
|
-
sg.Popup('Error: empty input field.', font=('Arial', 20))
|
|
1228
|
-
window.enable()
|
|
1229
|
-
window['-VIDEO_FILES-'].set_focus()
|
|
1230
|
-
continue
|
|
1231
|
-
video_files = values['-VIDEO_FILES-'].split(';')
|
|
1232
|
-
if len(video_files) == 1:
|
|
1233
|
-
video_files = video_files[0]
|
|
1234
|
-
audio_files = values['-AUDIO_FILES-'].split(';')
|
|
1235
|
-
if len(audio_files) == 1:
|
|
1236
|
-
audio_files = audio_files[0]
|
|
1237
|
-
window.disable()
|
|
1238
|
-
combine_gui(video_files, audio_files, config_path)
|
|
1239
|
-
window.enable()
|
|
1240
|
-
window['-VIDEO_FILES-'].set_focus()
|
|
1241
|
-
if event == 'Settings':
|
|
1242
|
-
window.disable()
|
|
1243
|
-
settings_gui(config_path)
|
|
1244
|
-
window.enable()
|
|
1245
|
-
window['-VIDEO_FILES-'].set_focus()
|
|
1246
|
-
if event == sg.WIN_CLOSED:
|
|
1247
|
-
break
|
|
1248
|
-
window.close()
|
|
1691
|
+
with open(filename, 'rb') as f:
|
|
1692
|
+
data = f.read()
|
|
1693
|
+
sha_hash = hashlib.sha1(data).hexdigest()
|
|
1694
|
+
return sha_hash[:8]
|
|
1695
|
+
except:
|
|
1696
|
+
return "None"
|
|
1249
1697
|
|
|
1250
1698
|
# Entry point for command line interaction, for example:
|
|
1251
1699
|
# > describealign video.mp4 audio_desc.mp3
|
|
1252
1700
|
def command_line_interface():
|
|
1253
1701
|
if len(sys.argv) < 2:
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1702
|
+
if wx is not None:
|
|
1703
|
+
# No args, run gui
|
|
1704
|
+
print('No input arguments detected, starting GUI...')
|
|
1705
|
+
# the following line is necessary on MacOS X to fix the filectrlpicker
|
|
1706
|
+
# https://docs.wxpython.org/wx.FileDialog.html#wx-filedialog
|
|
1707
|
+
# https://github.com/wxWidgets/Phoenix/issues/2368
|
|
1708
|
+
if platform.system() == 'Darwin':
|
|
1709
|
+
wx.SystemOptions.SetOption('osx.openfiledialog.always-show-types', 1)
|
|
1710
|
+
app = wx.App()
|
|
1711
|
+
main_gui = FrameMain(None)
|
|
1712
|
+
main_gui.Show()
|
|
1713
|
+
app.MainLoop()
|
|
1714
|
+
sys.exit(0)
|
|
1715
|
+
else:
|
|
1716
|
+
print("Can't launch GUI and arguments missing.\nGUI dependencies missing.")
|
|
1258
1717
|
|
|
1259
|
-
parser = argparse.ArgumentParser(
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
parser.add_argument("audio", help='An audio file or directory containing audio files.',
|
|
1264
|
-
|
|
1265
|
-
help='Lower values make the alignment more accurate when there are skips ' + \
|
|
1266
|
-
'(e.g. describer pauses), but also make it more likely to misalign. ' + \
|
|
1267
|
-
'Default is 50.')
|
|
1718
|
+
parser = argparse.ArgumentParser(description="Replaces a video's sound with an audio description.",
|
|
1719
|
+
usage="describealign video_file.mp4 audio_file.mp3")
|
|
1720
|
+
parser.add_argument("video", help='A video file or directory containing video files.',
|
|
1721
|
+
nargs='?', default=None)
|
|
1722
|
+
parser.add_argument("audio", help='An audio file or directory containing audio files.',
|
|
1723
|
+
nargs='?', default=None)
|
|
1268
1724
|
parser.add_argument('--stretch_audio', action='store_true',
|
|
1269
1725
|
help='Stretches the input audio to fit the input video. ' + \
|
|
1270
|
-
'Default is to stretch the video to fit the audio.'
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
'
|
|
1274
|
-
'Requires --stretch_audio to be set, otherwise does nothing.')
|
|
1275
|
-
parser.add_argument('--boost', type=float, default=0,
|
|
1276
|
-
help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
|
|
1277
|
-
'-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
|
|
1278
|
-
'Requires --stretch_audio to be set, otherwise does nothing.')
|
|
1279
|
-
parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
|
|
1280
|
-
help='Audio description detection sensitivity ratio. Higher values make ' + \
|
|
1281
|
-
'--keep_non_ad more likely to replace aligned audio. Default is 0.6')
|
|
1282
|
-
parser.add_argument('--boost_sensitivity', type=float, default=.4,
|
|
1283
|
-
help='Higher values make --boost less likely to miss a description, but ' + \
|
|
1284
|
-
'also make it more likely to boost non-description audio. Default is 0.4')
|
|
1726
|
+
'Default is to stretch the video to fit the audio. ' + \
|
|
1727
|
+
'Keeps original video audio as secondary tracks. Slower ' + \
|
|
1728
|
+
'and uses more RAM when enabled, long videos may cause ' + \
|
|
1729
|
+
'paging or Out of Memory errors on low-RAM systems.')
|
|
1285
1730
|
parser.add_argument('--yes', action='store_true',
|
|
1286
1731
|
help='Auto-skips user prompts asking to verify information.')
|
|
1287
1732
|
parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
|
|
@@ -1292,23 +1737,41 @@ def command_line_interface():
|
|
|
1292
1737
|
help='Directory combined output media is saved to. Default is "videos_with_ad"')
|
|
1293
1738
|
parser.add_argument("--alignment_dir", default=default_alignment_dir,
|
|
1294
1739
|
help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
|
|
1295
|
-
parser.add_argument("--extension", default="copy",
|
|
1296
|
-
help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
|
|
1297
|
-
'file type of the corresponding input video. Default is "copy".')
|
|
1298
1740
|
parser.add_argument("--install-ffmpeg", action="store_true",
|
|
1299
|
-
help="Install the required ffmpeg binaries and then exit. This is meant to be" + \
|
|
1741
|
+
help="Install the required ffmpeg binaries and then exit. This is meant to be " + \
|
|
1300
1742
|
"run from a privileged installer process (e.g. OS X Installer)")
|
|
1743
|
+
parser.add_argument('--version', action='store_true',
|
|
1744
|
+
help='Checks and prints the installed version of describealign.')
|
|
1301
1745
|
args = parser.parse_args()
|
|
1302
1746
|
|
|
1303
|
-
if args.
|
|
1747
|
+
if args.version:
|
|
1748
|
+
print(f"version: {__version__}")
|
|
1749
|
+
if "__compiled__" in globals() or getattr(sys, 'frozen', False):
|
|
1750
|
+
print("running from compiled binary")
|
|
1751
|
+
else:
|
|
1752
|
+
import importlib
|
|
1753
|
+
cur_dir = os.getcwd()
|
|
1754
|
+
if sys.path[0] == cur_dir:
|
|
1755
|
+
# ignore describealign.py in current directory
|
|
1756
|
+
del sys.path[0]
|
|
1757
|
+
installed_spec = importlib.util.find_spec('describealign')
|
|
1758
|
+
sys.path = [cur_dir] + sys.path
|
|
1759
|
+
else:
|
|
1760
|
+
installed_spec = importlib.util.find_spec('describealign')
|
|
1761
|
+
this_script_path = os.path.abspath(__file__)
|
|
1762
|
+
if installed_spec is None or (this_script_path != os.path.abspath(installed_spec.origin)):
|
|
1763
|
+
print("running from downloaded .py file")
|
|
1764
|
+
else:
|
|
1765
|
+
print("running from installed package")
|
|
1766
|
+
print(f"path: {this_script_path}")
|
|
1767
|
+
print(f"content hash: {get_version_hash(this_script_path)}")
|
|
1768
|
+
elif args.install_ffmpeg:
|
|
1304
1769
|
# Make sure the file is world executable
|
|
1305
1770
|
os.chmod(get_ffmpeg(), 0o755)
|
|
1306
1771
|
os.chmod(get_ffprobe(), 0o755)
|
|
1307
|
-
elif args.video
|
|
1308
|
-
combine(args.video, args.audio, args.
|
|
1309
|
-
args.
|
|
1310
|
-
args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
|
|
1311
|
-
args.extension)
|
|
1772
|
+
elif args.video and args.audio:
|
|
1773
|
+
combine(args.video, args.audio, args.stretch_audio, args.yes, args.prepend,
|
|
1774
|
+
args.no_pitch_correction, args.output_dir, args.alignment_dir)
|
|
1312
1775
|
else:
|
|
1313
1776
|
parser.print_usage()
|
|
1314
1777
|
|