muze 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -1
- data/README.md +5 -0
- data/Rakefile +3 -0
- data/ext/muze/muze_ext.c +129 -12
- data/lib/muze/beat/beat_track.rb +93 -11
- data/lib/muze/core/audio.rb +129 -0
- data/lib/muze/core/cache.rb +38 -0
- data/lib/muze/core/dct.rb +24 -21
- data/lib/muze/core/frames.rb +31 -0
- data/lib/muze/core/matrix.rb +23 -0
- data/lib/muze/core/resample.rb +111 -19
- data/lib/muze/core/stft.rb +312 -52
- data/lib/muze/core/windows.rb +113 -17
- data/lib/muze/display/specshow.rb +307 -41
- data/lib/muze/effects/harmonic_percussive.rb +83 -18
- data/lib/muze/effects/streaming.rb +101 -0
- data/lib/muze/effects/time_stretch.rb +353 -36
- data/lib/muze/feature/aggregation.rb +49 -0
- data/lib/muze/feature/chroma.rb +43 -15
- data/lib/muze/feature/context.rb +81 -0
- data/lib/muze/feature/mfcc.rb +78 -38
- data/lib/muze/feature/spectral.rb +258 -39
- data/lib/muze/filters/chroma_filter.rb +21 -2
- data/lib/muze/filters/mel.rb +47 -1
- data/lib/muze/io/audio_loader/ffmpeg_backend.rb +179 -15
- data/lib/muze/io/audio_loader/wavify_backend.rb +118 -11
- data/lib/muze/io/audio_loader.rb +178 -48
- data/lib/muze/io/audio_writer.rb +48 -0
- data/lib/muze/native.rb +91 -8
- data/lib/muze/onset/onset_detect.rb +114 -23
- data/lib/muze/version.rb +1 -1
- data/lib/muze.rb +237 -60
- metadata +11 -21
- data/benchmarks/baseline.json +0 -24
- data/benchmarks/native_vs_ruby.rb +0 -23
- data/benchmarks/quality_metrics.rb +0 -265
- data/benchmarks/quality_thresholds.md +0 -28
- data/benchmarks/support/fixture_library.rb +0 -107
|
@@ -4,24 +4,35 @@ module Muze
|
|
|
4
4
|
module Effects
|
|
5
5
|
module_function
|
|
6
6
|
|
|
7
|
-
#
|
|
7
|
+
# Use smaller FFTs for short clips so phase vocoding remains practical.
|
|
8
8
|
MIN_PHASE_VOCODER_SAMPLES = 32_768
|
|
9
|
+
MIN_TIME_STRETCH_RATE = 1.0 / 32.0
|
|
10
|
+
MAX_TIME_STRETCH_RATE = 32.0
|
|
9
11
|
|
|
10
12
|
# @param y [Numo::SFloat, Array<Float>]
|
|
11
13
|
# @param rate [Float]
|
|
12
14
|
# @return [Numo::SFloat]
|
|
13
|
-
def time_stretch(y, rate: 1.0)
|
|
14
|
-
|
|
15
|
+
def time_stretch(y, rate: 1.0, n_fft: nil, hop_length: nil, method: :phase_vocoder, phase_lock: false, force_phase_vocoder: false)
|
|
16
|
+
validate_positive_number!(rate, "rate")
|
|
17
|
+
unless rate.between?(MIN_TIME_STRETCH_RATE, MAX_TIME_STRETCH_RATE)
|
|
18
|
+
raise Muze::ParameterError, "rate must be between #{MIN_TIME_STRETCH_RATE} and #{MAX_TIME_STRETCH_RATE}"
|
|
19
|
+
end
|
|
20
|
+
validate_optional_positive_integer!(n_fft, "n_fft")
|
|
21
|
+
validate_optional_positive_integer!(hop_length, "hop_length")
|
|
22
|
+
raise Muze::ParameterError, "method must be :phase_vocoder, :ola, :wsola, or :linear" unless %i[phase_vocoder ola wsola linear].include?(method)
|
|
15
23
|
|
|
16
|
-
signal =
|
|
24
|
+
signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
|
|
25
|
+
return apply_channels(signal) { |channel| time_stretch(channel, rate:, n_fft:, hop_length:, method:, phase_lock:, force_phase_vocoder:) } if signal.ndim == 2
|
|
17
26
|
return signal if signal.empty? || rate == 1.0
|
|
18
|
-
return linear_time_stretch(signal.to_a, rate) if
|
|
27
|
+
return linear_time_stretch(signal.to_a, rate) if method == :linear
|
|
28
|
+
return ola_time_stretch(signal.to_a, rate) if method == :ola
|
|
29
|
+
return wsola_time_stretch(signal.to_a, rate) if method == :wsola
|
|
19
30
|
|
|
20
|
-
n_fft
|
|
21
|
-
hop_length
|
|
31
|
+
n_fft ||= phase_vocoder_fft_size(signal.size)
|
|
32
|
+
hop_length ||= [n_fft / 4, 1].max
|
|
22
33
|
|
|
23
34
|
stft_matrix = Muze::Core::STFT.stft(signal, n_fft:, hop_length:, center: true)
|
|
24
|
-
stretched_stft = phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:)
|
|
35
|
+
stretched_stft = phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:, phase_lock:)
|
|
25
36
|
target_length = [(signal.size / rate).round, 1].max
|
|
26
37
|
|
|
27
38
|
Muze::Core::STFT.istft(stretched_stft, hop_length:, center: true, length: target_length)
|
|
@@ -31,16 +42,25 @@ module Muze
|
|
|
31
42
|
# @param sr [Integer]
|
|
32
43
|
# @param n_steps [Float]
|
|
33
44
|
# @return [Numo::SFloat]
|
|
34
|
-
def pitch_shift(y, sr: 22_050, n_steps: 0)
|
|
35
|
-
|
|
36
|
-
|
|
45
|
+
def pitch_shift(y, sr: 22_050, n_steps: 0, bins_per_octave: 12, res_type: :auto, normalize: false, clip: nil)
|
|
46
|
+
validate_positive_integer!(sr, "sr")
|
|
47
|
+
validate_positive_number!(bins_per_octave, "bins_per_octave")
|
|
48
|
+
raise Muze::ParameterError, "n_steps must be finite" unless n_steps.respond_to?(:finite?) && n_steps.finite?
|
|
49
|
+
raise Muze::ParameterError, "normalize must be true or false" unless [true, false].include?(normalize)
|
|
50
|
+
validate_positive_number!(clip, "clip") if clip
|
|
51
|
+
|
|
52
|
+
signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
|
|
53
|
+
return apply_channels(signal) { |channel| pitch_shift(channel, sr:, n_steps:, bins_per_octave:, res_type:, normalize:, clip:) } if signal.ndim == 2
|
|
37
54
|
return signal if n_steps.zero?
|
|
38
55
|
|
|
39
|
-
rate = 2.0**(-n_steps.to_f /
|
|
56
|
+
rate = 2.0**(-n_steps.to_f / bins_per_octave)
|
|
40
57
|
stretched = time_stretch(signal, rate:)
|
|
41
|
-
|
|
42
|
-
restored = resample_for_pitch_shift(stretched, target_size: signal.size,
|
|
43
|
-
Numo::SFloat.cast(restored[0...signal.size])
|
|
58
|
+
effective_res_type = res_type == :auto ? (signal.size >= MIN_PHASE_VOCODER_SAMPLES ? :sinc : :linear) : res_type
|
|
59
|
+
restored = resample_for_pitch_shift(stretched, target_size: signal.size, sr:, rate:, res_type: effective_res_type)
|
|
60
|
+
output = Numo::SFloat.cast(restored[0...signal.size])
|
|
61
|
+
output = normalize_peak(output) if normalize
|
|
62
|
+
output = output.clip(-clip, clip) if clip
|
|
63
|
+
output
|
|
44
64
|
end
|
|
45
65
|
|
|
46
66
|
# @param y [Numo::SFloat, Array<Float>]
|
|
@@ -48,23 +68,70 @@ module Muze
|
|
|
48
68
|
# @param frame_length [Integer]
|
|
49
69
|
# @param hop_length [Integer]
|
|
50
70
|
# @return [Array(Numo::SFloat, Array<Integer>)] trimmed signal and [start, end]
|
|
51
|
-
def trim(y, top_db: 60, frame_length: 2048, hop_length: 512)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
71
|
+
def trim(y, top_db: 60, frame_length: 2048, hop_length: 512, ref: :max, aggregate: :mean, units: :samples, sr: nil)
|
|
72
|
+
raise Muze::ParameterError, "top_db must be non-negative" if top_db.negative?
|
|
73
|
+
raise Muze::ParameterError, "frame_length and hop_length must be positive" unless frame_length.positive? && hop_length.positive?
|
|
74
|
+
raise Muze::ParameterError, "aggregate must be :mean or :max" unless %i[mean max].include?(aggregate)
|
|
75
|
+
validate_trim_units!(units:, sr:, hop_length:)
|
|
76
|
+
|
|
77
|
+
signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
|
|
78
|
+
amplitude = sample_amplitude(signal, aggregate:)
|
|
79
|
+
frames = Muze::Core::Frames.slice(amplitude, frame_length:, hop_length:, pad_end: true)
|
|
80
|
+
energies = frames.map do |frame|
|
|
81
|
+
values = frame.map { |value| value * value }
|
|
82
|
+
aggregate == :max ? Math.sqrt(values.max || 0.0) : Math.sqrt(values.sum(0.0) / frame.length)
|
|
83
|
+
end
|
|
84
|
+
reference = trim_reference(energies, ref:)
|
|
85
|
+
threshold = [reference, 1.0e-12].max * (10.0**(-top_db / 20.0))
|
|
86
|
+
active_frames = energies.each_index.select { |index| energies[index] >= threshold }
|
|
87
|
+
return [Numo::SFloat[], [0, 0]] if active_frames.empty?
|
|
88
|
+
|
|
89
|
+
search_start = active_frames.first * hop_length
|
|
90
|
+
sample_count = amplitude.length
|
|
91
|
+
search_end = [(active_frames.last * hop_length) + frame_length, sample_count].min
|
|
92
|
+
active_samples = (search_start...search_end).select { |index| amplitude[index] >= threshold }
|
|
93
|
+
empty = signal.ndim == 2 ? Numo::SFloat.zeros(0, signal.shape[1]) : Numo::SFloat[]
|
|
94
|
+
return [empty, [0, 0]] if active_samples.empty?
|
|
95
|
+
|
|
96
|
+
start_sample = active_samples.first
|
|
97
|
+
end_sample = active_samples.last + 1
|
|
98
|
+
trimmed = signal.ndim == 2 ? signal[start_sample...end_sample, true] : signal[start_sample...end_sample]
|
|
99
|
+
[trimmed, convert_trim_interval(start_sample, end_sample, units:, sr:, hop_length:)]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def preemphasis(y, coef: 0.97)
|
|
103
|
+
validate_finite_number!(coef, "coef")
|
|
104
|
+
matrix = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
|
|
105
|
+
return apply_channels(matrix) { |channel| preemphasis(channel, coef:) } if matrix.ndim == 2
|
|
106
|
+
|
|
107
|
+
signal = matrix.to_a
|
|
108
|
+
return Numo::SFloat.cast(signal) if signal.empty?
|
|
58
109
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
[signal[
|
|
110
|
+
output = Array.new(signal.length, 0.0)
|
|
111
|
+
output[0] = signal[0]
|
|
112
|
+
(1...signal.length).each { |index| output[index] = signal[index] - (coef * signal[index - 1]) }
|
|
113
|
+
Numo::SFloat.cast(output)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def deemphasis(y, coef: 0.97)
|
|
117
|
+
validate_finite_number!(coef, "coef")
|
|
118
|
+
matrix = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
|
|
119
|
+
return apply_channels(matrix) { |channel| deemphasis(channel, coef:) } if matrix.ndim == 2
|
|
120
|
+
|
|
121
|
+
signal = matrix.to_a
|
|
122
|
+
return Numo::SFloat.cast(signal) if signal.empty?
|
|
123
|
+
|
|
124
|
+
output = Array.new(signal.length, 0.0)
|
|
125
|
+
output[0] = signal[0]
|
|
126
|
+
(1...signal.length).each { |index| output[index] = signal[index] + (coef * output[index - 1]) }
|
|
127
|
+
Numo::SFloat.cast(output)
|
|
62
128
|
end
|
|
63
129
|
|
|
64
130
|
# @param signal_length [Integer]
|
|
65
131
|
# @return [Integer]
|
|
66
132
|
def phase_vocoder_fft_size(signal_length)
|
|
67
|
-
|
|
133
|
+
max_allowed = signal_length < MIN_PHASE_VOCODER_SAMPLES ? 512 : 2048
|
|
134
|
+
max_fft = [signal_length, max_allowed].min
|
|
68
135
|
fft_size = 1
|
|
69
136
|
fft_size *= 2 while (fft_size * 2) <= max_fft
|
|
70
137
|
[fft_size, 32].max
|
|
@@ -76,7 +143,7 @@ module Muze
|
|
|
76
143
|
# @param hop_length [Integer]
|
|
77
144
|
# @param n_fft [Integer]
|
|
78
145
|
# @return [Numo::DComplex]
|
|
79
|
-
def phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:)
|
|
146
|
+
def phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:, phase_lock: false)
|
|
80
147
|
frequency_bins, frame_count = stft_matrix.shape
|
|
81
148
|
return stft_matrix if frame_count <= 1
|
|
82
149
|
|
|
@@ -115,10 +182,32 @@ module Muze
|
|
|
115
182
|
end
|
|
116
183
|
end
|
|
117
184
|
|
|
118
|
-
stretched
|
|
185
|
+
phase_lock ? phase_lock_spectrum(stretched) : stretched
|
|
119
186
|
end
|
|
120
187
|
private_class_method :phase_vocoder
|
|
121
188
|
|
|
189
|
+
def phase_lock_spectrum(stft_matrix)
|
|
190
|
+
rows, cols = stft_matrix.shape
|
|
191
|
+
output = stft_matrix.dup
|
|
192
|
+
cols.times do |col|
|
|
193
|
+
peak_bins = local_peak_bins(stft_matrix[true, col])
|
|
194
|
+
next if peak_bins.empty?
|
|
195
|
+
|
|
196
|
+
rows.times do |row|
|
|
197
|
+
peak = peak_bins.min_by { |candidate| (candidate - row).abs }
|
|
198
|
+
output[row, col] = Complex.polar(stft_matrix[row, col].abs, phase_of(stft_matrix[peak, col]))
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
output
|
|
202
|
+
end
|
|
203
|
+
private_class_method :phase_lock_spectrum
|
|
204
|
+
|
|
205
|
+
def local_peak_bins(spectrum)
|
|
206
|
+
values = spectrum.abs.to_a
|
|
207
|
+
(1...(values.length - 1)).select { |index| values[index] >= values[index - 1] && values[index] >= values[index + 1] }
|
|
208
|
+
end
|
|
209
|
+
private_class_method :local_peak_bins
|
|
210
|
+
|
|
122
211
|
# @param complex_number [Complex]
|
|
123
212
|
# @return [Float]
|
|
124
213
|
def phase_of(complex_number)
|
|
@@ -152,20 +241,248 @@ module Muze
|
|
|
152
241
|
end
|
|
153
242
|
private_class_method :linear_time_stretch
|
|
154
243
|
|
|
244
|
+
def ola_time_stretch(signal, rate)
|
|
245
|
+
frame_length = [[next_power_of_two([signal.length / 8, 256].max), 2048].min, 32].max
|
|
246
|
+
analysis_hop = [frame_length / 2, 1].max
|
|
247
|
+
synthesis_hop = [(analysis_hop / rate).round, 1].max
|
|
248
|
+
frame_count = signal.length <= frame_length ? 1 : (((signal.length - frame_length).to_f / analysis_hop).ceil + 1)
|
|
249
|
+
target_length = [(signal.length / rate).round, 1].max
|
|
250
|
+
output = Array.new(target_length + frame_length, 0.0)
|
|
251
|
+
window_sums = Array.new(output.length, 0.0)
|
|
252
|
+
window = Muze::Core::Windows.hann(frame_length).to_a
|
|
253
|
+
|
|
254
|
+
frame_count.times do |frame_index|
|
|
255
|
+
source_start = frame_index * analysis_hop
|
|
256
|
+
target_start = frame_index * synthesis_hop
|
|
257
|
+
frame_length.times do |offset|
|
|
258
|
+
source_index = source_start + offset
|
|
259
|
+
target_index = target_start + offset
|
|
260
|
+
break if target_index >= output.length
|
|
261
|
+
|
|
262
|
+
value = source_index < signal.length ? signal[source_index] : 0.0
|
|
263
|
+
weight = window[offset]
|
|
264
|
+
output[target_index] += value * weight
|
|
265
|
+
window_sums[target_index] += weight * weight
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
output.map!.with_index do |value, index|
|
|
270
|
+
denominator = window_sums[index]
|
|
271
|
+
denominator > 1.0e-3 ? value / denominator : 0.0
|
|
272
|
+
end
|
|
273
|
+
Numo::SFloat.cast(limit_peak(output[0, target_length], signal.map(&:abs).max.to_f))
|
|
274
|
+
end
|
|
275
|
+
private_class_method :ola_time_stretch
|
|
276
|
+
|
|
277
|
+
def wsola_time_stretch(signal, rate)
|
|
278
|
+
frame_length = [[next_power_of_two([signal.length / 8, 256].max), 2048].min, 32].max
|
|
279
|
+
analysis_hop = [frame_length / 2, 1].max
|
|
280
|
+
synthesis_hop = [(analysis_hop / rate).round, 1].max
|
|
281
|
+
target_length = [(signal.length / rate).round, 1].max
|
|
282
|
+
output = Array.new(target_length + frame_length, 0.0)
|
|
283
|
+
window_sums = Array.new(output.length, 0.0)
|
|
284
|
+
window = Muze::Core::Windows.hann(frame_length).to_a
|
|
285
|
+
overlap = [[frame_length - synthesis_hop, 0].max, frame_length / 2].min
|
|
286
|
+
search_radius = [[analysis_hop / 2, 8].max, frame_length].min
|
|
287
|
+
|
|
288
|
+
target_start = 0
|
|
289
|
+
source_start = 0
|
|
290
|
+
first_frame = true
|
|
291
|
+
|
|
292
|
+
while target_start < target_length
|
|
293
|
+
expected_source = (target_start * rate).round
|
|
294
|
+
source_start = if first_frame || overlap.zero?
|
|
295
|
+
[[expected_source, 0].max, signal.length - 1].min
|
|
296
|
+
else
|
|
297
|
+
best_wsola_source_start(signal, output, target_start, expected_source, frame_length, overlap, search_radius)
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
overlap_add_frame(signal, output, window_sums, window, source_start:, target_start:, frame_length:)
|
|
301
|
+
first_frame = false
|
|
302
|
+
target_start += synthesis_hop
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
output.map!.with_index do |value, index|
|
|
306
|
+
denominator = window_sums[index]
|
|
307
|
+
denominator > 1.0e-3 ? value / denominator : 0.0
|
|
308
|
+
end
|
|
309
|
+
Numo::SFloat.cast(limit_peak(output[0, target_length], signal.map(&:abs).max.to_f))
|
|
310
|
+
end
|
|
311
|
+
private_class_method :wsola_time_stretch
|
|
312
|
+
|
|
313
|
+
def best_wsola_source_start(signal, output, target_start, expected_source, frame_length, overlap, search_radius)
|
|
314
|
+
lower = [expected_source - search_radius, 0].max
|
|
315
|
+
upper = [expected_source + search_radius, [signal.length - 1, 0].max].min
|
|
316
|
+
best_start = lower
|
|
317
|
+
best_score = -Float::INFINITY
|
|
318
|
+
|
|
319
|
+
(lower..upper).each do |candidate|
|
|
320
|
+
score = overlap_correlation(signal, output, candidate, target_start, overlap)
|
|
321
|
+
next unless score > best_score
|
|
322
|
+
|
|
323
|
+
best_score = score
|
|
324
|
+
best_start = candidate
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
[best_start, [signal.length - frame_length, 0].max].min
|
|
328
|
+
end
|
|
329
|
+
private_class_method :best_wsola_source_start
|
|
330
|
+
|
|
331
|
+
def overlap_correlation(signal, output, source_start, target_start, overlap)
|
|
332
|
+
numerator = 0.0
|
|
333
|
+
source_energy = 0.0
|
|
334
|
+
output_energy = 0.0
|
|
335
|
+
|
|
336
|
+
overlap.times do |offset|
|
|
337
|
+
source_index = source_start + offset
|
|
338
|
+
target_index = target_start + offset
|
|
339
|
+
break if source_index >= signal.length || target_index >= output.length
|
|
340
|
+
|
|
341
|
+
source_value = signal[source_index]
|
|
342
|
+
output_value = output[target_index]
|
|
343
|
+
numerator += source_value * output_value
|
|
344
|
+
source_energy += source_value * source_value
|
|
345
|
+
output_energy += output_value * output_value
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
denominator = Math.sqrt(source_energy * output_energy)
|
|
349
|
+
denominator <= 1.0e-12 ? 0.0 : numerator / denominator
|
|
350
|
+
end
|
|
351
|
+
private_class_method :overlap_correlation
|
|
352
|
+
|
|
353
|
+
def overlap_add_frame(signal, output, window_sums, window, source_start:, target_start:, frame_length:)
|
|
354
|
+
frame_length.times do |offset|
|
|
355
|
+
source_index = source_start + offset
|
|
356
|
+
target_index = target_start + offset
|
|
357
|
+
break if target_index >= output.length
|
|
358
|
+
|
|
359
|
+
value = source_index < signal.length ? signal[source_index] : 0.0
|
|
360
|
+
weight = window[offset]
|
|
361
|
+
output[target_index] += value * weight
|
|
362
|
+
window_sums[target_index] += weight * weight
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
private_class_method :overlap_add_frame
|
|
366
|
+
|
|
367
|
+
def limit_peak(values, target_peak)
|
|
368
|
+
peak = values.map(&:abs).max.to_f
|
|
369
|
+
return values if target_peak <= 0.0 || peak <= target_peak
|
|
370
|
+
|
|
371
|
+
scale = target_peak / peak
|
|
372
|
+
values.map { |value| value * scale }
|
|
373
|
+
end
|
|
374
|
+
private_class_method :limit_peak
|
|
375
|
+
|
|
376
|
+
def next_power_of_two(value)
|
|
377
|
+
power = 1
|
|
378
|
+
power *= 2 while power < value
|
|
379
|
+
power
|
|
380
|
+
end
|
|
381
|
+
private_class_method :next_power_of_two
|
|
382
|
+
|
|
155
383
|
# Prefer sinc-quality resampling, then fall back to linear on failure.
|
|
156
384
|
# @param stretched [Numo::SFloat]
|
|
157
385
|
# @param target_size [Integer]
|
|
158
|
-
# @param
|
|
386
|
+
# @param sr [Integer]
|
|
387
|
+
# @param rate [Float]
|
|
388
|
+
# @param res_type [Symbol]
|
|
159
389
|
# @return [Numo::SFloat]
|
|
160
|
-
def resample_for_pitch_shift(stretched, target_size:,
|
|
161
|
-
|
|
162
|
-
|
|
390
|
+
def resample_for_pitch_shift(stretched, target_size:, sr:, rate:, res_type:)
|
|
391
|
+
target_sr = [(sr * rate).round, 1].max
|
|
392
|
+
Muze::Core::Resample.resample(stretched, orig_sr: sr, target_sr:, res_type:, target_length: target_size)
|
|
393
|
+
rescue Muze::ParameterError
|
|
394
|
+
Muze::Core::Resample.resample(stretched, orig_sr: stretched.size, target_sr: target_size, res_type: :linear, target_length: target_size)
|
|
395
|
+
end
|
|
396
|
+
private_class_method :resample_for_pitch_shift
|
|
397
|
+
|
|
398
|
+
def trim_reference(energies, ref:)
|
|
399
|
+
case ref
|
|
400
|
+
when :max then energies.max || 0.0
|
|
401
|
+
when Numeric then ref.to_f
|
|
402
|
+
when Proc then ref.call(energies)
|
|
403
|
+
else
|
|
404
|
+
raise Muze::ParameterError, "ref must be :max, numeric, or a Proc"
|
|
163
405
|
end
|
|
406
|
+
end
|
|
407
|
+
private_class_method :trim_reference
|
|
164
408
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
Muze::
|
|
409
|
+
def validate_trim_units!(units:, sr:, hop_length:)
|
|
410
|
+
raise Muze::ParameterError, "units must be :samples, :frames, or :time" unless %i[samples frames time].include?(units)
|
|
411
|
+
raise Muze::ParameterError, "sr must be positive for time units" if units == :time && !(sr.is_a?(Integer) && sr.positive?)
|
|
412
|
+
raise Muze::ParameterError, "hop_length must be positive for frame units" if units == :frames && !(hop_length.is_a?(Integer) && hop_length.positive?)
|
|
168
413
|
end
|
|
169
|
-
private_class_method :
|
|
414
|
+
private_class_method :validate_trim_units!
|
|
415
|
+
|
|
416
|
+
def convert_trim_interval(start_sample, end_sample, units:, sr:, hop_length:)
|
|
417
|
+
case units
|
|
418
|
+
when :samples
|
|
419
|
+
[start_sample, end_sample]
|
|
420
|
+
when :frames
|
|
421
|
+
[start_sample, end_sample].map { |sample| (sample / hop_length.to_f).floor }
|
|
422
|
+
when :time
|
|
423
|
+
[start_sample, end_sample].map { |sample| sample.to_f / sr }
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
private_class_method :convert_trim_interval
|
|
427
|
+
|
|
428
|
+
def apply_channels(matrix)
|
|
429
|
+
frames, channels = matrix.shape
|
|
430
|
+
processed = channels.times.map { |channel| yield(matrix[true, channel]) }
|
|
431
|
+
output_length = processed.map(&:size).max || frames
|
|
432
|
+
output = Numo::SFloat.zeros(output_length, channels)
|
|
433
|
+
channels.times do |channel|
|
|
434
|
+
values = processed[channel]
|
|
435
|
+
output[0...values.size, channel] = values
|
|
436
|
+
end
|
|
437
|
+
output
|
|
438
|
+
end
|
|
439
|
+
private_class_method :apply_channels
|
|
440
|
+
|
|
441
|
+
def sample_amplitude(signal, aggregate:)
|
|
442
|
+
return signal.abs.to_a unless signal.ndim == 2
|
|
443
|
+
|
|
444
|
+
frames, channels = signal.shape
|
|
445
|
+
Array.new(frames) do |frame|
|
|
446
|
+
values = Array.new(channels) { |channel| signal[frame, channel].abs }
|
|
447
|
+
aggregate == :max ? values.max : values.sum(0.0) / channels
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
private_class_method :sample_amplitude
|
|
451
|
+
|
|
452
|
+
def normalize_peak(signal)
|
|
453
|
+
peak = signal.abs.max
|
|
454
|
+
return signal if peak <= 0.0
|
|
455
|
+
|
|
456
|
+
signal / peak
|
|
457
|
+
end
|
|
458
|
+
private_class_method :normalize_peak
|
|
459
|
+
|
|
460
|
+
def validate_positive_integer!(value, label)
|
|
461
|
+
return if value.is_a?(Integer) && value.positive?
|
|
462
|
+
|
|
463
|
+
raise Muze::ParameterError, "#{label} must be a positive integer"
|
|
464
|
+
end
|
|
465
|
+
private_class_method :validate_positive_integer!
|
|
466
|
+
|
|
467
|
+
def validate_optional_positive_integer!(value, label)
|
|
468
|
+
return if value.nil?
|
|
469
|
+
|
|
470
|
+
validate_positive_integer!(value, label)
|
|
471
|
+
end
|
|
472
|
+
private_class_method :validate_optional_positive_integer!
|
|
473
|
+
|
|
474
|
+
def validate_positive_number!(value, label)
|
|
475
|
+
return if value.respond_to?(:finite?) && value.finite? && value.positive?
|
|
476
|
+
|
|
477
|
+
raise Muze::ParameterError, "#{label} must be positive"
|
|
478
|
+
end
|
|
479
|
+
private_class_method :validate_positive_number!
|
|
480
|
+
|
|
481
|
+
def validate_finite_number!(value, label)
|
|
482
|
+
return if value.respond_to?(:finite?) && value.finite?
|
|
483
|
+
|
|
484
|
+
raise Muze::ParameterError, "#{label} must be finite"
|
|
485
|
+
end
|
|
486
|
+
private_class_method :validate_finite_number!
|
|
170
487
|
end
|
|
171
488
|
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Muze
|
|
4
|
+
module Feature
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
# Beat-synchronous aggregation over the frame axis.
|
|
8
|
+
def beat_sync(data, beats:, aggregate: :mean)
|
|
9
|
+
raise Muze::ParameterError, "aggregate must be :mean, :median, or :max" unless %i[mean median max].include?(aggregate)
|
|
10
|
+
|
|
11
|
+
matrix = Numo::SFloat.cast(data)
|
|
12
|
+
matrix = matrix.expand_dims(0) if matrix.ndim == 1
|
|
13
|
+
raise Muze::ParameterError, "data must be one- or two-dimensional" unless [1, 2].include?(matrix.ndim)
|
|
14
|
+
|
|
15
|
+
rows, frames = matrix.shape
|
|
16
|
+
boundaries = beat_boundaries(beats, frames)
|
|
17
|
+
output = Numo::SFloat.zeros(rows, boundaries.length - 1)
|
|
18
|
+
|
|
19
|
+
boundaries.each_cons(2).with_index do |(left, right), segment_index|
|
|
20
|
+
rows.times do |row|
|
|
21
|
+
values = matrix[row, left...right].to_a
|
|
22
|
+
output[row, segment_index] = aggregate_values(values, aggregate:)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
output
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def beat_boundaries(beats, frames)
|
|
30
|
+
points = Array(beats).map(&:to_i).select { |beat| beat.between?(0, frames) }
|
|
31
|
+
([0] + points + [frames]).uniq.sort
|
|
32
|
+
end
|
|
33
|
+
private_class_method :beat_boundaries
|
|
34
|
+
|
|
35
|
+
def aggregate_values(values, aggregate:)
|
|
36
|
+
return 0.0 if values.empty?
|
|
37
|
+
|
|
38
|
+
case aggregate
|
|
39
|
+
when :mean
|
|
40
|
+
values.sum(0.0) / values.length
|
|
41
|
+
when :median
|
|
42
|
+
Muze::Native.median1d(values)
|
|
43
|
+
when :max
|
|
44
|
+
values.max
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
private_class_method :aggregate_values
|
|
48
|
+
end
|
|
49
|
+
end
|
data/lib/muze/feature/chroma.rb
CHANGED
|
@@ -11,42 +11,59 @@ module Muze
|
|
|
11
11
|
# @param n_fft [Integer]
|
|
12
12
|
# @param hop_length [Integer]
|
|
13
13
|
# @param norm [Integer, nil]
|
|
14
|
+
# @param tuning [Float]
|
|
14
15
|
# @return [Numo::SFloat] shape: [n_chroma, frames]
|
|
15
|
-
def chroma_stft(y: nil, sr: 22_050, s: nil, n_chroma: 12, n_fft: 2048, hop_length: 512, norm: 2)
|
|
16
|
+
def chroma_stft(y: nil, sr: 22_050, s: nil, n_chroma: 12, n_fft: 2048, hop_length: 512, norm: 2, tuning: 0.0, ctroct: nil, octwidth: nil)
|
|
16
17
|
spectrum = if s
|
|
17
|
-
Numo::SFloat.cast(s)
|
|
18
|
+
provided = Numo::SFloat.cast(s)
|
|
19
|
+
validate_spectrum!(provided)
|
|
20
|
+
provided
|
|
18
21
|
else
|
|
19
22
|
stft_matrix = Muze.stft(y, n_fft:, hop_length:)
|
|
20
23
|
magnitude, = Muze.magphase(stft_matrix)
|
|
21
24
|
magnitude
|
|
22
|
-
|
|
25
|
+
end
|
|
23
26
|
|
|
24
27
|
spectrum = spectrum.expand_dims(1) if spectrum.ndim == 1
|
|
25
|
-
filter_bank = Muze::Filters.chroma(sr:, n_fft:, n_chroma:)
|
|
26
|
-
chroma =
|
|
28
|
+
filter_bank = Muze::Filters.chroma(sr:, n_fft:, n_chroma:, tuning:, ctroct:, octwidth:)
|
|
29
|
+
chroma = Muze::Core::Matrix.multiply(filter_bank, spectrum)
|
|
27
30
|
normalize(chroma, norm:)
|
|
28
31
|
end
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
# @return [Numo::SFloat] shape: [6, frames]
|
|
34
|
+
def tonnetz(y: nil, chroma: nil, sr: 22_050, n_fft: 2048, hop_length: 512)
|
|
35
|
+
chroma_matrix = chroma ? Numo::SFloat.cast(chroma) : chroma_stft(y:, sr:, n_fft:, hop_length:)
|
|
36
|
+
chroma_matrix = chroma_matrix.expand_dims(1) if chroma_matrix.ndim == 1
|
|
37
|
+
validate_spectrum!(chroma_matrix)
|
|
38
|
+
raise Muze::ParameterError, "tonnetz requires 12-bin chroma" unless chroma_matrix.shape[0] == 12
|
|
39
|
+
|
|
40
|
+
frames = chroma_matrix.shape[1]
|
|
41
|
+
output = Numo::SFloat.zeros(6, frames)
|
|
42
|
+
intervals = [7, 7, 3, 3, 4, 4]
|
|
43
|
+
phases = [0.0, Math::PI / 2.0, 0.0, Math::PI / 2.0, 0.0, Math::PI / 2.0]
|
|
34
44
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
45
|
+
frames.times do |frame|
|
|
46
|
+
vector = chroma_matrix[true, frame]
|
|
47
|
+
total = vector.sum
|
|
48
|
+
next if total <= 0.0
|
|
49
|
+
|
|
50
|
+
normalized = vector / total
|
|
51
|
+
6.times do |dimension|
|
|
38
52
|
sum = 0.0
|
|
39
|
-
|
|
40
|
-
|
|
53
|
+
12.times do |chroma_index|
|
|
54
|
+
angle = ((Math::PI * intervals[dimension] * chroma_index) / 6.0) + phases[dimension]
|
|
55
|
+
sum += normalized[chroma_index] * Math.cos(angle)
|
|
56
|
+
end
|
|
57
|
+
output[dimension, frame] = sum
|
|
41
58
|
end
|
|
42
59
|
end
|
|
43
60
|
|
|
44
61
|
output
|
|
45
62
|
end
|
|
46
|
-
private_class_method :matrix_multiply
|
|
47
63
|
|
|
48
64
|
def normalize(chroma, norm:)
|
|
49
65
|
return chroma if norm.nil?
|
|
66
|
+
raise Muze::ParameterError, "norm must be nil, 1, or 2" unless [1, 2].include?(norm)
|
|
50
67
|
|
|
51
68
|
frames = chroma.shape[1]
|
|
52
69
|
frames.times do |frame_index|
|
|
@@ -64,5 +81,16 @@ module Muze
|
|
|
64
81
|
chroma
|
|
65
82
|
end
|
|
66
83
|
private_class_method :normalize
|
|
84
|
+
|
|
85
|
+
def validate_spectrum!(spectrum)
|
|
86
|
+
values = spectrum.to_a.flatten
|
|
87
|
+
unless values.all? { |value| value.respond_to?(:finite?) && value.finite? }
|
|
88
|
+
raise Muze::ParameterError, "s must contain only finite numeric values"
|
|
89
|
+
end
|
|
90
|
+
return unless values.any?(&:negative?)
|
|
91
|
+
|
|
92
|
+
raise Muze::ParameterError, "spectrogram input must be non-negative"
|
|
93
|
+
end
|
|
94
|
+
private_class_method :validate_spectrum!
|
|
67
95
|
end
|
|
68
96
|
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Muze
|
|
4
|
+
module Feature
|
|
5
|
+
# Lightweight cache for feature extractors that share the same STFT.
|
|
6
|
+
class Context
|
|
7
|
+
DEFAULT_FEATURES = %i[
|
|
8
|
+
melspectrogram
|
|
9
|
+
chroma_stft
|
|
10
|
+
spectral_centroid
|
|
11
|
+
spectral_bandwidth
|
|
12
|
+
spectral_rolloff
|
|
13
|
+
spectral_flatness
|
|
14
|
+
rms
|
|
15
|
+
zero_crossing_rate
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
18
|
+
attr_reader :y, :sr, :n_fft, :hop_length, :center, :pad_mode
|
|
19
|
+
|
|
20
|
+
def initialize(y:, sr: 22_050, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
|
|
21
|
+
@y = y
|
|
22
|
+
@sr = sr
|
|
23
|
+
@n_fft = n_fft
|
|
24
|
+
@hop_length = hop_length
|
|
25
|
+
@center = center
|
|
26
|
+
@pad_mode = pad_mode
|
|
27
|
+
@cache = {}
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def stft
|
|
31
|
+
@cache[:stft] ||= Muze.stft(y, n_fft:, hop_length:, center:, pad_mode:)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def magnitude
|
|
35
|
+
@cache[:magnitude] ||= Muze.magphase(stft).first
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def power
|
|
39
|
+
@cache[:power] ||= (magnitude**2).cast_to(Numo::SFloat)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def extract(features: DEFAULT_FEATURES)
|
|
43
|
+
features.each_with_object({}) do |feature, results|
|
|
44
|
+
results[feature] = fetch(feature)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def fetch(feature)
|
|
49
|
+
@cache[feature] ||= case feature
|
|
50
|
+
when :melspectrogram then Muze::Feature.melspectrogram(sr:, s: power, n_fft:, hop_length:)
|
|
51
|
+
when :chroma_stft then Muze::Feature.chroma_stft(sr:, s: magnitude, n_fft:, hop_length:)
|
|
52
|
+
when :spectral_centroid then Muze::Feature.spectral_centroid(s: magnitude, sr:, n_fft:, hop_length:)
|
|
53
|
+
when :spectral_bandwidth then Muze::Feature.spectral_bandwidth(s: magnitude, sr:, n_fft:, hop_length:)
|
|
54
|
+
when :spectral_rolloff then Muze::Feature.spectral_rolloff(s: magnitude, sr:, n_fft:, hop_length:)
|
|
55
|
+
when :spectral_flatness then Muze::Feature.spectral_flatness(s: magnitude, n_fft:, hop_length:)
|
|
56
|
+
when :spectral_flux then Muze::Feature.spectral_flux(s: magnitude, sr:, n_fft:, hop_length:)
|
|
57
|
+
when :spectral_entropy then Muze::Feature.spectral_entropy(s: magnitude, sr:, n_fft:, hop_length:)
|
|
58
|
+
when :spectral_crest then Muze::Feature.spectral_crest(s: magnitude, sr:, n_fft:, hop_length:)
|
|
59
|
+
when :spectral_slope then Muze::Feature.spectral_slope(s: magnitude, sr:, n_fft:, hop_length:)
|
|
60
|
+
when :spectral_decrease then Muze::Feature.spectral_decrease(s: magnitude, sr:, n_fft:, hop_length:)
|
|
61
|
+
when :poly_features then Muze::Feature.poly_features(s: magnitude, sr:, n_fft:, hop_length:)
|
|
62
|
+
when :tonnetz then Muze::Feature.tonnetz(chroma: fetch(:chroma_stft), sr:, n_fft:, hop_length:)
|
|
63
|
+
when :rms then Muze::Feature.rms(s: magnitude)
|
|
64
|
+
when :zero_crossing_rate then Muze::Feature.zero_crossing_rate(y, frame_length: n_fft, hop_length:)
|
|
65
|
+
else
|
|
66
|
+
raise Muze::ParameterError, "Unsupported feature: #{feature}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
module_function
|
|
72
|
+
|
|
73
|
+
def context(y:, sr: 22_050, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
|
|
74
|
+
Context.new(y:, sr:, n_fft:, hop_length:, center:, pad_mode:)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def extract(y:, sr: 22_050, features: Context::DEFAULT_FEATURES, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
|
|
78
|
+
context(y:, sr:, n_fft:, hop_length:, center:, pad_mode:).extract(features:)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|