muze 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -1
  3. data/README.md +5 -0
  4. data/Rakefile +3 -0
  5. data/ext/muze/muze_ext.c +129 -12
  6. data/lib/muze/beat/beat_track.rb +93 -11
  7. data/lib/muze/core/audio.rb +129 -0
  8. data/lib/muze/core/cache.rb +38 -0
  9. data/lib/muze/core/dct.rb +24 -21
  10. data/lib/muze/core/frames.rb +31 -0
  11. data/lib/muze/core/matrix.rb +23 -0
  12. data/lib/muze/core/resample.rb +111 -19
  13. data/lib/muze/core/stft.rb +312 -52
  14. data/lib/muze/core/windows.rb +113 -17
  15. data/lib/muze/display/specshow.rb +307 -41
  16. data/lib/muze/effects/harmonic_percussive.rb +83 -18
  17. data/lib/muze/effects/streaming.rb +101 -0
  18. data/lib/muze/effects/time_stretch.rb +353 -36
  19. data/lib/muze/feature/aggregation.rb +49 -0
  20. data/lib/muze/feature/chroma.rb +43 -15
  21. data/lib/muze/feature/context.rb +81 -0
  22. data/lib/muze/feature/mfcc.rb +78 -38
  23. data/lib/muze/feature/spectral.rb +258 -39
  24. data/lib/muze/filters/chroma_filter.rb +21 -2
  25. data/lib/muze/filters/mel.rb +47 -1
  26. data/lib/muze/io/audio_loader/ffmpeg_backend.rb +179 -15
  27. data/lib/muze/io/audio_loader/wavify_backend.rb +118 -11
  28. data/lib/muze/io/audio_loader.rb +178 -48
  29. data/lib/muze/io/audio_writer.rb +48 -0
  30. data/lib/muze/native.rb +91 -8
  31. data/lib/muze/onset/onset_detect.rb +114 -23
  32. data/lib/muze/version.rb +1 -1
  33. data/lib/muze.rb +237 -60
  34. metadata +11 -21
  35. data/benchmarks/baseline.json +0 -24
  36. data/benchmarks/native_vs_ruby.rb +0 -23
  37. data/benchmarks/quality_metrics.rb +0 -265
  38. data/benchmarks/quality_thresholds.md +0 -28
  39. data/benchmarks/support/fixture_library.rb +0 -107
@@ -4,24 +4,35 @@ module Muze
4
4
  module Effects
5
5
  module_function
6
6
 
7
- # Keep fast path for short clips where phase vocoder overhead dominates.
7
+ # Use smaller FFTs for short clips so phase vocoding remains practical.
8
8
  MIN_PHASE_VOCODER_SAMPLES = 32_768
9
+ MIN_TIME_STRETCH_RATE = 1.0 / 32.0
10
+ MAX_TIME_STRETCH_RATE = 32.0
9
11
 
10
12
  # @param y [Numo::SFloat, Array<Float>]
11
13
  # @param rate [Float]
12
14
  # @return [Numo::SFloat]
13
- def time_stretch(y, rate: 1.0)
14
- raise Muze::ParameterError, "rate must be positive" unless rate.positive?
15
+ def time_stretch(y, rate: 1.0, n_fft: nil, hop_length: nil, method: :phase_vocoder, phase_lock: false, force_phase_vocoder: false)
16
+ validate_positive_number!(rate, "rate")
17
+ unless rate.between?(MIN_TIME_STRETCH_RATE, MAX_TIME_STRETCH_RATE)
18
+ raise Muze::ParameterError, "rate must be between #{MIN_TIME_STRETCH_RATE} and #{MAX_TIME_STRETCH_RATE}"
19
+ end
20
+ validate_optional_positive_integer!(n_fft, "n_fft")
21
+ validate_optional_positive_integer!(hop_length, "hop_length")
22
+ raise Muze::ParameterError, "method must be :phase_vocoder, :ola, :wsola, or :linear" unless %i[phase_vocoder ola wsola linear].include?(method)
15
23
 
16
- signal = y.is_a?(Numo::NArray) ? Numo::SFloat.cast(y) : Numo::SFloat.cast(Array(y))
24
+ signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
25
+ return apply_channels(signal) { |channel| time_stretch(channel, rate:, n_fft:, hop_length:, method:, phase_lock:, force_phase_vocoder:) } if signal.ndim == 2
17
26
  return signal if signal.empty? || rate == 1.0
18
- return linear_time_stretch(signal.to_a, rate) if signal.size < MIN_PHASE_VOCODER_SAMPLES
27
+ return linear_time_stretch(signal.to_a, rate) if method == :linear
28
+ return ola_time_stretch(signal.to_a, rate) if method == :ola
29
+ return wsola_time_stretch(signal.to_a, rate) if method == :wsola
19
30
 
20
- n_fft = phase_vocoder_fft_size(signal.size)
21
- hop_length = [n_fft / 4, 1].max
31
+ n_fft ||= phase_vocoder_fft_size(signal.size)
32
+ hop_length ||= [n_fft / 4, 1].max
22
33
 
23
34
  stft_matrix = Muze::Core::STFT.stft(signal, n_fft:, hop_length:, center: true)
24
- stretched_stft = phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:)
35
+ stretched_stft = phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:, phase_lock:)
25
36
  target_length = [(signal.size / rate).round, 1].max
26
37
 
27
38
  Muze::Core::STFT.istft(stretched_stft, hop_length:, center: true, length: target_length)
@@ -31,16 +42,25 @@ module Muze
31
42
  # @param sr [Integer]
32
43
  # @param n_steps [Float]
33
44
  # @return [Numo::SFloat]
34
- def pitch_shift(y, sr: 22_050, n_steps: 0)
35
- _ = sr
36
- signal = y.is_a?(Numo::NArray) ? y : Numo::SFloat.cast(y)
45
+ def pitch_shift(y, sr: 22_050, n_steps: 0, bins_per_octave: 12, res_type: :auto, normalize: false, clip: nil)
46
+ validate_positive_integer!(sr, "sr")
47
+ validate_positive_number!(bins_per_octave, "bins_per_octave")
48
+ raise Muze::ParameterError, "n_steps must be finite" unless n_steps.respond_to?(:finite?) && n_steps.finite?
49
+ raise Muze::ParameterError, "normalize must be true or false" unless [true, false].include?(normalize)
50
+ validate_positive_number!(clip, "clip") if clip
51
+
52
+ signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
53
+ return apply_channels(signal) { |channel| pitch_shift(channel, sr:, n_steps:, bins_per_octave:, res_type:, normalize:, clip:) } if signal.ndim == 2
37
54
  return signal if n_steps.zero?
38
55
 
39
- rate = 2.0**(-n_steps.to_f / 12.0)
56
+ rate = 2.0**(-n_steps.to_f / bins_per_octave)
40
57
  stretched = time_stretch(signal, rate:)
41
- preferred_res_type = signal.size >= MIN_PHASE_VOCODER_SAMPLES ? :sinc : :linear
42
- restored = resample_for_pitch_shift(stretched, target_size: signal.size, preferred_res_type:)
43
- Numo::SFloat.cast(restored[0...signal.size])
58
+ effective_res_type = res_type == :auto ? (signal.size >= MIN_PHASE_VOCODER_SAMPLES ? :sinc : :linear) : res_type
59
+ restored = resample_for_pitch_shift(stretched, target_size: signal.size, sr:, rate:, res_type: effective_res_type)
60
+ output = Numo::SFloat.cast(restored[0...signal.size])
61
+ output = normalize_peak(output) if normalize
62
+ output = output.clip(-clip, clip) if clip
63
+ output
44
64
  end
45
65
 
46
66
  # @param y [Numo::SFloat, Array<Float>]
@@ -48,23 +68,70 @@ module Muze
48
68
  # @param frame_length [Integer]
49
69
  # @param hop_length [Integer]
50
70
  # @return [Array(Numo::SFloat, Array<Integer>)] trimmed signal and [start, end]
51
- def trim(y, top_db: 60, frame_length: 2048, hop_length: 512)
52
- _ = [frame_length, hop_length]
53
- signal = y.is_a?(Numo::NArray) ? y : Numo::SFloat.cast(y)
54
- abs_signal = signal.abs
55
- threshold = [abs_signal.max, 1.0e-12].max * (10.0**(-top_db / 20.0))
56
- indices = abs_signal.to_a.each_index.select { |index| abs_signal[index] >= threshold }
57
- return [Numo::SFloat[], [0, 0]] if indices.empty?
71
+ def trim(y, top_db: 60, frame_length: 2048, hop_length: 512, ref: :max, aggregate: :mean, units: :samples, sr: nil)
72
+ raise Muze::ParameterError, "top_db must be non-negative" if top_db.negative?
73
+ raise Muze::ParameterError, "frame_length and hop_length must be positive" unless frame_length.positive? && hop_length.positive?
74
+ raise Muze::ParameterError, "aggregate must be :mean or :max" unless %i[mean max].include?(aggregate)
75
+ validate_trim_units!(units:, sr:, hop_length:)
76
+
77
+ signal = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
78
+ amplitude = sample_amplitude(signal, aggregate:)
79
+ frames = Muze::Core::Frames.slice(amplitude, frame_length:, hop_length:, pad_end: true)
80
+ energies = frames.map do |frame|
81
+ values = frame.map { |value| value * value }
82
+ aggregate == :max ? Math.sqrt(values.max || 0.0) : Math.sqrt(values.sum(0.0) / frame.length)
83
+ end
84
+ reference = trim_reference(energies, ref:)
85
+ threshold = [reference, 1.0e-12].max * (10.0**(-top_db / 20.0))
86
+ active_frames = energies.each_index.select { |index| energies[index] >= threshold }
87
+ return [Numo::SFloat[], [0, 0]] if active_frames.empty?
88
+
89
+ search_start = active_frames.first * hop_length
90
+ sample_count = amplitude.length
91
+ search_end = [(active_frames.last * hop_length) + frame_length, sample_count].min
92
+ active_samples = (search_start...search_end).select { |index| amplitude[index] >= threshold }
93
+ empty = signal.ndim == 2 ? Numo::SFloat.zeros(0, signal.shape[1]) : Numo::SFloat[]
94
+ return [empty, [0, 0]] if active_samples.empty?
95
+
96
+ start_sample = active_samples.first
97
+ end_sample = active_samples.last + 1
98
+ trimmed = signal.ndim == 2 ? signal[start_sample...end_sample, true] : signal[start_sample...end_sample]
99
+ [trimmed, convert_trim_interval(start_sample, end_sample, units:, sr:, hop_length:)]
100
+ end
101
+
102
+ def preemphasis(y, coef: 0.97)
103
+ validate_finite_number!(coef, "coef")
104
+ matrix = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
105
+ return apply_channels(matrix) { |channel| preemphasis(channel, coef:) } if matrix.ndim == 2
106
+
107
+ signal = matrix.to_a
108
+ return Numo::SFloat.cast(signal) if signal.empty?
58
109
 
59
- start_sample = indices.first
60
- end_sample = indices.last + 1
61
- [signal[start_sample...end_sample], [start_sample, end_sample]]
110
+ output = Array.new(signal.length, 0.0)
111
+ output[0] = signal[0]
112
+ (1...signal.length).each { |index| output[index] = signal[index] - (coef * signal[index - 1]) }
113
+ Numo::SFloat.cast(output)
114
+ end
115
+
116
+ def deemphasis(y, coef: 0.97)
117
+ validate_finite_number!(coef, "coef")
118
+ matrix = Muze::Core::Audio.validate_audio!(y, allow_empty: true)
119
+ return apply_channels(matrix) { |channel| deemphasis(channel, coef:) } if matrix.ndim == 2
120
+
121
+ signal = matrix.to_a
122
+ return Numo::SFloat.cast(signal) if signal.empty?
123
+
124
+ output = Array.new(signal.length, 0.0)
125
+ output[0] = signal[0]
126
+ (1...signal.length).each { |index| output[index] = signal[index] + (coef * output[index - 1]) }
127
+ Numo::SFloat.cast(output)
62
128
  end
63
129
 
64
130
  # @param signal_length [Integer]
65
131
  # @return [Integer]
66
132
  def phase_vocoder_fft_size(signal_length)
67
- max_fft = [signal_length, 2048].min
133
+ max_allowed = signal_length < MIN_PHASE_VOCODER_SAMPLES ? 512 : 2048
134
+ max_fft = [signal_length, max_allowed].min
68
135
  fft_size = 1
69
136
  fft_size *= 2 while (fft_size * 2) <= max_fft
70
137
  [fft_size, 32].max
@@ -76,7 +143,7 @@ module Muze
76
143
  # @param hop_length [Integer]
77
144
  # @param n_fft [Integer]
78
145
  # @return [Numo::DComplex]
79
- def phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:)
146
+ def phase_vocoder(stft_matrix, rate:, hop_length:, n_fft:, phase_lock: false)
80
147
  frequency_bins, frame_count = stft_matrix.shape
81
148
  return stft_matrix if frame_count <= 1
82
149
 
@@ -115,10 +182,32 @@ module Muze
115
182
  end
116
183
  end
117
184
 
118
- stretched
185
+ phase_lock ? phase_lock_spectrum(stretched) : stretched
119
186
  end
120
187
  private_class_method :phase_vocoder
121
188
 
189
+ def phase_lock_spectrum(stft_matrix)
190
+ rows, cols = stft_matrix.shape
191
+ output = stft_matrix.dup
192
+ cols.times do |col|
193
+ peak_bins = local_peak_bins(stft_matrix[true, col])
194
+ next if peak_bins.empty?
195
+
196
+ rows.times do |row|
197
+ peak = peak_bins.min_by { |candidate| (candidate - row).abs }
198
+ output[row, col] = Complex.polar(stft_matrix[row, col].abs, phase_of(stft_matrix[peak, col]))
199
+ end
200
+ end
201
+ output
202
+ end
203
+ private_class_method :phase_lock_spectrum
204
+
205
+ def local_peak_bins(spectrum)
206
+ values = spectrum.abs.to_a
207
+ (1...(values.length - 1)).select { |index| values[index] >= values[index - 1] && values[index] >= values[index + 1] }
208
+ end
209
+ private_class_method :local_peak_bins
210
+
122
211
  # @param complex_number [Complex]
123
212
  # @return [Float]
124
213
  def phase_of(complex_number)
@@ -152,20 +241,248 @@ module Muze
152
241
  end
153
242
  private_class_method :linear_time_stretch
154
243
 
244
+ def ola_time_stretch(signal, rate)
245
+ frame_length = [[next_power_of_two([signal.length / 8, 256].max), 2048].min, 32].max
246
+ analysis_hop = [frame_length / 2, 1].max
247
+ synthesis_hop = [(analysis_hop / rate).round, 1].max
248
+ frame_count = signal.length <= frame_length ? 1 : (((signal.length - frame_length).to_f / analysis_hop).ceil + 1)
249
+ target_length = [(signal.length / rate).round, 1].max
250
+ output = Array.new(target_length + frame_length, 0.0)
251
+ window_sums = Array.new(output.length, 0.0)
252
+ window = Muze::Core::Windows.hann(frame_length).to_a
253
+
254
+ frame_count.times do |frame_index|
255
+ source_start = frame_index * analysis_hop
256
+ target_start = frame_index * synthesis_hop
257
+ frame_length.times do |offset|
258
+ source_index = source_start + offset
259
+ target_index = target_start + offset
260
+ break if target_index >= output.length
261
+
262
+ value = source_index < signal.length ? signal[source_index] : 0.0
263
+ weight = window[offset]
264
+ output[target_index] += value * weight
265
+ window_sums[target_index] += weight * weight
266
+ end
267
+ end
268
+
269
+ output.map!.with_index do |value, index|
270
+ denominator = window_sums[index]
271
+ denominator > 1.0e-3 ? value / denominator : 0.0
272
+ end
273
+ Numo::SFloat.cast(limit_peak(output[0, target_length], signal.map(&:abs).max.to_f))
274
+ end
275
+ private_class_method :ola_time_stretch
276
+
277
+ def wsola_time_stretch(signal, rate)
278
+ frame_length = [[next_power_of_two([signal.length / 8, 256].max), 2048].min, 32].max
279
+ analysis_hop = [frame_length / 2, 1].max
280
+ synthesis_hop = [(analysis_hop / rate).round, 1].max
281
+ target_length = [(signal.length / rate).round, 1].max
282
+ output = Array.new(target_length + frame_length, 0.0)
283
+ window_sums = Array.new(output.length, 0.0)
284
+ window = Muze::Core::Windows.hann(frame_length).to_a
285
+ overlap = [[frame_length - synthesis_hop, 0].max, frame_length / 2].min
286
+ search_radius = [[analysis_hop / 2, 8].max, frame_length].min
287
+
288
+ target_start = 0
289
+ source_start = 0
290
+ first_frame = true
291
+
292
+ while target_start < target_length
293
+ expected_source = (target_start * rate).round
294
+ source_start = if first_frame || overlap.zero?
295
+ [[expected_source, 0].max, signal.length - 1].min
296
+ else
297
+ best_wsola_source_start(signal, output, target_start, expected_source, frame_length, overlap, search_radius)
298
+ end
299
+
300
+ overlap_add_frame(signal, output, window_sums, window, source_start:, target_start:, frame_length:)
301
+ first_frame = false
302
+ target_start += synthesis_hop
303
+ end
304
+
305
+ output.map!.with_index do |value, index|
306
+ denominator = window_sums[index]
307
+ denominator > 1.0e-3 ? value / denominator : 0.0
308
+ end
309
+ Numo::SFloat.cast(limit_peak(output[0, target_length], signal.map(&:abs).max.to_f))
310
+ end
311
+ private_class_method :wsola_time_stretch
312
+
313
+ def best_wsola_source_start(signal, output, target_start, expected_source, frame_length, overlap, search_radius)
314
+ lower = [expected_source - search_radius, 0].max
315
+ upper = [expected_source + search_radius, [signal.length - 1, 0].max].min
316
+ best_start = lower
317
+ best_score = -Float::INFINITY
318
+
319
+ (lower..upper).each do |candidate|
320
+ score = overlap_correlation(signal, output, candidate, target_start, overlap)
321
+ next unless score > best_score
322
+
323
+ best_score = score
324
+ best_start = candidate
325
+ end
326
+
327
+ [best_start, [signal.length - frame_length, 0].max].min
328
+ end
329
+ private_class_method :best_wsola_source_start
330
+
331
+ def overlap_correlation(signal, output, source_start, target_start, overlap)
332
+ numerator = 0.0
333
+ source_energy = 0.0
334
+ output_energy = 0.0
335
+
336
+ overlap.times do |offset|
337
+ source_index = source_start + offset
338
+ target_index = target_start + offset
339
+ break if source_index >= signal.length || target_index >= output.length
340
+
341
+ source_value = signal[source_index]
342
+ output_value = output[target_index]
343
+ numerator += source_value * output_value
344
+ source_energy += source_value * source_value
345
+ output_energy += output_value * output_value
346
+ end
347
+
348
+ denominator = Math.sqrt(source_energy * output_energy)
349
+ denominator <= 1.0e-12 ? 0.0 : numerator / denominator
350
+ end
351
+ private_class_method :overlap_correlation
352
+
353
+ def overlap_add_frame(signal, output, window_sums, window, source_start:, target_start:, frame_length:)
354
+ frame_length.times do |offset|
355
+ source_index = source_start + offset
356
+ target_index = target_start + offset
357
+ break if target_index >= output.length
358
+
359
+ value = source_index < signal.length ? signal[source_index] : 0.0
360
+ weight = window[offset]
361
+ output[target_index] += value * weight
362
+ window_sums[target_index] += weight * weight
363
+ end
364
+ end
365
+ private_class_method :overlap_add_frame
366
+
367
+ def limit_peak(values, target_peak)
368
+ peak = values.map(&:abs).max.to_f
369
+ return values if target_peak <= 0.0 || peak <= target_peak
370
+
371
+ scale = target_peak / peak
372
+ values.map { |value| value * scale }
373
+ end
374
+ private_class_method :limit_peak
375
+
376
+ def next_power_of_two(value)
377
+ power = 1
378
+ power *= 2 while power < value
379
+ power
380
+ end
381
+ private_class_method :next_power_of_two
382
+
155
383
  # Prefer sinc-quality resampling, then fall back to linear on failure.
156
384
  # @param stretched [Numo::SFloat]
157
385
  # @param target_size [Integer]
158
- # @param preferred_res_type [Symbol]
386
+ # @param sr [Integer]
387
+ # @param rate [Float]
388
+ # @param res_type [Symbol]
159
389
  # @return [Numo::SFloat]
160
- def resample_for_pitch_shift(stretched, target_size:, preferred_res_type:)
161
- if preferred_res_type == :sinc
162
- return Muze::Core::Resample.resample(stretched, orig_sr: stretched.size, target_sr: target_size, res_type: :sinc)
390
+ def resample_for_pitch_shift(stretched, target_size:, sr:, rate:, res_type:)
391
+ target_sr = [(sr * rate).round, 1].max
392
+ Muze::Core::Resample.resample(stretched, orig_sr: sr, target_sr:, res_type:, target_length: target_size)
393
+ rescue Muze::ParameterError
394
+ Muze::Core::Resample.resample(stretched, orig_sr: stretched.size, target_sr: target_size, res_type: :linear, target_length: target_size)
395
+ end
396
+ private_class_method :resample_for_pitch_shift
397
+
398
+ def trim_reference(energies, ref:)
399
+ case ref
400
+ when :max then energies.max || 0.0
401
+ when Numeric then ref.to_f
402
+ when Proc then ref.call(energies)
403
+ else
404
+ raise Muze::ParameterError, "ref must be :max, numeric, or a Proc"
163
405
  end
406
+ end
407
+ private_class_method :trim_reference
164
408
 
165
- Muze::Core::Resample.resample(stretched, orig_sr: stretched.size, target_sr: target_size, res_type: :linear)
166
- rescue Muze::Error, StandardError
167
- Muze::Core::Resample.resample(stretched, orig_sr: stretched.size, target_sr: target_size, res_type: :linear)
409
+ def validate_trim_units!(units:, sr:, hop_length:)
410
+ raise Muze::ParameterError, "units must be :samples, :frames, or :time" unless %i[samples frames time].include?(units)
411
+ raise Muze::ParameterError, "sr must be positive for time units" if units == :time && !(sr.is_a?(Integer) && sr.positive?)
412
+ raise Muze::ParameterError, "hop_length must be positive for frame units" if units == :frames && !(hop_length.is_a?(Integer) && hop_length.positive?)
168
413
  end
169
- private_class_method :resample_for_pitch_shift
414
+ private_class_method :validate_trim_units!
415
+
416
+ def convert_trim_interval(start_sample, end_sample, units:, sr:, hop_length:)
417
+ case units
418
+ when :samples
419
+ [start_sample, end_sample]
420
+ when :frames
421
+ [start_sample, end_sample].map { |sample| (sample / hop_length.to_f).floor }
422
+ when :time
423
+ [start_sample, end_sample].map { |sample| sample.to_f / sr }
424
+ end
425
+ end
426
+ private_class_method :convert_trim_interval
427
+
428
+ def apply_channels(matrix)
429
+ frames, channels = matrix.shape
430
+ processed = channels.times.map { |channel| yield(matrix[true, channel]) }
431
+ output_length = processed.map(&:size).max || frames
432
+ output = Numo::SFloat.zeros(output_length, channels)
433
+ channels.times do |channel|
434
+ values = processed[channel]
435
+ output[0...values.size, channel] = values
436
+ end
437
+ output
438
+ end
439
+ private_class_method :apply_channels
440
+
441
+ def sample_amplitude(signal, aggregate:)
442
+ return signal.abs.to_a unless signal.ndim == 2
443
+
444
+ frames, channels = signal.shape
445
+ Array.new(frames) do |frame|
446
+ values = Array.new(channels) { |channel| signal[frame, channel].abs }
447
+ aggregate == :max ? values.max : values.sum(0.0) / channels
448
+ end
449
+ end
450
+ private_class_method :sample_amplitude
451
+
452
+ def normalize_peak(signal)
453
+ peak = signal.abs.max
454
+ return signal if peak <= 0.0
455
+
456
+ signal / peak
457
+ end
458
+ private_class_method :normalize_peak
459
+
460
+ def validate_positive_integer!(value, label)
461
+ return if value.is_a?(Integer) && value.positive?
462
+
463
+ raise Muze::ParameterError, "#{label} must be a positive integer"
464
+ end
465
+ private_class_method :validate_positive_integer!
466
+
467
+ def validate_optional_positive_integer!(value, label)
468
+ return if value.nil?
469
+
470
+ validate_positive_integer!(value, label)
471
+ end
472
+ private_class_method :validate_optional_positive_integer!
473
+
474
+ def validate_positive_number!(value, label)
475
+ return if value.respond_to?(:finite?) && value.finite? && value.positive?
476
+
477
+ raise Muze::ParameterError, "#{label} must be positive"
478
+ end
479
+ private_class_method :validate_positive_number!
480
+
481
+ def validate_finite_number!(value, label)
482
+ return if value.respond_to?(:finite?) && value.finite?
483
+
484
+ raise Muze::ParameterError, "#{label} must be finite"
485
+ end
486
+ private_class_method :validate_finite_number!
170
487
  end
171
488
  end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Muze
4
+ module Feature
5
+ module_function
6
+
7
+ # Beat-synchronous aggregation over the frame axis.
8
+ def beat_sync(data, beats:, aggregate: :mean)
9
+ raise Muze::ParameterError, "aggregate must be :mean, :median, or :max" unless %i[mean median max].include?(aggregate)
10
+
11
+ matrix = Numo::SFloat.cast(data)
12
+ matrix = matrix.expand_dims(0) if matrix.ndim == 1
13
+ raise Muze::ParameterError, "data must be one- or two-dimensional" unless [1, 2].include?(matrix.ndim)
14
+
15
+ rows, frames = matrix.shape
16
+ boundaries = beat_boundaries(beats, frames)
17
+ output = Numo::SFloat.zeros(rows, boundaries.length - 1)
18
+
19
+ boundaries.each_cons(2).with_index do |(left, right), segment_index|
20
+ rows.times do |row|
21
+ values = matrix[row, left...right].to_a
22
+ output[row, segment_index] = aggregate_values(values, aggregate:)
23
+ end
24
+ end
25
+
26
+ output
27
+ end
28
+
29
+ def beat_boundaries(beats, frames)
30
+ points = Array(beats).map(&:to_i).select { |beat| beat.between?(0, frames) }
31
+ ([0] + points + [frames]).uniq.sort
32
+ end
33
+ private_class_method :beat_boundaries
34
+
35
+ def aggregate_values(values, aggregate:)
36
+ return 0.0 if values.empty?
37
+
38
+ case aggregate
39
+ when :mean
40
+ values.sum(0.0) / values.length
41
+ when :median
42
+ Muze::Native.median1d(values)
43
+ when :max
44
+ values.max
45
+ end
46
+ end
47
+ private_class_method :aggregate_values
48
+ end
49
+ end
@@ -11,42 +11,59 @@ module Muze
11
11
  # @param n_fft [Integer]
12
12
  # @param hop_length [Integer]
13
13
  # @param norm [Integer, nil]
14
+ # @param tuning [Float]
14
15
  # @return [Numo::SFloat] shape: [n_chroma, frames]
15
- def chroma_stft(y: nil, sr: 22_050, s: nil, n_chroma: 12, n_fft: 2048, hop_length: 512, norm: 2)
16
+ def chroma_stft(y: nil, sr: 22_050, s: nil, n_chroma: 12, n_fft: 2048, hop_length: 512, norm: 2, tuning: 0.0, ctroct: nil, octwidth: nil)
16
17
  spectrum = if s
17
- Numo::SFloat.cast(s)
18
+ provided = Numo::SFloat.cast(s)
19
+ validate_spectrum!(provided)
20
+ provided
18
21
  else
19
22
  stft_matrix = Muze.stft(y, n_fft:, hop_length:)
20
23
  magnitude, = Muze.magphase(stft_matrix)
21
24
  magnitude
22
- end
25
+ end
23
26
 
24
27
  spectrum = spectrum.expand_dims(1) if spectrum.ndim == 1
25
- filter_bank = Muze::Filters.chroma(sr:, n_fft:, n_chroma:)
26
- chroma = matrix_multiply(filter_bank, spectrum)
28
+ filter_bank = Muze::Filters.chroma(sr:, n_fft:, n_chroma:, tuning:, ctroct:, octwidth:)
29
+ chroma = Muze::Core::Matrix.multiply(filter_bank, spectrum)
27
30
  normalize(chroma, norm:)
28
31
  end
29
32
 
30
- def matrix_multiply(left, right)
31
- left_rows, left_cols = left.shape
32
- right_rows, right_cols = right.shape
33
- raise Muze::ParameterError, "Matrix dimensions do not align" unless left_cols == right_rows
33
+ # @return [Numo::SFloat] shape: [6, frames]
34
+ def tonnetz(y: nil, chroma: nil, sr: 22_050, n_fft: 2048, hop_length: 512)
35
+ chroma_matrix = chroma ? Numo::SFloat.cast(chroma) : chroma_stft(y:, sr:, n_fft:, hop_length:)
36
+ chroma_matrix = chroma_matrix.expand_dims(1) if chroma_matrix.ndim == 1
37
+ validate_spectrum!(chroma_matrix)
38
+ raise Muze::ParameterError, "tonnetz requires 12-bin chroma" unless chroma_matrix.shape[0] == 12
39
+
40
+ frames = chroma_matrix.shape[1]
41
+ output = Numo::SFloat.zeros(6, frames)
42
+ intervals = [7, 7, 3, 3, 4, 4]
43
+ phases = [0.0, Math::PI / 2.0, 0.0, Math::PI / 2.0, 0.0, Math::PI / 2.0]
34
44
 
35
- output = Numo::SFloat.zeros(left_rows, right_cols)
36
- left_rows.times do |row|
37
- right_cols.times do |col|
45
+ frames.times do |frame|
46
+ vector = chroma_matrix[true, frame]
47
+ total = vector.sum
48
+ next if total <= 0.0
49
+
50
+ normalized = vector / total
51
+ 6.times do |dimension|
38
52
  sum = 0.0
39
- left_cols.times { |idx| sum += left[row, idx] * right[idx, col] }
40
- output[row, col] = sum
53
+ 12.times do |chroma_index|
54
+ angle = ((Math::PI * intervals[dimension] * chroma_index) / 6.0) + phases[dimension]
55
+ sum += normalized[chroma_index] * Math.cos(angle)
56
+ end
57
+ output[dimension, frame] = sum
41
58
  end
42
59
  end
43
60
 
44
61
  output
45
62
  end
46
- private_class_method :matrix_multiply
47
63
 
48
64
  def normalize(chroma, norm:)
49
65
  return chroma if norm.nil?
66
+ raise Muze::ParameterError, "norm must be nil, 1, or 2" unless [1, 2].include?(norm)
50
67
 
51
68
  frames = chroma.shape[1]
52
69
  frames.times do |frame_index|
@@ -64,5 +81,16 @@ module Muze
64
81
  chroma
65
82
  end
66
83
  private_class_method :normalize
84
+
85
+ def validate_spectrum!(spectrum)
86
+ values = spectrum.to_a.flatten
87
+ unless values.all? { |value| value.respond_to?(:finite?) && value.finite? }
88
+ raise Muze::ParameterError, "s must contain only finite numeric values"
89
+ end
90
+ return unless values.any?(&:negative?)
91
+
92
+ raise Muze::ParameterError, "spectrogram input must be non-negative"
93
+ end
94
+ private_class_method :validate_spectrum!
67
95
  end
68
96
  end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Muze
4
+ module Feature
5
+ # Lightweight cache for feature extractors that share the same STFT.
6
+ class Context
7
+ DEFAULT_FEATURES = %i[
8
+ melspectrogram
9
+ chroma_stft
10
+ spectral_centroid
11
+ spectral_bandwidth
12
+ spectral_rolloff
13
+ spectral_flatness
14
+ rms
15
+ zero_crossing_rate
16
+ ].freeze
17
+
18
+ attr_reader :y, :sr, :n_fft, :hop_length, :center, :pad_mode
19
+
20
+ def initialize(y:, sr: 22_050, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
21
+ @y = y
22
+ @sr = sr
23
+ @n_fft = n_fft
24
+ @hop_length = hop_length
25
+ @center = center
26
+ @pad_mode = pad_mode
27
+ @cache = {}
28
+ end
29
+
30
+ def stft
31
+ @cache[:stft] ||= Muze.stft(y, n_fft:, hop_length:, center:, pad_mode:)
32
+ end
33
+
34
+ def magnitude
35
+ @cache[:magnitude] ||= Muze.magphase(stft).first
36
+ end
37
+
38
+ def power
39
+ @cache[:power] ||= (magnitude**2).cast_to(Numo::SFloat)
40
+ end
41
+
42
+ def extract(features: DEFAULT_FEATURES)
43
+ features.each_with_object({}) do |feature, results|
44
+ results[feature] = fetch(feature)
45
+ end
46
+ end
47
+
48
+ def fetch(feature)
49
+ @cache[feature] ||= case feature
50
+ when :melspectrogram then Muze::Feature.melspectrogram(sr:, s: power, n_fft:, hop_length:)
51
+ when :chroma_stft then Muze::Feature.chroma_stft(sr:, s: magnitude, n_fft:, hop_length:)
52
+ when :spectral_centroid then Muze::Feature.spectral_centroid(s: magnitude, sr:, n_fft:, hop_length:)
53
+ when :spectral_bandwidth then Muze::Feature.spectral_bandwidth(s: magnitude, sr:, n_fft:, hop_length:)
54
+ when :spectral_rolloff then Muze::Feature.spectral_rolloff(s: magnitude, sr:, n_fft:, hop_length:)
55
+ when :spectral_flatness then Muze::Feature.spectral_flatness(s: magnitude, n_fft:, hop_length:)
56
+ when :spectral_flux then Muze::Feature.spectral_flux(s: magnitude, sr:, n_fft:, hop_length:)
57
+ when :spectral_entropy then Muze::Feature.spectral_entropy(s: magnitude, sr:, n_fft:, hop_length:)
58
+ when :spectral_crest then Muze::Feature.spectral_crest(s: magnitude, sr:, n_fft:, hop_length:)
59
+ when :spectral_slope then Muze::Feature.spectral_slope(s: magnitude, sr:, n_fft:, hop_length:)
60
+ when :spectral_decrease then Muze::Feature.spectral_decrease(s: magnitude, sr:, n_fft:, hop_length:)
61
+ when :poly_features then Muze::Feature.poly_features(s: magnitude, sr:, n_fft:, hop_length:)
62
+ when :tonnetz then Muze::Feature.tonnetz(chroma: fetch(:chroma_stft), sr:, n_fft:, hop_length:)
63
+ when :rms then Muze::Feature.rms(s: magnitude)
64
+ when :zero_crossing_rate then Muze::Feature.zero_crossing_rate(y, frame_length: n_fft, hop_length:)
65
+ else
66
+ raise Muze::ParameterError, "Unsupported feature: #{feature}"
67
+ end
68
+ end
69
+ end
70
+
71
+ module_function
72
+
73
+ def context(y:, sr: 22_050, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
74
+ Context.new(y:, sr:, n_fft:, hop_length:, center:, pad_mode:)
75
+ end
76
+
77
+ def extract(y:, sr: 22_050, features: Context::DEFAULT_FEATURES, n_fft: 2048, hop_length: 512, center: true, pad_mode: :reflect)
78
+ context(y:, sr:, n_fft:, hop_length:, center:, pad_mode:).extract(features:)
79
+ end
80
+ end
81
+ end