ruby_dsp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/Gemfile +3 -0
- data/README.md +80 -0
- data/Rakefile +31 -0
- data/ext/ruby_dsp/extconf.rb +6 -0
- data/ext/ruby_dsp/ruby_dsp.cpp +627 -0
- data/ext/ruby_dsp/vendor/miniaudio.h +95844 -0
- data/lib/ruby_dsp/version.rb +6 -0
- data/lib/ruby_dsp.rb +10 -0
- data/ruby_dsp.gemspec +29 -0
- data/stubs/ruby_dsp/audio_track.rb +131 -0
- metadata +139 -0
|
@@ -0,0 +1,627 @@
|
|
|
1
|
+
#include <rice/rice.hpp>
|
|
2
|
+
#include <rice/stl.hpp>
|
|
3
|
+
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <stdexcept>
|
|
6
|
+
#include <cmath>
|
|
7
|
+
#include <sstream>
|
|
8
|
+
#include <iomanip>
|
|
9
|
+
#include <algorithm>
|
|
10
|
+
|
|
11
|
+
#define MINIAUDIO_IMPLEMENTATION
|
|
12
|
+
#include "vendor/miniaudio.h"
|
|
13
|
+
|
|
14
|
+
using namespace Rice;
|
|
15
|
+
|
|
16
|
+
std::string get_extension(const std::string &filename)
|
|
17
|
+
{
|
|
18
|
+
size_t dot_pos = filename.find_last_of('.');
|
|
19
|
+
if (dot_pos == std::string::npos)
|
|
20
|
+
return ""; // No dot found
|
|
21
|
+
|
|
22
|
+
std::string ext = filename.substr(dot_pos + 1);
|
|
23
|
+
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
|
24
|
+
return ext;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
struct AudioTrack
|
|
28
|
+
{
|
|
29
|
+
std::string filename;
|
|
30
|
+
int sample_rate = -1;
|
|
31
|
+
int channels = -1;
|
|
32
|
+
bool is_mono = false;
|
|
33
|
+
std::vector<float> samples;
|
|
34
|
+
unsigned long long sample_count = 0;
|
|
35
|
+
|
|
36
|
+
AudioTrack(std::string f, unsigned int target_channels = 0, unsigned int target_sample_rate = 0) : filename(f)
|
|
37
|
+
{
|
|
38
|
+
ma_decoder decoder;
|
|
39
|
+
ma_result result;
|
|
40
|
+
|
|
41
|
+
ma_decoder_config config = ma_decoder_config_init(ma_format_f32, (ma_uint32)target_channels, (ma_uint32)target_sample_rate);
|
|
42
|
+
|
|
43
|
+
result = ma_decoder_init_file(filename.c_str(), &config, &decoder);
|
|
44
|
+
|
|
45
|
+
if (result != MA_SUCCESS)
|
|
46
|
+
{
|
|
47
|
+
throw std::runtime_error("RubyDSP: Could not process audio file: " + filename);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
sample_rate = decoder.outputSampleRate;
|
|
51
|
+
channels = decoder.outputChannels;
|
|
52
|
+
is_mono = (channels == 1);
|
|
53
|
+
|
|
54
|
+
ma_uint64 totalFrames;
|
|
55
|
+
if (ma_decoder_get_length_in_pcm_frames(&decoder, &totalFrames) != MA_SUCCESS)
|
|
56
|
+
{
|
|
57
|
+
ma_decoder_uninit(&decoder);
|
|
58
|
+
throw std::runtime_error("RubyDSP: Could not determine track length.");
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
sample_count = totalFrames * channels;
|
|
62
|
+
samples.resize(sample_count);
|
|
63
|
+
|
|
64
|
+
ma_uint64 framesRead;
|
|
65
|
+
if (ma_decoder_read_pcm_frames(&decoder, samples.data(), totalFrames, &framesRead) != MA_SUCCESS)
|
|
66
|
+
{
|
|
67
|
+
ma_decoder_uninit(&decoder);
|
|
68
|
+
throw std::runtime_error("RubyDSP: Failed to read PCM data.");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ma_decoder_uninit(&decoder);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
bool save_track(const std::string &outFile, Rice::Symbol format_sym = Rice::Symbol("auto"))
|
|
75
|
+
{
|
|
76
|
+
std::string final_path = outFile;
|
|
77
|
+
std::string format = format_sym.str();
|
|
78
|
+
std::string ext = get_extension(final_path);
|
|
79
|
+
|
|
80
|
+
if (format == "auto")
|
|
81
|
+
{
|
|
82
|
+
if (ext.empty())
|
|
83
|
+
{
|
|
84
|
+
format = "wav";
|
|
85
|
+
final_path += ".wav";
|
|
86
|
+
}
|
|
87
|
+
else
|
|
88
|
+
{
|
|
89
|
+
format = ext;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
else
|
|
93
|
+
{
|
|
94
|
+
if (ext.empty())
|
|
95
|
+
{
|
|
96
|
+
final_path += "." + format;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (format == "wav")
|
|
101
|
+
{
|
|
102
|
+
if (samples.empty() || channels <= 0 || sample_rate <= 0)
|
|
103
|
+
{
|
|
104
|
+
throw std::runtime_error("RubyDSP: Cannot save an empty or invalid audio track.");
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
ma_encoder_config config = ma_encoder_config_init(
|
|
108
|
+
ma_encoding_format_wav,
|
|
109
|
+
ma_format_f32,
|
|
110
|
+
(ma_uint32)channels,
|
|
111
|
+
(ma_uint32)sample_rate);
|
|
112
|
+
|
|
113
|
+
ma_encoder encoder;
|
|
114
|
+
|
|
115
|
+
ma_result result = ma_encoder_init_file(final_path.c_str(), &config, &encoder);
|
|
116
|
+
|
|
117
|
+
if (result != MA_SUCCESS)
|
|
118
|
+
{
|
|
119
|
+
throw std::runtime_error("RubyDSP: Failed to initialize WAV encoder for: " + final_path);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
ma_uint64 framesToWrite = samples.size() / channels;
|
|
123
|
+
ma_uint64 framesWritten = 0;
|
|
124
|
+
|
|
125
|
+
result = ma_encoder_write_pcm_frames(&encoder, samples.data(), framesToWrite, &framesWritten);
|
|
126
|
+
|
|
127
|
+
ma_encoder_uninit(&encoder);
|
|
128
|
+
|
|
129
|
+
if (result != MA_SUCCESS)
|
|
130
|
+
{
|
|
131
|
+
throw std::runtime_error("RubyDSP: Failed to write PCM data to: " + final_path);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (framesWritten != framesToWrite)
|
|
135
|
+
{
|
|
136
|
+
throw std::runtime_error("RubyDSP: Incomplete file write to: " + final_path);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return true;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// TODO add unsupported formats
|
|
143
|
+
if (format == "flac" || format == "mp3" || format == "vorbis" || format == "ogg")
|
|
144
|
+
{
|
|
145
|
+
throw std::runtime_error("RubyDSP: " + format + " encoding is not yet supported. Only WAV is available.");
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
throw std::runtime_error("RubyDSP: Unknown format '" + format + "'");
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
float duration()
|
|
152
|
+
{
|
|
153
|
+
return (float)sample_count / (sample_rate * channels);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
float peak_amplitude()
|
|
157
|
+
{
|
|
158
|
+
float max_val = 0.0f;
|
|
159
|
+
for (const auto &sample : samples)
|
|
160
|
+
{
|
|
161
|
+
max_val = std::max(max_val, std::fabs(sample));
|
|
162
|
+
}
|
|
163
|
+
return max_val;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
bool to_mono_bang()
|
|
167
|
+
{
|
|
168
|
+
if (is_mono)
|
|
169
|
+
{
|
|
170
|
+
return false; // no-op
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (channels < 1)
|
|
174
|
+
{
|
|
175
|
+
throw std::runtime_error("RubyDSP: Wrong number of channels (" + std::to_string(channels) + ")");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
unsigned long long new_size = sample_count / channels;
|
|
179
|
+
std::vector<float> mono_samples;
|
|
180
|
+
mono_samples.reserve(new_size);
|
|
181
|
+
|
|
182
|
+
// mean calculation pass
|
|
183
|
+
for (unsigned long long i = 0; i < new_size; ++i)
|
|
184
|
+
{
|
|
185
|
+
float sum = 0.0f;
|
|
186
|
+
// frame pass
|
|
187
|
+
for (int c = 0; c < channels; ++c)
|
|
188
|
+
{
|
|
189
|
+
sum += samples[i * channels + c];
|
|
190
|
+
}
|
|
191
|
+
mono_samples.push_back(sum / channels);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// replace samples with mono
|
|
195
|
+
samples = std::move(mono_samples);
|
|
196
|
+
channels = 1;
|
|
197
|
+
is_mono = true;
|
|
198
|
+
sample_count = samples.size();
|
|
199
|
+
return true;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
bool resample_bang(unsigned int target_rate = 0)
|
|
203
|
+
{
|
|
204
|
+
if (target_rate == 0 || target_rate == sample_rate)
|
|
205
|
+
{
|
|
206
|
+
return false; // no-op
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// TODO: add better, linear will have to do for now
|
|
210
|
+
ma_resampler_config config = ma_resampler_config_init(
|
|
211
|
+
ma_format_f32,
|
|
212
|
+
(ma_uint32)channels,
|
|
213
|
+
(ma_uint32)sample_rate,
|
|
214
|
+
(ma_uint32)target_rate,
|
|
215
|
+
ma_resample_algorithm_linear);
|
|
216
|
+
|
|
217
|
+
ma_resampler resampler;
|
|
218
|
+
if (ma_resampler_init(&config, NULL, &resampler) != MA_SUCCESS)
|
|
219
|
+
{
|
|
220
|
+
throw std::runtime_error("RubyDSP: Failed to initialize resampler.");
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Calculate input/output frame counts
|
|
224
|
+
ma_uint64 input_frames = sample_count / channels;
|
|
225
|
+
ma_uint64 expected_output_frames = 0;
|
|
226
|
+
|
|
227
|
+
if (ma_resampler_get_expected_output_frame_count(&resampler, input_frames, &expected_output_frames) != MA_SUCCESS)
|
|
228
|
+
{
|
|
229
|
+
ma_resampler_uninit(&resampler, NULL);
|
|
230
|
+
throw std::runtime_error("RubyDSP: Failed to get expected output frame count.");
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
std::vector<float> resampled_data(expected_output_frames * channels);
|
|
234
|
+
|
|
235
|
+
// Process the audio
|
|
236
|
+
ma_uint64 frames_in = input_frames;
|
|
237
|
+
ma_uint64 frames_out = expected_output_frames;
|
|
238
|
+
|
|
239
|
+
if (ma_resampler_process_pcm_frames(&resampler, samples.data(), &frames_in, resampled_data.data(), &frames_out) != MA_SUCCESS)
|
|
240
|
+
{
|
|
241
|
+
ma_resampler_uninit(&resampler, NULL);
|
|
242
|
+
throw std::runtime_error("RubyDSP: Resampling failed during processing.");
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
ma_resampler_uninit(&resampler, NULL);
|
|
246
|
+
|
|
247
|
+
// Shrink buffer if the resampler output slightly fewer frames than expected
|
|
248
|
+
resampled_data.resize(frames_out * channels);
|
|
249
|
+
|
|
250
|
+
// Update internals
|
|
251
|
+
samples = std::move(resampled_data);
|
|
252
|
+
sample_rate = target_rate;
|
|
253
|
+
sample_count = samples.size();
|
|
254
|
+
|
|
255
|
+
return true;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
std::vector<float> rms()
|
|
259
|
+
{
|
|
260
|
+
if (samples.empty())
|
|
261
|
+
{
|
|
262
|
+
return {}; // should not happen
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
std::vector<float> result(channels, 0.0f);
|
|
266
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
267
|
+
|
|
268
|
+
if (per_channel_samples == 0)
|
|
269
|
+
{
|
|
270
|
+
return result;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Process each channel
|
|
274
|
+
for (int c = 0; c < channels; ++c)
|
|
275
|
+
{
|
|
276
|
+
double sum_sq = 0.0;
|
|
277
|
+
|
|
278
|
+
for (unsigned long long i = 0; i < per_channel_samples; ++i)
|
|
279
|
+
{
|
|
280
|
+
// Access the correct sample in the interleaved array
|
|
281
|
+
float s = samples[i * channels + c];
|
|
282
|
+
sum_sq += s * s;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
result[c] = (float)std::sqrt(sum_sq / per_channel_samples);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return result;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
std::vector<std::vector<float>> framed_rms(unsigned int frame_length = 2048, unsigned int hop_length = 512)
|
|
292
|
+
{
|
|
293
|
+
if (frame_length == 0 || hop_length == 0 || samples.empty())
|
|
294
|
+
{
|
|
295
|
+
return {};
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
299
|
+
|
|
300
|
+
// Either SUPER SHORT track or SUPER LONG frame_length
|
|
301
|
+
// --> will be less than single full frame per channel
|
|
302
|
+
// --> fallback to rms wrapped to be 2D
|
|
303
|
+
if (per_channel_samples < frame_length)
|
|
304
|
+
{
|
|
305
|
+
std::vector<float> overall_rms = rms();
|
|
306
|
+
|
|
307
|
+
// wrap
|
|
308
|
+
std::vector<std::vector<float>> fallback_result(channels, std::vector<float>(1, 0.0f));
|
|
309
|
+
|
|
310
|
+
for (int c = 0; c < channels; ++c)
|
|
311
|
+
{
|
|
312
|
+
fallback_result[c][0] = overall_rms[c];
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
return fallback_result;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// more than single full frame per channel (usual)
|
|
319
|
+
unsigned long long expected_frames = ((per_channel_samples - frame_length) / hop_length) + 1;
|
|
320
|
+
std::vector<std::vector<float>> result(channels, std::vector<float>(expected_frames, 0.0f));
|
|
321
|
+
|
|
322
|
+
for (int c = 0; c < channels; ++c)
|
|
323
|
+
{
|
|
324
|
+
for (unsigned long long i = 0; i < expected_frames; ++i)
|
|
325
|
+
{
|
|
326
|
+
unsigned long long start_sample = (i * hop_length) * channels + c;
|
|
327
|
+
double sum_sq = 0.0;
|
|
328
|
+
|
|
329
|
+
for (unsigned int j = 0; j < frame_length; ++j)
|
|
330
|
+
{
|
|
331
|
+
float s = samples[start_sample + (j * channels)];
|
|
332
|
+
// ^2 to flip all to positive
|
|
333
|
+
sum_sq += s * s;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
result[c][i] = (float)std::sqrt(sum_sq / frame_length);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return result;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
std::vector<float> zcr()
|
|
344
|
+
{
|
|
345
|
+
if (samples.empty())
|
|
346
|
+
return {};
|
|
347
|
+
|
|
348
|
+
std::vector<float> result(channels, 0.0f);
|
|
349
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
350
|
+
|
|
351
|
+
if (per_channel_samples < 2)
|
|
352
|
+
return result;
|
|
353
|
+
|
|
354
|
+
for (int c = 0; c < channels; ++c)
|
|
355
|
+
{
|
|
356
|
+
unsigned int crossings = 0;
|
|
357
|
+
for (unsigned long long j = 1; j < per_channel_samples; ++j)
|
|
358
|
+
{
|
|
359
|
+
float curr = samples[j * channels + c];
|
|
360
|
+
float prev = samples[(j - 1) * channels + c];
|
|
361
|
+
|
|
362
|
+
if ((curr >= 0.0f) != (prev >= 0.0f))
|
|
363
|
+
{
|
|
364
|
+
crossings++;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
result[c] = (float)crossings / per_channel_samples;
|
|
368
|
+
}
|
|
369
|
+
return result;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
std::vector<std::vector<float>> framed_zcr(unsigned int frame_length = 2048, unsigned int hop_length = 512)
|
|
373
|
+
{
|
|
374
|
+
|
|
375
|
+
if (frame_length == 0 || hop_length == 0 || samples.empty())
|
|
376
|
+
{
|
|
377
|
+
return {};
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
381
|
+
|
|
382
|
+
if (per_channel_samples < frame_length)
|
|
383
|
+
{
|
|
384
|
+
std::vector<float> overall_zcr = zcr();
|
|
385
|
+
|
|
386
|
+
// wrap
|
|
387
|
+
std::vector<std::vector<float>> fallback_result(channels, std::vector<float>(1, 0.0f));
|
|
388
|
+
|
|
389
|
+
for (int c = 0; c < channels; ++c)
|
|
390
|
+
{
|
|
391
|
+
fallback_result[c][0] = overall_zcr[c];
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
return fallback_result;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Calculate number of frames
|
|
398
|
+
unsigned long long expected_frames = ((per_channel_samples - frame_length) / hop_length) + 1;
|
|
399
|
+
|
|
400
|
+
std::vector<std::vector<float>> result(channels, std::vector<float>(expected_frames, 0.0f));
|
|
401
|
+
|
|
402
|
+
for (int c = 0; c < channels; ++c)
|
|
403
|
+
{
|
|
404
|
+
for (unsigned long long i = 0; i < expected_frames; ++i)
|
|
405
|
+
{
|
|
406
|
+
unsigned long long start_sample = (i * hop_length) * channels + c;
|
|
407
|
+
unsigned int crossings = 0;
|
|
408
|
+
|
|
409
|
+
for (unsigned int j = 1; j < frame_length; ++j)
|
|
410
|
+
{
|
|
411
|
+
unsigned long long curr = start_sample + (j * channels);
|
|
412
|
+
unsigned long long prev = start_sample + ((j - 1) * channels);
|
|
413
|
+
|
|
414
|
+
if ((samples[curr] >= 0.0f) != (samples[prev] >= 0.0f))
|
|
415
|
+
{
|
|
416
|
+
crossings++;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
// Normalize
|
|
420
|
+
result[c][i] = (float)crossings / frame_length;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
return result;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
std::vector<unsigned long long> silence_bounds(float threshold_db = -60.0f, unsigned int frame_length = 2048, unsigned int hop_length = 512)
|
|
427
|
+
{
|
|
428
|
+
if (samples.empty())
|
|
429
|
+
return {0, 0};
|
|
430
|
+
|
|
431
|
+
// Get framed RMS
|
|
432
|
+
std::vector<std::vector<float>> rms_frames = framed_rms(frame_length, hop_length);
|
|
433
|
+
if (rms_frames.empty() || rms_frames[0].empty())
|
|
434
|
+
{
|
|
435
|
+
return {0, sample_count / channels};
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
unsigned long long num_frames = rms_frames[0].size();
|
|
439
|
+
|
|
440
|
+
// Find the global peak RMS across all frames and all channels
|
|
441
|
+
float max_rms = 0.0f;
|
|
442
|
+
for (int c = 0; c < channels; ++c)
|
|
443
|
+
{
|
|
444
|
+
for (unsigned long long i = 0; i < num_frames; ++i)
|
|
445
|
+
{
|
|
446
|
+
if (rms_frames[c][i] > max_rms)
|
|
447
|
+
{
|
|
448
|
+
max_rms = rms_frames[c][i];
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// Prevent errors on pure silence
|
|
454
|
+
if (max_rms < 1e-10f)
|
|
455
|
+
return {0, 0};
|
|
456
|
+
|
|
457
|
+
// Scan from the left to find the start frame
|
|
458
|
+
unsigned long long start_frame = 0;
|
|
459
|
+
bool found_start = false;
|
|
460
|
+
|
|
461
|
+
for (unsigned long long i = 0; i < num_frames; ++i)
|
|
462
|
+
{
|
|
463
|
+
float frame_max_rms = 0.0f;
|
|
464
|
+
for (int c = 0; c < channels; ++c)
|
|
465
|
+
{
|
|
466
|
+
if (rms_frames[c][i] > frame_max_rms)
|
|
467
|
+
{
|
|
468
|
+
frame_max_rms = rms_frames[c][i];
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Convert to decibels relative to the peak RMS
|
|
473
|
+
float db = 20.0f * std::log10((frame_max_rms / max_rms) + 1e-10f);
|
|
474
|
+
|
|
475
|
+
if (db > threshold_db)
|
|
476
|
+
{
|
|
477
|
+
start_frame = i;
|
|
478
|
+
found_start = true;
|
|
479
|
+
break;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// Scan from the right to find the end frame
|
|
484
|
+
unsigned long long end_frame = num_frames > 0 ? num_frames - 1 : 0;
|
|
485
|
+
if (found_start)
|
|
486
|
+
{
|
|
487
|
+
for (long long i = num_frames - 1; i >= 0; --i)
|
|
488
|
+
{
|
|
489
|
+
float frame_max_rms = 0.0f;
|
|
490
|
+
for (int c = 0; c < channels; ++c)
|
|
491
|
+
{
|
|
492
|
+
if (rms_frames[c][i] > frame_max_rms)
|
|
493
|
+
{
|
|
494
|
+
frame_max_rms = rms_frames[c][i];
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
float db = 20.0f * std::log10((frame_max_rms / max_rms) + 1e-10f);
|
|
499
|
+
|
|
500
|
+
if (db > threshold_db)
|
|
501
|
+
{
|
|
502
|
+
end_frame = i;
|
|
503
|
+
break;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
else
|
|
508
|
+
{
|
|
509
|
+
return {0, 0}; // Track was entirely below threshold
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Convert frame indices back to sample indices
|
|
513
|
+
unsigned long long start_sample = start_frame * hop_length;
|
|
514
|
+
unsigned long long end_sample = end_frame * hop_length + frame_length;
|
|
515
|
+
|
|
516
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
517
|
+
|
|
518
|
+
if (start_frame == 0)
|
|
519
|
+
{
|
|
520
|
+
start_sample = 0;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
if (end_frame == num_frames - 1)
|
|
524
|
+
{
|
|
525
|
+
end_sample = per_channel_samples;
|
|
526
|
+
}
|
|
527
|
+
else if (end_sample > per_channel_samples)
|
|
528
|
+
{
|
|
529
|
+
end_sample = per_channel_samples;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return {start_sample, end_sample};
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
bool trim_silence_bang(float threshold_db = -60.0f, unsigned int frame_length = 2048, unsigned int hop_length = 512)
|
|
536
|
+
{
|
|
537
|
+
if (samples.empty())
|
|
538
|
+
return false;
|
|
539
|
+
|
|
540
|
+
std::vector<unsigned long long> bounds = silence_bounds(threshold_db, frame_length, hop_length);
|
|
541
|
+
unsigned long long start_sample = bounds[0];
|
|
542
|
+
unsigned long long end_sample = bounds[1];
|
|
543
|
+
|
|
544
|
+
unsigned long long per_channel_samples = sample_count / channels;
|
|
545
|
+
|
|
546
|
+
// No-op checks
|
|
547
|
+
if (start_sample == 0 && end_sample >= per_channel_samples)
|
|
548
|
+
return false;
|
|
549
|
+
|
|
550
|
+
// If the file is entirely silent, clear everything
|
|
551
|
+
if (start_sample == 0 && end_sample == 0)
|
|
552
|
+
{
|
|
553
|
+
samples.clear();
|
|
554
|
+
sample_count = 0;
|
|
555
|
+
return true;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Slice the interleaved sample array
|
|
559
|
+
unsigned long long start_idx = start_sample * channels;
|
|
560
|
+
unsigned long long end_idx = end_sample * channels;
|
|
561
|
+
|
|
562
|
+
std::vector<float> trimmed_samples(samples.begin() + start_idx, samples.begin() + end_idx);
|
|
563
|
+
samples = std::move(trimmed_samples);
|
|
564
|
+
sample_count = samples.size();
|
|
565
|
+
|
|
566
|
+
return true;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
std::string to_s()
|
|
570
|
+
{
|
|
571
|
+
std::ostringstream stream;
|
|
572
|
+
stream << "['" << filename << "', "
|
|
573
|
+
<< std::fixed << std::setprecision(3) << duration() << "s duration, "
|
|
574
|
+
<< channels << " channel(s), "
|
|
575
|
+
<< sample_rate << "Hz sample rate]";
|
|
576
|
+
return stream.str();
|
|
577
|
+
}
|
|
578
|
+
};
|
|
579
|
+
|
|
580
|
+
extern "C"
|
|
581
|
+
#if defined(_WIN32)
|
|
582
|
+
__declspec(dllexport)
|
|
583
|
+
#else
|
|
584
|
+
__attribute__((visibility("default")))
|
|
585
|
+
#endif
|
|
586
|
+
void Init_ruby_dsp()
|
|
587
|
+
{
|
|
588
|
+
Module rb_mRubyDSP = define_module("RubyDSP");
|
|
589
|
+
Data_Type<AudioTrack> rb_cAudioTrack = define_class_under<AudioTrack>(rb_mRubyDSP, "AudioTrack")
|
|
590
|
+
.define_constructor(Constructor<AudioTrack, std::string, unsigned int, unsigned int>(),
|
|
591
|
+
Arg("file_name") = (std::string) "default.wav",
|
|
592
|
+
Arg("target_channels") = (unsigned int)0,
|
|
593
|
+
Arg("target_sample_rate") = (unsigned int)0)
|
|
594
|
+
// attributes
|
|
595
|
+
.define_attr("file_name", &AudioTrack::filename, Rice::AttrAccess::Read)
|
|
596
|
+
.define_attr("channels", &AudioTrack::channels, Rice::AttrAccess::Read)
|
|
597
|
+
.define_attr("samples", &AudioTrack::samples, Rice::AttrAccess::Read)
|
|
598
|
+
.define_attr("sample_count", &AudioTrack::sample_count, Rice::AttrAccess::Read)
|
|
599
|
+
.define_attr("sample_rate", &AudioTrack::sample_rate, Rice::AttrAccess::Read)
|
|
600
|
+
.define_attr("is_mono?", &AudioTrack::is_mono, Rice::AttrAccess::Read)
|
|
601
|
+
// methods
|
|
602
|
+
.define_method("duration", &AudioTrack::duration)
|
|
603
|
+
.define_method("peak_amp", &AudioTrack::peak_amplitude)
|
|
604
|
+
.define_method("to_mono!", &AudioTrack::to_mono_bang)
|
|
605
|
+
.define_method("resample!", &AudioTrack::resample_bang,
|
|
606
|
+
Arg("target_rate") = (unsigned int)0)
|
|
607
|
+
.define_method("rms", &AudioTrack::rms)
|
|
608
|
+
.define_method("framed_rms", &AudioTrack::framed_rms,
|
|
609
|
+
Arg("frame_length") = (unsigned int)2048,
|
|
610
|
+
Arg("hop_length") = (unsigned int)512)
|
|
611
|
+
.define_method("zcr", &AudioTrack::zcr)
|
|
612
|
+
.define_method("framed_zcr", &AudioTrack::framed_zcr,
|
|
613
|
+
Arg("frame_length") = (unsigned int)2048,
|
|
614
|
+
Arg("hop_length") = (unsigned int)512)
|
|
615
|
+
.define_method("silence_bounds", &AudioTrack::silence_bounds,
|
|
616
|
+
Arg("threshold_db") = -60.0f,
|
|
617
|
+
Arg("frame_length") = (unsigned int)2048,
|
|
618
|
+
Arg("hop_length") = (unsigned int)512)
|
|
619
|
+
.define_method("trim_silence!", &AudioTrack::trim_silence_bang,
|
|
620
|
+
Arg("threshold_db") = -60.0f,
|
|
621
|
+
Arg("frame_length") = (unsigned int)2048,
|
|
622
|
+
Arg("hop_length") = (unsigned int)512)
|
|
623
|
+
.define_method("save_track", &AudioTrack::save_track,
|
|
624
|
+
Arg("out_file"), // (no default -- duh)
|
|
625
|
+
Arg("format") = Symbol("auto"))
|
|
626
|
+
.define_method("to_s", &AudioTrack::to_s);
|
|
627
|
+
}
|