ruby_dsp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,627 @@
1
+ #include <rice/rice.hpp>
2
+ #include <rice/stl.hpp>
3
+
4
+ #include <string>
5
+ #include <stdexcept>
6
+ #include <cmath>
7
+ #include <sstream>
8
+ #include <iomanip>
9
+ #include <algorithm>
10
+
11
+ #define MINIAUDIO_IMPLEMENTATION
12
+ #include "vendor/miniaudio.h"
13
+
14
+ using namespace Rice;
15
+
16
+ std::string get_extension(const std::string &filename)
17
+ {
18
+ size_t dot_pos = filename.find_last_of('.');
19
+ if (dot_pos == std::string::npos)
20
+ return ""; // No dot found
21
+
22
+ std::string ext = filename.substr(dot_pos + 1);
23
+ std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
24
+ return ext;
25
+ }
26
+
27
+ struct AudioTrack
28
+ {
29
+ std::string filename;
30
+ int sample_rate = -1;
31
+ int channels = -1;
32
+ bool is_mono = false;
33
+ std::vector<float> samples;
34
+ unsigned long long sample_count = 0;
35
+
36
+ AudioTrack(std::string f, unsigned int target_channels = 0, unsigned int target_sample_rate = 0) : filename(f)
37
+ {
38
+ ma_decoder decoder;
39
+ ma_result result;
40
+
41
+ ma_decoder_config config = ma_decoder_config_init(ma_format_f32, (ma_uint32)target_channels, (ma_uint32)target_sample_rate);
42
+
43
+ result = ma_decoder_init_file(filename.c_str(), &config, &decoder);
44
+
45
+ if (result != MA_SUCCESS)
46
+ {
47
+ throw std::runtime_error("RubyDSP: Could not process audio file: " + filename);
48
+ }
49
+
50
+ sample_rate = decoder.outputSampleRate;
51
+ channels = decoder.outputChannels;
52
+ is_mono = (channels == 1);
53
+
54
+ ma_uint64 totalFrames;
55
+ if (ma_decoder_get_length_in_pcm_frames(&decoder, &totalFrames) != MA_SUCCESS)
56
+ {
57
+ ma_decoder_uninit(&decoder);
58
+ throw std::runtime_error("RubyDSP: Could not determine track length.");
59
+ }
60
+
61
+ sample_count = totalFrames * channels;
62
+ samples.resize(sample_count);
63
+
64
+ ma_uint64 framesRead;
65
+ if (ma_decoder_read_pcm_frames(&decoder, samples.data(), totalFrames, &framesRead) != MA_SUCCESS)
66
+ {
67
+ ma_decoder_uninit(&decoder);
68
+ throw std::runtime_error("RubyDSP: Failed to read PCM data.");
69
+ }
70
+
71
+ ma_decoder_uninit(&decoder);
72
+ }
73
+
74
+ bool save_track(const std::string &outFile, Rice::Symbol format_sym = Rice::Symbol("auto"))
75
+ {
76
+ std::string final_path = outFile;
77
+ std::string format = format_sym.str();
78
+ std::string ext = get_extension(final_path);
79
+
80
+ if (format == "auto")
81
+ {
82
+ if (ext.empty())
83
+ {
84
+ format = "wav";
85
+ final_path += ".wav";
86
+ }
87
+ else
88
+ {
89
+ format = ext;
90
+ }
91
+ }
92
+ else
93
+ {
94
+ if (ext.empty())
95
+ {
96
+ final_path += "." + format;
97
+ }
98
+ }
99
+
100
+ if (format == "wav")
101
+ {
102
+ if (samples.empty() || channels <= 0 || sample_rate <= 0)
103
+ {
104
+ throw std::runtime_error("RubyDSP: Cannot save an empty or invalid audio track.");
105
+ }
106
+
107
+ ma_encoder_config config = ma_encoder_config_init(
108
+ ma_encoding_format_wav,
109
+ ma_format_f32,
110
+ (ma_uint32)channels,
111
+ (ma_uint32)sample_rate);
112
+
113
+ ma_encoder encoder;
114
+
115
+ ma_result result = ma_encoder_init_file(final_path.c_str(), &config, &encoder);
116
+
117
+ if (result != MA_SUCCESS)
118
+ {
119
+ throw std::runtime_error("RubyDSP: Failed to initialize WAV encoder for: " + final_path);
120
+ }
121
+
122
+ ma_uint64 framesToWrite = samples.size() / channels;
123
+ ma_uint64 framesWritten = 0;
124
+
125
+ result = ma_encoder_write_pcm_frames(&encoder, samples.data(), framesToWrite, &framesWritten);
126
+
127
+ ma_encoder_uninit(&encoder);
128
+
129
+ if (result != MA_SUCCESS)
130
+ {
131
+ throw std::runtime_error("RubyDSP: Failed to write PCM data to: " + final_path);
132
+ }
133
+
134
+ if (framesWritten != framesToWrite)
135
+ {
136
+ throw std::runtime_error("RubyDSP: Incomplete file write to: " + final_path);
137
+ }
138
+
139
+ return true;
140
+ }
141
+
142
+ // TODO add unsupported formats
143
+ if (format == "flac" || format == "mp3" || format == "vorbis" || format == "ogg")
144
+ {
145
+ throw std::runtime_error("RubyDSP: " + format + " encoding is not yet supported. Only WAV is available.");
146
+ }
147
+
148
+ throw std::runtime_error("RubyDSP: Unknown format '" + format + "'");
149
+ }
150
+
151
+ float duration()
152
+ {
153
+ return (float)sample_count / (sample_rate * channels);
154
+ }
155
+
156
+ float peak_amplitude()
157
+ {
158
+ float max_val = 0.0f;
159
+ for (const auto &sample : samples)
160
+ {
161
+ max_val = std::max(max_val, std::fabs(sample));
162
+ }
163
+ return max_val;
164
+ }
165
+
166
+ bool to_mono_bang()
167
+ {
168
+ if (is_mono)
169
+ {
170
+ return false; // no-op
171
+ }
172
+
173
+ if (channels < 1)
174
+ {
175
+ throw std::runtime_error("RubyDSP: Wrong number of channels (" + std::to_string(channels) + ")");
176
+ }
177
+
178
+ unsigned long long new_size = sample_count / channels;
179
+ std::vector<float> mono_samples;
180
+ mono_samples.reserve(new_size);
181
+
182
+ // mean calculation pass
183
+ for (unsigned long long i = 0; i < new_size; ++i)
184
+ {
185
+ float sum = 0.0f;
186
+ // frame pass
187
+ for (int c = 0; c < channels; ++c)
188
+ {
189
+ sum += samples[i * channels + c];
190
+ }
191
+ mono_samples.push_back(sum / channels);
192
+ }
193
+
194
+ // replace samples with mono
195
+ samples = std::move(mono_samples);
196
+ channels = 1;
197
+ is_mono = true;
198
+ sample_count = samples.size();
199
+ return true;
200
+ }
201
+
202
+ bool resample_bang(unsigned int target_rate = 0)
203
+ {
204
+ if (target_rate == 0 || target_rate == sample_rate)
205
+ {
206
+ return false; // no-op
207
+ }
208
+
209
+ // TODO: add better, linear will have to do for now
210
+ ma_resampler_config config = ma_resampler_config_init(
211
+ ma_format_f32,
212
+ (ma_uint32)channels,
213
+ (ma_uint32)sample_rate,
214
+ (ma_uint32)target_rate,
215
+ ma_resample_algorithm_linear);
216
+
217
+ ma_resampler resampler;
218
+ if (ma_resampler_init(&config, NULL, &resampler) != MA_SUCCESS)
219
+ {
220
+ throw std::runtime_error("RubyDSP: Failed to initialize resampler.");
221
+ }
222
+
223
+ // Calculate input/output frame counts
224
+ ma_uint64 input_frames = sample_count / channels;
225
+ ma_uint64 expected_output_frames = 0;
226
+
227
+ if (ma_resampler_get_expected_output_frame_count(&resampler, input_frames, &expected_output_frames) != MA_SUCCESS)
228
+ {
229
+ ma_resampler_uninit(&resampler, NULL);
230
+ throw std::runtime_error("RubyDSP: Failed to get expected output frame count.");
231
+ }
232
+
233
+ std::vector<float> resampled_data(expected_output_frames * channels);
234
+
235
+ // Process the audio
236
+ ma_uint64 frames_in = input_frames;
237
+ ma_uint64 frames_out = expected_output_frames;
238
+
239
+ if (ma_resampler_process_pcm_frames(&resampler, samples.data(), &frames_in, resampled_data.data(), &frames_out) != MA_SUCCESS)
240
+ {
241
+ ma_resampler_uninit(&resampler, NULL);
242
+ throw std::runtime_error("RubyDSP: Resampling failed during processing.");
243
+ }
244
+
245
+ ma_resampler_uninit(&resampler, NULL);
246
+
247
+ // Shrink buffer if the resampler output slightly fewer frames than expected
248
+ resampled_data.resize(frames_out * channels);
249
+
250
+ // Update internals
251
+ samples = std::move(resampled_data);
252
+ sample_rate = target_rate;
253
+ sample_count = samples.size();
254
+
255
+ return true;
256
+ }
257
+
258
+ std::vector<float> rms()
259
+ {
260
+ if (samples.empty())
261
+ {
262
+ return {}; // should not happen
263
+ }
264
+
265
+ std::vector<float> result(channels, 0.0f);
266
+ unsigned long long per_channel_samples = sample_count / channels;
267
+
268
+ if (per_channel_samples == 0)
269
+ {
270
+ return result;
271
+ }
272
+
273
+ // Process each channel
274
+ for (int c = 0; c < channels; ++c)
275
+ {
276
+ double sum_sq = 0.0;
277
+
278
+ for (unsigned long long i = 0; i < per_channel_samples; ++i)
279
+ {
280
+ // Access the correct sample in the interleaved array
281
+ float s = samples[i * channels + c];
282
+ sum_sq += s * s;
283
+ }
284
+
285
+ result[c] = (float)std::sqrt(sum_sq / per_channel_samples);
286
+ }
287
+
288
+ return result;
289
+ }
290
+
291
+ std::vector<std::vector<float>> framed_rms(unsigned int frame_length = 2048, unsigned int hop_length = 512)
292
+ {
293
+ if (frame_length == 0 || hop_length == 0 || samples.empty())
294
+ {
295
+ return {};
296
+ }
297
+
298
+ unsigned long long per_channel_samples = sample_count / channels;
299
+
300
+ // Either SUPER SHORT track or SUPER LONG frame_length
301
+ // --> will be less than single full frame per channel
302
+ // --> fallback to rms wrapped to be 2D
303
+ if (per_channel_samples < frame_length)
304
+ {
305
+ std::vector<float> overall_rms = rms();
306
+
307
+ // wrap
308
+ std::vector<std::vector<float>> fallback_result(channels, std::vector<float>(1, 0.0f));
309
+
310
+ for (int c = 0; c < channels; ++c)
311
+ {
312
+ fallback_result[c][0] = overall_rms[c];
313
+ }
314
+
315
+ return fallback_result;
316
+ }
317
+
318
+ // more than single full frame per channel (usual)
319
+ unsigned long long expected_frames = ((per_channel_samples - frame_length) / hop_length) + 1;
320
+ std::vector<std::vector<float>> result(channels, std::vector<float>(expected_frames, 0.0f));
321
+
322
+ for (int c = 0; c < channels; ++c)
323
+ {
324
+ for (unsigned long long i = 0; i < expected_frames; ++i)
325
+ {
326
+ unsigned long long start_sample = (i * hop_length) * channels + c;
327
+ double sum_sq = 0.0;
328
+
329
+ for (unsigned int j = 0; j < frame_length; ++j)
330
+ {
331
+ float s = samples[start_sample + (j * channels)];
332
+ // ^2 to flip all to positive
333
+ sum_sq += s * s;
334
+ }
335
+
336
+ result[c][i] = (float)std::sqrt(sum_sq / frame_length);
337
+ }
338
+ }
339
+
340
+ return result;
341
+ }
342
+
343
+ std::vector<float> zcr()
344
+ {
345
+ if (samples.empty())
346
+ return {};
347
+
348
+ std::vector<float> result(channels, 0.0f);
349
+ unsigned long long per_channel_samples = sample_count / channels;
350
+
351
+ if (per_channel_samples < 2)
352
+ return result;
353
+
354
+ for (int c = 0; c < channels; ++c)
355
+ {
356
+ unsigned int crossings = 0;
357
+ for (unsigned long long j = 1; j < per_channel_samples; ++j)
358
+ {
359
+ float curr = samples[j * channels + c];
360
+ float prev = samples[(j - 1) * channels + c];
361
+
362
+ if ((curr >= 0.0f) != (prev >= 0.0f))
363
+ {
364
+ crossings++;
365
+ }
366
+ }
367
+ result[c] = (float)crossings / per_channel_samples;
368
+ }
369
+ return result;
370
+ }
371
+
372
+ std::vector<std::vector<float>> framed_zcr(unsigned int frame_length = 2048, unsigned int hop_length = 512)
373
+ {
374
+
375
+ if (frame_length == 0 || hop_length == 0 || samples.empty())
376
+ {
377
+ return {};
378
+ }
379
+
380
+ unsigned long long per_channel_samples = sample_count / channels;
381
+
382
+ if (per_channel_samples < frame_length)
383
+ {
384
+ std::vector<float> overall_zcr = zcr();
385
+
386
+ // wrap
387
+ std::vector<std::vector<float>> fallback_result(channels, std::vector<float>(1, 0.0f));
388
+
389
+ for (int c = 0; c < channels; ++c)
390
+ {
391
+ fallback_result[c][0] = overall_zcr[c];
392
+ }
393
+
394
+ return fallback_result;
395
+ }
396
+
397
+ // Calculate number of frames
398
+ unsigned long long expected_frames = ((per_channel_samples - frame_length) / hop_length) + 1;
399
+
400
+ std::vector<std::vector<float>> result(channels, std::vector<float>(expected_frames, 0.0f));
401
+
402
+ for (int c = 0; c < channels; ++c)
403
+ {
404
+ for (unsigned long long i = 0; i < expected_frames; ++i)
405
+ {
406
+ unsigned long long start_sample = (i * hop_length) * channels + c;
407
+ unsigned int crossings = 0;
408
+
409
+ for (unsigned int j = 1; j < frame_length; ++j)
410
+ {
411
+ unsigned long long curr = start_sample + (j * channels);
412
+ unsigned long long prev = start_sample + ((j - 1) * channels);
413
+
414
+ if ((samples[curr] >= 0.0f) != (samples[prev] >= 0.0f))
415
+ {
416
+ crossings++;
417
+ }
418
+ }
419
+ // Normalize
420
+ result[c][i] = (float)crossings / frame_length;
421
+ }
422
+ }
423
+ return result;
424
+ }
425
+
426
+ std::vector<unsigned long long> silence_bounds(float threshold_db = -60.0f, unsigned int frame_length = 2048, unsigned int hop_length = 512)
427
+ {
428
+ if (samples.empty())
429
+ return {0, 0};
430
+
431
+ // Get framed RMS
432
+ std::vector<std::vector<float>> rms_frames = framed_rms(frame_length, hop_length);
433
+ if (rms_frames.empty() || rms_frames[0].empty())
434
+ {
435
+ return {0, sample_count / channels};
436
+ }
437
+
438
+ unsigned long long num_frames = rms_frames[0].size();
439
+
440
+ // Find the global peak RMS across all frames and all channels
441
+ float max_rms = 0.0f;
442
+ for (int c = 0; c < channels; ++c)
443
+ {
444
+ for (unsigned long long i = 0; i < num_frames; ++i)
445
+ {
446
+ if (rms_frames[c][i] > max_rms)
447
+ {
448
+ max_rms = rms_frames[c][i];
449
+ }
450
+ }
451
+ }
452
+
453
+ // Prevent errors on pure silence
454
+ if (max_rms < 1e-10f)
455
+ return {0, 0};
456
+
457
+ // Scan from the left to find the start frame
458
+ unsigned long long start_frame = 0;
459
+ bool found_start = false;
460
+
461
+ for (unsigned long long i = 0; i < num_frames; ++i)
462
+ {
463
+ float frame_max_rms = 0.0f;
464
+ for (int c = 0; c < channels; ++c)
465
+ {
466
+ if (rms_frames[c][i] > frame_max_rms)
467
+ {
468
+ frame_max_rms = rms_frames[c][i];
469
+ }
470
+ }
471
+
472
+ // Convert to decibels relative to the peak RMS
473
+ float db = 20.0f * std::log10((frame_max_rms / max_rms) + 1e-10f);
474
+
475
+ if (db > threshold_db)
476
+ {
477
+ start_frame = i;
478
+ found_start = true;
479
+ break;
480
+ }
481
+ }
482
+
483
+ // Scan from the right to find the end frame
484
+ unsigned long long end_frame = num_frames > 0 ? num_frames - 1 : 0;
485
+ if (found_start)
486
+ {
487
+ for (long long i = num_frames - 1; i >= 0; --i)
488
+ {
489
+ float frame_max_rms = 0.0f;
490
+ for (int c = 0; c < channels; ++c)
491
+ {
492
+ if (rms_frames[c][i] > frame_max_rms)
493
+ {
494
+ frame_max_rms = rms_frames[c][i];
495
+ }
496
+ }
497
+
498
+ float db = 20.0f * std::log10((frame_max_rms / max_rms) + 1e-10f);
499
+
500
+ if (db > threshold_db)
501
+ {
502
+ end_frame = i;
503
+ break;
504
+ }
505
+ }
506
+ }
507
+ else
508
+ {
509
+ return {0, 0}; // Track was entirely below threshold
510
+ }
511
+
512
+ // Convert frame indices back to sample indices
513
+ unsigned long long start_sample = start_frame * hop_length;
514
+ unsigned long long end_sample = end_frame * hop_length + frame_length;
515
+
516
+ unsigned long long per_channel_samples = sample_count / channels;
517
+
518
+ if (start_frame == 0)
519
+ {
520
+ start_sample = 0;
521
+ }
522
+
523
+ if (end_frame == num_frames - 1)
524
+ {
525
+ end_sample = per_channel_samples;
526
+ }
527
+ else if (end_sample > per_channel_samples)
528
+ {
529
+ end_sample = per_channel_samples;
530
+ }
531
+
532
+ return {start_sample, end_sample};
533
+ }
534
+
535
+ bool trim_silence_bang(float threshold_db = -60.0f, unsigned int frame_length = 2048, unsigned int hop_length = 512)
536
+ {
537
+ if (samples.empty())
538
+ return false;
539
+
540
+ std::vector<unsigned long long> bounds = silence_bounds(threshold_db, frame_length, hop_length);
541
+ unsigned long long start_sample = bounds[0];
542
+ unsigned long long end_sample = bounds[1];
543
+
544
+ unsigned long long per_channel_samples = sample_count / channels;
545
+
546
+ // No-op checks
547
+ if (start_sample == 0 && end_sample >= per_channel_samples)
548
+ return false;
549
+
550
+ // If the file is entirely silent, clear everything
551
+ if (start_sample == 0 && end_sample == 0)
552
+ {
553
+ samples.clear();
554
+ sample_count = 0;
555
+ return true;
556
+ }
557
+
558
+ // Slice the interleaved sample array
559
+ unsigned long long start_idx = start_sample * channels;
560
+ unsigned long long end_idx = end_sample * channels;
561
+
562
+ std::vector<float> trimmed_samples(samples.begin() + start_idx, samples.begin() + end_idx);
563
+ samples = std::move(trimmed_samples);
564
+ sample_count = samples.size();
565
+
566
+ return true;
567
+ }
568
+
569
+ std::string to_s()
570
+ {
571
+ std::ostringstream stream;
572
+ stream << "['" << filename << "', "
573
+ << std::fixed << std::setprecision(3) << duration() << "s duration, "
574
+ << channels << " channel(s), "
575
+ << sample_rate << "Hz sample rate]";
576
+ return stream.str();
577
+ }
578
+ };
579
+
580
+ extern "C"
581
+ #if defined(_WIN32)
582
+ __declspec(dllexport)
583
+ #else
584
+ __attribute__((visibility("default")))
585
+ #endif
586
+ void Init_ruby_dsp()
587
+ {
588
+ Module rb_mRubyDSP = define_module("RubyDSP");
589
+ Data_Type<AudioTrack> rb_cAudioTrack = define_class_under<AudioTrack>(rb_mRubyDSP, "AudioTrack")
590
+ .define_constructor(Constructor<AudioTrack, std::string, unsigned int, unsigned int>(),
591
+ Arg("file_name") = (std::string) "default.wav",
592
+ Arg("target_channels") = (unsigned int)0,
593
+ Arg("target_sample_rate") = (unsigned int)0)
594
+ // attributes
595
+ .define_attr("file_name", &AudioTrack::filename, Rice::AttrAccess::Read)
596
+ .define_attr("channels", &AudioTrack::channels, Rice::AttrAccess::Read)
597
+ .define_attr("samples", &AudioTrack::samples, Rice::AttrAccess::Read)
598
+ .define_attr("sample_count", &AudioTrack::sample_count, Rice::AttrAccess::Read)
599
+ .define_attr("sample_rate", &AudioTrack::sample_rate, Rice::AttrAccess::Read)
600
+ .define_attr("is_mono?", &AudioTrack::is_mono, Rice::AttrAccess::Read)
601
+ // methods
602
+ .define_method("duration", &AudioTrack::duration)
603
+ .define_method("peak_amp", &AudioTrack::peak_amplitude)
604
+ .define_method("to_mono!", &AudioTrack::to_mono_bang)
605
+ .define_method("resample!", &AudioTrack::resample_bang,
606
+ Arg("target_rate") = (unsigned int)0)
607
+ .define_method("rms", &AudioTrack::rms)
608
+ .define_method("framed_rms", &AudioTrack::framed_rms,
609
+ Arg("frame_length") = (unsigned int)2048,
610
+ Arg("hop_length") = (unsigned int)512)
611
+ .define_method("zcr", &AudioTrack::zcr)
612
+ .define_method("framed_zcr", &AudioTrack::framed_zcr,
613
+ Arg("frame_length") = (unsigned int)2048,
614
+ Arg("hop_length") = (unsigned int)512)
615
+ .define_method("silence_bounds", &AudioTrack::silence_bounds,
616
+ Arg("threshold_db") = -60.0f,
617
+ Arg("frame_length") = (unsigned int)2048,
618
+ Arg("hop_length") = (unsigned int)512)
619
+ .define_method("trim_silence!", &AudioTrack::trim_silence_bang,
620
+ Arg("threshold_db") = -60.0f,
621
+ Arg("frame_length") = (unsigned int)2048,
622
+ Arg("hop_length") = (unsigned int)512)
623
+ .define_method("save_track", &AudioTrack::save_track,
624
+ Arg("out_file"), // (no default -- duh)
625
+ Arg("format") = Symbol("auto"))
626
+ .define_method("to_s", &AudioTrack::to_s);
627
+ }