react-native-audio-api 0.7.2-nightly-c06331b-20250824 → 0.7.2-nightly-799cc6b-20250825
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/common/cpp/audioapi/core/sources/AudioBufferQueueSourceNode.cpp +1 -1
- package/common/cpp/audioapi/core/sources/AudioBufferSourceNode.cpp +1 -1
- package/common/cpp/audioapi/libs/signalsmith-stretch/fft-pffft.h +222 -0
- package/common/cpp/audioapi/libs/signalsmith-stretch/fft.h +116 -100
- package/common/cpp/audioapi/libs/signalsmith-stretch/signalsmith-stretch.h +299 -41
- package/common/cpp/audioapi/libs/signalsmith-stretch/stft.h +89 -83
- package/package.json +1 -1
- package/common/cpp/audioapi/libs/signalsmith-stretch/fft-accelerate.h +0 -326
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
#ifndef SIGNALSMITH_STRETCH_H
|
|
2
2
|
#define SIGNALSMITH_STRETCH_H
|
|
3
3
|
|
|
4
|
-
#include <audioapi/libs/signalsmith-stretch/stft.h>
|
|
4
|
+
#include <audioapi/libs/signalsmith-stretch/stft.h> // https://github.com/Signalsmith-Audio/linear
|
|
5
|
+
|
|
5
6
|
#include <vector>
|
|
7
|
+
#include <array>
|
|
6
8
|
#include <algorithm>
|
|
7
9
|
#include <functional>
|
|
8
10
|
#include <random>
|
|
11
|
+
#include <limits>
|
|
9
12
|
#include <type_traits>
|
|
10
13
|
|
|
11
14
|
namespace signalsmith { namespace stretch {
|
|
@@ -30,17 +33,12 @@ namespace _impl {
|
|
|
30
33
|
|
|
31
34
|
template<typename Sample=float, class RandomEngine=void>
|
|
32
35
|
struct SignalsmithStretch {
|
|
33
|
-
static constexpr size_t version[3] = {1,
|
|
36
|
+
static constexpr size_t version[3] = {1, 3, 2};
|
|
34
37
|
|
|
35
38
|
SignalsmithStretch() : randomEngine(std::random_device{}()) {}
|
|
36
39
|
SignalsmithStretch(long seed) : randomEngine(seed) {}
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
return int(stft.blockSamples());
|
|
40
|
-
}
|
|
41
|
-
int intervalSamples() const {
|
|
42
|
-
return int(stft.defaultInterval());
|
|
43
|
-
}
|
|
41
|
+
// The difference between the internal position (centre of a block) and the input samples you're supplying
|
|
44
42
|
int inputLatency() const {
|
|
45
43
|
return int(stft.analysisLatency());
|
|
46
44
|
}
|
|
@@ -57,8 +55,8 @@ struct SignalsmithStretch {
|
|
|
57
55
|
channelBands.assign(channelBands.size(), Band());
|
|
58
56
|
silenceCounter = 0;
|
|
59
57
|
didSeek = false;
|
|
60
|
-
|
|
61
58
|
blockProcess = {};
|
|
59
|
+
freqEstimateWeighted = freqEstimateWeight = 0;
|
|
62
60
|
}
|
|
63
61
|
|
|
64
62
|
// Configures using a default preset
|
|
@@ -78,7 +76,6 @@ struct SignalsmithStretch {
|
|
|
78
76
|
stft.reset(0.1);
|
|
79
77
|
stashedInput = stft.input;
|
|
80
78
|
stashedOutput = stft.output;
|
|
81
|
-
tmpBuffer.resize(blockSamples + intervalSamples);
|
|
82
79
|
|
|
83
80
|
bands = int(stft.bands());
|
|
84
81
|
channelBands.assign(bands*channels, Band());
|
|
@@ -90,6 +87,20 @@ struct SignalsmithStretch {
|
|
|
90
87
|
channelPredictions.resize(channels*bands);
|
|
91
88
|
|
|
92
89
|
blockProcess = {};
|
|
90
|
+
formantMetric.resize(bands + 2);
|
|
91
|
+
|
|
92
|
+
tmpProcessBuffer.resize(blockSamples + intervalSamples);
|
|
93
|
+
tmpPreRollBuffer.resize(outputLatency()*channels);
|
|
94
|
+
}
|
|
95
|
+
// For querying the existing config
|
|
96
|
+
int blockSamples() const {
|
|
97
|
+
return int(stft.blockSamples());
|
|
98
|
+
}
|
|
99
|
+
int intervalSamples() const {
|
|
100
|
+
return int(stft.defaultInterval());
|
|
101
|
+
}
|
|
102
|
+
bool splitComputation() const {
|
|
103
|
+
return _splitComputation;
|
|
93
104
|
}
|
|
94
105
|
|
|
95
106
|
/// Frequency multiplier, and optional tonality limit (as multiple of sample-rate)
|
|
@@ -104,21 +115,34 @@ struct SignalsmithStretch {
|
|
|
104
115
|
}
|
|
105
116
|
void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) {
|
|
106
117
|
setTransposeFactor(std::pow(2, semitones/12), tonalityLimit);
|
|
107
|
-
customFreqMap = nullptr;
|
|
108
118
|
}
|
|
109
119
|
// Sets a custom frequency map - should be monotonically increasing
|
|
110
120
|
void setFreqMap(std::function<Sample(Sample)> inputToOutput) {
|
|
111
121
|
customFreqMap = inputToOutput;
|
|
112
122
|
}
|
|
113
123
|
|
|
114
|
-
|
|
124
|
+
void setFormantFactor(Sample multiplier, bool compensatePitch=false) {
|
|
125
|
+
formantMultiplier = multiplier;
|
|
126
|
+
invFormantMultiplier = 1/multiplier;
|
|
127
|
+
formantCompensation = compensatePitch;
|
|
128
|
+
}
|
|
129
|
+
void setFormantSemitones(Sample semitones, bool compensatePitch=false) {
|
|
130
|
+
setFormantFactor(std::pow(2, semitones/12), compensatePitch);
|
|
131
|
+
}
|
|
132
|
+
// Rough guesstimate of the fundamental frequency, used for formant analysis. 0 means attempting to detect the pitch
|
|
133
|
+
void setFormantBase(Sample baseFreq=0) {
|
|
134
|
+
formantBaseFreq = baseFreq;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Provide previous input ("pre-roll") to smoothly change the input location without interrupting the output. This doesn't do any calculation, just copies intput to a buffer.
|
|
138
|
+
// You should ideally feed it `seekLength()` frames of input, unless it's directly after a `.reset()` (in which case `.outputSeek()` might be a better choice)
|
|
115
139
|
template<class Inputs>
|
|
116
140
|
void seek(Inputs &&inputs, int inputSamples, double playbackRate) {
|
|
117
|
-
|
|
118
|
-
|
|
141
|
+
tmpProcessBuffer.resize(0);
|
|
142
|
+
tmpProcessBuffer.resize(stft.blockSamples() + stft.defaultInterval());
|
|
119
143
|
|
|
120
|
-
int startIndex = std::max<int>(0, inputSamples - int(
|
|
121
|
-
int padStart = int(
|
|
144
|
+
int startIndex = std::max<int>(0, inputSamples - int(tmpProcessBuffer.size())); // start position in input
|
|
145
|
+
int padStart = int(tmpProcessBuffer.size() + startIndex) - inputSamples; // start position in tmpProcessBuffer
|
|
122
146
|
|
|
123
147
|
Sample totalEnergy = 0;
|
|
124
148
|
for (int c = 0; c < channels; ++c) {
|
|
@@ -126,12 +150,12 @@ struct SignalsmithStretch {
|
|
|
126
150
|
for (int i = startIndex; i < inputSamples; ++i) {
|
|
127
151
|
Sample s = inputChannel[i];
|
|
128
152
|
totalEnergy += s*s;
|
|
129
|
-
|
|
153
|
+
tmpProcessBuffer[i - startIndex + padStart] = s;
|
|
130
154
|
}
|
|
131
155
|
|
|
132
|
-
stft.writeInput(c,
|
|
156
|
+
stft.writeInput(c, tmpProcessBuffer.size(), tmpProcessBuffer.data());
|
|
133
157
|
}
|
|
134
|
-
stft.moveInput(
|
|
158
|
+
stft.moveInput(tmpProcessBuffer.size());
|
|
135
159
|
if (totalEnergy >= noiseFloor) {
|
|
136
160
|
silenceCounter = 0;
|
|
137
161
|
silenceFirst = true;
|
|
@@ -139,6 +163,48 @@ struct SignalsmithStretch {
|
|
|
139
163
|
didSeek = true;
|
|
140
164
|
seekTimeFactor = (playbackRate*stft.defaultInterval() > 1) ? 1/playbackRate : stft.defaultInterval();
|
|
141
165
|
}
|
|
166
|
+
int seekLength() const {
|
|
167
|
+
return int(stft.blockSamples() + stft.defaultInterval());
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Moves the input position *and* pre-calculates some output, so that the next samples returned from `.process()` are aligned to the beginning of the sample.
|
|
171
|
+
// The time-stretch rate is inferred from `inputLength`, so use `.outputSeekLength()` to get a correct value for that.
|
|
172
|
+
template<class Inputs>
|
|
173
|
+
void outputSeek(Inputs &&inputs, int inputLength) {
|
|
174
|
+
// TODO: add fade-out parameter to avoid clicks, instead of doing a full reset
|
|
175
|
+
reset();
|
|
176
|
+
// Assume we've been handed enough surplus input to produce `outputLatency()` samples of pre-roll
|
|
177
|
+
int surplusInput = std::max<int>(inputLength - inputLatency(), 0);
|
|
178
|
+
Sample playbackRate = surplusInput/Sample(outputLatency());
|
|
179
|
+
|
|
180
|
+
// Move the input position to the start of the sound
|
|
181
|
+
int seekSamples = inputLength - surplusInput;
|
|
182
|
+
seek(inputs, seekSamples, playbackRate);
|
|
183
|
+
|
|
184
|
+
tmpPreRollBuffer.resize(outputLatency()*channels);
|
|
185
|
+
struct BufferOutput {
|
|
186
|
+
Sample *samples;
|
|
187
|
+
int length;
|
|
188
|
+
|
|
189
|
+
Sample * operator[](int c) {
|
|
190
|
+
return samples + c*length;
|
|
191
|
+
}
|
|
192
|
+
} preRollOutput{tmpPreRollBuffer.data(), outputLatency()};
|
|
193
|
+
|
|
194
|
+
// Use the surplus input to produce pre-roll output
|
|
195
|
+
OffsetIO<Inputs> offsetInput{inputs, seekSamples};
|
|
196
|
+
process(offsetInput, surplusInput, preRollOutput, preRollOutput.length);
|
|
197
|
+
|
|
198
|
+
// put the thing down, flip it and reverse it
|
|
199
|
+
for (auto &v : tmpPreRollBuffer) v = -v;
|
|
200
|
+
for (int c = 0; c < channels; ++c) {
|
|
201
|
+
std::reverse(preRollOutput[c], preRollOutput[c] + preRollOutput.length);
|
|
202
|
+
stft.addOutput(c, preRollOutput.length, preRollOutput[c]);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
int outputSeekLength(Sample playbackRate) const {
|
|
206
|
+
return inputLatency() + playbackRate*outputLatency();
|
|
207
|
+
}
|
|
142
208
|
|
|
143
209
|
template<class Inputs, class Outputs>
|
|
144
210
|
void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
|
|
@@ -149,14 +215,14 @@ struct SignalsmithStretch {
|
|
|
149
215
|
auto copyInput = [&](int toIndex){
|
|
150
216
|
|
|
151
217
|
int length = std::min<int>(int(stft.blockSamples() + stft.defaultInterval()), toIndex - prevCopiedInput);
|
|
152
|
-
|
|
218
|
+
tmpProcessBuffer.resize(length);
|
|
153
219
|
int offset = toIndex - length;
|
|
154
220
|
for (int c = 0; c < channels; ++c) {
|
|
155
221
|
auto &&inputBuffer = inputs[c];
|
|
156
222
|
for (int i = 0; i < length; ++i) {
|
|
157
|
-
|
|
223
|
+
tmpProcessBuffer[i] = inputBuffer[i + offset];
|
|
158
224
|
}
|
|
159
|
-
stft.writeInput(c, length,
|
|
225
|
+
stft.writeInput(c, length, tmpProcessBuffer.data());
|
|
160
226
|
}
|
|
161
227
|
stft.moveInput(length);
|
|
162
228
|
prevCopiedInput = toIndex;
|
|
@@ -241,6 +307,8 @@ struct SignalsmithStretch {
|
|
|
241
307
|
blockProcess.steps += stft.analyseSteps() + 1;
|
|
242
308
|
}
|
|
243
309
|
|
|
310
|
+
blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies);
|
|
311
|
+
|
|
244
312
|
blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max<Sample>(1, inputInterval);
|
|
245
313
|
didSeek = false;
|
|
246
314
|
|
|
@@ -354,28 +422,38 @@ struct SignalsmithStretch {
|
|
|
354
422
|
#endif
|
|
355
423
|
}
|
|
356
424
|
|
|
357
|
-
// Read the remaining output, providing no further input. `outputSamples`
|
|
425
|
+
// Read the remaining output, providing no further input. If `outputSamples` is more than one interval, it will compute additional blocks assuming a zero-valued input
|
|
358
426
|
template<class Outputs>
|
|
359
|
-
void flush(Outputs &&outputs, int outputSamples) {
|
|
360
|
-
|
|
361
|
-
|
|
427
|
+
void flush(Outputs &&outputs, int outputSamples, Sample playbackRate=0) {
|
|
428
|
+
struct Zeros {
|
|
429
|
+
struct Channel {
|
|
430
|
+
Sample operator[](int) {
|
|
431
|
+
return 0;
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
Channel operator[](int) {
|
|
435
|
+
return {};
|
|
436
|
+
}
|
|
437
|
+
} zeros;
|
|
438
|
+
// If we're asked for more than an interval of extra output, then zero-pad the input
|
|
439
|
+
int outputBlock = std::max<int>(0, outputSamples - stft.defaultInterval());
|
|
440
|
+
if (outputBlock > 0) process(zeros, outputBlock*playbackRate, outputs, outputBlock);
|
|
441
|
+
|
|
442
|
+
int tailSamples = outputSamples - outputBlock; // at most one interval
|
|
443
|
+
tmpProcessBuffer.resize(tailSamples);
|
|
362
444
|
stft.finishOutput(1);
|
|
363
445
|
for (int c = 0; c < channels; ++c) {
|
|
364
|
-
|
|
365
|
-
stft.readOutput(c, plainOutput, tmpBuffer.data());
|
|
446
|
+
stft.readOutput(c, tailSamples, tmpProcessBuffer.data());
|
|
366
447
|
auto &&outputChannel = outputs[c];
|
|
367
|
-
for (int i = 0; i <
|
|
368
|
-
|
|
369
|
-
outputChannel[i] = tmpBuffer[i];
|
|
448
|
+
for (int i = 0; i < tailSamples; ++i) {
|
|
449
|
+
outputChannel[outputBlock + i] = tmpProcessBuffer[i];
|
|
370
450
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
outputChannel[outputSamples - 1 - i] -= tmpBuffer[i];
|
|
451
|
+
stft.readOutput(c, tailSamples, tailSamples, tmpProcessBuffer.data());
|
|
452
|
+
for (int i = 0; i < tailSamples; ++i) {
|
|
453
|
+
outputChannel[outputBlock + tailSamples - 1 - i] -= tmpProcessBuffer[i];
|
|
375
454
|
}
|
|
376
455
|
}
|
|
377
|
-
stft.reset(0.
|
|
378
|
-
|
|
456
|
+
stft.reset(0.1f);
|
|
379
457
|
// Reset the phase-vocoder stuff, so the next block gets a fresh start
|
|
380
458
|
for (int c = 0; c < channels; ++c) {
|
|
381
459
|
auto channelBands = bandsForChannel(c);
|
|
@@ -384,16 +462,45 @@ struct SignalsmithStretch {
|
|
|
384
462
|
}
|
|
385
463
|
}
|
|
386
464
|
}
|
|
465
|
+
|
|
466
|
+
// Process a complete audio buffer all in one go
|
|
467
|
+
template<class Inputs, class Outputs>
|
|
468
|
+
bool exact(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) {
|
|
469
|
+
Sample playbackRate = inputSamples/Sample(outputSamples);
|
|
470
|
+
auto seekLength = outputSeekLength(playbackRate);
|
|
471
|
+
if (inputSamples < seekLength) {
|
|
472
|
+
// to short for this - zero the output just to be polite
|
|
473
|
+
for (int c = 0; c < channels; ++c) {
|
|
474
|
+
auto &&channel = outputs[c];
|
|
475
|
+
for (int i = 0; i < outputSamples; ++i) {
|
|
476
|
+
channel[i] = 0;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
return false;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
outputSeek(inputs, seekLength);
|
|
483
|
+
|
|
484
|
+
int outputIndex = outputSamples - seekLength/playbackRate;
|
|
485
|
+
OffsetIO<Inputs> offsetInput{inputs, seekLength};
|
|
486
|
+
process(offsetInput, inputSamples - seekLength, outputs, outputIndex);
|
|
487
|
+
|
|
488
|
+
OffsetIO<Outputs> offsetOutput{outputs, outputIndex};
|
|
489
|
+
flush(offsetOutput, outputSamples - outputIndex, playbackRate);
|
|
490
|
+
return true;
|
|
491
|
+
}
|
|
492
|
+
|
|
387
493
|
private:
|
|
388
494
|
bool _splitComputation = false;
|
|
389
495
|
struct {
|
|
390
|
-
size_t samplesSinceLast =
|
|
496
|
+
size_t samplesSinceLast = std::numeric_limits<size_t>::max();
|
|
391
497
|
size_t steps = 0;
|
|
392
498
|
size_t step = 0;
|
|
393
499
|
|
|
394
500
|
bool newSpectrum = false;
|
|
395
501
|
bool reanalysePrev = false;
|
|
396
502
|
bool mappedFrequencies = false;
|
|
503
|
+
bool processFormants = false;
|
|
397
504
|
Sample timeFactor;
|
|
398
505
|
} blockProcess;
|
|
399
506
|
|
|
@@ -406,12 +513,15 @@ private:
|
|
|
406
513
|
Sample freqMultiplier = 1, freqTonalityLimit = 0.5;
|
|
407
514
|
std::function<Sample(Sample)> customFreqMap = nullptr;
|
|
408
515
|
|
|
516
|
+
bool formantCompensation = false; // compensate for pitch/freq change
|
|
517
|
+
Sample formantMultiplier = 1, invFormantMultiplier = 1;
|
|
518
|
+
|
|
409
519
|
using STFT = signalsmith::linear::DynamicSTFT<Sample, false, true>;
|
|
410
520
|
STFT stft;
|
|
411
521
|
typename STFT::Input stashedInput;
|
|
412
522
|
typename STFT::Output stashedOutput;
|
|
413
523
|
|
|
414
|
-
std::vector<Sample>
|
|
524
|
+
std::vector<Sample> tmpProcessBuffer, tmpPreRollBuffer;
|
|
415
525
|
|
|
416
526
|
int channels = 0, bands = 0;
|
|
417
527
|
int prevInputOffset = -1;
|
|
@@ -518,6 +628,7 @@ private:
|
|
|
518
628
|
processSpectrumSteps += channels; // preliminary phase-vocoder prediction
|
|
519
629
|
processSpectrumSteps += splitMainPrediction;
|
|
520
630
|
if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput
|
|
631
|
+
if (blockProcess.processFormants) processSpectrumSteps += 3;
|
|
521
632
|
}
|
|
522
633
|
void processSpectrum(size_t step) {
|
|
523
634
|
Sample timeFactor = blockProcess.timeFactor;
|
|
@@ -568,12 +679,21 @@ private:
|
|
|
568
679
|
bins[b].inputEnergy = _impl::norm(bins[b].input);
|
|
569
680
|
}
|
|
570
681
|
}
|
|
682
|
+
|
|
571
683
|
for (int b = 0; b < bands; ++b) {
|
|
572
684
|
outputMap[b] = {Sample(b), 1};
|
|
573
685
|
}
|
|
574
686
|
}
|
|
575
687
|
return;
|
|
576
688
|
}
|
|
689
|
+
if (blockProcess.processFormants) {
|
|
690
|
+
if (step < 3) {
|
|
691
|
+
updateFormants(step);
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
step -= 3;
|
|
695
|
+
}
|
|
696
|
+
// Preliminary output prediction from phase-vocoder
|
|
577
697
|
if (step < size_t(channels)) {
|
|
578
698
|
int c = int(step);
|
|
579
699
|
Band *bins = bandsForChannel(c);
|
|
@@ -730,8 +850,7 @@ private:
|
|
|
730
850
|
Sample mapFreq(Sample freq) const {
|
|
731
851
|
if (customFreqMap) return customFreqMap(freq);
|
|
732
852
|
if (freq > freqTonalityLimit) {
|
|
733
|
-
|
|
734
|
-
return freqTonalityLimit*freqMultiplier + diff;
|
|
853
|
+
return freq + (freqMultiplier - 1)*freqTonalityLimit;
|
|
735
854
|
}
|
|
736
855
|
return freq*freqMultiplier;
|
|
737
856
|
}
|
|
@@ -796,6 +915,145 @@ private:
|
|
|
796
915
|
outputMap[b] = {b + topOffset, 1};
|
|
797
916
|
}
|
|
798
917
|
}
|
|
918
|
+
|
|
919
|
+
// If we mapped formants the same way as mapFreq(), this would be the inverse
|
|
920
|
+
Sample invMapFormant(Sample freq) const {
|
|
921
|
+
if (freq*invFormantMultiplier > freqTonalityLimit) {
|
|
922
|
+
return freq + (1 - formantMultiplier)*freqTonalityLimit;
|
|
923
|
+
}
|
|
924
|
+
return freq*invFormantMultiplier;
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
Sample freqEstimateWeighted = 0;
|
|
928
|
+
Sample freqEstimateWeight = 0;
|
|
929
|
+
Sample estimateFrequency() {
|
|
930
|
+
// 3 highest peaks in the input
|
|
931
|
+
std::array<int, 3> peakIndices{0, 0, 0};
|
|
932
|
+
for (int b = 1; b < bands - 1; ++b) {
|
|
933
|
+
Sample e = formantMetric[b];
|
|
934
|
+
// local maxima only
|
|
935
|
+
if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue;
|
|
936
|
+
|
|
937
|
+
if (e > formantMetric[peakIndices[0]]) {
|
|
938
|
+
if (e > formantMetric[peakIndices[1]]) {
|
|
939
|
+
if (e > formantMetric[peakIndices[2]]) {
|
|
940
|
+
peakIndices = {peakIndices[1], peakIndices[2], b};
|
|
941
|
+
} else {
|
|
942
|
+
peakIndices = {peakIndices[1], b, peakIndices[2]};
|
|
943
|
+
}
|
|
944
|
+
} else {
|
|
945
|
+
peakIndices[0] = b;
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
// VERY rough pitch estimation
|
|
951
|
+
int peakEstimate = peakIndices[2];
|
|
952
|
+
if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) {
|
|
953
|
+
int diff = std::abs(peakEstimate - peakIndices[1]);
|
|
954
|
+
if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
|
|
955
|
+
if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) {
|
|
956
|
+
int diff = std::abs(peakEstimate - peakIndices[0]);
|
|
957
|
+
if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff;
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
Sample weight = formantMetric[peakIndices[2]];
|
|
961
|
+
// Smooth it out a bit
|
|
962
|
+
freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25;
|
|
963
|
+
freqEstimateWeight += (weight - freqEstimateWeight)*0.25;
|
|
964
|
+
|
|
965
|
+
return freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30));
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
Sample freqEstimate;
|
|
969
|
+
|
|
970
|
+
std::vector<Sample> formantMetric;
|
|
971
|
+
Sample formantBaseFreq = 0;
|
|
972
|
+
void updateFormants(size_t step) {
|
|
973
|
+
if (step-- == 0) {
|
|
974
|
+
for (auto &e : formantMetric) e = 0;
|
|
975
|
+
for (int c = 0; c < channels; ++c) {
|
|
976
|
+
Band *bins = bandsForChannel(c);
|
|
977
|
+
for (int b = 0; b < bands; ++b) {
|
|
978
|
+
formantMetric[b] += bins[b].inputEnergy;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
freqEstimate = freqToBand(formantBaseFreq);
|
|
983
|
+
if (formantBaseFreq <= 0) freqEstimate = estimateFrequency();
|
|
984
|
+
} else if (step-- == 0) {
|
|
985
|
+
Sample decay = 1 - 1/(freqEstimate*0.5 + 1);
|
|
986
|
+
Sample e = 0;
|
|
987
|
+
for (size_t repeat = 0; repeat < 2; ++repeat) {
|
|
988
|
+
for (int b = bands - 1; b >= 0; --b) {
|
|
989
|
+
e = std::max(formantMetric[b], e*decay);
|
|
990
|
+
formantMetric[b] = e;
|
|
991
|
+
}
|
|
992
|
+
for (int b = 0; b < bands; ++b) {
|
|
993
|
+
e = std::max(formantMetric[b], e*decay);
|
|
994
|
+
formantMetric[b] = e;
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
decay = 1/decay;
|
|
998
|
+
for (size_t repeat = 0; repeat < 2; ++repeat) {
|
|
999
|
+
for (int b = bands - 1; b >= 0; --b) {
|
|
1000
|
+
e = std::min(formantMetric[b], e*decay);
|
|
1001
|
+
formantMetric[b] = e;
|
|
1002
|
+
}
|
|
1003
|
+
for (int b = 0; b < bands; ++b) {
|
|
1004
|
+
e = std::min(formantMetric[b], e*decay);
|
|
1005
|
+
formantMetric[b] = e;
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
} else {
|
|
1009
|
+
auto getFormant = [&](Sample band) -> Sample {
|
|
1010
|
+
if (band < 0) return 0;
|
|
1011
|
+
band = std::min<Sample>(band, bands);
|
|
1012
|
+
int floorBand = std::floor(band);
|
|
1013
|
+
Sample fracBand = band - floorBand;
|
|
1014
|
+
Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1];
|
|
1015
|
+
return low + (high - low)*fracBand;
|
|
1016
|
+
};
|
|
1017
|
+
|
|
1018
|
+
for (int b = 0; b < bands; ++b) {
|
|
1019
|
+
Sample inputF = bandToFreq(b);
|
|
1020
|
+
Sample outputF = formantCompensation ? mapFreq(inputF) : inputF;
|
|
1021
|
+
outputF = invMapFormant(outputF);
|
|
1022
|
+
|
|
1023
|
+
Sample inputE = formantMetric[b];
|
|
1024
|
+
Sample targetE = getFormant(freqToBand(outputF));
|
|
1025
|
+
|
|
1026
|
+
Sample formantRatio = targetE/(inputE + Sample(1e-30));
|
|
1027
|
+
Sample energyRatio = formantRatio;
|
|
1028
|
+
|
|
1029
|
+
for (int c = 0; c < channels; ++c) {
|
|
1030
|
+
Band *bins = bandsForChannel(c);
|
|
1031
|
+
// This is what's used to decide the output energy, so this affects the output
|
|
1032
|
+
bins[b].inputEnergy *= energyRatio;
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
// Proxy class to avoid copying/allocating anything
|
|
1039
|
+
template<class Io>
|
|
1040
|
+
struct OffsetIO {
|
|
1041
|
+
Io &io;
|
|
1042
|
+
int offset;
|
|
1043
|
+
|
|
1044
|
+
struct Channel {
|
|
1045
|
+
Io &io;
|
|
1046
|
+
int channel;
|
|
1047
|
+
int offset;
|
|
1048
|
+
|
|
1049
|
+
auto operator[](int i) -> decltype(io[0][0]) {
|
|
1050
|
+
return io[channel][i + offset];
|
|
1051
|
+
}
|
|
1052
|
+
};
|
|
1053
|
+
Channel operator[](int c) {
|
|
1054
|
+
return {io, c, offset};
|
|
1055
|
+
}
|
|
1056
|
+
};
|
|
799
1057
|
};
|
|
800
1058
|
|
|
801
1059
|
}} // namespace
|