dspx 1.0.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Mel Spectrogram Pipeline Stage
3
+ *
4
+ * Converts power spectrum to Mel-scale representation using filterbank matrix multiplication.
5
+ * This is a STATELESS operation that applies the Mel filterbank to incoming power spectra.
6
+ *
7
+ * Features:
8
+ * - High-performance matrix multiplication using Eigen
9
+ * - Pre-computed Mel filterbank (passed from TypeScript)
10
+ * - Processes power spectrum bins → Mel frequency bins
11
+ * - Multi-channel support (each channel processed independently)
12
+ *
13
+ * Mathematical Operation:
14
+ * mel_energies = filterbank × power_spectrum
15
+ * where:
16
+ * - power_spectrum is (numBins × 1) vector
17
+ * - filterbank is (numMelBands × numBins) matrix
18
+ * - mel_energies is (numMelBands × 1) vector
19
+ *
20
+ * Typical Pipeline:
21
+ * STFT → Power → MelSpectrogram → Log → MFCC
22
+ *
23
+ * Parameters:
24
+ * - filterbankMatrix: Pre-computed Mel filterbank (TypeScript provides this)
25
+ * - numBins: Number of input frequency bins (from STFT/FFT)
26
+ * - numMelBands: Number of Mel frequency bands (output size)
27
+ */
28
+
29
+ #pragma once
30
+
31
+ #include "../IDspStage.h"
32
+ #include <Eigen/Dense>
33
+ #include <vector>
34
+ #include <memory>
35
+ #include <stdexcept>
36
+ #include <string>
37
+ #include <cmath>
38
+
39
+ namespace dsp::adapters
40
+ {
41
+ class MelSpectrogramStage : public IDspStage
42
+ {
43
+ public:
44
+ /**
45
+ * @brief Constructs a Mel Spectrogram stage
46
+ * @param filterbank_matrix Pre-computed Mel filterbank (numMelBands × numBins), row-major
47
+ * @param num_bins Number of input frequency bins
48
+ * @param num_mel_bands Number of output Mel frequency bands
49
+ */
50
+ explicit MelSpectrogramStage(
51
+ const std::vector<float> &filterbank_matrix,
52
+ size_t num_bins,
53
+ size_t num_mel_bands)
54
+ : m_numBins(num_bins),
55
+ m_numMelBands(num_mel_bands),
56
+ m_filterbank(num_mel_bands, num_bins)
57
+ {
58
+ // Validate parameters
59
+ if (m_numBins == 0)
60
+ {
61
+ throw std::invalid_argument("MelSpectrogram: num_bins must be greater than 0");
62
+ }
63
+
64
+ if (m_numMelBands == 0)
65
+ {
66
+ throw std::invalid_argument("MelSpectrogram: num_mel_bands must be greater than 0");
67
+ }
68
+
69
+ if (filterbank_matrix.size() != num_mel_bands * num_bins)
70
+ {
71
+ throw std::invalid_argument(
72
+ "MelSpectrogram: filterbank matrix size (" +
73
+ std::to_string(filterbank_matrix.size()) +
74
+ ") must equal numMelBands × numBins (" +
75
+ std::to_string(num_mel_bands * num_bins) + ")");
76
+ }
77
+
78
+ // Copy filterbank matrix (input is row-major from TypeScript)
79
+ // Eigen uses column-major by default, so we need to specify row-major
80
+ m_filterbank = Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
81
+ filterbank_matrix.data(), num_mel_bands, num_bins);
82
+ }
83
+
84
+ const char *getType() const override
85
+ {
86
+ return "melSpectrogram";
87
+ }
88
+
89
+ bool isResizing() const override
90
+ {
91
+ return true; // This stage changes output size
92
+ }
93
+
94
+ void process(float *buffer, size_t numSamples, int numChannels, const float *timestamps = nullptr) override
95
+ {
96
+ // This stage changes output size - processResizing() should be called instead
97
+ throw std::runtime_error("MelSpectrogram stage requires processResizing() to be called");
98
+ }
99
+
100
+ size_t calculateOutputSize(size_t inputSize) const override
101
+ {
102
+ // Input has numBins per frame, output has numMelBands per frame
103
+ // Example: 10 samples, 2 channels, numBins=5 → samplesPerChannel=5, numFrames=1
104
+ // Output: 1 frame × numMelBands × numChannels
105
+ // Since we don't know numChannels here, we need to handle it in processResizing
106
+ // For now, return based on the ratio: (numMelBands / numBins) * inputSize
107
+ return (inputSize / m_numBins) * m_numMelBands;
108
+ }
109
+
110
+ void processResizing(const float *inputBuffer, size_t inputSize,
111
+ float *outputBuffer, size_t &outputSize,
112
+ int numChannels, const float *timestamps = nullptr) override
113
+ {
114
+ // Calculate how many complete spectrum frames we have
115
+ // Each frame should be numBins samples per channel
116
+ size_t samplesPerChannel = inputSize / numChannels;
117
+ size_t numFrames = samplesPerChannel / m_numBins;
118
+
119
+ if (numFrames == 0)
120
+ {
121
+ // Not enough data for even one frame - output nothing
122
+ outputSize = 0;
123
+ return;
124
+ }
125
+
126
+ // Calculate output size
127
+ outputSize = numFrames * m_numMelBands * numChannels;
128
+
129
+ // Temporary buffers for Eigen operations
130
+ Eigen::VectorXf input(m_numBins);
131
+ Eigen::VectorXf output(m_numMelBands);
132
+
133
+ // Process each channel independently
134
+ for (int ch = 0; ch < numChannels; ++ch)
135
+ {
136
+ // Process each frame for this channel
137
+ for (size_t frame = 0; frame < numFrames; ++frame)
138
+ {
139
+ // Extract input spectrum (de-interleaved)
140
+ for (size_t i = 0; i < m_numBins; ++i)
141
+ {
142
+ size_t index = (frame * m_numBins + i) * numChannels + ch;
143
+ input(i) = inputBuffer[index];
144
+ }
145
+
146
+ // Apply Mel filterbank: mel_energies = filterbank × power_spectrum
147
+ output = m_filterbank * input;
148
+
149
+ // Write output (re-interleaved)
150
+ for (size_t i = 0; i < m_numMelBands; ++i)
151
+ {
152
+ size_t outIndex = (frame * m_numMelBands + i) * numChannels + ch;
153
+ outputBuffer[outIndex] = output(i);
154
+ }
155
+ }
156
+ }
157
+ }
158
+
159
+ Napi::Object serializeState(Napi::Env env) const override
160
+ {
161
+ Napi::Object state = Napi::Object::New(env);
162
+ state.Set("numBins", Napi::Number::New(env, m_numBins));
163
+ state.Set("numMelBands", Napi::Number::New(env, m_numMelBands));
164
+
165
+ // Serialize filterbank matrix (row-major)
166
+ Napi::Array filterbankArray = Napi::Array::New(env, m_numMelBands * m_numBins);
167
+ for (size_t i = 0; i < m_numMelBands; ++i)
168
+ {
169
+ for (size_t j = 0; j < m_numBins; ++j)
170
+ {
171
+ filterbankArray.Set(i * m_numBins + j, Napi::Number::New(env, m_filterbank(i, j)));
172
+ }
173
+ }
174
+ state.Set("filterbank", filterbankArray);
175
+
176
+ return state;
177
+ }
178
+
179
+ void deserializeState(const Napi::Object &state) override
180
+ {
181
+ size_t numBins = state.Get("numBins").As<Napi::Number>().Uint32Value();
182
+ size_t numMelBands = state.Get("numMelBands").As<Napi::Number>().Uint32Value();
183
+
184
+ if (numBins != m_numBins || numMelBands != m_numMelBands)
185
+ {
186
+ throw std::runtime_error("MelSpectrogram: Dimension mismatch during deserialization");
187
+ }
188
+
189
+ // Restore filterbank matrix
190
+ Napi::Array filterbankArray = state.Get("filterbank").As<Napi::Array>();
191
+ for (size_t i = 0; i < m_numMelBands; ++i)
192
+ {
193
+ for (size_t j = 0; j < m_numBins; ++j)
194
+ {
195
+ m_filterbank(i, j) = filterbankArray.Get(i * m_numBins + j).As<Napi::Number>().FloatValue();
196
+ }
197
+ }
198
+ }
199
+
200
+ void reset() override
201
+ {
202
+ // Stateless - no reset needed
203
+ }
204
+
205
+ private:
206
+ size_t m_numBins; // Number of input frequency bins
207
+ size_t m_numMelBands; // Number of output Mel bands
208
+ Eigen::MatrixXf m_filterbank; // Mel filterbank matrix (numMelBands × numBins)
209
+ };
210
+
211
+ } // namespace dsp::adapters
@@ -0,0 +1,220 @@
1
+ /**
2
+ * MFCC (Mel-Frequency Cepstral Coefficients) Pipeline Stage
3
+ *
4
+ * Applies Discrete Cosine Transform (DCT) to log Mel-scale energies to produce MFCCs.
5
+ * This is a STATELESS operation that leverages the DCT engine.
6
+ *
7
+ * Features:
8
+ * - High-performance DCT using pre-computed cosine tables
9
+ * - Optional log-energy normalization
10
+ * - Coefficient selection (keep first N coefficients)
11
+ * - Multi-channel support
12
+ *
13
+ * Mathematical Operation:
14
+ * 1. Input: log(mel_energies) from Mel spectrogram
15
+ * 2. Apply DCT-II: mfcc[k] = DCT(log_mel_energies)
16
+ * 3. Keep first numCoefficients (typically 13-20)
17
+ *
18
+ * Typical Pipeline:
19
+ * STFT → Power → MelSpectrogram → Log → MFCC
20
+ *
21
+ * Parameters:
22
+ * - numMelBands: Number of input Mel bands (from MelSpectrogram)
23
+ * - numCoefficients: Number of MFCC coefficients to output (default: 13)
24
+ * - useLogEnergy: Apply log to input before DCT (default: true)
25
+ * - lifterCoefficient: Optional cepstral liftering (default: 0 = disabled)
26
+ */
27
+
28
+ #pragma once
29
+
30
+ #include "../IDspStage.h"
31
+ #include "../core/DctEngine.h"
32
+ #include <vector>
33
+ #include <memory>
34
+ #include <stdexcept>
35
+ #include <string>
36
+ #include <cmath>
37
+ #include <algorithm>
38
+
39
+ namespace dsp::adapters
40
+ {
41
+ class MfccStage : public IDspStage
42
+ {
43
+ public:
44
+ /**
45
+ * @brief Constructs an MFCC stage
46
+ * @param num_mel_bands Number of input Mel frequency bands
47
+ * @param num_coefficients Number of MFCC coefficients to output (default: 13)
48
+ * @param use_log_energy Apply log to input energies before DCT (default: true)
49
+ * @param lifter_coefficient Cepstral liftering parameter (0 = disabled)
50
+ */
51
+ explicit MfccStage(
52
+ size_t num_mel_bands,
53
+ size_t num_coefficients = 13,
54
+ bool use_log_energy = true,
55
+ float lifter_coefficient = 0.0f)
56
+ : m_numMelBands(num_mel_bands),
57
+ m_numCoefficients(num_coefficients),
58
+ m_useLogEnergy(use_log_energy),
59
+ m_lifterCoefficient(lifter_coefficient)
60
+ {
61
+ // Validate parameters
62
+ if (m_numMelBands == 0)
63
+ {
64
+ throw std::invalid_argument("MFCC: num_mel_bands must be greater than 0");
65
+ }
66
+
67
+ if (m_numCoefficients == 0 || m_numCoefficients > m_numMelBands)
68
+ {
69
+ throw std::invalid_argument(
70
+ "MFCC: num_coefficients must be in range [1, num_mel_bands]");
71
+ }
72
+
73
+ // Create DCT engine (size = numMelBands)
74
+ m_dctEngine = std::make_unique<dsp::core::DctEngine<float>>(m_numMelBands);
75
+
76
+ // Allocate working buffers
77
+ m_logEnergies.resize(m_numMelBands);
78
+ m_dctOutput.resize(m_numMelBands);
79
+
80
+ // Pre-compute lifter weights if liftering is enabled
81
+ if (m_lifterCoefficient > 0)
82
+ {
83
+ m_lifterWeights.resize(m_numCoefficients);
84
+ for (size_t i = 0; i < m_numCoefficients; ++i)
85
+ {
86
+ m_lifterWeights[i] = 1.0f + (m_lifterCoefficient / 2.0f) *
87
+ std::sin(M_PI * static_cast<float>(i) / m_lifterCoefficient);
88
+ }
89
+ }
90
+ }
91
+
92
+ const char *getType() const override
93
+ {
94
+ return "mfcc";
95
+ }
96
+
97
+ bool isResizing() const override
98
+ {
99
+ return true; // This stage changes output size
100
+ }
101
+
102
+ void process(float *buffer, size_t numSamples, int numChannels, const float *timestamps = nullptr) override
103
+ {
104
+ // This stage changes output size - processResizing() should be called instead
105
+ throw std::runtime_error("MFCC stage requires processResizing() to be called");
106
+ }
107
+
108
+ size_t calculateOutputSize(size_t inputSize) const override
109
+ {
110
+ // Input has numMelBands per frame, output has numCoefficients per frame
111
+ // Calculate output size based on the ratio: (numCoefficients / numMelBands) * inputSize
112
+ return (inputSize / m_numMelBands) * m_numCoefficients;
113
+ }
114
+
115
+ void processResizing(const float *inputBuffer, size_t inputSize,
116
+ float *outputBuffer, size_t &outputSize,
117
+ int numChannels, const float *timestamps = nullptr) override
118
+ {
119
+ // Calculate how many complete Mel spectrum frames we have
120
+ size_t samplesPerChannel = inputSize / numChannels;
121
+ size_t numFrames = samplesPerChannel / m_numMelBands;
122
+
123
+ if (numFrames == 0)
124
+ {
125
+ // Not enough data for even one frame - output nothing
126
+ outputSize = 0;
127
+ return;
128
+ }
129
+
130
+ // Calculate output size
131
+ outputSize = numFrames * m_numCoefficients * numChannels;
132
+
133
+ // Process each channel independently
134
+ for (int ch = 0; ch < numChannels; ++ch)
135
+ {
136
+ // Process each frame for this channel
137
+ for (size_t frame = 0; frame < numFrames; ++frame)
138
+ {
139
+ // Extract Mel energies for this frame (de-interleaved)
140
+ for (size_t i = 0; i < m_numMelBands; ++i)
141
+ {
142
+ size_t index = (frame * m_numMelBands + i) * numChannels + ch;
143
+ float energy = inputBuffer[index];
144
+
145
+ // Apply log if requested (add small epsilon to avoid log(0))
146
+ if (m_useLogEnergy)
147
+ {
148
+ const float epsilon = 1e-10f;
149
+ m_logEnergies[i] = std::log(energy + epsilon);
150
+ }
151
+ else
152
+ {
153
+ m_logEnergies[i] = energy;
154
+ }
155
+ }
156
+
157
+ // Apply DCT to get MFCCs
158
+ m_dctEngine->dct(m_logEnergies.data(), m_dctOutput.data());
159
+
160
+ // Extract first numCoefficients and apply liftering if enabled
161
+ for (size_t i = 0; i < m_numCoefficients; ++i)
162
+ {
163
+ float coeff = m_dctOutput[i];
164
+
165
+ // Apply cepstral liftering
166
+ if (m_lifterCoefficient > 0)
167
+ {
168
+ coeff *= m_lifterWeights[i];
169
+ }
170
+
171
+ // Write output (re-interleaved)
172
+ size_t outIndex = (frame * m_numCoefficients + i) * numChannels + ch;
173
+ outputBuffer[outIndex] = coeff;
174
+ }
175
+ }
176
+ }
177
+ }
178
+
179
+ Napi::Object serializeState(Napi::Env env) const override
180
+ {
181
+ Napi::Object state = Napi::Object::New(env);
182
+ state.Set("numMelBands", Napi::Number::New(env, m_numMelBands));
183
+ state.Set("numCoefficients", Napi::Number::New(env, m_numCoefficients));
184
+ state.Set("useLogEnergy", Napi::Boolean::New(env, m_useLogEnergy));
185
+ state.Set("lifterCoefficient", Napi::Number::New(env, m_lifterCoefficient));
186
+ return state;
187
+ }
188
+
189
+ void deserializeState(const Napi::Object &state) override
190
+ {
191
+ size_t numMelBands = state.Get("numMelBands").As<Napi::Number>().Uint32Value();
192
+ size_t numCoefficients = state.Get("numCoefficients").As<Napi::Number>().Uint32Value();
193
+
194
+ if (numMelBands != m_numMelBands || numCoefficients != m_numCoefficients)
195
+ {
196
+ throw std::runtime_error("MFCC: Dimension mismatch during deserialization");
197
+ }
198
+ }
199
+
200
+ void reset() override
201
+ {
202
+ // Stateless - no reset needed
203
+ }
204
+
205
+ private:
206
+ size_t m_numMelBands; // Number of input Mel bands
207
+ size_t m_numCoefficients; // Number of MFCC coefficients to output
208
+ bool m_useLogEnergy; // Apply log to input energies
209
+ float m_lifterCoefficient; // Cepstral liftering parameter (0 = disabled)
210
+
211
+ // DCT engine
212
+ std::unique_ptr<dsp::core::DctEngine<float>> m_dctEngine;
213
+
214
+ // Working buffers
215
+ std::vector<float> m_logEnergies; // Log Mel energies (input to DCT)
216
+ std::vector<float> m_dctOutput; // Full DCT output (before truncation)
217
+ std::vector<float> m_lifterWeights; // Pre-computed lifter weights
218
+ };
219
+
220
+ } // namespace dsp::adapters