@utterance/core 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +436 -49
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +425 -49
- package/models/utterance-v1.onnx +0 -0
- package/package.json +2 -1
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
2
3
|
var __defProp = Object.defineProperty;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
5
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
8
|
var __export = (target, all) => {
|
|
7
9
|
for (var name in all)
|
|
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
15
17
|
}
|
|
16
18
|
return to;
|
|
17
19
|
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
18
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
29
|
|
|
20
30
|
// src/index.ts
|
|
@@ -70,35 +80,82 @@ var AudioCapture = class {
|
|
|
70
80
|
// src/features/extractor.ts
|
|
71
81
|
var FeatureExtractor = class {
|
|
72
82
|
sampleRate;
|
|
83
|
+
nFft;
|
|
84
|
+
nMels;
|
|
85
|
+
nMfcc;
|
|
86
|
+
// Pre-computed DSP tables
|
|
87
|
+
hammingWindow;
|
|
88
|
+
melFilterbank;
|
|
89
|
+
dctMatrix;
|
|
90
|
+
// State for pause duration tracking
|
|
91
|
+
silenceAccumulator = 0;
|
|
92
|
+
silenceThreshold = 0.01;
|
|
93
|
+
frameDurationSec;
|
|
94
|
+
// State for speech rate (rolling energy buffer)
|
|
95
|
+
energyBuffer;
|
|
96
|
+
energyBufferIdx = 0;
|
|
97
|
+
energyBufferFull = false;
|
|
73
98
|
constructor(sampleRate = 16e3) {
|
|
74
99
|
this.sampleRate = sampleRate;
|
|
100
|
+
this.nFft = Math.floor(sampleRate * 0.025);
|
|
101
|
+
this.nMels = 40;
|
|
102
|
+
this.nMfcc = 13;
|
|
103
|
+
this.frameDurationSec = 0.01;
|
|
104
|
+
this.hammingWindow = new Float32Array(this.nFft);
|
|
105
|
+
for (let i = 0; i < this.nFft; i++) {
|
|
106
|
+
this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
|
|
107
|
+
}
|
|
108
|
+
this.melFilterbank = this.createMelFilterbank();
|
|
109
|
+
this.dctMatrix = this.createDCTMatrix();
|
|
110
|
+
const framesPerSecond = Math.floor(1 / this.frameDurationSec);
|
|
111
|
+
this.energyBuffer = new Float32Array(framesPerSecond);
|
|
75
112
|
}
|
|
76
113
|
/**
|
|
77
114
|
* Extract all features from a single audio frame.
|
|
78
115
|
*/
|
|
79
116
|
extract(frame) {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
// tracked by the detector over time
|
|
87
|
-
};
|
|
117
|
+
const energy = this.computeEnergy(frame);
|
|
118
|
+
const mfcc = this.computeMFCC(frame);
|
|
119
|
+
const pitch = this.estimatePitch(frame);
|
|
120
|
+
const speechRate = this.estimateSpeechRate(energy);
|
|
121
|
+
const pauseDuration = this.updatePauseDuration(energy);
|
|
122
|
+
return { mfcc, energy, pitch, speechRate, pauseDuration };
|
|
88
123
|
}
|
|
89
124
|
/**
|
|
90
125
|
* Compute Mel-Frequency Cepstral Coefficients.
|
|
91
126
|
*
|
|
92
|
-
*
|
|
93
|
-
* 1. Pre-emphasis filter
|
|
94
|
-
* 2. Windowing (Hamming)
|
|
95
|
-
* 3. FFT
|
|
96
|
-
* 4. Mel filterbank
|
|
97
|
-
* 5. Log energy
|
|
98
|
-
* 6. DCT
|
|
127
|
+
* Pipeline: Pre-emphasis → Hamming window → FFT → Mel filterbank → log → DCT
|
|
99
128
|
*/
|
|
100
|
-
computeMFCC(
|
|
101
|
-
|
|
129
|
+
computeMFCC(frame) {
|
|
130
|
+
const preEmph = new Float32Array(this.nFft);
|
|
131
|
+
const len = Math.min(frame.length, this.nFft);
|
|
132
|
+
preEmph[0] = frame[0];
|
|
133
|
+
for (let i = 1; i < len; i++) {
|
|
134
|
+
preEmph[i] = frame[i] - 0.97 * frame[i - 1];
|
|
135
|
+
}
|
|
136
|
+
for (let i = 0; i < this.nFft; i++) {
|
|
137
|
+
preEmph[i] *= this.hammingWindow[i];
|
|
138
|
+
}
|
|
139
|
+
const spectrum = this.fftMagnitude(preEmph);
|
|
140
|
+
const melEnergies = new Float32Array(this.nMels);
|
|
141
|
+
for (let m = 0; m < this.nMels; m++) {
|
|
142
|
+
let sum = 0;
|
|
143
|
+
const filter = this.melFilterbank[m];
|
|
144
|
+
for (let k = 0; k < filter.length; k++) {
|
|
145
|
+
sum += spectrum[k] * filter[k];
|
|
146
|
+
}
|
|
147
|
+
melEnergies[m] = Math.log(Math.max(sum, 1e-10));
|
|
148
|
+
}
|
|
149
|
+
const mfcc = new Float32Array(this.nMfcc);
|
|
150
|
+
for (let i = 0; i < this.nMfcc; i++) {
|
|
151
|
+
let sum = 0;
|
|
152
|
+
const dctRow = this.dctMatrix[i];
|
|
153
|
+
for (let j = 0; j < this.nMels; j++) {
|
|
154
|
+
sum += dctRow[j] * melEnergies[j];
|
|
155
|
+
}
|
|
156
|
+
mfcc[i] = sum;
|
|
157
|
+
}
|
|
158
|
+
return mfcc;
|
|
102
159
|
}
|
|
103
160
|
/**
|
|
104
161
|
* Compute RMS energy of the frame.
|
|
@@ -111,22 +168,217 @@ var FeatureExtractor = class {
|
|
|
111
168
|
return Math.sqrt(sum / frame.length);
|
|
112
169
|
}
|
|
113
170
|
/**
|
|
114
|
-
* Estimate fundamental frequency (pitch) using autocorrelation.
|
|
171
|
+
* Estimate fundamental frequency (pitch) using simplified autocorrelation.
|
|
172
|
+
*
|
|
173
|
+
* Looks for the dominant periodicity in the signal within the
|
|
174
|
+
* speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
|
|
175
|
+
*/
|
|
176
|
+
estimatePitch(frame) {
|
|
177
|
+
const minPeriod = Math.floor(this.sampleRate / 500);
|
|
178
|
+
const maxPeriod = Math.floor(this.sampleRate / 50);
|
|
179
|
+
const len = Math.min(frame.length, this.nFft);
|
|
180
|
+
if (len < maxPeriod * 2) return 0;
|
|
181
|
+
let bestCorr = 0;
|
|
182
|
+
let bestLag = 0;
|
|
183
|
+
let energy = 0;
|
|
184
|
+
for (let i = 0; i < len; i++) {
|
|
185
|
+
energy += frame[i] * frame[i];
|
|
186
|
+
}
|
|
187
|
+
if (energy < 1e-10) return 0;
|
|
188
|
+
for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
|
|
189
|
+
let corr = 0;
|
|
190
|
+
let energyLag = 0;
|
|
191
|
+
const limit = len - lag;
|
|
192
|
+
for (let i = 0; i < limit; i++) {
|
|
193
|
+
corr += frame[i] * frame[i + lag];
|
|
194
|
+
energyLag += frame[i + lag] * frame[i + lag];
|
|
195
|
+
}
|
|
196
|
+
const norm = Math.sqrt(energy * energyLag);
|
|
197
|
+
if (norm > 0) {
|
|
198
|
+
corr /= norm;
|
|
199
|
+
}
|
|
200
|
+
if (corr > bestCorr) {
|
|
201
|
+
bestCorr = corr;
|
|
202
|
+
bestLag = lag;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (bestCorr < 0.3 || bestLag === 0) return 0;
|
|
206
|
+
return this.sampleRate / bestLag;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Estimate speech rate from rolling energy envelope.
|
|
115
210
|
*
|
|
116
|
-
*
|
|
211
|
+
* Counts energy peaks in a 1-second sliding window.
|
|
212
|
+
* Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
|
|
117
213
|
*/
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
214
|
+
estimateSpeechRate(energy) {
|
|
215
|
+
this.energyBuffer[this.energyBufferIdx] = energy;
|
|
216
|
+
this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
|
|
217
|
+
if (this.energyBufferIdx === 0) this.energyBufferFull = true;
|
|
218
|
+
const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
|
|
219
|
+
if (len < 5) return 0;
|
|
220
|
+
let peaks = 0;
|
|
221
|
+
const threshold = this.silenceThreshold * 0.5;
|
|
222
|
+
for (let i = 2; i < len - 2; i++) {
|
|
223
|
+
const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
|
|
224
|
+
const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
|
|
225
|
+
const curr = this.energyBuffer[idx];
|
|
226
|
+
const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
|
|
227
|
+
if (curr > prev && curr > next && curr > threshold) {
|
|
228
|
+
peaks++;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
const windowDuration = len * this.frameDurationSec;
|
|
232
|
+
const rate = windowDuration > 0 ? peaks / windowDuration : 0;
|
|
233
|
+
return rate / 10;
|
|
121
234
|
}
|
|
122
235
|
/**
|
|
123
|
-
*
|
|
236
|
+
* Track accumulated pause duration.
|
|
124
237
|
*
|
|
125
|
-
*
|
|
238
|
+
* Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
|
|
126
239
|
*/
|
|
127
|
-
|
|
128
|
-
|
|
240
|
+
updatePauseDuration(energy) {
|
|
241
|
+
if (energy < this.silenceThreshold) {
|
|
242
|
+
this.silenceAccumulator += this.frameDurationSec;
|
|
243
|
+
} else {
|
|
244
|
+
this.silenceAccumulator = 0;
|
|
245
|
+
}
|
|
246
|
+
return Math.min(this.silenceAccumulator, 5) / 5;
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Compute FFT magnitude spectrum (power spectrum).
|
|
250
|
+
*
|
|
251
|
+
* Uses a radix-2 DIT FFT implementation. For frames smaller than
|
|
252
|
+
* nFft, zero-pads to the next power of 2.
|
|
253
|
+
*/
|
|
254
|
+
fftMagnitude(signal) {
|
|
255
|
+
let n = 1;
|
|
256
|
+
while (n < signal.length) n *= 2;
|
|
257
|
+
const real = new Float32Array(n);
|
|
258
|
+
const imag = new Float32Array(n);
|
|
259
|
+
real.set(signal);
|
|
260
|
+
let j = 0;
|
|
261
|
+
for (let i = 0; i < n; i++) {
|
|
262
|
+
if (i < j) {
|
|
263
|
+
[real[i], real[j]] = [real[j], real[i]];
|
|
264
|
+
[imag[i], imag[j]] = [imag[j], imag[i]];
|
|
265
|
+
}
|
|
266
|
+
let m = n >> 1;
|
|
267
|
+
while (m >= 1 && j >= m) {
|
|
268
|
+
j -= m;
|
|
269
|
+
m >>= 1;
|
|
270
|
+
}
|
|
271
|
+
j += m;
|
|
272
|
+
}
|
|
273
|
+
for (let size = 2; size <= n; size *= 2) {
|
|
274
|
+
const halfSize = size / 2;
|
|
275
|
+
const angle = -2 * Math.PI / size;
|
|
276
|
+
const wReal = Math.cos(angle);
|
|
277
|
+
const wImag = Math.sin(angle);
|
|
278
|
+
for (let i = 0; i < n; i += size) {
|
|
279
|
+
let curReal = 1;
|
|
280
|
+
let curImag = 0;
|
|
281
|
+
for (let k = 0; k < halfSize; k++) {
|
|
282
|
+
const evenIdx = i + k;
|
|
283
|
+
const oddIdx = i + k + halfSize;
|
|
284
|
+
const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
|
|
285
|
+
const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
|
|
286
|
+
real[oddIdx] = real[evenIdx] - tReal;
|
|
287
|
+
imag[oddIdx] = imag[evenIdx] - tImag;
|
|
288
|
+
real[evenIdx] += tReal;
|
|
289
|
+
imag[evenIdx] += tImag;
|
|
290
|
+
const newCurReal = curReal * wReal - curImag * wImag;
|
|
291
|
+
curImag = curReal * wImag + curImag * wReal;
|
|
292
|
+
curReal = newCurReal;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
const numBins = n / 2 + 1;
|
|
297
|
+
const power = new Float32Array(numBins);
|
|
298
|
+
for (let i = 0; i < numBins; i++) {
|
|
299
|
+
power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
|
|
300
|
+
}
|
|
301
|
+
return power;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Create Mel filterbank matrix.
|
|
305
|
+
*
|
|
306
|
+
* Produces nMels triangular filters spanning the frequency range
|
|
307
|
+
* from 0 to sampleRate/2 on the Mel scale.
|
|
308
|
+
*/
|
|
309
|
+
createMelFilterbank() {
|
|
310
|
+
let n = 1;
|
|
311
|
+
while (n < this.nFft) n *= 2;
|
|
312
|
+
const fftBins = n / 2 + 1;
|
|
313
|
+
const fMin = 0;
|
|
314
|
+
const fMax = this.sampleRate / 2;
|
|
315
|
+
const melMin = this.hzToMel(fMin);
|
|
316
|
+
const melMax = this.hzToMel(fMax);
|
|
317
|
+
const melPoints = new Float32Array(this.nMels + 2);
|
|
318
|
+
for (let i = 0; i < this.nMels + 2; i++) {
|
|
319
|
+
melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
|
|
320
|
+
}
|
|
321
|
+
const binIndices = new Float32Array(this.nMels + 2);
|
|
322
|
+
for (let i = 0; i < this.nMels + 2; i++) {
|
|
323
|
+
const hz = this.melToHz(melPoints[i]);
|
|
324
|
+
binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
|
|
325
|
+
}
|
|
326
|
+
const filters = [];
|
|
327
|
+
for (let m = 0; m < this.nMels; m++) {
|
|
328
|
+
const filter = new Float32Array(fftBins);
|
|
329
|
+
const left = binIndices[m];
|
|
330
|
+
const center = binIndices[m + 1];
|
|
331
|
+
const right = binIndices[m + 2];
|
|
332
|
+
for (let k = 0; k < fftBins; k++) {
|
|
333
|
+
if (k >= left && k <= center && center > left) {
|
|
334
|
+
filter[k] = (k - left) / (center - left);
|
|
335
|
+
} else if (k > center && k <= right && right > center) {
|
|
336
|
+
filter[k] = (right - k) / (right - center);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
filters.push(filter);
|
|
340
|
+
}
|
|
341
|
+
return filters;
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Create DCT-II matrix for MFCC computation.
|
|
345
|
+
*/
|
|
346
|
+
createDCTMatrix() {
|
|
347
|
+
const matrix = [];
|
|
348
|
+
const scale = Math.sqrt(2 / this.nMels);
|
|
349
|
+
for (let i = 0; i < this.nMfcc; i++) {
|
|
350
|
+
const row = new Float32Array(this.nMels);
|
|
351
|
+
for (let j = 0; j < this.nMels; j++) {
|
|
352
|
+
row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
|
|
353
|
+
}
|
|
354
|
+
matrix.push(row);
|
|
355
|
+
}
|
|
356
|
+
return matrix;
|
|
357
|
+
}
|
|
358
|
+
hzToMel(hz) {
|
|
359
|
+
return 2595 * Math.log10(1 + hz / 700);
|
|
129
360
|
}
|
|
361
|
+
melToHz(mel) {
|
|
362
|
+
return 700 * (Math.pow(10, mel / 2595) - 1);
|
|
363
|
+
}
|
|
364
|
+
/**
|
|
365
|
+
* Reset internal state (energy buffer, pause accumulator).
|
|
366
|
+
*/
|
|
367
|
+
reset() {
|
|
368
|
+
this.silenceAccumulator = 0;
|
|
369
|
+
this.energyBuffer.fill(0);
|
|
370
|
+
this.energyBufferIdx = 0;
|
|
371
|
+
this.energyBufferFull = false;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
// src/types.ts
|
|
376
|
+
var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
|
|
377
|
+
var DEFAULT_OPTIONS = {
|
|
378
|
+
sensitivity: 0.5,
|
|
379
|
+
pauseTolerance: 1500,
|
|
380
|
+
modelPath: "cdn",
|
|
381
|
+
sampleRate: 16e3
|
|
130
382
|
};
|
|
131
383
|
|
|
132
384
|
// src/model/energy-vad.ts
|
|
@@ -177,45 +429,188 @@ var EnergyVAD = class {
|
|
|
177
429
|
};
|
|
178
430
|
|
|
179
431
|
// src/model/onnx.ts
|
|
432
|
+
var import_meta = {};
|
|
433
|
+
var LABELS = [
|
|
434
|
+
"speaking",
|
|
435
|
+
"thinking_pause",
|
|
436
|
+
"turn_complete",
|
|
437
|
+
"interrupt_intent"
|
|
438
|
+
];
|
|
439
|
+
var FEATURE_DIM = 17;
|
|
440
|
+
var CONTEXT_FRAMES = 100;
|
|
441
|
+
var INFERENCE_INTERVAL = 10;
|
|
180
442
|
var ONNXModel = class {
|
|
181
443
|
session = null;
|
|
444
|
+
ort = null;
|
|
182
445
|
fallback;
|
|
183
|
-
|
|
446
|
+
useWebGpu;
|
|
447
|
+
/** Circular buffer of feature vectors for the context window. */
|
|
448
|
+
frameBuffer;
|
|
449
|
+
bufferIdx = 0;
|
|
450
|
+
framesBuffered = 0;
|
|
451
|
+
framesSinceInference = 0;
|
|
452
|
+
/** Cache the last inference result for frames between batches. */
|
|
453
|
+
lastResult = null;
|
|
454
|
+
constructor(sensitivity = 0.5, useWebGpu = false) {
|
|
184
455
|
this.fallback = new EnergyVAD(sensitivity);
|
|
456
|
+
this.useWebGpu = useWebGpu;
|
|
457
|
+
this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
|
|
185
458
|
}
|
|
186
459
|
/**
|
|
187
|
-
* Load the ONNX model from
|
|
460
|
+
* Load the ONNX model from CDN, bundled path, or custom URL.
|
|
461
|
+
*
|
|
462
|
+
* Dynamically imports onnxruntime-web to avoid bundling it
|
|
463
|
+
* when the model isn't used (tree-shaking friendly).
|
|
188
464
|
*
|
|
189
|
-
*
|
|
190
|
-
* 1. Import onnxruntime-web InferenceSession
|
|
191
|
-
* 2. Load model bytes
|
|
192
|
-
* 3. Create session with appropriate execution providers
|
|
465
|
+
* @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
|
|
193
466
|
*/
|
|
194
|
-
async load(
|
|
195
|
-
|
|
467
|
+
async load(path) {
|
|
468
|
+
try {
|
|
469
|
+
const ort = await import("onnxruntime-web");
|
|
470
|
+
this.ort = ort;
|
|
471
|
+
let modelSource = path;
|
|
472
|
+
if (path === "cdn") {
|
|
473
|
+
try {
|
|
474
|
+
const response = await fetch(MODEL_CDN_URL);
|
|
475
|
+
if (response.ok) {
|
|
476
|
+
modelSource = await response.arrayBuffer();
|
|
477
|
+
} else {
|
|
478
|
+
throw new Error(`Failed to fetch CDN model: ${response.status}`);
|
|
479
|
+
}
|
|
480
|
+
} catch {
|
|
481
|
+
console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
|
|
482
|
+
this.session = null;
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
} else if (path === "bundled") {
|
|
486
|
+
try {
|
|
487
|
+
const getUrl = new Function("p", "b", "return new URL(p, b).href");
|
|
488
|
+
const href = getUrl("../../models/utterance-v1.onnx", import_meta.url);
|
|
489
|
+
const response = await fetch(href);
|
|
490
|
+
if (response.ok) {
|
|
491
|
+
modelSource = await response.arrayBuffer();
|
|
492
|
+
} else {
|
|
493
|
+
throw new Error(`Failed to fetch bundled model: ${response.status}`);
|
|
494
|
+
}
|
|
495
|
+
} catch {
|
|
496
|
+
console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
|
|
497
|
+
this.session = null;
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
|
|
502
|
+
this.session = await ort.InferenceSession.create(modelSource, {
|
|
503
|
+
executionProviders: providers
|
|
504
|
+
});
|
|
505
|
+
} catch (err) {
|
|
506
|
+
console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
|
|
507
|
+
this.session = null;
|
|
508
|
+
}
|
|
196
509
|
}
|
|
197
510
|
/**
|
|
198
|
-
* Run inference on
|
|
511
|
+
* Run inference on extracted features.
|
|
199
512
|
*
|
|
200
|
-
*
|
|
201
|
-
*
|
|
202
|
-
*
|
|
203
|
-
* 3. Parse output into ClassificationResult
|
|
513
|
+
* Buffers frames into a sliding window and runs the ONNX model
|
|
514
|
+
* every 100ms (10 frames). Between inference runs, returns the
|
|
515
|
+
* cached result. Falls back to EnergyVAD when no model is loaded.
|
|
204
516
|
*/
|
|
205
517
|
async predict(features) {
|
|
206
|
-
if (!this.session) {
|
|
518
|
+
if (!this.session || !this.ort) {
|
|
207
519
|
return this.fallback.classify(features);
|
|
208
520
|
}
|
|
209
|
-
|
|
521
|
+
this.addFrame(features);
|
|
522
|
+
this.framesSinceInference++;
|
|
523
|
+
if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
|
|
524
|
+
this.framesSinceInference = 0;
|
|
525
|
+
try {
|
|
526
|
+
this.lastResult = await this.runInference();
|
|
527
|
+
} catch (err) {
|
|
528
|
+
console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
|
|
529
|
+
return this.fallback.classify(features);
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
return this.lastResult ?? this.fallback.classify(features);
|
|
210
533
|
}
|
|
211
534
|
/**
|
|
212
535
|
* Release model resources.
|
|
213
536
|
*/
|
|
214
537
|
dispose() {
|
|
538
|
+
if (this.session) {
|
|
539
|
+
this.session.release().catch(() => {
|
|
540
|
+
});
|
|
541
|
+
}
|
|
215
542
|
this.session = null;
|
|
543
|
+
this.ort = null;
|
|
216
544
|
this.fallback.reset();
|
|
545
|
+
this.resetBuffer();
|
|
546
|
+
}
|
|
547
|
+
/**
|
|
548
|
+
* Add a feature frame to the circular buffer.
|
|
549
|
+
*/
|
|
550
|
+
addFrame(features) {
|
|
551
|
+
const offset = this.bufferIdx * FEATURE_DIM;
|
|
552
|
+
this.frameBuffer.set(features.mfcc, offset);
|
|
553
|
+
this.frameBuffer[offset + 13] = features.energy;
|
|
554
|
+
this.frameBuffer[offset + 14] = features.pitch;
|
|
555
|
+
this.frameBuffer[offset + 15] = features.speechRate;
|
|
556
|
+
this.frameBuffer[offset + 16] = features.pauseDuration;
|
|
557
|
+
this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
|
|
558
|
+
if (this.framesBuffered < CONTEXT_FRAMES) {
|
|
559
|
+
this.framesBuffered++;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Build the input tensor from the circular buffer and run ONNX inference.
|
|
564
|
+
*/
|
|
565
|
+
async runInference() {
|
|
566
|
+
const ort = this.ort;
|
|
567
|
+
const session = this.session;
|
|
568
|
+
const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
|
|
569
|
+
for (let i = 0; i < CONTEXT_FRAMES; i++) {
|
|
570
|
+
const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
|
|
571
|
+
const dstIdx = i * FEATURE_DIM;
|
|
572
|
+
input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
|
|
573
|
+
}
|
|
574
|
+
const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
|
|
575
|
+
const results = await session.run({ input: tensor });
|
|
576
|
+
const output = results.output;
|
|
577
|
+
const logits = output.data;
|
|
578
|
+
const probs = softmax(logits);
|
|
579
|
+
let bestIdx = 0;
|
|
580
|
+
let bestProb = probs[0];
|
|
581
|
+
for (let i = 1; i < probs.length; i++) {
|
|
582
|
+
if (probs[i] > bestProb) {
|
|
583
|
+
bestProb = probs[i];
|
|
584
|
+
bestIdx = i;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
return {
|
|
588
|
+
label: LABELS[bestIdx],
|
|
589
|
+
confidence: bestProb,
|
|
590
|
+
timestamp: Date.now()
|
|
591
|
+
};
|
|
592
|
+
}
|
|
593
|
+
resetBuffer() {
|
|
594
|
+
this.frameBuffer.fill(0);
|
|
595
|
+
this.bufferIdx = 0;
|
|
596
|
+
this.framesBuffered = 0;
|
|
597
|
+
this.framesSinceInference = 0;
|
|
598
|
+
this.lastResult = null;
|
|
217
599
|
}
|
|
218
600
|
};
|
|
601
|
+
function softmax(logits) {
|
|
602
|
+
const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
|
|
603
|
+
const exps = new Float32Array(logits.length);
|
|
604
|
+
let sum = 0;
|
|
605
|
+
for (let i = 0; i < logits.length; i++) {
|
|
606
|
+
exps[i] = Math.exp(logits[i] - max);
|
|
607
|
+
sum += exps[i];
|
|
608
|
+
}
|
|
609
|
+
for (let i = 0; i < exps.length; i++) {
|
|
610
|
+
exps[i] /= sum;
|
|
611
|
+
}
|
|
612
|
+
return exps;
|
|
613
|
+
}
|
|
219
614
|
|
|
220
615
|
// src/detector/turn-detector.ts
|
|
221
616
|
var TurnDetector = class {
|
|
@@ -306,14 +701,6 @@ var TurnDetector = class {
|
|
|
306
701
|
}
|
|
307
702
|
};
|
|
308
703
|
|
|
309
|
-
// src/types.ts
|
|
310
|
-
var DEFAULT_OPTIONS = {
|
|
311
|
-
sensitivity: 0.5,
|
|
312
|
-
pauseTolerance: 1500,
|
|
313
|
-
modelPath: "bundled",
|
|
314
|
-
sampleRate: 16e3
|
|
315
|
-
};
|
|
316
|
-
|
|
317
704
|
// src/utterance.ts
|
|
318
705
|
var Utterance = class {
|
|
319
706
|
options;
|
package/dist/index.d.cts
CHANGED
|
@@ -9,7 +9,7 @@ interface UtteranceOptions {
|
|
|
9
9
|
sensitivity?: number;
|
|
10
10
|
/** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
|
|
11
11
|
pauseTolerance?: number;
|
|
12
|
-
/**
|
|
12
|
+
/** Model source: "cdn" (default), "bundled", or a custom URL. */
|
|
13
13
|
modelPath?: string;
|
|
14
14
|
/** Audio sample rate in Hz. Default: 16000 */
|
|
15
15
|
sampleRate?: number;
|
package/dist/index.d.ts
CHANGED
|
@@ -9,7 +9,7 @@ interface UtteranceOptions {
|
|
|
9
9
|
sensitivity?: number;
|
|
10
10
|
/** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
|
|
11
11
|
pauseTolerance?: number;
|
|
12
|
-
/**
|
|
12
|
+
/** Model source: "cdn" (default), "bundled", or a custom URL. */
|
|
13
13
|
modelPath?: string;
|
|
14
14
|
/** Audio sample rate in Hz. Default: 16000 */
|
|
15
15
|
sampleRate?: number;
|
package/dist/index.js
CHANGED
|
@@ -44,35 +44,82 @@ var AudioCapture = class {
|
|
|
44
44
|
// src/features/extractor.ts
|
|
45
45
|
var FeatureExtractor = class {
|
|
46
46
|
sampleRate;
|
|
47
|
+
nFft;
|
|
48
|
+
nMels;
|
|
49
|
+
nMfcc;
|
|
50
|
+
// Pre-computed DSP tables
|
|
51
|
+
hammingWindow;
|
|
52
|
+
melFilterbank;
|
|
53
|
+
dctMatrix;
|
|
54
|
+
// State for pause duration tracking
|
|
55
|
+
silenceAccumulator = 0;
|
|
56
|
+
silenceThreshold = 0.01;
|
|
57
|
+
frameDurationSec;
|
|
58
|
+
// State for speech rate (rolling energy buffer)
|
|
59
|
+
energyBuffer;
|
|
60
|
+
energyBufferIdx = 0;
|
|
61
|
+
energyBufferFull = false;
|
|
47
62
|
constructor(sampleRate = 16e3) {
|
|
48
63
|
this.sampleRate = sampleRate;
|
|
64
|
+
this.nFft = Math.floor(sampleRate * 0.025);
|
|
65
|
+
this.nMels = 40;
|
|
66
|
+
this.nMfcc = 13;
|
|
67
|
+
this.frameDurationSec = 0.01;
|
|
68
|
+
this.hammingWindow = new Float32Array(this.nFft);
|
|
69
|
+
for (let i = 0; i < this.nFft; i++) {
|
|
70
|
+
this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
|
|
71
|
+
}
|
|
72
|
+
this.melFilterbank = this.createMelFilterbank();
|
|
73
|
+
this.dctMatrix = this.createDCTMatrix();
|
|
74
|
+
const framesPerSecond = Math.floor(1 / this.frameDurationSec);
|
|
75
|
+
this.energyBuffer = new Float32Array(framesPerSecond);
|
|
49
76
|
}
|
|
50
77
|
/**
|
|
51
78
|
* Extract all features from a single audio frame.
|
|
52
79
|
*/
|
|
53
80
|
extract(frame) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
// tracked by the detector over time
|
|
61
|
-
};
|
|
81
|
+
const energy = this.computeEnergy(frame);
|
|
82
|
+
const mfcc = this.computeMFCC(frame);
|
|
83
|
+
const pitch = this.estimatePitch(frame);
|
|
84
|
+
const speechRate = this.estimateSpeechRate(energy);
|
|
85
|
+
const pauseDuration = this.updatePauseDuration(energy);
|
|
86
|
+
return { mfcc, energy, pitch, speechRate, pauseDuration };
|
|
62
87
|
}
|
|
63
88
|
/**
|
|
64
89
|
* Compute Mel-Frequency Cepstral Coefficients.
|
|
65
90
|
*
|
|
66
|
-
*
|
|
67
|
-
* 1. Pre-emphasis filter
|
|
68
|
-
* 2. Windowing (Hamming)
|
|
69
|
-
* 3. FFT
|
|
70
|
-
* 4. Mel filterbank
|
|
71
|
-
* 5. Log energy
|
|
72
|
-
* 6. DCT
|
|
91
|
+
* Pipeline: Pre-emphasis → Hamming window → FFT → Mel filterbank → log → DCT
|
|
73
92
|
*/
|
|
74
|
-
computeMFCC(
|
|
75
|
-
|
|
93
|
+
computeMFCC(frame) {
|
|
94
|
+
const preEmph = new Float32Array(this.nFft);
|
|
95
|
+
const len = Math.min(frame.length, this.nFft);
|
|
96
|
+
preEmph[0] = frame[0];
|
|
97
|
+
for (let i = 1; i < len; i++) {
|
|
98
|
+
preEmph[i] = frame[i] - 0.97 * frame[i - 1];
|
|
99
|
+
}
|
|
100
|
+
for (let i = 0; i < this.nFft; i++) {
|
|
101
|
+
preEmph[i] *= this.hammingWindow[i];
|
|
102
|
+
}
|
|
103
|
+
const spectrum = this.fftMagnitude(preEmph);
|
|
104
|
+
const melEnergies = new Float32Array(this.nMels);
|
|
105
|
+
for (let m = 0; m < this.nMels; m++) {
|
|
106
|
+
let sum = 0;
|
|
107
|
+
const filter = this.melFilterbank[m];
|
|
108
|
+
for (let k = 0; k < filter.length; k++) {
|
|
109
|
+
sum += spectrum[k] * filter[k];
|
|
110
|
+
}
|
|
111
|
+
melEnergies[m] = Math.log(Math.max(sum, 1e-10));
|
|
112
|
+
}
|
|
113
|
+
const mfcc = new Float32Array(this.nMfcc);
|
|
114
|
+
for (let i = 0; i < this.nMfcc; i++) {
|
|
115
|
+
let sum = 0;
|
|
116
|
+
const dctRow = this.dctMatrix[i];
|
|
117
|
+
for (let j = 0; j < this.nMels; j++) {
|
|
118
|
+
sum += dctRow[j] * melEnergies[j];
|
|
119
|
+
}
|
|
120
|
+
mfcc[i] = sum;
|
|
121
|
+
}
|
|
122
|
+
return mfcc;
|
|
76
123
|
}
|
|
77
124
|
/**
|
|
78
125
|
* Compute RMS energy of the frame.
|
|
@@ -85,22 +132,217 @@ var FeatureExtractor = class {
|
|
|
85
132
|
return Math.sqrt(sum / frame.length);
|
|
86
133
|
}
|
|
87
134
|
/**
|
|
88
|
-
* Estimate fundamental frequency (pitch) using autocorrelation.
|
|
135
|
+
* Estimate fundamental frequency (pitch) using simplified autocorrelation.
|
|
136
|
+
*
|
|
137
|
+
* Looks for the dominant periodicity in the signal within the
|
|
138
|
+
* speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
|
|
139
|
+
*/
|
|
140
|
+
estimatePitch(frame) {
|
|
141
|
+
const minPeriod = Math.floor(this.sampleRate / 500);
|
|
142
|
+
const maxPeriod = Math.floor(this.sampleRate / 50);
|
|
143
|
+
const len = Math.min(frame.length, this.nFft);
|
|
144
|
+
if (len < maxPeriod * 2) return 0;
|
|
145
|
+
let bestCorr = 0;
|
|
146
|
+
let bestLag = 0;
|
|
147
|
+
let energy = 0;
|
|
148
|
+
for (let i = 0; i < len; i++) {
|
|
149
|
+
energy += frame[i] * frame[i];
|
|
150
|
+
}
|
|
151
|
+
if (energy < 1e-10) return 0;
|
|
152
|
+
for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
|
|
153
|
+
let corr = 0;
|
|
154
|
+
let energyLag = 0;
|
|
155
|
+
const limit = len - lag;
|
|
156
|
+
for (let i = 0; i < limit; i++) {
|
|
157
|
+
corr += frame[i] * frame[i + lag];
|
|
158
|
+
energyLag += frame[i + lag] * frame[i + lag];
|
|
159
|
+
}
|
|
160
|
+
const norm = Math.sqrt(energy * energyLag);
|
|
161
|
+
if (norm > 0) {
|
|
162
|
+
corr /= norm;
|
|
163
|
+
}
|
|
164
|
+
if (corr > bestCorr) {
|
|
165
|
+
bestCorr = corr;
|
|
166
|
+
bestLag = lag;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
if (bestCorr < 0.3 || bestLag === 0) return 0;
|
|
170
|
+
return this.sampleRate / bestLag;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Estimate speech rate from rolling energy envelope.
|
|
89
174
|
*
|
|
90
|
-
*
|
|
175
|
+
* Counts energy peaks in a 1-second sliding window.
|
|
176
|
+
* Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
|
|
91
177
|
*/
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
178
|
+
estimateSpeechRate(energy) {
|
|
179
|
+
this.energyBuffer[this.energyBufferIdx] = energy;
|
|
180
|
+
this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
|
|
181
|
+
if (this.energyBufferIdx === 0) this.energyBufferFull = true;
|
|
182
|
+
const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
|
|
183
|
+
if (len < 5) return 0;
|
|
184
|
+
let peaks = 0;
|
|
185
|
+
const threshold = this.silenceThreshold * 0.5;
|
|
186
|
+
for (let i = 2; i < len - 2; i++) {
|
|
187
|
+
const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
|
|
188
|
+
const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
|
|
189
|
+
const curr = this.energyBuffer[idx];
|
|
190
|
+
const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
|
|
191
|
+
if (curr > prev && curr > next && curr > threshold) {
|
|
192
|
+
peaks++;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const windowDuration = len * this.frameDurationSec;
|
|
196
|
+
const rate = windowDuration > 0 ? peaks / windowDuration : 0;
|
|
197
|
+
return rate / 10;
|
|
95
198
|
}
|
|
96
199
|
/**
|
|
97
|
-
*
|
|
200
|
+
* Track accumulated pause duration.
|
|
98
201
|
*
|
|
99
|
-
*
|
|
202
|
+
* Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
|
|
100
203
|
*/
|
|
101
|
-
|
|
102
|
-
|
|
204
|
+
updatePauseDuration(energy) {
|
|
205
|
+
if (energy < this.silenceThreshold) {
|
|
206
|
+
this.silenceAccumulator += this.frameDurationSec;
|
|
207
|
+
} else {
|
|
208
|
+
this.silenceAccumulator = 0;
|
|
209
|
+
}
|
|
210
|
+
return Math.min(this.silenceAccumulator, 5) / 5;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Compute FFT magnitude spectrum (power spectrum).
|
|
214
|
+
*
|
|
215
|
+
* Uses a radix-2 DIT FFT implementation. For frames smaller than
|
|
216
|
+
* nFft, zero-pads to the next power of 2.
|
|
217
|
+
*/
|
|
218
|
+
fftMagnitude(signal) {
|
|
219
|
+
let n = 1;
|
|
220
|
+
while (n < signal.length) n *= 2;
|
|
221
|
+
const real = new Float32Array(n);
|
|
222
|
+
const imag = new Float32Array(n);
|
|
223
|
+
real.set(signal);
|
|
224
|
+
let j = 0;
|
|
225
|
+
for (let i = 0; i < n; i++) {
|
|
226
|
+
if (i < j) {
|
|
227
|
+
[real[i], real[j]] = [real[j], real[i]];
|
|
228
|
+
[imag[i], imag[j]] = [imag[j], imag[i]];
|
|
229
|
+
}
|
|
230
|
+
let m = n >> 1;
|
|
231
|
+
while (m >= 1 && j >= m) {
|
|
232
|
+
j -= m;
|
|
233
|
+
m >>= 1;
|
|
234
|
+
}
|
|
235
|
+
j += m;
|
|
236
|
+
}
|
|
237
|
+
for (let size = 2; size <= n; size *= 2) {
|
|
238
|
+
const halfSize = size / 2;
|
|
239
|
+
const angle = -2 * Math.PI / size;
|
|
240
|
+
const wReal = Math.cos(angle);
|
|
241
|
+
const wImag = Math.sin(angle);
|
|
242
|
+
for (let i = 0; i < n; i += size) {
|
|
243
|
+
let curReal = 1;
|
|
244
|
+
let curImag = 0;
|
|
245
|
+
for (let k = 0; k < halfSize; k++) {
|
|
246
|
+
const evenIdx = i + k;
|
|
247
|
+
const oddIdx = i + k + halfSize;
|
|
248
|
+
const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
|
|
249
|
+
const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
|
|
250
|
+
real[oddIdx] = real[evenIdx] - tReal;
|
|
251
|
+
imag[oddIdx] = imag[evenIdx] - tImag;
|
|
252
|
+
real[evenIdx] += tReal;
|
|
253
|
+
imag[evenIdx] += tImag;
|
|
254
|
+
const newCurReal = curReal * wReal - curImag * wImag;
|
|
255
|
+
curImag = curReal * wImag + curImag * wReal;
|
|
256
|
+
curReal = newCurReal;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
const numBins = n / 2 + 1;
|
|
261
|
+
const power = new Float32Array(numBins);
|
|
262
|
+
for (let i = 0; i < numBins; i++) {
|
|
263
|
+
power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
|
|
264
|
+
}
|
|
265
|
+
return power;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Create Mel filterbank matrix.
|
|
269
|
+
*
|
|
270
|
+
* Produces nMels triangular filters spanning the frequency range
|
|
271
|
+
* from 0 to sampleRate/2 on the Mel scale.
|
|
272
|
+
*/
|
|
273
|
+
createMelFilterbank() {
|
|
274
|
+
let n = 1;
|
|
275
|
+
while (n < this.nFft) n *= 2;
|
|
276
|
+
const fftBins = n / 2 + 1;
|
|
277
|
+
const fMin = 0;
|
|
278
|
+
const fMax = this.sampleRate / 2;
|
|
279
|
+
const melMin = this.hzToMel(fMin);
|
|
280
|
+
const melMax = this.hzToMel(fMax);
|
|
281
|
+
const melPoints = new Float32Array(this.nMels + 2);
|
|
282
|
+
for (let i = 0; i < this.nMels + 2; i++) {
|
|
283
|
+
melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
|
|
284
|
+
}
|
|
285
|
+
const binIndices = new Float32Array(this.nMels + 2);
|
|
286
|
+
for (let i = 0; i < this.nMels + 2; i++) {
|
|
287
|
+
const hz = this.melToHz(melPoints[i]);
|
|
288
|
+
binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
|
|
289
|
+
}
|
|
290
|
+
const filters = [];
|
|
291
|
+
for (let m = 0; m < this.nMels; m++) {
|
|
292
|
+
const filter = new Float32Array(fftBins);
|
|
293
|
+
const left = binIndices[m];
|
|
294
|
+
const center = binIndices[m + 1];
|
|
295
|
+
const right = binIndices[m + 2];
|
|
296
|
+
for (let k = 0; k < fftBins; k++) {
|
|
297
|
+
if (k >= left && k <= center && center > left) {
|
|
298
|
+
filter[k] = (k - left) / (center - left);
|
|
299
|
+
} else if (k > center && k <= right && right > center) {
|
|
300
|
+
filter[k] = (right - k) / (right - center);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
filters.push(filter);
|
|
304
|
+
}
|
|
305
|
+
return filters;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Create DCT-II matrix for MFCC computation.
|
|
309
|
+
*/
|
|
310
|
+
createDCTMatrix() {
|
|
311
|
+
const matrix = [];
|
|
312
|
+
const scale = Math.sqrt(2 / this.nMels);
|
|
313
|
+
for (let i = 0; i < this.nMfcc; i++) {
|
|
314
|
+
const row = new Float32Array(this.nMels);
|
|
315
|
+
for (let j = 0; j < this.nMels; j++) {
|
|
316
|
+
row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
|
|
317
|
+
}
|
|
318
|
+
matrix.push(row);
|
|
319
|
+
}
|
|
320
|
+
return matrix;
|
|
321
|
+
}
|
|
322
|
+
hzToMel(hz) {
|
|
323
|
+
return 2595 * Math.log10(1 + hz / 700);
|
|
103
324
|
}
|
|
325
|
+
melToHz(mel) {
|
|
326
|
+
return 700 * (Math.pow(10, mel / 2595) - 1);
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Reset internal state (energy buffer, pause accumulator).
|
|
330
|
+
*/
|
|
331
|
+
reset() {
|
|
332
|
+
this.silenceAccumulator = 0;
|
|
333
|
+
this.energyBuffer.fill(0);
|
|
334
|
+
this.energyBufferIdx = 0;
|
|
335
|
+
this.energyBufferFull = false;
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
// src/types.ts
|
|
340
|
+
var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
|
|
341
|
+
var DEFAULT_OPTIONS = {
|
|
342
|
+
sensitivity: 0.5,
|
|
343
|
+
pauseTolerance: 1500,
|
|
344
|
+
modelPath: "cdn",
|
|
345
|
+
sampleRate: 16e3
|
|
104
346
|
};
|
|
105
347
|
|
|
106
348
|
// src/model/energy-vad.ts
|
|
@@ -151,45 +393,187 @@ var EnergyVAD = class {
|
|
|
151
393
|
};
|
|
152
394
|
|
|
153
395
|
// src/model/onnx.ts
|
|
396
|
+
var LABELS = [
|
|
397
|
+
"speaking",
|
|
398
|
+
"thinking_pause",
|
|
399
|
+
"turn_complete",
|
|
400
|
+
"interrupt_intent"
|
|
401
|
+
];
|
|
402
|
+
var FEATURE_DIM = 17;
|
|
403
|
+
var CONTEXT_FRAMES = 100;
|
|
404
|
+
var INFERENCE_INTERVAL = 10;
|
|
154
405
|
var ONNXModel = class {
|
|
155
406
|
session = null;
|
|
407
|
+
ort = null;
|
|
156
408
|
fallback;
|
|
157
|
-
|
|
409
|
+
useWebGpu;
|
|
410
|
+
/** Circular buffer of feature vectors for the context window. */
|
|
411
|
+
frameBuffer;
|
|
412
|
+
bufferIdx = 0;
|
|
413
|
+
framesBuffered = 0;
|
|
414
|
+
framesSinceInference = 0;
|
|
415
|
+
/** Cache the last inference result for frames between batches. */
|
|
416
|
+
lastResult = null;
|
|
417
|
+
constructor(sensitivity = 0.5, useWebGpu = false) {
|
|
158
418
|
this.fallback = new EnergyVAD(sensitivity);
|
|
419
|
+
this.useWebGpu = useWebGpu;
|
|
420
|
+
this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
|
|
159
421
|
}
|
|
160
422
|
/**
|
|
161
|
-
* Load the ONNX model from
|
|
423
|
+
* Load the ONNX model from CDN, bundled path, or custom URL.
|
|
424
|
+
*
|
|
425
|
+
* Dynamically imports onnxruntime-web to avoid bundling it
|
|
426
|
+
* when the model isn't used (tree-shaking friendly).
|
|
162
427
|
*
|
|
163
|
-
*
|
|
164
|
-
* 1. Import onnxruntime-web InferenceSession
|
|
165
|
-
* 2. Load model bytes
|
|
166
|
-
* 3. Create session with appropriate execution providers
|
|
428
|
+
* @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
|
|
167
429
|
*/
|
|
168
|
-
async load(
|
|
169
|
-
|
|
430
|
+
async load(path) {
|
|
431
|
+
try {
|
|
432
|
+
const ort = await import("onnxruntime-web");
|
|
433
|
+
this.ort = ort;
|
|
434
|
+
let modelSource = path;
|
|
435
|
+
if (path === "cdn") {
|
|
436
|
+
try {
|
|
437
|
+
const response = await fetch(MODEL_CDN_URL);
|
|
438
|
+
if (response.ok) {
|
|
439
|
+
modelSource = await response.arrayBuffer();
|
|
440
|
+
} else {
|
|
441
|
+
throw new Error(`Failed to fetch CDN model: ${response.status}`);
|
|
442
|
+
}
|
|
443
|
+
} catch {
|
|
444
|
+
console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
|
|
445
|
+
this.session = null;
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
} else if (path === "bundled") {
|
|
449
|
+
try {
|
|
450
|
+
const getUrl = new Function("p", "b", "return new URL(p, b).href");
|
|
451
|
+
const href = getUrl("../../models/utterance-v1.onnx", import.meta.url);
|
|
452
|
+
const response = await fetch(href);
|
|
453
|
+
if (response.ok) {
|
|
454
|
+
modelSource = await response.arrayBuffer();
|
|
455
|
+
} else {
|
|
456
|
+
throw new Error(`Failed to fetch bundled model: ${response.status}`);
|
|
457
|
+
}
|
|
458
|
+
} catch {
|
|
459
|
+
console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
|
|
460
|
+
this.session = null;
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
|
|
465
|
+
this.session = await ort.InferenceSession.create(modelSource, {
|
|
466
|
+
executionProviders: providers
|
|
467
|
+
});
|
|
468
|
+
} catch (err) {
|
|
469
|
+
console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
|
|
470
|
+
this.session = null;
|
|
471
|
+
}
|
|
170
472
|
}
|
|
171
473
|
/**
|
|
172
|
-
* Run inference on
|
|
474
|
+
* Run inference on extracted features.
|
|
173
475
|
*
|
|
174
|
-
*
|
|
175
|
-
*
|
|
176
|
-
*
|
|
177
|
-
* 3. Parse output into ClassificationResult
|
|
476
|
+
* Buffers frames into a sliding window and runs the ONNX model
|
|
477
|
+
* every 100ms (10 frames). Between inference runs, returns the
|
|
478
|
+
* cached result. Falls back to EnergyVAD when no model is loaded.
|
|
178
479
|
*/
|
|
179
480
|
async predict(features) {
|
|
180
|
-
if (!this.session) {
|
|
481
|
+
if (!this.session || !this.ort) {
|
|
181
482
|
return this.fallback.classify(features);
|
|
182
483
|
}
|
|
183
|
-
|
|
484
|
+
this.addFrame(features);
|
|
485
|
+
this.framesSinceInference++;
|
|
486
|
+
if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
|
|
487
|
+
this.framesSinceInference = 0;
|
|
488
|
+
try {
|
|
489
|
+
this.lastResult = await this.runInference();
|
|
490
|
+
} catch (err) {
|
|
491
|
+
console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
|
|
492
|
+
return this.fallback.classify(features);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
return this.lastResult ?? this.fallback.classify(features);
|
|
184
496
|
}
|
|
185
497
|
/**
|
|
186
498
|
* Release model resources.
|
|
187
499
|
*/
|
|
188
500
|
dispose() {
|
|
501
|
+
if (this.session) {
|
|
502
|
+
this.session.release().catch(() => {
|
|
503
|
+
});
|
|
504
|
+
}
|
|
189
505
|
this.session = null;
|
|
506
|
+
this.ort = null;
|
|
190
507
|
this.fallback.reset();
|
|
508
|
+
this.resetBuffer();
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Add a feature frame to the circular buffer.
|
|
512
|
+
*/
|
|
513
|
+
addFrame(features) {
|
|
514
|
+
const offset = this.bufferIdx * FEATURE_DIM;
|
|
515
|
+
this.frameBuffer.set(features.mfcc, offset);
|
|
516
|
+
this.frameBuffer[offset + 13] = features.energy;
|
|
517
|
+
this.frameBuffer[offset + 14] = features.pitch;
|
|
518
|
+
this.frameBuffer[offset + 15] = features.speechRate;
|
|
519
|
+
this.frameBuffer[offset + 16] = features.pauseDuration;
|
|
520
|
+
this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
|
|
521
|
+
if (this.framesBuffered < CONTEXT_FRAMES) {
|
|
522
|
+
this.framesBuffered++;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
/**
|
|
526
|
+
* Build the input tensor from the circular buffer and run ONNX inference.
|
|
527
|
+
*/
|
|
528
|
+
async runInference() {
|
|
529
|
+
const ort = this.ort;
|
|
530
|
+
const session = this.session;
|
|
531
|
+
const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
|
|
532
|
+
for (let i = 0; i < CONTEXT_FRAMES; i++) {
|
|
533
|
+
const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
|
|
534
|
+
const dstIdx = i * FEATURE_DIM;
|
|
535
|
+
input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
|
|
536
|
+
}
|
|
537
|
+
const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
|
|
538
|
+
const results = await session.run({ input: tensor });
|
|
539
|
+
const output = results.output;
|
|
540
|
+
const logits = output.data;
|
|
541
|
+
const probs = softmax(logits);
|
|
542
|
+
let bestIdx = 0;
|
|
543
|
+
let bestProb = probs[0];
|
|
544
|
+
for (let i = 1; i < probs.length; i++) {
|
|
545
|
+
if (probs[i] > bestProb) {
|
|
546
|
+
bestProb = probs[i];
|
|
547
|
+
bestIdx = i;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
return {
|
|
551
|
+
label: LABELS[bestIdx],
|
|
552
|
+
confidence: bestProb,
|
|
553
|
+
timestamp: Date.now()
|
|
554
|
+
};
|
|
555
|
+
}
|
|
556
|
+
resetBuffer() {
|
|
557
|
+
this.frameBuffer.fill(0);
|
|
558
|
+
this.bufferIdx = 0;
|
|
559
|
+
this.framesBuffered = 0;
|
|
560
|
+
this.framesSinceInference = 0;
|
|
561
|
+
this.lastResult = null;
|
|
191
562
|
}
|
|
192
563
|
};
|
|
564
|
+
function softmax(logits) {
|
|
565
|
+
const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
|
|
566
|
+
const exps = new Float32Array(logits.length);
|
|
567
|
+
let sum = 0;
|
|
568
|
+
for (let i = 0; i < logits.length; i++) {
|
|
569
|
+
exps[i] = Math.exp(logits[i] - max);
|
|
570
|
+
sum += exps[i];
|
|
571
|
+
}
|
|
572
|
+
for (let i = 0; i < exps.length; i++) {
|
|
573
|
+
exps[i] /= sum;
|
|
574
|
+
}
|
|
575
|
+
return exps;
|
|
576
|
+
}
|
|
193
577
|
|
|
194
578
|
// src/detector/turn-detector.ts
|
|
195
579
|
var TurnDetector = class {
|
|
@@ -280,14 +664,6 @@ var TurnDetector = class {
|
|
|
280
664
|
}
|
|
281
665
|
};
|
|
282
666
|
|
|
283
|
-
// src/types.ts
|
|
284
|
-
var DEFAULT_OPTIONS = {
|
|
285
|
-
sensitivity: 0.5,
|
|
286
|
-
pauseTolerance: 1500,
|
|
287
|
-
modelPath: "bundled",
|
|
288
|
-
sampleRate: 16e3
|
|
289
|
-
};
|
|
290
|
-
|
|
291
667
|
// src/utterance.ts
|
|
292
668
|
var Utterance = class {
|
|
293
669
|
options;
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@utterance/core",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.3",
|
|
4
4
|
"description": "Client-side semantic endpointing. Know when they're done talking.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.cjs",
|
|
@@ -76,6 +76,7 @@
|
|
|
76
76
|
"dependencies": {
|
|
77
77
|
"@next/third-parties": "^16.1.6",
|
|
78
78
|
"@react-three/fiber": "^9.5.0",
|
|
79
|
+
"@utterance/core": "^0.0.2",
|
|
79
80
|
"class-variance-authority": "^0.7.1",
|
|
80
81
|
"clsx": "^2.1.1",
|
|
81
82
|
"fumadocs-core": "^16.6.3",
|