@utterance/core 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,9 @@
1
1
  "use strict";
2
+ var __create = Object.create;
2
3
  var __defProp = Object.defineProperty;
3
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
5
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
8
  var __export = (target, all) => {
7
9
  for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
15
17
  }
16
18
  return to;
17
19
  };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
18
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
29
 
20
30
  // src/index.ts
@@ -70,35 +80,82 @@ var AudioCapture = class {
70
80
  // src/features/extractor.ts
71
81
  var FeatureExtractor = class {
72
82
  sampleRate;
83
+ nFft;
84
+ nMels;
85
+ nMfcc;
86
+ // Pre-computed DSP tables
87
+ hammingWindow;
88
+ melFilterbank;
89
+ dctMatrix;
90
+ // State for pause duration tracking
91
+ silenceAccumulator = 0;
92
+ silenceThreshold = 0.01;
93
+ frameDurationSec;
94
+ // State for speech rate (rolling energy buffer)
95
+ energyBuffer;
96
+ energyBufferIdx = 0;
97
+ energyBufferFull = false;
73
98
  constructor(sampleRate = 16e3) {
74
99
  this.sampleRate = sampleRate;
100
+ this.nFft = Math.floor(sampleRate * 0.025);
101
+ this.nMels = 40;
102
+ this.nMfcc = 13;
103
+ this.frameDurationSec = 0.01;
104
+ this.hammingWindow = new Float32Array(this.nFft);
105
+ for (let i = 0; i < this.nFft; i++) {
106
+ this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
107
+ }
108
+ this.melFilterbank = this.createMelFilterbank();
109
+ this.dctMatrix = this.createDCTMatrix();
110
+ const framesPerSecond = Math.floor(1 / this.frameDurationSec);
111
+ this.energyBuffer = new Float32Array(framesPerSecond);
75
112
  }
76
113
  /**
77
114
  * Extract all features from a single audio frame.
78
115
  */
79
116
  extract(frame) {
80
- return {
81
- mfcc: this.computeMFCC(frame),
82
- energy: this.computeEnergy(frame),
83
- pitch: this.estimatePitch(frame),
84
- speechRate: this.estimateSpeechRate(frame),
85
- pauseDuration: 0
86
- // tracked by the detector over time
87
- };
117
+ const energy = this.computeEnergy(frame);
118
+ const mfcc = this.computeMFCC(frame);
119
+ const pitch = this.estimatePitch(frame);
120
+ const speechRate = this.estimateSpeechRate(energy);
121
+ const pauseDuration = this.updatePauseDuration(energy);
122
+ return { mfcc, energy, pitch, speechRate, pauseDuration };
88
123
  }
89
124
  /**
90
125
  * Compute Mel-Frequency Cepstral Coefficients.
91
126
  *
92
- * TODO: Implement full MFCC pipeline:
93
- * 1. Pre-emphasis filter
94
- * 2. Windowing (Hamming)
95
- * 3. FFT
96
- * 4. Mel filterbank
97
- * 5. Log energy
98
- * 6. DCT
127
+ * Pipeline: Pre-emphasis Hamming window → FFT → Mel filterbank → log → DCT
99
128
  */
100
- computeMFCC(_frame) {
101
- return new Float32Array(13);
129
+ computeMFCC(frame) {
130
+ const preEmph = new Float32Array(this.nFft);
131
+ const len = Math.min(frame.length, this.nFft);
132
+ preEmph[0] = frame[0];
133
+ for (let i = 1; i < len; i++) {
134
+ preEmph[i] = frame[i] - 0.97 * frame[i - 1];
135
+ }
136
+ for (let i = 0; i < this.nFft; i++) {
137
+ preEmph[i] *= this.hammingWindow[i];
138
+ }
139
+ const spectrum = this.fftMagnitude(preEmph);
140
+ const melEnergies = new Float32Array(this.nMels);
141
+ for (let m = 0; m < this.nMels; m++) {
142
+ let sum = 0;
143
+ const filter = this.melFilterbank[m];
144
+ for (let k = 0; k < filter.length; k++) {
145
+ sum += spectrum[k] * filter[k];
146
+ }
147
+ melEnergies[m] = Math.log(Math.max(sum, 1e-10));
148
+ }
149
+ const mfcc = new Float32Array(this.nMfcc);
150
+ for (let i = 0; i < this.nMfcc; i++) {
151
+ let sum = 0;
152
+ const dctRow = this.dctMatrix[i];
153
+ for (let j = 0; j < this.nMels; j++) {
154
+ sum += dctRow[j] * melEnergies[j];
155
+ }
156
+ mfcc[i] = sum;
157
+ }
158
+ return mfcc;
102
159
  }
103
160
  /**
104
161
  * Compute RMS energy of the frame.
@@ -111,22 +168,217 @@ var FeatureExtractor = class {
111
168
  return Math.sqrt(sum / frame.length);
112
169
  }
113
170
  /**
114
- * Estimate fundamental frequency (pitch) using autocorrelation.
171
+ * Estimate fundamental frequency (pitch) using simplified autocorrelation.
172
+ *
173
+ * Looks for the dominant periodicity in the signal within the
174
+ * speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
175
+ */
176
+ estimatePitch(frame) {
177
+ const minPeriod = Math.floor(this.sampleRate / 500);
178
+ const maxPeriod = Math.floor(this.sampleRate / 50);
179
+ const len = Math.min(frame.length, this.nFft);
180
+ if (len < maxPeriod * 2) return 0;
181
+ let bestCorr = 0;
182
+ let bestLag = 0;
183
+ let energy = 0;
184
+ for (let i = 0; i < len; i++) {
185
+ energy += frame[i] * frame[i];
186
+ }
187
+ if (energy < 1e-10) return 0;
188
+ for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
189
+ let corr = 0;
190
+ let energyLag = 0;
191
+ const limit = len - lag;
192
+ for (let i = 0; i < limit; i++) {
193
+ corr += frame[i] * frame[i + lag];
194
+ energyLag += frame[i + lag] * frame[i + lag];
195
+ }
196
+ const norm = Math.sqrt(energy * energyLag);
197
+ if (norm > 0) {
198
+ corr /= norm;
199
+ }
200
+ if (corr > bestCorr) {
201
+ bestCorr = corr;
202
+ bestLag = lag;
203
+ }
204
+ }
205
+ if (bestCorr < 0.3 || bestLag === 0) return 0;
206
+ return this.sampleRate / bestLag;
207
+ }
208
+ /**
209
+ * Estimate speech rate from rolling energy envelope.
115
210
  *
116
- * TODO: Implement YIN or autocorrelation-based pitch detection.
211
+ * Counts energy peaks in a 1-second sliding window.
212
+ * Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
117
213
  */
118
- estimatePitch(_frame) {
119
- void this.sampleRate;
120
- return 0;
214
+ estimateSpeechRate(energy) {
215
+ this.energyBuffer[this.energyBufferIdx] = energy;
216
+ this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
217
+ if (this.energyBufferIdx === 0) this.energyBufferFull = true;
218
+ const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
219
+ if (len < 5) return 0;
220
+ let peaks = 0;
221
+ const threshold = this.silenceThreshold * 0.5;
222
+ for (let i = 2; i < len - 2; i++) {
223
+ const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
224
+ const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
225
+ const curr = this.energyBuffer[idx];
226
+ const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
227
+ if (curr > prev && curr > next && curr > threshold) {
228
+ peaks++;
229
+ }
230
+ }
231
+ const windowDuration = len * this.frameDurationSec;
232
+ const rate = windowDuration > 0 ? peaks / windowDuration : 0;
233
+ return rate / 10;
121
234
  }
122
235
  /**
123
- * Estimate speech rate (syllables per second).
236
+ * Track accumulated pause duration.
124
237
  *
125
- * TODO: Implement energy-envelope peak counting.
238
+ * Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
126
239
  */
127
- estimateSpeechRate(_frame) {
128
- return 0;
240
+ updatePauseDuration(energy) {
241
+ if (energy < this.silenceThreshold) {
242
+ this.silenceAccumulator += this.frameDurationSec;
243
+ } else {
244
+ this.silenceAccumulator = 0;
245
+ }
246
+ return Math.min(this.silenceAccumulator, 5) / 5;
247
+ }
248
+ /**
249
+ * Compute FFT magnitude spectrum (power spectrum).
250
+ *
251
+ * Uses a radix-2 DIT FFT implementation. For frames smaller than
252
+ * nFft, zero-pads to the next power of 2.
253
+ */
254
+ fftMagnitude(signal) {
255
+ let n = 1;
256
+ while (n < signal.length) n *= 2;
257
+ const real = new Float32Array(n);
258
+ const imag = new Float32Array(n);
259
+ real.set(signal);
260
+ let j = 0;
261
+ for (let i = 0; i < n; i++) {
262
+ if (i < j) {
263
+ [real[i], real[j]] = [real[j], real[i]];
264
+ [imag[i], imag[j]] = [imag[j], imag[i]];
265
+ }
266
+ let m = n >> 1;
267
+ while (m >= 1 && j >= m) {
268
+ j -= m;
269
+ m >>= 1;
270
+ }
271
+ j += m;
272
+ }
273
+ for (let size = 2; size <= n; size *= 2) {
274
+ const halfSize = size / 2;
275
+ const angle = -2 * Math.PI / size;
276
+ const wReal = Math.cos(angle);
277
+ const wImag = Math.sin(angle);
278
+ for (let i = 0; i < n; i += size) {
279
+ let curReal = 1;
280
+ let curImag = 0;
281
+ for (let k = 0; k < halfSize; k++) {
282
+ const evenIdx = i + k;
283
+ const oddIdx = i + k + halfSize;
284
+ const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
285
+ const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
286
+ real[oddIdx] = real[evenIdx] - tReal;
287
+ imag[oddIdx] = imag[evenIdx] - tImag;
288
+ real[evenIdx] += tReal;
289
+ imag[evenIdx] += tImag;
290
+ const newCurReal = curReal * wReal - curImag * wImag;
291
+ curImag = curReal * wImag + curImag * wReal;
292
+ curReal = newCurReal;
293
+ }
294
+ }
295
+ }
296
+ const numBins = n / 2 + 1;
297
+ const power = new Float32Array(numBins);
298
+ for (let i = 0; i < numBins; i++) {
299
+ power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
300
+ }
301
+ return power;
302
+ }
303
+ /**
304
+ * Create Mel filterbank matrix.
305
+ *
306
+ * Produces nMels triangular filters spanning the frequency range
307
+ * from 0 to sampleRate/2 on the Mel scale.
308
+ */
309
+ createMelFilterbank() {
310
+ let n = 1;
311
+ while (n < this.nFft) n *= 2;
312
+ const fftBins = n / 2 + 1;
313
+ const fMin = 0;
314
+ const fMax = this.sampleRate / 2;
315
+ const melMin = this.hzToMel(fMin);
316
+ const melMax = this.hzToMel(fMax);
317
+ const melPoints = new Float32Array(this.nMels + 2);
318
+ for (let i = 0; i < this.nMels + 2; i++) {
319
+ melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
320
+ }
321
+ const binIndices = new Float32Array(this.nMels + 2);
322
+ for (let i = 0; i < this.nMels + 2; i++) {
323
+ const hz = this.melToHz(melPoints[i]);
324
+ binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
325
+ }
326
+ const filters = [];
327
+ for (let m = 0; m < this.nMels; m++) {
328
+ const filter = new Float32Array(fftBins);
329
+ const left = binIndices[m];
330
+ const center = binIndices[m + 1];
331
+ const right = binIndices[m + 2];
332
+ for (let k = 0; k < fftBins; k++) {
333
+ if (k >= left && k <= center && center > left) {
334
+ filter[k] = (k - left) / (center - left);
335
+ } else if (k > center && k <= right && right > center) {
336
+ filter[k] = (right - k) / (right - center);
337
+ }
338
+ }
339
+ filters.push(filter);
340
+ }
341
+ return filters;
342
+ }
343
+ /**
344
+ * Create DCT-II matrix for MFCC computation.
345
+ */
346
+ createDCTMatrix() {
347
+ const matrix = [];
348
+ const scale = Math.sqrt(2 / this.nMels);
349
+ for (let i = 0; i < this.nMfcc; i++) {
350
+ const row = new Float32Array(this.nMels);
351
+ for (let j = 0; j < this.nMels; j++) {
352
+ row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
353
+ }
354
+ matrix.push(row);
355
+ }
356
+ return matrix;
357
+ }
358
+ hzToMel(hz) {
359
+ return 2595 * Math.log10(1 + hz / 700);
129
360
  }
361
+ melToHz(mel) {
362
+ return 700 * (Math.pow(10, mel / 2595) - 1);
363
+ }
364
+ /**
365
+ * Reset internal state (energy buffer, pause accumulator).
366
+ */
367
+ reset() {
368
+ this.silenceAccumulator = 0;
369
+ this.energyBuffer.fill(0);
370
+ this.energyBufferIdx = 0;
371
+ this.energyBufferFull = false;
372
+ }
373
+ };
374
+
375
+ // src/types.ts
376
+ var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
377
+ var DEFAULT_OPTIONS = {
378
+ sensitivity: 0.5,
379
+ pauseTolerance: 1500,
380
+ modelPath: "cdn",
381
+ sampleRate: 16e3
130
382
  };
131
383
 
132
384
  // src/model/energy-vad.ts
@@ -177,45 +429,188 @@ var EnergyVAD = class {
177
429
  };
178
430
 
179
431
  // src/model/onnx.ts
432
+ var import_meta = {};
433
+ var LABELS = [
434
+ "speaking",
435
+ "thinking_pause",
436
+ "turn_complete",
437
+ "interrupt_intent"
438
+ ];
439
+ var FEATURE_DIM = 17;
440
+ var CONTEXT_FRAMES = 100;
441
+ var INFERENCE_INTERVAL = 10;
180
442
  var ONNXModel = class {
181
443
  session = null;
444
+ ort = null;
182
445
  fallback;
183
- constructor(sensitivity = 0.5) {
446
+ useWebGpu;
447
+ /** Circular buffer of feature vectors for the context window. */
448
+ frameBuffer;
449
+ bufferIdx = 0;
450
+ framesBuffered = 0;
451
+ framesSinceInference = 0;
452
+ /** Cache the last inference result for frames between batches. */
453
+ lastResult = null;
454
+ constructor(sensitivity = 0.5, useWebGpu = false) {
184
455
  this.fallback = new EnergyVAD(sensitivity);
456
+ this.useWebGpu = useWebGpu;
457
+ this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
185
458
  }
186
459
  /**
187
- * Load the ONNX model from a given path or URL.
460
+ * Load the ONNX model from CDN, bundled path, or custom URL.
461
+ *
462
+ * Dynamically imports onnxruntime-web to avoid bundling it
463
+ * when the model isn't used (tree-shaking friendly).
188
464
  *
189
- * TODO:
190
- * 1. Import onnxruntime-web InferenceSession
191
- * 2. Load model bytes
192
- * 3. Create session with appropriate execution providers
465
+ * @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
193
466
  */
194
- async load(_path) {
195
- this.session = null;
467
+ async load(path) {
468
+ try {
469
+ const ort = await import("onnxruntime-web");
470
+ this.ort = ort;
471
+ let modelSource = path;
472
+ if (path === "cdn") {
473
+ try {
474
+ const response = await fetch(MODEL_CDN_URL);
475
+ if (response.ok) {
476
+ modelSource = await response.arrayBuffer();
477
+ } else {
478
+ throw new Error(`Failed to fetch CDN model: ${response.status}`);
479
+ }
480
+ } catch {
481
+ console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
482
+ this.session = null;
483
+ return;
484
+ }
485
+ } else if (path === "bundled") {
486
+ try {
487
+ const getUrl = new Function("p", "b", "return new URL(p, b).href");
488
+ const href = getUrl("../../models/utterance-v1.onnx", import_meta.url);
489
+ const response = await fetch(href);
490
+ if (response.ok) {
491
+ modelSource = await response.arrayBuffer();
492
+ } else {
493
+ throw new Error(`Failed to fetch bundled model: ${response.status}`);
494
+ }
495
+ } catch {
496
+ console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
497
+ this.session = null;
498
+ return;
499
+ }
500
+ }
501
+ const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
502
+ this.session = await ort.InferenceSession.create(modelSource, {
503
+ executionProviders: providers
504
+ });
505
+ } catch (err) {
506
+ console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
507
+ this.session = null;
508
+ }
196
509
  }
197
510
  /**
198
- * Run inference on a set of extracted features.
511
+ * Run inference on extracted features.
199
512
  *
200
- * TODO:
201
- * 1. Build input tensor from AudioFeatures
202
- * 2. Run session.run()
203
- * 3. Parse output into ClassificationResult
513
+ * Buffers frames into a sliding window and runs the ONNX model
514
+ * every 100ms (10 frames). Between inference runs, returns the
515
+ * cached result. Falls back to EnergyVAD when no model is loaded.
204
516
  */
205
517
  async predict(features) {
206
- if (!this.session) {
518
+ if (!this.session || !this.ort) {
207
519
  return this.fallback.classify(features);
208
520
  }
209
- return this.fallback.classify(features);
521
+ this.addFrame(features);
522
+ this.framesSinceInference++;
523
+ if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
524
+ this.framesSinceInference = 0;
525
+ try {
526
+ this.lastResult = await this.runInference();
527
+ } catch (err) {
528
+ console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
529
+ return this.fallback.classify(features);
530
+ }
531
+ }
532
+ return this.lastResult ?? this.fallback.classify(features);
210
533
  }
211
534
  /**
212
535
  * Release model resources.
213
536
  */
214
537
  dispose() {
538
+ if (this.session) {
539
+ this.session.release().catch(() => {
540
+ });
541
+ }
215
542
  this.session = null;
543
+ this.ort = null;
216
544
  this.fallback.reset();
545
+ this.resetBuffer();
546
+ }
547
+ /**
548
+ * Add a feature frame to the circular buffer.
549
+ */
550
+ addFrame(features) {
551
+ const offset = this.bufferIdx * FEATURE_DIM;
552
+ this.frameBuffer.set(features.mfcc, offset);
553
+ this.frameBuffer[offset + 13] = features.energy;
554
+ this.frameBuffer[offset + 14] = features.pitch;
555
+ this.frameBuffer[offset + 15] = features.speechRate;
556
+ this.frameBuffer[offset + 16] = features.pauseDuration;
557
+ this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
558
+ if (this.framesBuffered < CONTEXT_FRAMES) {
559
+ this.framesBuffered++;
560
+ }
561
+ }
562
+ /**
563
+ * Build the input tensor from the circular buffer and run ONNX inference.
564
+ */
565
+ async runInference() {
566
+ const ort = this.ort;
567
+ const session = this.session;
568
+ const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
569
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
570
+ const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
571
+ const dstIdx = i * FEATURE_DIM;
572
+ input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
573
+ }
574
+ const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
575
+ const results = await session.run({ input: tensor });
576
+ const output = results.output;
577
+ const logits = output.data;
578
+ const probs = softmax(logits);
579
+ let bestIdx = 0;
580
+ let bestProb = probs[0];
581
+ for (let i = 1; i < probs.length; i++) {
582
+ if (probs[i] > bestProb) {
583
+ bestProb = probs[i];
584
+ bestIdx = i;
585
+ }
586
+ }
587
+ return {
588
+ label: LABELS[bestIdx],
589
+ confidence: bestProb,
590
+ timestamp: Date.now()
591
+ };
592
+ }
593
+ resetBuffer() {
594
+ this.frameBuffer.fill(0);
595
+ this.bufferIdx = 0;
596
+ this.framesBuffered = 0;
597
+ this.framesSinceInference = 0;
598
+ this.lastResult = null;
217
599
  }
218
600
  };
601
+ function softmax(logits) {
602
+ const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
603
+ const exps = new Float32Array(logits.length);
604
+ let sum = 0;
605
+ for (let i = 0; i < logits.length; i++) {
606
+ exps[i] = Math.exp(logits[i] - max);
607
+ sum += exps[i];
608
+ }
609
+ for (let i = 0; i < exps.length; i++) {
610
+ exps[i] /= sum;
611
+ }
612
+ return exps;
613
+ }
219
614
 
220
615
  // src/detector/turn-detector.ts
221
616
  var TurnDetector = class {
@@ -306,14 +701,6 @@ var TurnDetector = class {
306
701
  }
307
702
  };
308
703
 
309
- // src/types.ts
310
- var DEFAULT_OPTIONS = {
311
- sensitivity: 0.5,
312
- pauseTolerance: 1500,
313
- modelPath: "bundled",
314
- sampleRate: 16e3
315
- };
316
-
317
704
  // src/utterance.ts
318
705
  var Utterance = class {
319
706
  options;
package/dist/index.d.cts CHANGED
@@ -9,7 +9,7 @@ interface UtteranceOptions {
9
9
  sensitivity?: number;
10
10
  /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
11
11
  pauseTolerance?: number;
12
- /** Path to a custom ONNX model. Default: bundled model */
12
+ /** Model source: "cdn" (default), "bundled", or a custom URL. */
13
13
  modelPath?: string;
14
14
  /** Audio sample rate in Hz. Default: 16000 */
15
15
  sampleRate?: number;
package/dist/index.d.ts CHANGED
@@ -9,7 +9,7 @@ interface UtteranceOptions {
9
9
  sensitivity?: number;
10
10
  /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
11
11
  pauseTolerance?: number;
12
- /** Path to a custom ONNX model. Default: bundled model */
12
+ /** Model source: "cdn" (default), "bundled", or a custom URL. */
13
13
  modelPath?: string;
14
14
  /** Audio sample rate in Hz. Default: 16000 */
15
15
  sampleRate?: number;
package/dist/index.js CHANGED
@@ -44,35 +44,82 @@ var AudioCapture = class {
44
44
  // src/features/extractor.ts
45
45
  var FeatureExtractor = class {
46
46
  sampleRate;
47
+ nFft;
48
+ nMels;
49
+ nMfcc;
50
+ // Pre-computed DSP tables
51
+ hammingWindow;
52
+ melFilterbank;
53
+ dctMatrix;
54
+ // State for pause duration tracking
55
+ silenceAccumulator = 0;
56
+ silenceThreshold = 0.01;
57
+ frameDurationSec;
58
+ // State for speech rate (rolling energy buffer)
59
+ energyBuffer;
60
+ energyBufferIdx = 0;
61
+ energyBufferFull = false;
47
62
  constructor(sampleRate = 16e3) {
48
63
  this.sampleRate = sampleRate;
64
+ this.nFft = Math.floor(sampleRate * 0.025);
65
+ this.nMels = 40;
66
+ this.nMfcc = 13;
67
+ this.frameDurationSec = 0.01;
68
+ this.hammingWindow = new Float32Array(this.nFft);
69
+ for (let i = 0; i < this.nFft; i++) {
70
+ this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
71
+ }
72
+ this.melFilterbank = this.createMelFilterbank();
73
+ this.dctMatrix = this.createDCTMatrix();
74
+ const framesPerSecond = Math.floor(1 / this.frameDurationSec);
75
+ this.energyBuffer = new Float32Array(framesPerSecond);
49
76
  }
50
77
  /**
51
78
  * Extract all features from a single audio frame.
52
79
  */
53
80
  extract(frame) {
54
- return {
55
- mfcc: this.computeMFCC(frame),
56
- energy: this.computeEnergy(frame),
57
- pitch: this.estimatePitch(frame),
58
- speechRate: this.estimateSpeechRate(frame),
59
- pauseDuration: 0
60
- // tracked by the detector over time
61
- };
81
+ const energy = this.computeEnergy(frame);
82
+ const mfcc = this.computeMFCC(frame);
83
+ const pitch = this.estimatePitch(frame);
84
+ const speechRate = this.estimateSpeechRate(energy);
85
+ const pauseDuration = this.updatePauseDuration(energy);
86
+ return { mfcc, energy, pitch, speechRate, pauseDuration };
62
87
  }
63
88
  /**
64
89
  * Compute Mel-Frequency Cepstral Coefficients.
65
90
  *
66
- * TODO: Implement full MFCC pipeline:
67
- * 1. Pre-emphasis filter
68
- * 2. Windowing (Hamming)
69
- * 3. FFT
70
- * 4. Mel filterbank
71
- * 5. Log energy
72
- * 6. DCT
91
+ * Pipeline: Pre-emphasis Hamming window → FFT → Mel filterbank → log → DCT
73
92
  */
74
- computeMFCC(_frame) {
75
- return new Float32Array(13);
93
+ computeMFCC(frame) {
94
+ const preEmph = new Float32Array(this.nFft);
95
+ const len = Math.min(frame.length, this.nFft);
96
+ preEmph[0] = frame[0];
97
+ for (let i = 1; i < len; i++) {
98
+ preEmph[i] = frame[i] - 0.97 * frame[i - 1];
99
+ }
100
+ for (let i = 0; i < this.nFft; i++) {
101
+ preEmph[i] *= this.hammingWindow[i];
102
+ }
103
+ const spectrum = this.fftMagnitude(preEmph);
104
+ const melEnergies = new Float32Array(this.nMels);
105
+ for (let m = 0; m < this.nMels; m++) {
106
+ let sum = 0;
107
+ const filter = this.melFilterbank[m];
108
+ for (let k = 0; k < filter.length; k++) {
109
+ sum += spectrum[k] * filter[k];
110
+ }
111
+ melEnergies[m] = Math.log(Math.max(sum, 1e-10));
112
+ }
113
+ const mfcc = new Float32Array(this.nMfcc);
114
+ for (let i = 0; i < this.nMfcc; i++) {
115
+ let sum = 0;
116
+ const dctRow = this.dctMatrix[i];
117
+ for (let j = 0; j < this.nMels; j++) {
118
+ sum += dctRow[j] * melEnergies[j];
119
+ }
120
+ mfcc[i] = sum;
121
+ }
122
+ return mfcc;
76
123
  }
77
124
  /**
78
125
  * Compute RMS energy of the frame.
@@ -85,22 +132,217 @@ var FeatureExtractor = class {
85
132
  return Math.sqrt(sum / frame.length);
86
133
  }
87
134
  /**
88
- * Estimate fundamental frequency (pitch) using autocorrelation.
135
+ * Estimate fundamental frequency (pitch) using simplified autocorrelation.
136
+ *
137
+ * Looks for the dominant periodicity in the signal within the
138
+ * speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
139
+ */
140
+ estimatePitch(frame) {
141
+ const minPeriod = Math.floor(this.sampleRate / 500);
142
+ const maxPeriod = Math.floor(this.sampleRate / 50);
143
+ const len = Math.min(frame.length, this.nFft);
144
+ if (len < maxPeriod * 2) return 0;
145
+ let bestCorr = 0;
146
+ let bestLag = 0;
147
+ let energy = 0;
148
+ for (let i = 0; i < len; i++) {
149
+ energy += frame[i] * frame[i];
150
+ }
151
+ if (energy < 1e-10) return 0;
152
+ for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
153
+ let corr = 0;
154
+ let energyLag = 0;
155
+ const limit = len - lag;
156
+ for (let i = 0; i < limit; i++) {
157
+ corr += frame[i] * frame[i + lag];
158
+ energyLag += frame[i + lag] * frame[i + lag];
159
+ }
160
+ const norm = Math.sqrt(energy * energyLag);
161
+ if (norm > 0) {
162
+ corr /= norm;
163
+ }
164
+ if (corr > bestCorr) {
165
+ bestCorr = corr;
166
+ bestLag = lag;
167
+ }
168
+ }
169
+ if (bestCorr < 0.3 || bestLag === 0) return 0;
170
+ return this.sampleRate / bestLag;
171
+ }
172
+ /**
173
+ * Estimate speech rate from rolling energy envelope.
89
174
  *
90
- * TODO: Implement YIN or autocorrelation-based pitch detection.
175
+ * Counts energy peaks in a 1-second sliding window.
176
+ * Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
91
177
  */
92
- estimatePitch(_frame) {
93
- void this.sampleRate;
94
- return 0;
178
+ estimateSpeechRate(energy) {
179
+ this.energyBuffer[this.energyBufferIdx] = energy;
180
+ this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
181
+ if (this.energyBufferIdx === 0) this.energyBufferFull = true;
182
+ const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
183
+ if (len < 5) return 0;
184
+ let peaks = 0;
185
+ const threshold = this.silenceThreshold * 0.5;
186
+ for (let i = 2; i < len - 2; i++) {
187
+ const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
188
+ const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
189
+ const curr = this.energyBuffer[idx];
190
+ const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
191
+ if (curr > prev && curr > next && curr > threshold) {
192
+ peaks++;
193
+ }
194
+ }
195
+ const windowDuration = len * this.frameDurationSec;
196
+ const rate = windowDuration > 0 ? peaks / windowDuration : 0;
197
+ return rate / 10;
95
198
  }
96
199
  /**
97
- * Estimate speech rate (syllables per second).
200
+ * Track accumulated pause duration.
98
201
  *
99
- * TODO: Implement energy-envelope peak counting.
202
+ * Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
100
203
  */
101
- estimateSpeechRate(_frame) {
102
- return 0;
204
+ updatePauseDuration(energy) {
205
+ if (energy < this.silenceThreshold) {
206
+ this.silenceAccumulator += this.frameDurationSec;
207
+ } else {
208
+ this.silenceAccumulator = 0;
209
+ }
210
+ return Math.min(this.silenceAccumulator, 5) / 5;
211
+ }
212
+ /**
213
+ * Compute FFT magnitude spectrum (power spectrum).
214
+ *
215
+ * Uses a radix-2 DIT FFT implementation. For frames smaller than
216
+ * nFft, zero-pads to the next power of 2.
217
+ */
218
+ fftMagnitude(signal) {
219
+ let n = 1;
220
+ while (n < signal.length) n *= 2;
221
+ const real = new Float32Array(n);
222
+ const imag = new Float32Array(n);
223
+ real.set(signal);
224
+ let j = 0;
225
+ for (let i = 0; i < n; i++) {
226
+ if (i < j) {
227
+ [real[i], real[j]] = [real[j], real[i]];
228
+ [imag[i], imag[j]] = [imag[j], imag[i]];
229
+ }
230
+ let m = n >> 1;
231
+ while (m >= 1 && j >= m) {
232
+ j -= m;
233
+ m >>= 1;
234
+ }
235
+ j += m;
236
+ }
237
+ for (let size = 2; size <= n; size *= 2) {
238
+ const halfSize = size / 2;
239
+ const angle = -2 * Math.PI / size;
240
+ const wReal = Math.cos(angle);
241
+ const wImag = Math.sin(angle);
242
+ for (let i = 0; i < n; i += size) {
243
+ let curReal = 1;
244
+ let curImag = 0;
245
+ for (let k = 0; k < halfSize; k++) {
246
+ const evenIdx = i + k;
247
+ const oddIdx = i + k + halfSize;
248
+ const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
249
+ const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
250
+ real[oddIdx] = real[evenIdx] - tReal;
251
+ imag[oddIdx] = imag[evenIdx] - tImag;
252
+ real[evenIdx] += tReal;
253
+ imag[evenIdx] += tImag;
254
+ const newCurReal = curReal * wReal - curImag * wImag;
255
+ curImag = curReal * wImag + curImag * wReal;
256
+ curReal = newCurReal;
257
+ }
258
+ }
259
+ }
260
+ const numBins = n / 2 + 1;
261
+ const power = new Float32Array(numBins);
262
+ for (let i = 0; i < numBins; i++) {
263
+ power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
264
+ }
265
+ return power;
266
+ }
267
+ /**
268
+ * Create Mel filterbank matrix.
269
+ *
270
+ * Produces nMels triangular filters spanning the frequency range
271
+ * from 0 to sampleRate/2 on the Mel scale.
272
+ */
273
+ createMelFilterbank() {
274
+ let n = 1;
275
+ while (n < this.nFft) n *= 2;
276
+ const fftBins = n / 2 + 1;
277
+ const fMin = 0;
278
+ const fMax = this.sampleRate / 2;
279
+ const melMin = this.hzToMel(fMin);
280
+ const melMax = this.hzToMel(fMax);
281
+ const melPoints = new Float32Array(this.nMels + 2);
282
+ for (let i = 0; i < this.nMels + 2; i++) {
283
+ melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
284
+ }
285
+ const binIndices = new Float32Array(this.nMels + 2);
286
+ for (let i = 0; i < this.nMels + 2; i++) {
287
+ const hz = this.melToHz(melPoints[i]);
288
+ binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
289
+ }
290
+ const filters = [];
291
+ for (let m = 0; m < this.nMels; m++) {
292
+ const filter = new Float32Array(fftBins);
293
+ const left = binIndices[m];
294
+ const center = binIndices[m + 1];
295
+ const right = binIndices[m + 2];
296
+ for (let k = 0; k < fftBins; k++) {
297
+ if (k >= left && k <= center && center > left) {
298
+ filter[k] = (k - left) / (center - left);
299
+ } else if (k > center && k <= right && right > center) {
300
+ filter[k] = (right - k) / (right - center);
301
+ }
302
+ }
303
+ filters.push(filter);
304
+ }
305
+ return filters;
306
+ }
307
+ /**
308
+ * Create DCT-II matrix for MFCC computation.
309
+ */
310
+ createDCTMatrix() {
311
+ const matrix = [];
312
+ const scale = Math.sqrt(2 / this.nMels);
313
+ for (let i = 0; i < this.nMfcc; i++) {
314
+ const row = new Float32Array(this.nMels);
315
+ for (let j = 0; j < this.nMels; j++) {
316
+ row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
317
+ }
318
+ matrix.push(row);
319
+ }
320
+ return matrix;
321
+ }
322
+ hzToMel(hz) {
323
+ return 2595 * Math.log10(1 + hz / 700);
103
324
  }
325
+ melToHz(mel) {
326
+ return 700 * (Math.pow(10, mel / 2595) - 1);
327
+ }
328
+ /**
329
+ * Reset internal state (energy buffer, pause accumulator).
330
+ */
331
+ reset() {
332
+ this.silenceAccumulator = 0;
333
+ this.energyBuffer.fill(0);
334
+ this.energyBufferIdx = 0;
335
+ this.energyBufferFull = false;
336
+ }
337
+ };
338
+
339
+ // src/types.ts
340
+ var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
341
+ var DEFAULT_OPTIONS = {
342
+ sensitivity: 0.5,
343
+ pauseTolerance: 1500,
344
+ modelPath: "cdn",
345
+ sampleRate: 16e3
104
346
  };
105
347
 
106
348
  // src/model/energy-vad.ts
@@ -151,45 +393,187 @@ var EnergyVAD = class {
151
393
  };
152
394
 
153
395
  // src/model/onnx.ts
396
+ var LABELS = [
397
+ "speaking",
398
+ "thinking_pause",
399
+ "turn_complete",
400
+ "interrupt_intent"
401
+ ];
402
+ var FEATURE_DIM = 17;
403
+ var CONTEXT_FRAMES = 100;
404
+ var INFERENCE_INTERVAL = 10;
154
405
  var ONNXModel = class {
155
406
  session = null;
407
+ ort = null;
156
408
  fallback;
157
- constructor(sensitivity = 0.5) {
409
+ useWebGpu;
410
+ /** Circular buffer of feature vectors for the context window. */
411
+ frameBuffer;
412
+ bufferIdx = 0;
413
+ framesBuffered = 0;
414
+ framesSinceInference = 0;
415
+ /** Cache the last inference result for frames between batches. */
416
+ lastResult = null;
417
+ constructor(sensitivity = 0.5, useWebGpu = false) {
158
418
  this.fallback = new EnergyVAD(sensitivity);
419
+ this.useWebGpu = useWebGpu;
420
+ this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
159
421
  }
160
422
  /**
161
- * Load the ONNX model from a given path or URL.
423
+ * Load the ONNX model from CDN, bundled path, or custom URL.
424
+ *
425
+ * Dynamically imports onnxruntime-web to avoid bundling it
426
+ * when the model isn't used (tree-shaking friendly).
162
427
  *
163
- * TODO:
164
- * 1. Import onnxruntime-web InferenceSession
165
- * 2. Load model bytes
166
- * 3. Create session with appropriate execution providers
428
+ * @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
167
429
  */
168
- async load(_path) {
169
- this.session = null;
430
+ async load(path) {
431
+ try {
432
+ const ort = await import("onnxruntime-web");
433
+ this.ort = ort;
434
+ let modelSource = path;
435
+ if (path === "cdn") {
436
+ try {
437
+ const response = await fetch(MODEL_CDN_URL);
438
+ if (response.ok) {
439
+ modelSource = await response.arrayBuffer();
440
+ } else {
441
+ throw new Error(`Failed to fetch CDN model: ${response.status}`);
442
+ }
443
+ } catch {
444
+ console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
445
+ this.session = null;
446
+ return;
447
+ }
448
+ } else if (path === "bundled") {
449
+ try {
450
+ const getUrl = new Function("p", "b", "return new URL(p, b).href");
451
+ const href = getUrl("../../models/utterance-v1.onnx", import.meta.url);
452
+ const response = await fetch(href);
453
+ if (response.ok) {
454
+ modelSource = await response.arrayBuffer();
455
+ } else {
456
+ throw new Error(`Failed to fetch bundled model: ${response.status}`);
457
+ }
458
+ } catch {
459
+ console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
460
+ this.session = null;
461
+ return;
462
+ }
463
+ }
464
+ const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
465
+ this.session = await ort.InferenceSession.create(modelSource, {
466
+ executionProviders: providers
467
+ });
468
+ } catch (err) {
469
+ console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
470
+ this.session = null;
471
+ }
170
472
  }
171
473
  /**
172
- * Run inference on a set of extracted features.
474
+ * Run inference on extracted features.
173
475
  *
174
- * TODO:
175
- * 1. Build input tensor from AudioFeatures
176
- * 2. Run session.run()
177
- * 3. Parse output into ClassificationResult
476
+ * Buffers frames into a sliding window and runs the ONNX model
477
+ * every 100ms (10 frames). Between inference runs, returns the
478
+ * cached result. Falls back to EnergyVAD when no model is loaded.
178
479
  */
179
480
  async predict(features) {
180
- if (!this.session) {
481
+ if (!this.session || !this.ort) {
181
482
  return this.fallback.classify(features);
182
483
  }
183
- return this.fallback.classify(features);
484
+ this.addFrame(features);
485
+ this.framesSinceInference++;
486
+ if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
487
+ this.framesSinceInference = 0;
488
+ try {
489
+ this.lastResult = await this.runInference();
490
+ } catch (err) {
491
+ console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
492
+ return this.fallback.classify(features);
493
+ }
494
+ }
495
+ return this.lastResult ?? this.fallback.classify(features);
184
496
  }
185
497
  /**
186
498
  * Release model resources.
187
499
  */
188
500
  dispose() {
501
+ if (this.session) {
502
+ this.session.release().catch(() => {
503
+ });
504
+ }
189
505
  this.session = null;
506
+ this.ort = null;
190
507
  this.fallback.reset();
508
+ this.resetBuffer();
509
+ }
510
+ /**
511
+ * Add a feature frame to the circular buffer.
512
+ */
513
+ addFrame(features) {
514
+ const offset = this.bufferIdx * FEATURE_DIM;
515
+ this.frameBuffer.set(features.mfcc, offset);
516
+ this.frameBuffer[offset + 13] = features.energy;
517
+ this.frameBuffer[offset + 14] = features.pitch;
518
+ this.frameBuffer[offset + 15] = features.speechRate;
519
+ this.frameBuffer[offset + 16] = features.pauseDuration;
520
+ this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
521
+ if (this.framesBuffered < CONTEXT_FRAMES) {
522
+ this.framesBuffered++;
523
+ }
524
+ }
525
+ /**
526
+ * Build the input tensor from the circular buffer and run ONNX inference.
527
+ */
528
+ async runInference() {
529
+ const ort = this.ort;
530
+ const session = this.session;
531
+ const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
532
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
533
+ const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
534
+ const dstIdx = i * FEATURE_DIM;
535
+ input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
536
+ }
537
+ const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
538
+ const results = await session.run({ input: tensor });
539
+ const output = results.output;
540
+ const logits = output.data;
541
+ const probs = softmax(logits);
542
+ let bestIdx = 0;
543
+ let bestProb = probs[0];
544
+ for (let i = 1; i < probs.length; i++) {
545
+ if (probs[i] > bestProb) {
546
+ bestProb = probs[i];
547
+ bestIdx = i;
548
+ }
549
+ }
550
+ return {
551
+ label: LABELS[bestIdx],
552
+ confidence: bestProb,
553
+ timestamp: Date.now()
554
+ };
555
+ }
556
+ resetBuffer() {
557
+ this.frameBuffer.fill(0);
558
+ this.bufferIdx = 0;
559
+ this.framesBuffered = 0;
560
+ this.framesSinceInference = 0;
561
+ this.lastResult = null;
191
562
  }
192
563
  };
564
+ function softmax(logits) {
565
+ const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
566
+ const exps = new Float32Array(logits.length);
567
+ let sum = 0;
568
+ for (let i = 0; i < logits.length; i++) {
569
+ exps[i] = Math.exp(logits[i] - max);
570
+ sum += exps[i];
571
+ }
572
+ for (let i = 0; i < exps.length; i++) {
573
+ exps[i] /= sum;
574
+ }
575
+ return exps;
576
+ }
193
577
 
194
578
  // src/detector/turn-detector.ts
195
579
  var TurnDetector = class {
@@ -280,14 +664,6 @@ var TurnDetector = class {
280
664
  }
281
665
  };
282
666
 
283
- // src/types.ts
284
- var DEFAULT_OPTIONS = {
285
- sensitivity: 0.5,
286
- pauseTolerance: 1500,
287
- modelPath: "bundled",
288
- sampleRate: 16e3
289
- };
290
-
291
667
  // src/utterance.ts
292
668
  var Utterance = class {
293
669
  options;
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@utterance/core",
3
- "version": "0.0.1",
3
+ "version": "0.0.3",
4
4
  "description": "Client-side semantic endpointing. Know when they're done talking.",
5
5
  "type": "module",
6
6
  "main": "dist/index.cjs",
@@ -76,6 +76,7 @@
76
76
  "dependencies": {
77
77
  "@next/third-parties": "^16.1.6",
78
78
  "@react-three/fiber": "^9.5.0",
79
+ "@utterance/core": "^0.0.2",
79
80
  "class-variance-authority": "^0.7.1",
80
81
  "clsx": "^2.1.1",
81
82
  "fumadocs-core": "^16.6.3",