npm - @utterance/core - Versions diffs - 0.0.1 → 0.0.3 - Mend

@utterance/core 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.cjs CHANGED Viewed

@@ -1,7 +1,9 @@
 "use strict";
+var __create = Object.create;
 var __defProp = Object.defineProperty;
 var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
 var __getOwnPropNames = Object.getOwnPropertyNames;
+var __getProtoOf = Object.getPrototypeOf;
 var __hasOwnProp = Object.prototype.hasOwnProperty;
 var __export = (target, all) => {
   for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
   }
   return to;
 };
+var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
+  // If the importer is in node compatibility mode or this is not an ESM
+  // file that has been converted to a CommonJS file using a Babel-
+  // compatible transform (i.e. "__esModule" has not been set), then set
+  // "default" to the CommonJS "module.exports" for node compatibility.
+  isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
+  mod
+));
 var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 // src/index.ts
@@ -70,35 +80,82 @@ var AudioCapture = class {
 // src/features/extractor.ts
 var FeatureExtractor = class {
   sampleRate;
+  nFft;
+  nMels;
+  nMfcc;
+  // Pre-computed DSP tables
+  hammingWindow;
+  melFilterbank;
+  dctMatrix;
+  // State for pause duration tracking
+  silenceAccumulator = 0;
+  silenceThreshold = 0.01;
+  frameDurationSec;
+  // State for speech rate (rolling energy buffer)
+  energyBuffer;
+  energyBufferIdx = 0;
+  energyBufferFull = false;
   constructor(sampleRate = 16e3) {
     this.sampleRate = sampleRate;
+    this.nFft = Math.floor(sampleRate * 0.025);
+    this.nMels = 40;
+    this.nMfcc = 13;
+    this.frameDurationSec = 0.01;
+    this.hammingWindow = new Float32Array(this.nFft);
+    for (let i = 0; i < this.nFft; i++) {
+      this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
+    }
+    this.melFilterbank = this.createMelFilterbank();
+    this.dctMatrix = this.createDCTMatrix();
+    const framesPerSecond = Math.floor(1 / this.frameDurationSec);
+    this.energyBuffer = new Float32Array(framesPerSecond);
   }
   /**
    * Extract all features from a single audio frame.
    */
   extract(frame) {
-    return {
-      mfcc: this.computeMFCC(frame),
-      energy: this.computeEnergy(frame),
-      pitch: this.estimatePitch(frame),
-      speechRate: this.estimateSpeechRate(frame),
-      pauseDuration: 0
-      // tracked by the detector over time
-    };
+    const energy = this.computeEnergy(frame);
+    const mfcc = this.computeMFCC(frame);
+    const pitch = this.estimatePitch(frame);
+    const speechRate = this.estimateSpeechRate(energy);
+    const pauseDuration = this.updatePauseDuration(energy);
+    return { mfcc, energy, pitch, speechRate, pauseDuration };
   }
   /**
    * Compute Mel-Frequency Cepstral Coefficients.
    *
-   * TODO: Implement full MFCC pipeline:
-   *   1. Pre-emphasis filter
-   *   2. Windowing (Hamming)
-   *   3. FFT
-   *   4. Mel filterbank
-   *   5. Log energy
-   *   6. DCT
+   * Pipeline: Pre-emphasis → Hamming window → FFT → Mel filterbank → log → DCT
    */
-  computeMFCC(_frame) {
-    return new Float32Array(13);
+  computeMFCC(frame) {
+    const preEmph = new Float32Array(this.nFft);
+    const len = Math.min(frame.length, this.nFft);
+    preEmph[0] = frame[0];
+    for (let i = 1; i < len; i++) {
+      preEmph[i] = frame[i] - 0.97 * frame[i - 1];
+    }
+    for (let i = 0; i < this.nFft; i++) {
+      preEmph[i] *= this.hammingWindow[i];
+    }
+    const spectrum = this.fftMagnitude(preEmph);
+    const melEnergies = new Float32Array(this.nMels);
+    for (let m = 0; m < this.nMels; m++) {
+      let sum = 0;
+      const filter = this.melFilterbank[m];
+      for (let k = 0; k < filter.length; k++) {
+        sum += spectrum[k] * filter[k];
+      }
+      melEnergies[m] = Math.log(Math.max(sum, 1e-10));
+    }
+    const mfcc = new Float32Array(this.nMfcc);
+    for (let i = 0; i < this.nMfcc; i++) {
+      let sum = 0;
+      const dctRow = this.dctMatrix[i];
+      for (let j = 0; j < this.nMels; j++) {
+        sum += dctRow[j] * melEnergies[j];
+      }
+      mfcc[i] = sum;
+    }
+    return mfcc;
   }
   /**
    * Compute RMS energy of the frame.
@@ -111,22 +168,217 @@ var FeatureExtractor = class {
     return Math.sqrt(sum / frame.length);
   }
   /**
-   * Estimate fundamental frequency (pitch) using autocorrelation.
+   * Estimate fundamental frequency (pitch) using simplified autocorrelation.
+   *
+   * Looks for the dominant periodicity in the signal within the
+   * speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
+   */
+  estimatePitch(frame) {
+    const minPeriod = Math.floor(this.sampleRate / 500);
+    const maxPeriod = Math.floor(this.sampleRate / 50);
+    const len = Math.min(frame.length, this.nFft);
+    if (len < maxPeriod * 2) return 0;
+    let bestCorr = 0;
+    let bestLag = 0;
+    let energy = 0;
+    for (let i = 0; i < len; i++) {
+      energy += frame[i] * frame[i];
+    }
+    if (energy < 1e-10) return 0;
+    for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
+      let corr = 0;
+      let energyLag = 0;
+      const limit = len - lag;
+      for (let i = 0; i < limit; i++) {
+        corr += frame[i] * frame[i + lag];
+        energyLag += frame[i + lag] * frame[i + lag];
+      }
+      const norm = Math.sqrt(energy * energyLag);
+      if (norm > 0) {
+        corr /= norm;
+      }
+      if (corr > bestCorr) {
+        bestCorr = corr;
+        bestLag = lag;
+      }
+    }
+    if (bestCorr < 0.3 || bestLag === 0) return 0;
+    return this.sampleRate / bestLag;
+  }
+  /**
+   * Estimate speech rate from rolling energy envelope.
    *
-   * TODO: Implement YIN or autocorrelation-based pitch detection.
+   * Counts energy peaks in a 1-second sliding window.
+   * Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
    */
-  estimatePitch(_frame) {
-    void this.sampleRate;
-    return 0;
+  estimateSpeechRate(energy) {
+    this.energyBuffer[this.energyBufferIdx] = energy;
+    this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
+    if (this.energyBufferIdx === 0) this.energyBufferFull = true;
+    const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
+    if (len < 5) return 0;
+    let peaks = 0;
+    const threshold = this.silenceThreshold * 0.5;
+    for (let i = 2; i < len - 2; i++) {
+      const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
+      const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
+      const curr = this.energyBuffer[idx];
+      const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
+      if (curr > prev && curr > next && curr > threshold) {
+        peaks++;
+      }
+    }
+    const windowDuration = len * this.frameDurationSec;
+    const rate = windowDuration > 0 ? peaks / windowDuration : 0;
+    return rate / 10;
   }
   /**
-   * Estimate speech rate (syllables per second).
+   * Track accumulated pause duration.
    *
-   * TODO: Implement energy-envelope peak counting.
+   * Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
    */
-  estimateSpeechRate(_frame) {
-    return 0;
+  updatePauseDuration(energy) {
+    if (energy < this.silenceThreshold) {
+      this.silenceAccumulator += this.frameDurationSec;
+    } else {
+      this.silenceAccumulator = 0;
+    }
+    return Math.min(this.silenceAccumulator, 5) / 5;
+  }
+  /**
+   * Compute FFT magnitude spectrum (power spectrum).
+   *
+   * Uses a radix-2 DIT FFT implementation. For frames smaller than
+   * nFft, zero-pads to the next power of 2.
+   */
+  fftMagnitude(signal) {
+    let n = 1;
+    while (n < signal.length) n *= 2;
+    const real = new Float32Array(n);
+    const imag = new Float32Array(n);
+    real.set(signal);
+    let j = 0;
+    for (let i = 0; i < n; i++) {
+      if (i < j) {
+        [real[i], real[j]] = [real[j], real[i]];
+        [imag[i], imag[j]] = [imag[j], imag[i]];
+      }
+      let m = n >> 1;
+      while (m >= 1 && j >= m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+    for (let size = 2; size <= n; size *= 2) {
+      const halfSize = size / 2;
+      const angle = -2 * Math.PI / size;
+      const wReal = Math.cos(angle);
+      const wImag = Math.sin(angle);
+      for (let i = 0; i < n; i += size) {
+        let curReal = 1;
+        let curImag = 0;
+        for (let k = 0; k < halfSize; k++) {
+          const evenIdx = i + k;
+          const oddIdx = i + k + halfSize;
+          const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
+          const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
+          real[oddIdx] = real[evenIdx] - tReal;
+          imag[oddIdx] = imag[evenIdx] - tImag;
+          real[evenIdx] += tReal;
+          imag[evenIdx] += tImag;
+          const newCurReal = curReal * wReal - curImag * wImag;
+          curImag = curReal * wImag + curImag * wReal;
+          curReal = newCurReal;
+        }
+      }
+    }
+    const numBins = n / 2 + 1;
+    const power = new Float32Array(numBins);
+    for (let i = 0; i < numBins; i++) {
+      power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
+    }
+    return power;
+  }
+  /**
+   * Create Mel filterbank matrix.
+   *
+   * Produces nMels triangular filters spanning the frequency range
+   * from 0 to sampleRate/2 on the Mel scale.
+   */
+  createMelFilterbank() {
+    let n = 1;
+    while (n < this.nFft) n *= 2;
+    const fftBins = n / 2 + 1;
+    const fMin = 0;
+    const fMax = this.sampleRate / 2;
+    const melMin = this.hzToMel(fMin);
+    const melMax = this.hzToMel(fMax);
+    const melPoints = new Float32Array(this.nMels + 2);
+    for (let i = 0; i < this.nMels + 2; i++) {
+      melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
+    }
+    const binIndices = new Float32Array(this.nMels + 2);
+    for (let i = 0; i < this.nMels + 2; i++) {
+      const hz = this.melToHz(melPoints[i]);
+      binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
+    }
+    const filters = [];
+    for (let m = 0; m < this.nMels; m++) {
+      const filter = new Float32Array(fftBins);
+      const left = binIndices[m];
+      const center = binIndices[m + 1];
+      const right = binIndices[m + 2];
+      for (let k = 0; k < fftBins; k++) {
+        if (k >= left && k <= center && center > left) {
+          filter[k] = (k - left) / (center - left);
+        } else if (k > center && k <= right && right > center) {
+          filter[k] = (right - k) / (right - center);
+        }
+      }
+      filters.push(filter);
+    }
+    return filters;
+  }
+  /**
+   * Create DCT-II matrix for MFCC computation.
+   */
+  createDCTMatrix() {
+    const matrix = [];
+    const scale = Math.sqrt(2 / this.nMels);
+    for (let i = 0; i < this.nMfcc; i++) {
+      const row = new Float32Array(this.nMels);
+      for (let j = 0; j < this.nMels; j++) {
+        row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
+      }
+      matrix.push(row);
+    }
+    return matrix;
+  }
+  hzToMel(hz) {
+    return 2595 * Math.log10(1 + hz / 700);
   }
+  melToHz(mel) {
+    return 700 * (Math.pow(10, mel / 2595) - 1);
+  }
+  /**
+   * Reset internal state (energy buffer, pause accumulator).
+   */
+  reset() {
+    this.silenceAccumulator = 0;
+    this.energyBuffer.fill(0);
+    this.energyBufferIdx = 0;
+    this.energyBufferFull = false;
+  }
+};
+// src/types.ts
+var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
+var DEFAULT_OPTIONS = {
+  sensitivity: 0.5,
+  pauseTolerance: 1500,
+  modelPath: "cdn",
+  sampleRate: 16e3
 };
 // src/model/energy-vad.ts
@@ -177,45 +429,188 @@ var EnergyVAD = class {
 };
 // src/model/onnx.ts
+var import_meta = {};
+var LABELS = [
+  "speaking",
+  "thinking_pause",
+  "turn_complete",
+  "interrupt_intent"
+];
+var FEATURE_DIM = 17;
+var CONTEXT_FRAMES = 100;
+var INFERENCE_INTERVAL = 10;
 var ONNXModel = class {
   session = null;
+  ort = null;
   fallback;
-  constructor(sensitivity = 0.5) {
+  useWebGpu;
+  /** Circular buffer of feature vectors for the context window. */
+  frameBuffer;
+  bufferIdx = 0;
+  framesBuffered = 0;
+  framesSinceInference = 0;
+  /** Cache the last inference result for frames between batches. */
+  lastResult = null;
+  constructor(sensitivity = 0.5, useWebGpu = false) {
     this.fallback = new EnergyVAD(sensitivity);
+    this.useWebGpu = useWebGpu;
+    this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
   }
   /**
-   * Load the ONNX model from a given path or URL.
+   * Load the ONNX model from CDN, bundled path, or custom URL.
+   *
+   * Dynamically imports onnxruntime-web to avoid bundling it
+   * when the model isn't used (tree-shaking friendly).
    *
-   * TODO:
-   *   1. Import onnxruntime-web InferenceSession
-   *   2. Load model bytes
-   *   3. Create session with appropriate execution providers
+   * @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
    */
-  async load(_path) {
-    this.session = null;
+  async load(path) {
+    try {
+      const ort = await import("onnxruntime-web");
+      this.ort = ort;
+      let modelSource = path;
+      if (path === "cdn") {
+        try {
+          const response = await fetch(MODEL_CDN_URL);
+          if (response.ok) {
+            modelSource = await response.arrayBuffer();
+          } else {
+            throw new Error(`Failed to fetch CDN model: ${response.status}`);
+          }
+        } catch {
+          console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
+          this.session = null;
+          return;
+        }
+      } else if (path === "bundled") {
+        try {
+          const getUrl = new Function("p", "b", "return new URL(p, b).href");
+          const href = getUrl("../../models/utterance-v1.onnx", import_meta.url);
+          const response = await fetch(href);
+          if (response.ok) {
+            modelSource = await response.arrayBuffer();
+          } else {
+            throw new Error(`Failed to fetch bundled model: ${response.status}`);
+          }
+        } catch {
+          console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
+          this.session = null;
+          return;
+        }
+      }
+      const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
+      this.session = await ort.InferenceSession.create(modelSource, {
+        executionProviders: providers
+      });
+    } catch (err) {
+      console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
+      this.session = null;
+    }
   }
   /**
-   * Run inference on a set of extracted features.
+   * Run inference on extracted features.
    *
-   * TODO:
-   *   1. Build input tensor from AudioFeatures
-   *   2. Run session.run()
-   *   3. Parse output into ClassificationResult
+   * Buffers frames into a sliding window and runs the ONNX model
+   * every 100ms (10 frames). Between inference runs, returns the
+   * cached result. Falls back to EnergyVAD when no model is loaded.
    */
   async predict(features) {
-    if (!this.session) {
+    if (!this.session || !this.ort) {
       return this.fallback.classify(features);
     }
-    return this.fallback.classify(features);
+    this.addFrame(features);
+    this.framesSinceInference++;
+    if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
+      this.framesSinceInference = 0;
+      try {
+        this.lastResult = await this.runInference();
+      } catch (err) {
+        console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
+        return this.fallback.classify(features);
+      }
+    }
+    return this.lastResult ?? this.fallback.classify(features);
   }
   /**
    * Release model resources.
    */
   dispose() {
+    if (this.session) {
+      this.session.release().catch(() => {
+      });
+    }
     this.session = null;
+    this.ort = null;
     this.fallback.reset();
+    this.resetBuffer();
+  }
+  /**
+   * Add a feature frame to the circular buffer.
+   */
+  addFrame(features) {
+    const offset = this.bufferIdx * FEATURE_DIM;
+    this.frameBuffer.set(features.mfcc, offset);
+    this.frameBuffer[offset + 13] = features.energy;
+    this.frameBuffer[offset + 14] = features.pitch;
+    this.frameBuffer[offset + 15] = features.speechRate;
+    this.frameBuffer[offset + 16] = features.pauseDuration;
+    this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
+    if (this.framesBuffered < CONTEXT_FRAMES) {
+      this.framesBuffered++;
+    }
+  }
+  /**
+   * Build the input tensor from the circular buffer and run ONNX inference.
+   */
+  async runInference() {
+    const ort = this.ort;
+    const session = this.session;
+    const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
+    for (let i = 0; i < CONTEXT_FRAMES; i++) {
+      const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
+      const dstIdx = i * FEATURE_DIM;
+      input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
+    }
+    const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
+    const results = await session.run({ input: tensor });
+    const output = results.output;
+    const logits = output.data;
+    const probs = softmax(logits);
+    let bestIdx = 0;
+    let bestProb = probs[0];
+    for (let i = 1; i < probs.length; i++) {
+      if (probs[i] > bestProb) {
+        bestProb = probs[i];
+        bestIdx = i;
+      }
+    }
+    return {
+      label: LABELS[bestIdx],
+      confidence: bestProb,
+      timestamp: Date.now()
+    };
+  }
+  resetBuffer() {
+    this.frameBuffer.fill(0);
+    this.bufferIdx = 0;
+    this.framesBuffered = 0;
+    this.framesSinceInference = 0;
+    this.lastResult = null;
   }
 };
+function softmax(logits) {
+  const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
+  const exps = new Float32Array(logits.length);
+  let sum = 0;
+  for (let i = 0; i < logits.length; i++) {
+    exps[i] = Math.exp(logits[i] - max);
+    sum += exps[i];
+  }
+  for (let i = 0; i < exps.length; i++) {
+    exps[i] /= sum;
+  }
+  return exps;
+}
 // src/detector/turn-detector.ts
 var TurnDetector = class {
@@ -306,14 +701,6 @@ var TurnDetector = class {
   }
 };
-// src/types.ts
-var DEFAULT_OPTIONS = {
-  sensitivity: 0.5,
-  pauseTolerance: 1500,
-  modelPath: "bundled",
-  sampleRate: 16e3
-};
 // src/utterance.ts
 var Utterance = class {
   options;

package/dist/index.d.cts CHANGED Viewed

@@ -9,7 +9,7 @@ interface UtteranceOptions {
     sensitivity?: number;
     /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
     pauseTolerance?: number;
-    /** Path to a custom ONNX model. Default: bundled model */
+    /** Model source: "cdn" (default), "bundled", or a custom URL. */
     modelPath?: string;
     /** Audio sample rate in Hz. Default: 16000 */
     sampleRate?: number;

package/dist/index.d.ts CHANGED Viewed

@@ -9,7 +9,7 @@ interface UtteranceOptions {
     sensitivity?: number;
     /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
     pauseTolerance?: number;
-    /** Path to a custom ONNX model. Default: bundled model */
+    /** Model source: "cdn" (default), "bundled", or a custom URL. */
     modelPath?: string;
     /** Audio sample rate in Hz. Default: 16000 */
     sampleRate?: number;

package/dist/index.js CHANGED Viewed

@@ -44,35 +44,82 @@ var AudioCapture = class {
 // src/features/extractor.ts
 var FeatureExtractor = class {
   sampleRate;
+  nFft;
+  nMels;
+  nMfcc;
+  // Pre-computed DSP tables
+  hammingWindow;
+  melFilterbank;
+  dctMatrix;
+  // State for pause duration tracking
+  silenceAccumulator = 0;
+  silenceThreshold = 0.01;
+  frameDurationSec;
+  // State for speech rate (rolling energy buffer)
+  energyBuffer;
+  energyBufferIdx = 0;
+  energyBufferFull = false;
   constructor(sampleRate = 16e3) {
     this.sampleRate = sampleRate;
+    this.nFft = Math.floor(sampleRate * 0.025);
+    this.nMels = 40;
+    this.nMfcc = 13;
+    this.frameDurationSec = 0.01;
+    this.hammingWindow = new Float32Array(this.nFft);
+    for (let i = 0; i < this.nFft; i++) {
+      this.hammingWindow[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (this.nFft - 1));
+    }
+    this.melFilterbank = this.createMelFilterbank();
+    this.dctMatrix = this.createDCTMatrix();
+    const framesPerSecond = Math.floor(1 / this.frameDurationSec);
+    this.energyBuffer = new Float32Array(framesPerSecond);
   }
   /**
    * Extract all features from a single audio frame.
    */
   extract(frame) {
-    return {
-      mfcc: this.computeMFCC(frame),
-      energy: this.computeEnergy(frame),
-      pitch: this.estimatePitch(frame),
-      speechRate: this.estimateSpeechRate(frame),
-      pauseDuration: 0
-      // tracked by the detector over time
-    };
+    const energy = this.computeEnergy(frame);
+    const mfcc = this.computeMFCC(frame);
+    const pitch = this.estimatePitch(frame);
+    const speechRate = this.estimateSpeechRate(energy);
+    const pauseDuration = this.updatePauseDuration(energy);
+    return { mfcc, energy, pitch, speechRate, pauseDuration };
   }
   /**
    * Compute Mel-Frequency Cepstral Coefficients.
    *
-   * TODO: Implement full MFCC pipeline:
-   *   1. Pre-emphasis filter
-   *   2. Windowing (Hamming)
-   *   3. FFT
-   *   4. Mel filterbank
-   *   5. Log energy
-   *   6. DCT
+   * Pipeline: Pre-emphasis → Hamming window → FFT → Mel filterbank → log → DCT
    */
-  computeMFCC(_frame) {
-    return new Float32Array(13);
+  computeMFCC(frame) {
+    const preEmph = new Float32Array(this.nFft);
+    const len = Math.min(frame.length, this.nFft);
+    preEmph[0] = frame[0];
+    for (let i = 1; i < len; i++) {
+      preEmph[i] = frame[i] - 0.97 * frame[i - 1];
+    }
+    for (let i = 0; i < this.nFft; i++) {
+      preEmph[i] *= this.hammingWindow[i];
+    }
+    const spectrum = this.fftMagnitude(preEmph);
+    const melEnergies = new Float32Array(this.nMels);
+    for (let m = 0; m < this.nMels; m++) {
+      let sum = 0;
+      const filter = this.melFilterbank[m];
+      for (let k = 0; k < filter.length; k++) {
+        sum += spectrum[k] * filter[k];
+      }
+      melEnergies[m] = Math.log(Math.max(sum, 1e-10));
+    }
+    const mfcc = new Float32Array(this.nMfcc);
+    for (let i = 0; i < this.nMfcc; i++) {
+      let sum = 0;
+      const dctRow = this.dctMatrix[i];
+      for (let j = 0; j < this.nMels; j++) {
+        sum += dctRow[j] * melEnergies[j];
+      }
+      mfcc[i] = sum;
+    }
+    return mfcc;
   }
   /**
    * Compute RMS energy of the frame.
@@ -85,22 +132,217 @@ var FeatureExtractor = class {
     return Math.sqrt(sum / frame.length);
   }
   /**
-   * Estimate fundamental frequency (pitch) using autocorrelation.
+   * Estimate fundamental frequency (pitch) using simplified autocorrelation.
+   *
+   * Looks for the dominant periodicity in the signal within the
+   * speech frequency range (50-500 Hz). Returns 0 for unvoiced frames.
+   */
+  estimatePitch(frame) {
+    const minPeriod = Math.floor(this.sampleRate / 500);
+    const maxPeriod = Math.floor(this.sampleRate / 50);
+    const len = Math.min(frame.length, this.nFft);
+    if (len < maxPeriod * 2) return 0;
+    let bestCorr = 0;
+    let bestLag = 0;
+    let energy = 0;
+    for (let i = 0; i < len; i++) {
+      energy += frame[i] * frame[i];
+    }
+    if (energy < 1e-10) return 0;
+    for (let lag = minPeriod; lag <= maxPeriod && lag < len; lag++) {
+      let corr = 0;
+      let energyLag = 0;
+      const limit = len - lag;
+      for (let i = 0; i < limit; i++) {
+        corr += frame[i] * frame[i + lag];
+        energyLag += frame[i + lag] * frame[i + lag];
+      }
+      const norm = Math.sqrt(energy * energyLag);
+      if (norm > 0) {
+        corr /= norm;
+      }
+      if (corr > bestCorr) {
+        bestCorr = corr;
+        bestLag = lag;
+      }
+    }
+    if (bestCorr < 0.3 || bestLag === 0) return 0;
+    return this.sampleRate / bestLag;
+  }
+  /**
+   * Estimate speech rate from rolling energy envelope.
    *
-   * TODO: Implement YIN or autocorrelation-based pitch detection.
+   * Counts energy peaks in a 1-second sliding window.
+   * Returns a normalized value (~0-1 range, where 0.3-0.7 is typical speech).
    */
-  estimatePitch(_frame) {
-    void this.sampleRate;
-    return 0;
+  estimateSpeechRate(energy) {
+    this.energyBuffer[this.energyBufferIdx] = energy;
+    this.energyBufferIdx = (this.energyBufferIdx + 1) % this.energyBuffer.length;
+    if (this.energyBufferIdx === 0) this.energyBufferFull = true;
+    const len = this.energyBufferFull ? this.energyBuffer.length : this.energyBufferIdx;
+    if (len < 5) return 0;
+    let peaks = 0;
+    const threshold = this.silenceThreshold * 0.5;
+    for (let i = 2; i < len - 2; i++) {
+      const idx = (this.energyBufferIdx - len + i + this.energyBuffer.length) % this.energyBuffer.length;
+      const prev = this.energyBuffer[(idx - 1 + this.energyBuffer.length) % this.energyBuffer.length];
+      const curr = this.energyBuffer[idx];
+      const next = this.energyBuffer[(idx + 1) % this.energyBuffer.length];
+      if (curr > prev && curr > next && curr > threshold) {
+        peaks++;
+      }
+    }
+    const windowDuration = len * this.frameDurationSec;
+    const rate = windowDuration > 0 ? peaks / windowDuration : 0;
+    return rate / 10;
   }
   /**
-   * Estimate speech rate (syllables per second).
+   * Track accumulated pause duration.
    *
-   * TODO: Implement energy-envelope peak counting.
+   * Returns pause duration in seconds, capped at 5s and normalized to [0, 1].
    */
-  estimateSpeechRate(_frame) {
-    return 0;
+  updatePauseDuration(energy) {
+    if (energy < this.silenceThreshold) {
+      this.silenceAccumulator += this.frameDurationSec;
+    } else {
+      this.silenceAccumulator = 0;
+    }
+    return Math.min(this.silenceAccumulator, 5) / 5;
+  }
+  /**
+   * Compute FFT magnitude spectrum (power spectrum).
+   *
+   * Uses a radix-2 DIT FFT implementation. For frames smaller than
+   * nFft, zero-pads to the next power of 2.
+   */
+  fftMagnitude(signal) {
+    let n = 1;
+    while (n < signal.length) n *= 2;
+    const real = new Float32Array(n);
+    const imag = new Float32Array(n);
+    real.set(signal);
+    let j = 0;
+    for (let i = 0; i < n; i++) {
+      if (i < j) {
+        [real[i], real[j]] = [real[j], real[i]];
+        [imag[i], imag[j]] = [imag[j], imag[i]];
+      }
+      let m = n >> 1;
+      while (m >= 1 && j >= m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+    for (let size = 2; size <= n; size *= 2) {
+      const halfSize = size / 2;
+      const angle = -2 * Math.PI / size;
+      const wReal = Math.cos(angle);
+      const wImag = Math.sin(angle);
+      for (let i = 0; i < n; i += size) {
+        let curReal = 1;
+        let curImag = 0;
+        for (let k = 0; k < halfSize; k++) {
+          const evenIdx = i + k;
+          const oddIdx = i + k + halfSize;
+          const tReal = curReal * real[oddIdx] - curImag * imag[oddIdx];
+          const tImag = curReal * imag[oddIdx] + curImag * real[oddIdx];
+          real[oddIdx] = real[evenIdx] - tReal;
+          imag[oddIdx] = imag[evenIdx] - tImag;
+          real[evenIdx] += tReal;
+          imag[evenIdx] += tImag;
+          const newCurReal = curReal * wReal - curImag * wImag;
+          curImag = curReal * wImag + curImag * wReal;
+          curReal = newCurReal;
+        }
+      }
+    }
+    const numBins = n / 2 + 1;
+    const power = new Float32Array(numBins);
+    for (let i = 0; i < numBins; i++) {
+      power[i] = (real[i] * real[i] + imag[i] * imag[i]) / n;
+    }
+    return power;
+  }
+  /**
+   * Create Mel filterbank matrix.
+   *
+   * Produces nMels triangular filters spanning the frequency range
+   * from 0 to sampleRate/2 on the Mel scale.
+   */
+  createMelFilterbank() {
+    let n = 1;
+    while (n < this.nFft) n *= 2;
+    const fftBins = n / 2 + 1;
+    const fMin = 0;
+    const fMax = this.sampleRate / 2;
+    const melMin = this.hzToMel(fMin);
+    const melMax = this.hzToMel(fMax);
+    const melPoints = new Float32Array(this.nMels + 2);
+    for (let i = 0; i < this.nMels + 2; i++) {
+      melPoints[i] = melMin + i * (melMax - melMin) / (this.nMels + 1);
+    }
+    const binIndices = new Float32Array(this.nMels + 2);
+    for (let i = 0; i < this.nMels + 2; i++) {
+      const hz = this.melToHz(melPoints[i]);
+      binIndices[i] = Math.floor((n + 1) * hz / this.sampleRate);
+    }
+    const filters = [];
+    for (let m = 0; m < this.nMels; m++) {
+      const filter = new Float32Array(fftBins);
+      const left = binIndices[m];
+      const center = binIndices[m + 1];
+      const right = binIndices[m + 2];
+      for (let k = 0; k < fftBins; k++) {
+        if (k >= left && k <= center && center > left) {
+          filter[k] = (k - left) / (center - left);
+        } else if (k > center && k <= right && right > center) {
+          filter[k] = (right - k) / (right - center);
+        }
+      }
+      filters.push(filter);
+    }
+    return filters;
+  }
+  /**
+   * Create DCT-II matrix for MFCC computation.
+   */
+  createDCTMatrix() {
+    const matrix = [];
+    const scale = Math.sqrt(2 / this.nMels);
+    for (let i = 0; i < this.nMfcc; i++) {
+      const row = new Float32Array(this.nMels);
+      for (let j = 0; j < this.nMels; j++) {
+        row[j] = scale * Math.cos(Math.PI * i * (j + 0.5) / this.nMels);
+      }
+      matrix.push(row);
+    }
+    return matrix;
+  }
+  hzToMel(hz) {
+    return 2595 * Math.log10(1 + hz / 700);
   }
+  melToHz(mel) {
+    return 700 * (Math.pow(10, mel / 2595) - 1);
+  }
+  /**
+   * Reset internal state (energy buffer, pause accumulator).
+   */
+  reset() {
+    this.silenceAccumulator = 0;
+    this.energyBuffer.fill(0);
+    this.energyBufferIdx = 0;
+    this.energyBufferFull = false;
+  }
+};
+// src/types.ts
+var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
+var DEFAULT_OPTIONS = {
+  sensitivity: 0.5,
+  pauseTolerance: 1500,
+  modelPath: "cdn",
+  sampleRate: 16e3
 };
 // src/model/energy-vad.ts
@@ -151,45 +393,187 @@ var EnergyVAD = class {
 };
 // src/model/onnx.ts
+var LABELS = [
+  "speaking",
+  "thinking_pause",
+  "turn_complete",
+  "interrupt_intent"
+];
+var FEATURE_DIM = 17;
+var CONTEXT_FRAMES = 100;
+var INFERENCE_INTERVAL = 10;
 var ONNXModel = class {
   session = null;
+  ort = null;
   fallback;
-  constructor(sensitivity = 0.5) {
+  useWebGpu;
+  /** Circular buffer of feature vectors for the context window. */
+  frameBuffer;
+  bufferIdx = 0;
+  framesBuffered = 0;
+  framesSinceInference = 0;
+  /** Cache the last inference result for frames between batches. */
+  lastResult = null;
+  constructor(sensitivity = 0.5, useWebGpu = false) {
     this.fallback = new EnergyVAD(sensitivity);
+    this.useWebGpu = useWebGpu;
+    this.frameBuffer = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
   }
   /**
-   * Load the ONNX model from a given path or URL.
+   * Load the ONNX model from CDN, bundled path, or custom URL.
+   *
+   * Dynamically imports onnxruntime-web to avoid bundling it
+   * when the model isn't used (tree-shaking friendly).
    *
-   * TODO:
-   *   1. Import onnxruntime-web InferenceSession
-   *   2. Load model bytes
-   *   3. Create session with appropriate execution providers
+   * @param path - "cdn" (default, loads from Cloudflare R2), "bundled" (from npm package), or a custom URL.
    */
-  async load(_path) {
-    this.session = null;
+  async load(path) {
+    try {
+      const ort = await import("onnxruntime-web");
+      this.ort = ort;
+      let modelSource = path;
+      if (path === "cdn") {
+        try {
+          const response = await fetch(MODEL_CDN_URL);
+          if (response.ok) {
+            modelSource = await response.arrayBuffer();
+          } else {
+            throw new Error(`Failed to fetch CDN model: ${response.status}`);
+          }
+        } catch {
+          console.warn("[utterance] CDN model unavailable, falling back to EnergyVAD");
+          this.session = null;
+          return;
+        }
+      } else if (path === "bundled") {
+        try {
+          const getUrl = new Function("p", "b", "return new URL(p, b).href");
+          const href = getUrl("../../models/utterance-v1.onnx", import.meta.url);
+          const response = await fetch(href);
+          if (response.ok) {
+            modelSource = await response.arrayBuffer();
+          } else {
+            throw new Error(`Failed to fetch bundled model: ${response.status}`);
+          }
+        } catch {
+          console.warn("[utterance] Bundled model not found, falling back to EnergyVAD");
+          this.session = null;
+          return;
+        }
+      }
+      const providers = this.useWebGpu ? ["webgpu", "wasm"] : ["wasm"];
+      this.session = await ort.InferenceSession.create(modelSource, {
+        executionProviders: providers
+      });
+    } catch (err) {
+      console.warn("[utterance] Failed to load ONNX model, falling back to EnergyVAD:", err);
+      this.session = null;
+    }
   }
   /**
-   * Run inference on a set of extracted features.
+   * Run inference on extracted features.
    *
-   * TODO:
-   *   1. Build input tensor from AudioFeatures
-   *   2. Run session.run()
-   *   3. Parse output into ClassificationResult
+   * Buffers frames into a sliding window and runs the ONNX model
+   * every 100ms (10 frames). Between inference runs, returns the
+   * cached result. Falls back to EnergyVAD when no model is loaded.
    */
   async predict(features) {
-    if (!this.session) {
+    if (!this.session || !this.ort) {
       return this.fallback.classify(features);
     }
-    return this.fallback.classify(features);
+    this.addFrame(features);
+    this.framesSinceInference++;
+    if (this.framesSinceInference >= INFERENCE_INTERVAL && this.framesBuffered >= CONTEXT_FRAMES) {
+      this.framesSinceInference = 0;
+      try {
+        this.lastResult = await this.runInference();
+      } catch (err) {
+        console.warn("[utterance] ONNX inference failed, using EnergyVAD:", err);
+        return this.fallback.classify(features);
+      }
+    }
+    return this.lastResult ?? this.fallback.classify(features);
   }
   /**
    * Release model resources.
    */
   dispose() {
+    if (this.session) {
+      this.session.release().catch(() => {
+      });
+    }
     this.session = null;
+    this.ort = null;
     this.fallback.reset();
+    this.resetBuffer();
+  }
+  /**
+   * Add a feature frame to the circular buffer.
+   */
+  addFrame(features) {
+    const offset = this.bufferIdx * FEATURE_DIM;
+    this.frameBuffer.set(features.mfcc, offset);
+    this.frameBuffer[offset + 13] = features.energy;
+    this.frameBuffer[offset + 14] = features.pitch;
+    this.frameBuffer[offset + 15] = features.speechRate;
+    this.frameBuffer[offset + 16] = features.pauseDuration;
+    this.bufferIdx = (this.bufferIdx + 1) % CONTEXT_FRAMES;
+    if (this.framesBuffered < CONTEXT_FRAMES) {
+      this.framesBuffered++;
+    }
+  }
+  /**
+   * Build the input tensor from the circular buffer and run ONNX inference.
+   */
+  async runInference() {
+    const ort = this.ort;
+    const session = this.session;
+    const input = new Float32Array(CONTEXT_FRAMES * FEATURE_DIM);
+    for (let i = 0; i < CONTEXT_FRAMES; i++) {
+      const srcIdx = (this.bufferIdx - CONTEXT_FRAMES + i + CONTEXT_FRAMES) % CONTEXT_FRAMES * FEATURE_DIM;
+      const dstIdx = i * FEATURE_DIM;
+      input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
+    }
+    const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
+    const results = await session.run({ input: tensor });
+    const output = results.output;
+    const logits = output.data;
+    const probs = softmax(logits);
+    let bestIdx = 0;
+    let bestProb = probs[0];
+    for (let i = 1; i < probs.length; i++) {
+      if (probs[i] > bestProb) {
+        bestProb = probs[i];
+        bestIdx = i;
+      }
+    }
+    return {
+      label: LABELS[bestIdx],
+      confidence: bestProb,
+      timestamp: Date.now()
+    };
+  }
+  resetBuffer() {
+    this.frameBuffer.fill(0);
+    this.bufferIdx = 0;
+    this.framesBuffered = 0;
+    this.framesSinceInference = 0;
+    this.lastResult = null;
   }
 };
+function softmax(logits) {
+  const max = logits.reduce((a, b) => Math.max(a, b), -Infinity);
+  const exps = new Float32Array(logits.length);
+  let sum = 0;
+  for (let i = 0; i < logits.length; i++) {
+    exps[i] = Math.exp(logits[i] - max);
+    sum += exps[i];
+  }
+  for (let i = 0; i < exps.length; i++) {
+    exps[i] /= sum;
+  }
+  return exps;
+}
 // src/detector/turn-detector.ts
 var TurnDetector = class {
@@ -280,14 +664,6 @@ var TurnDetector = class {
   }
 };
-// src/types.ts
-var DEFAULT_OPTIONS = {
-  sensitivity: 0.5,
-  pauseTolerance: 1500,
-  modelPath: "bundled",
-  sampleRate: 16e3
-};
 // src/utterance.ts
 var Utterance = class {
   options;

package/models/utterance-v1.onnx ADDED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@utterance/core",
-  "version": "0.0.1",
+  "version": "0.0.3",
   "description": "Client-side semantic endpointing. Know when they're done talking.",
   "type": "module",
   "main": "dist/index.cjs",
@@ -76,6 +76,7 @@
   "dependencies": {
     "@next/third-parties": "^16.1.6",
     "@react-three/fiber": "^9.5.0",
+    "@utterance/core": "^0.0.2",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
     "fumadocs-core": "^16.6.3",