npm - @fonoster/autopilot - Versions diffs - 0.8.47 → 0.8.50 - Mend

@fonoster/autopilot 0.8.47 → 0.8.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/envs.js +1 -1
package/dist/vad/SileroVad.js +2 -2
package/dist/vad/chunkToFloat32Array.js +11 -16
package/dist/vad/createVad.d.ts +3 -0
package/dist/vad/{makeVad.js → createVad.js} +46 -29
package/dist/vad/types.d.ts +7 -1
package/dist/vadv5/SileroVad.js +1 -1
package/dist/vadv5/SileroVadModel.js +20 -7
package/dist/vadv5/createVad.d.ts +3 -7
package/dist/vadv5/createVad.js +51 -29
package/dist/vadv5/types.d.ts +25 -13
package/package.json +5 -5
package/dist/vad/makeVad.d.ts +0 -7

package/dist/envs.js CHANGED Viewed

@@ -39,7 +39,7 @@ exports.KNOWLEDGE_BASE_ENABLED = e.KNOWLEDGE_BASE_ENABLED === "true";
 exports.NODE_ENV = e.NODE_ENV || "production";
 exports.UNSTRUCTURED_API_KEY = e.UNSTRUCTURED_API_KEY;
 exports.UNSTRUCTURED_API_URL = e.UNSTRUCTURED_API_URL ?? "https://api.unstructuredapp.io/general/v0/general";
-exports.SILERO_VAD_VERSION = e.SILERO_VAD_VERSION ?? "v4";
+exports.SILERO_VAD_VERSION = e.SILERO_VAD_VERSION ?? "v5";
 exports.CONVERSATION_PROVIDER = e.CONVERSATION_PROVIDER
     ? e.CONVERSATION_PROVIDER
     : types_1.ConversationProvider.FILE;

package/dist/vad/SileroVad.js CHANGED Viewed

@@ -21,7 +21,7 @@ exports.SileroVad = void 0;
  * limitations under the License.
  */
 const logger_1 = require("@fonoster/logger");
-const makeVad_1 = require("./makeVad");
+const createVad_1 = require("./createVad");
 const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
 class SileroVad {
     constructor(params) {
@@ -29,7 +29,7 @@ class SileroVad {
         this.params = params;
     }
     async init() {
-        this.vad = await (0, makeVad_1.makeVad)(this.params);
+        this.vad = await (0, createVad_1.createVad)(this.params);
     }
     processChunk(data, callback) {
         if (!this.vad) {

package/dist/vad/chunkToFloat32Array.js CHANGED Viewed

@@ -1,4 +1,6 @@
 "use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.chunkToFloat32Array = chunkToFloat32Array;
 /*
  * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
  * http://github.com/fonoster/fonoster
@@ -17,27 +19,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.chunkToFloat32Array = chunkToFloat32Array;
 // This version of the chunkToFloat32Array accounts for the case where
 // the byteOffset is misaligned.
 //
 // Q. Would it be the same if we just created a new Uint8Array from the chunk?
 function chunkToFloat32Array(chunk) {
-    // Check if byteOffset is not aligned
-    const alignedByteOffset = chunk.byteOffset % Int16Array.BYTES_PER_ELEMENT === 0;
-    let int16Array;
-    if (alignedByteOffset) {
-        int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
-    }
-    else {
-        // Create a new aligned Uint8Array and then an Int16Array from it
-        const alignedChunk = new Uint8Array(chunk);
-        int16Array = new Int16Array(alignedChunk.buffer, alignedChunk.byteOffset, alignedChunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
-    }
-    const floatArray = new Float32Array(int16Array.length);
-    for (let i = 0; i < int16Array.length; i++) {
-        floatArray[i] = int16Array[i] / 32768.0;
+    // Create a DataView to handle endianness explicitly
+    const dataView = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
+    const floatArray = new Float32Array(chunk.byteLength / 2);
+    // Convert each 16-bit sample to float32, explicitly handling little-endian
+    for (let i = 0; i < floatArray.length; i++) {
+        // Read 16-bit value with explicit little-endian
+        const int16Value = dataView.getInt16(i * 2, true); // true = little-endian
+        // Normalize to [-1, 1]
+        floatArray[i] = int16Value / 32768.0;
     }
     return floatArray;
 }

package/dist/vad/createVad.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { VadParams } from "./types";
+declare function createVad(params: VadParams): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
+export { createVad };

package/dist/vad/{makeVad.js → createVad.js} RENAMED Viewed

@@ -33,7 +33,7 @@ var __importStar = (this && this.__importStar) || (function () {
     };
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.makeVad = makeVad;
+exports.createVad = createVad;
 /* eslint-disable no-loops/no-loops */
 /*
  * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
@@ -59,46 +59,63 @@ const ort = __importStar(require("onnxruntime-node"));
 const chunkToFloat32Array_1 = require("./chunkToFloat32Array");
 const SileroVadModel_1 = require("./SileroVadModel");
 const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
-const BUFFER_SIZE = 8000;
-async function makeVad(params) {
+const FULL_FRAME_SIZE = 1600; // Equivalent to 100ms @ 16kHz
+const FRAME_SIZE = 480; // Use last 30ms from the full frame for VAD processing
+async function createVad(params) {
     const { pathToModel, activationThreshold, deactivationThreshold, debounceFrames } = params;
     const effectivePath = pathToModel || (0, path_1.join)(__dirname, "..", "..", "silero_vad.onnx");
     const silero = await SileroVadModel_1.SileroVadModel.new(ort, effectivePath);
     let audioBuffer = [];
     let isSpeechActive = false;
-    let consecutiveSpeechFrames = 0;
-    let consecutiveNonSpeechFrames = 0;
+    let framesSinceStateChange = 0;
+    // Helper to reset internal state after a state change.
+    const resetState = () => {
+        isSpeechActive = false;
+        framesSinceStateChange = 0;
+        audioBuffer = [];
+        silero.resetState();
+        logger.silly("State reset -- audioBuffer cleared");
+    };
     return async function process(chunk, callback) {
         const float32Array = (0, chunkToFloat32Array_1.chunkToFloat32Array)(chunk);
         audioBuffer.push(...float32Array);
-        const processBuffer = async (buffer) => {
-            if (buffer.length < BUFFER_SIZE)
-                return buffer;
-            const audioFrame = buffer.slice(0, BUFFER_SIZE);
-            const remainingBuffer = buffer.slice(BUFFER_SIZE);
-            const result = await silero.process(new Float32Array(audioFrame));
-            logger.silly("last vad result", { ...result });
-            if (result.isSpeech > activationThreshold) {
-                consecutiveNonSpeechFrames = 0; // Reset non-speech counter
-                consecutiveSpeechFrames++;
-                if (consecutiveSpeechFrames >= debounceFrames && !isSpeechActive) {
-                    isSpeechActive = true;
-                    callback("SPEECH_START");
-                }
-            }
-            else {
-                consecutiveSpeechFrames = 0; // Reset speech counter
-                consecutiveNonSpeechFrames++;
-                if (consecutiveNonSpeechFrames >= debounceFrames &&
-                    isSpeechActive &&
-                    result.isSpeech < deactivationThreshold) {
+        // Process full frames from the buffer
+        while (audioBuffer.length >= FULL_FRAME_SIZE) {
+            // Extract one full frame worth of samples
+            const fullFrame = audioBuffer.slice(0, FULL_FRAME_SIZE);
+            audioBuffer = audioBuffer.slice(FULL_FRAME_SIZE);
+            // Use the last FRAME_SIZE samples from the full frame for VAD processing
+            const frame = fullFrame.slice(fullFrame.length - FRAME_SIZE);
+            const result = await silero.process(new Float32Array(frame));
+            const rawScore = result.isSpeech;
+            logger.silly("Frame processing", {
+                rawScore,
+                isSpeechActive,
+                framesSinceStateChange,
+                pendingSamples: audioBuffer.length
+            });
+            framesSinceStateChange++;
+            if (isSpeechActive) {
+                // If currently in speech, check if the score has dropped below the deactivation threshold
+                if (rawScore < deactivationThreshold &&
+                    framesSinceStateChange >= debounceFrames) {
                     isSpeechActive = false;
                     callback("SPEECH_END");
                     silero.resetState(); // Reset VAD state after speech ends
+                    framesSinceStateChange = 0;
+                    logger.silly("Speech end detected", { rawScore });
                 }
             }
-            return processBuffer(remainingBuffer);
-        };
-        audioBuffer = await processBuffer(audioBuffer);
+            else {
+                // If not currently in speech, check if the score exceeds the activation threshold
+                if (rawScore > activationThreshold &&
+                    framesSinceStateChange >= debounceFrames) {
+                    isSpeechActive = true;
+                    framesSinceStateChange = 0;
+                    callback("SPEECH_START");
+                    logger.silly("Speech start detected", { rawScore });
+                }
+            }
+        }
     };
 }

package/dist/vad/types.d.ts CHANGED Viewed

@@ -2,6 +2,12 @@ type VadEvent = "SPEECH_START" | "SPEECH_END";
 type Vad = {
     processChunk: (chunk: Uint8Array, callback: (event: VadEvent) => void) => void;
 };
+type VadParams = {
+    pathToModel?: string;
+    activationThreshold: number;
+    deactivationThreshold: number;
+    debounceFrames: number;
+};
 type SpeechProbabilities = {
     notSpeech: number;
     isSpeech: number;
@@ -17,4 +23,4 @@ type ONNXRuntimeAPI = {
         new (type: "float32", data: Float32Array, dims: [1, number]): unknown;
     };
 };
-export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadEvent };
+export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadParams, VadEvent };

package/dist/vadv5/SileroVad.js CHANGED Viewed

@@ -33,7 +33,7 @@ class SileroVad {
     }
     processChunk(data, callback) {
         if (!this.vad) {
-            throw new Error("VAD not initialized)");
+            throw new Error("VAD not initialized");
         }
         this.vad(data, callback);
     }

package/dist/vadv5/SileroVadModel.js CHANGED Viewed

@@ -21,9 +21,10 @@ exports.SileroVadModel = void 0;
  * limitations under the License.
  */
 const fs_1 = require("fs");
+const SAMPLE_RATE = 16000;
 function getNewState(ortInstance) {
-    const zeroes = Array(2 * 128).fill(0);
-    return new ortInstance.Tensor("float32", zeroes, [2, 1, 128]);
+    return new ortInstance.Tensor("float32", new Float32Array(2 * 1 * 128), // Use Float32Array for consistency
+    [2, 1, 128]);
 }
 class SileroVadModel {
     constructor(ort, pathToModel) {
@@ -37,20 +38,32 @@ class SileroVadModel {
         const modelArrayBuffer = (0, fs_1.readFileSync)(this.pathToModel).buffer;
         const sessionOption = { interOpNumThreads: 1, intraOpNumThreads: 1 };
         this._session = await this.ort.InferenceSession.create(modelArrayBuffer, sessionOption);
-        this._sr = new this.ort.Tensor("int64", [16000n]);
+        // Validate model inputs/outputs
+        const requiredInputs = ["input", "state", "sr"];
+        for (const name of requiredInputs) {
+            if (!this._session.inputNames.includes(name)) {
+                throw new Error(`Model is missing expected input "${name}"`);
+            }
+        }
+        if (!this._session.outputNames.includes("output") ||
+            !this._session.outputNames.includes("stateN")) {
+            throw new Error("Model is missing expected outputs");
+        }
+        // Use BigInt for sample rate tensor
+        this._sr = new this.ort.Tensor("int64", [BigInt(SAMPLE_RATE)], []);
         this._state = getNewState(this.ort);
     }
     async process(audioFrame) {
-        const t = new this.ort.Tensor("float32", audioFrame, [
+        const inputTensor = new this.ort.Tensor("float32", audioFrame, [
             1,
             audioFrame.length
         ]);
-        const inputs = {
-            input: t,
+        const feeds = {
+            input: inputTensor,
             state: this._state,
             sr: this._sr
         };
-        const out = await this._session.run(inputs);
+        const out = await this._session.run(feeds);
         this._state = out.stateN;
         const [isSpeech] = out.output.data;
         const notSpeech = 1 - isSpeech;

package/dist/vadv5/createVad.d.ts CHANGED Viewed

@@ -1,7 +1,3 @@
-declare function createVad(params: {
-    pathToModel: string;
-    activationThreshold: number;
-    deactivationThreshold: number;
-    debounceFrames: number;
-}): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
-export { createVad };
+import { VadParams } from "../vad/types";
+declare function createVad(params: VadParams): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
+export { createVad, VadParams };

package/dist/vadv5/createVad.js CHANGED Viewed

@@ -56,44 +56,66 @@ exports.createVad = createVad;
 const ort = __importStar(require("onnxruntime-node"));
 const chunkToFloat32Array_1 = require("../vad/chunkToFloat32Array");
 const SileroVadModel_1 = require("./SileroVadModel");
-const BUFFER_SIZE = 512;
+const logger_1 = require("@fonoster/logger");
+const path_1 = require("path");
+const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
+const FULL_FRAME_SIZE = 1024; // 64ms @ 16kHz
+const BUFFER_SIZE = 512; // 32ms @ 16kHz
 async function createVad(params) {
     const { pathToModel, activationThreshold, deactivationThreshold, debounceFrames } = params;
-    const silero = await SileroVadModel_1.SileroVadModel.new(ort, pathToModel);
-    let audioBuffer = [];
+    const effectivePath = pathToModel || (0, path_1.join)(__dirname, "..", "..", "silero_vad_v5.onnx");
+    const silero = await SileroVadModel_1.SileroVadModel.new(ort, effectivePath);
+    let sampleBuffer = [];
     let isSpeechActive = false;
-    let consecutiveSpeechFrames = 0;
-    let consecutiveNonSpeechFrames = 0;
+    let framesSinceStateChange = 0;
+    // Reset internal state after a state change.
+    const resetState = () => {
+        isSpeechActive = false;
+        framesSinceStateChange = 0;
+        // Clear any pending audio samples to avoid using outdated values.
+        sampleBuffer = [];
+        silero.resetState();
+        logger.silly("State reset -- sampleBuffer cleared");
+    };
     return async function process(chunk, callback) {
+        // Convert the incoming chunk to normalized Float32 samples (using chunkToFloat32Array)
         const float32Array = (0, chunkToFloat32Array_1.chunkToFloat32Array)(chunk);
-        audioBuffer.push(...float32Array);
-        const processBuffer = async (buffer) => {
-            if (buffer.length < BUFFER_SIZE)
-                return buffer;
-            const audioFrame = buffer.slice(0, BUFFER_SIZE);
-            const remainingBuffer = buffer.slice(BUFFER_SIZE);
-            const result = await silero.process(new Float32Array(audioFrame));
-            if (result.isSpeech > activationThreshold) {
-                consecutiveNonSpeechFrames = 0; // Reset non-speech counter
-                consecutiveSpeechFrames++;
-                if (consecutiveSpeechFrames >= debounceFrames && !isSpeechActive) {
-                    isSpeechActive = true;
-                    callback("SPEECH_START");
+        sampleBuffer.push(...float32Array);
+        // Wait until we've collected a full frame worth of samples.
+        while (sampleBuffer.length >= FULL_FRAME_SIZE) {
+            const fullFrame = sampleBuffer.slice(0, FULL_FRAME_SIZE);
+            sampleBuffer = sampleBuffer.slice(FULL_FRAME_SIZE);
+            // Use the last BUFFER_SIZE samples from the full frame.
+            const frame = fullFrame.slice(fullFrame.length - BUFFER_SIZE);
+            const result = await silero.process(new Float32Array(frame));
+            const rawScore = result.isSpeech;
+            logger.silly("Frame processing", {
+                rawScore,
+                isSpeechActive,
+                framesSinceStateChange,
+                pendingSamples: sampleBuffer.length
+            });
+            framesSinceStateChange++;
+            if (isSpeechActive) {
+                // If already in speech, check if the score has dropped below deactivationThreshold
+                if (rawScore < deactivationThreshold &&
+                    framesSinceStateChange >= debounceFrames) {
+                    callback("SPEECH_END");
+                    resetState();
+                    logger.silly("Speech end detected", { rawScore });
+                    continue;
                 }
             }
             else {
-                consecutiveSpeechFrames = 0; // Reset speech counter
-                consecutiveNonSpeechFrames++;
-                if (consecutiveNonSpeechFrames >= debounceFrames &&
-                    isSpeechActive &&
-                    result.isSpeech < deactivationThreshold) {
-                    isSpeechActive = false;
-                    callback("SPEECH_END");
-                    silero.resetState(); // Reset VAD state after speech ends
+                // If currently not speaking, check if the score is above activationThreshold
+                if (rawScore > activationThreshold &&
+                    framesSinceStateChange >= debounceFrames) {
+                    isSpeechActive = true;
+                    framesSinceStateChange = 0;
+                    callback("SPEECH_START");
+                    logger.silly("Speech start detected", { rawScore });
                 }
             }
-            return processBuffer(remainingBuffer);
-        };
-        audioBuffer = await processBuffer(audioBuffer);
+        }
     };
 }

package/dist/vadv5/types.d.ts CHANGED Viewed

@@ -2,22 +2,34 @@ type VadEvent = "SPEECH_START" | "SPEECH_END";
 type Vad = {
     processChunk: (chunk: Uint8Array, callback: (event: VadEvent) => void) => void;
 };
-type SpeechProbabilities = {
+export interface SpeechProbabilities {
     notSpeech: number;
     isSpeech: number;
-};
-type ONNXRuntimeAPI = {
+}
+export interface ONNXRuntimeAPI {
     InferenceSession: {
-        create(modelArrayBuffer: ArrayBuffer, sessionOption: {
+        create: (modelPath: ArrayBuffer | string, options?: {
             interOpNumThreads: number;
             intraOpNumThreads: number;
-        }): Promise<unknown>;
-    };
-    Tensor: {
-        new (type: "int64", data: BigInt[]): unknown;
-        new (type: "int64", data: BigInt[], dims: [1]): unknown;
-        new (type: "float32", data: Float32Array | number[], dims: [2, 1, 128]): unknown;
-        new (type: "float32", data: Float32Array, dims: [1, number]): unknown;
+        }) => Promise<ONNXSession>;
     };
-};
-export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadEvent };
+    Tensor: new (type: string, data: Float32Array | bigint[], dims: number[]) => ONNXTensor;
+}
+export interface ONNXSession {
+    run: (feeds: {
+        [key: string]: ONNXTensor;
+    }) => Promise<{
+        output: {
+            data: Float32Array;
+        };
+        stateN: ONNXTensor;
+    }>;
+    inputNames: string[];
+    outputNames: string[];
+}
+export interface ONNXTensor {
+    data: Float32Array | bigint[];
+    dims: number[];
+    type: string;
+}
+export { Vad, VadEvent };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fonoster/autopilot",
-  "version": "0.8.47",
+  "version": "0.8.50",
   "description": "Voice AI for the Fonoster platform",
   "author": "Pedro Sanders <psanders@fonoster.com>",
   "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -33,11 +33,11 @@
   },
   "dependencies": {
     "@aws-sdk/client-s3": "^3.712.0",
-    "@fonoster/common": "^0.8.47",
+    "@fonoster/common": "^0.8.50",
     "@fonoster/logger": "^0.8.47",
-    "@fonoster/sdk": "^0.8.47",
+    "@fonoster/sdk": "^0.8.50",
     "@fonoster/types": "^0.8.47",
-    "@fonoster/voice": "^0.8.47",
+    "@fonoster/voice": "^0.8.50",
     "@langchain/community": "^0.3.19",
     "@langchain/core": "^0.3.23",
     "@langchain/groq": "^0.1.2",
@@ -55,5 +55,5 @@
   "devDependencies": {
     "typescript": "^5.5.4"
   },
-  "gitHead": "33970ac7794e8e55333936ad3ad84f7260d5c138"
+  "gitHead": "d0e373668e8e77295e3847c99a346c4aa1c8d3d7"
 }

package/dist/vad/makeVad.d.ts DELETED Viewed

@@ -1,7 +0,0 @@
-declare function makeVad(params: {
-    pathToModel?: string;
-    activationThreshold: number;
-    deactivationThreshold: number;
-    debounceFrames: number;
-}): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
-export { makeVad };