@fonoster/autopilot 0.8.47 → 0.8.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/envs.js CHANGED
@@ -39,7 +39,7 @@ exports.KNOWLEDGE_BASE_ENABLED = e.KNOWLEDGE_BASE_ENABLED === "true";
39
39
  exports.NODE_ENV = e.NODE_ENV || "production";
40
40
  exports.UNSTRUCTURED_API_KEY = e.UNSTRUCTURED_API_KEY;
41
41
  exports.UNSTRUCTURED_API_URL = e.UNSTRUCTURED_API_URL ?? "https://api.unstructuredapp.io/general/v0/general";
42
- exports.SILERO_VAD_VERSION = e.SILERO_VAD_VERSION ?? "v4";
42
+ exports.SILERO_VAD_VERSION = e.SILERO_VAD_VERSION ?? "v5";
43
43
  exports.CONVERSATION_PROVIDER = e.CONVERSATION_PROVIDER
44
44
  ? e.CONVERSATION_PROVIDER
45
45
  : types_1.ConversationProvider.FILE;
@@ -21,7 +21,7 @@ exports.SileroVad = void 0;
21
21
  * limitations under the License.
22
22
  */
23
23
  const logger_1 = require("@fonoster/logger");
24
- const makeVad_1 = require("./makeVad");
24
+ const createVad_1 = require("./createVad");
25
25
  const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
26
26
  class SileroVad {
27
27
  constructor(params) {
@@ -29,7 +29,7 @@ class SileroVad {
29
29
  this.params = params;
30
30
  }
31
31
  async init() {
32
- this.vad = await (0, makeVad_1.makeVad)(this.params);
32
+ this.vad = await (0, createVad_1.createVad)(this.params);
33
33
  }
34
34
  processChunk(data, callback) {
35
35
  if (!this.vad) {
@@ -1,4 +1,6 @@
1
1
  "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.chunkToFloat32Array = chunkToFloat32Array;
2
4
  /*
3
5
  * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
4
6
  * http://github.com/fonoster/fonoster
@@ -17,27 +19,20 @@
17
19
  * See the License for the specific language governing permissions and
18
20
  * limitations under the License.
19
21
  */
20
- Object.defineProperty(exports, "__esModule", { value: true });
21
- exports.chunkToFloat32Array = chunkToFloat32Array;
22
22
  // This version of the chunkToFloat32Array accounts for the case where
23
23
  // the byteOffset is misaligned.
24
24
  //
25
25
  // Q. Would it be the same if we just created a new Uint8Array from the chunk?
26
26
  function chunkToFloat32Array(chunk) {
27
- // Check if byteOffset is not aligned
28
- const alignedByteOffset = chunk.byteOffset % Int16Array.BYTES_PER_ELEMENT === 0;
29
- let int16Array;
30
- if (alignedByteOffset) {
31
- int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
32
- }
33
- else {
34
- // Create a new aligned Uint8Array and then an Int16Array from it
35
- const alignedChunk = new Uint8Array(chunk);
36
- int16Array = new Int16Array(alignedChunk.buffer, alignedChunk.byteOffset, alignedChunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
37
- }
38
- const floatArray = new Float32Array(int16Array.length);
39
- for (let i = 0; i < int16Array.length; i++) {
40
- floatArray[i] = int16Array[i] / 32768.0;
27
+ // Create a DataView to handle endianness explicitly
28
+ const dataView = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
29
+ const floatArray = new Float32Array(chunk.byteLength / 2);
30
+ // Convert each 16-bit sample to float32, explicitly handling little-endian
31
+ for (let i = 0; i < floatArray.length; i++) {
32
+ // Read 16-bit value with explicit little-endian
33
+ const int16Value = dataView.getInt16(i * 2, true); // true = little-endian
34
+ // Normalize to [-1, 1]
35
+ floatArray[i] = int16Value / 32768.0;
41
36
  }
42
37
  return floatArray;
43
38
  }
@@ -0,0 +1,3 @@
1
+ import { VadParams } from "./types";
2
+ declare function createVad(params: VadParams): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
3
+ export { createVad };
@@ -33,7 +33,7 @@ var __importStar = (this && this.__importStar) || (function () {
33
33
  };
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.makeVad = makeVad;
36
+ exports.createVad = createVad;
37
37
  /* eslint-disable no-loops/no-loops */
38
38
  /*
39
39
  * Copyright (C) 2025 by Fonoster Inc (https://fonoster.com)
@@ -59,46 +59,63 @@ const ort = __importStar(require("onnxruntime-node"));
59
59
  const chunkToFloat32Array_1 = require("./chunkToFloat32Array");
60
60
  const SileroVadModel_1 = require("./SileroVadModel");
61
61
  const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
62
- const BUFFER_SIZE = 8000;
63
- async function makeVad(params) {
62
+ const FULL_FRAME_SIZE = 1600; // Equivalent to 100ms @ 16kHz
63
+ const FRAME_SIZE = 480; // Use last 30ms from the full frame for VAD processing
64
+ async function createVad(params) {
64
65
  const { pathToModel, activationThreshold, deactivationThreshold, debounceFrames } = params;
65
66
  const effectivePath = pathToModel || (0, path_1.join)(__dirname, "..", "..", "silero_vad.onnx");
66
67
  const silero = await SileroVadModel_1.SileroVadModel.new(ort, effectivePath);
67
68
  let audioBuffer = [];
68
69
  let isSpeechActive = false;
69
- let consecutiveSpeechFrames = 0;
70
- let consecutiveNonSpeechFrames = 0;
70
+ let framesSinceStateChange = 0;
71
+ // Helper to reset internal state after a state change.
72
+ const resetState = () => {
73
+ isSpeechActive = false;
74
+ framesSinceStateChange = 0;
75
+ audioBuffer = [];
76
+ silero.resetState();
77
+ logger.silly("State reset -- audioBuffer cleared");
78
+ };
71
79
  return async function process(chunk, callback) {
72
80
  const float32Array = (0, chunkToFloat32Array_1.chunkToFloat32Array)(chunk);
73
81
  audioBuffer.push(...float32Array);
74
- const processBuffer = async (buffer) => {
75
- if (buffer.length < BUFFER_SIZE)
76
- return buffer;
77
- const audioFrame = buffer.slice(0, BUFFER_SIZE);
78
- const remainingBuffer = buffer.slice(BUFFER_SIZE);
79
- const result = await silero.process(new Float32Array(audioFrame));
80
- logger.silly("last vad result", { ...result });
81
- if (result.isSpeech > activationThreshold) {
82
- consecutiveNonSpeechFrames = 0; // Reset non-speech counter
83
- consecutiveSpeechFrames++;
84
- if (consecutiveSpeechFrames >= debounceFrames && !isSpeechActive) {
85
- isSpeechActive = true;
86
- callback("SPEECH_START");
87
- }
88
- }
89
- else {
90
- consecutiveSpeechFrames = 0; // Reset speech counter
91
- consecutiveNonSpeechFrames++;
92
- if (consecutiveNonSpeechFrames >= debounceFrames &&
93
- isSpeechActive &&
94
- result.isSpeech < deactivationThreshold) {
82
+ // Process full frames from the buffer
83
+ while (audioBuffer.length >= FULL_FRAME_SIZE) {
84
+ // Extract one full frame worth of samples
85
+ const fullFrame = audioBuffer.slice(0, FULL_FRAME_SIZE);
86
+ audioBuffer = audioBuffer.slice(FULL_FRAME_SIZE);
87
+ // Use the last FRAME_SIZE samples from the full frame for VAD processing
88
+ const frame = fullFrame.slice(fullFrame.length - FRAME_SIZE);
89
+ const result = await silero.process(new Float32Array(frame));
90
+ const rawScore = result.isSpeech;
91
+ logger.silly("Frame processing", {
92
+ rawScore,
93
+ isSpeechActive,
94
+ framesSinceStateChange,
95
+ pendingSamples: audioBuffer.length
96
+ });
97
+ framesSinceStateChange++;
98
+ if (isSpeechActive) {
99
+ // If currently in speech, check if the score has dropped below the deactivation threshold
100
+ if (rawScore < deactivationThreshold &&
101
+ framesSinceStateChange >= debounceFrames) {
95
102
  isSpeechActive = false;
96
103
  callback("SPEECH_END");
97
104
  silero.resetState(); // Reset VAD state after speech ends
105
+ framesSinceStateChange = 0;
106
+ logger.silly("Speech end detected", { rawScore });
98
107
  }
99
108
  }
100
- return processBuffer(remainingBuffer);
101
- };
102
- audioBuffer = await processBuffer(audioBuffer);
109
+ else {
110
+ // If not currently in speech, check if the score exceeds the activation threshold
111
+ if (rawScore > activationThreshold &&
112
+ framesSinceStateChange >= debounceFrames) {
113
+ isSpeechActive = true;
114
+ framesSinceStateChange = 0;
115
+ callback("SPEECH_START");
116
+ logger.silly("Speech start detected", { rawScore });
117
+ }
118
+ }
119
+ }
103
120
  };
104
121
  }
@@ -2,6 +2,12 @@ type VadEvent = "SPEECH_START" | "SPEECH_END";
2
2
  type Vad = {
3
3
  processChunk: (chunk: Uint8Array, callback: (event: VadEvent) => void) => void;
4
4
  };
5
+ type VadParams = {
6
+ pathToModel?: string;
7
+ activationThreshold: number;
8
+ deactivationThreshold: number;
9
+ debounceFrames: number;
10
+ };
5
11
  type SpeechProbabilities = {
6
12
  notSpeech: number;
7
13
  isSpeech: number;
@@ -17,4 +23,4 @@ type ONNXRuntimeAPI = {
17
23
  new (type: "float32", data: Float32Array, dims: [1, number]): unknown;
18
24
  };
19
25
  };
20
- export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadEvent };
26
+ export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadParams, VadEvent };
@@ -33,7 +33,7 @@ class SileroVad {
33
33
  }
34
34
  processChunk(data, callback) {
35
35
  if (!this.vad) {
36
- throw new Error("VAD not initialized)");
36
+ throw new Error("VAD not initialized");
37
37
  }
38
38
  this.vad(data, callback);
39
39
  }
@@ -21,9 +21,10 @@ exports.SileroVadModel = void 0;
21
21
  * limitations under the License.
22
22
  */
23
23
  const fs_1 = require("fs");
24
+ const SAMPLE_RATE = 16000;
24
25
  function getNewState(ortInstance) {
25
- const zeroes = Array(2 * 128).fill(0);
26
- return new ortInstance.Tensor("float32", zeroes, [2, 1, 128]);
26
+ return new ortInstance.Tensor("float32", new Float32Array(2 * 1 * 128), // Use Float32Array for consistency
27
+ [2, 1, 128]);
27
28
  }
28
29
  class SileroVadModel {
29
30
  constructor(ort, pathToModel) {
@@ -37,20 +38,32 @@ class SileroVadModel {
37
38
  const modelArrayBuffer = (0, fs_1.readFileSync)(this.pathToModel).buffer;
38
39
  const sessionOption = { interOpNumThreads: 1, intraOpNumThreads: 1 };
39
40
  this._session = await this.ort.InferenceSession.create(modelArrayBuffer, sessionOption);
40
- this._sr = new this.ort.Tensor("int64", [16000n]);
41
+ // Validate model inputs/outputs
42
+ const requiredInputs = ["input", "state", "sr"];
43
+ for (const name of requiredInputs) {
44
+ if (!this._session.inputNames.includes(name)) {
45
+ throw new Error(`Model is missing expected input "${name}"`);
46
+ }
47
+ }
48
+ if (!this._session.outputNames.includes("output") ||
49
+ !this._session.outputNames.includes("stateN")) {
50
+ throw new Error("Model is missing expected outputs");
51
+ }
52
+ // Use BigInt for sample rate tensor
53
+ this._sr = new this.ort.Tensor("int64", [BigInt(SAMPLE_RATE)], []);
41
54
  this._state = getNewState(this.ort);
42
55
  }
43
56
  async process(audioFrame) {
44
- const t = new this.ort.Tensor("float32", audioFrame, [
57
+ const inputTensor = new this.ort.Tensor("float32", audioFrame, [
45
58
  1,
46
59
  audioFrame.length
47
60
  ]);
48
- const inputs = {
49
- input: t,
61
+ const feeds = {
62
+ input: inputTensor,
50
63
  state: this._state,
51
64
  sr: this._sr
52
65
  };
53
- const out = await this._session.run(inputs);
66
+ const out = await this._session.run(feeds);
54
67
  this._state = out.stateN;
55
68
  const [isSpeech] = out.output.data;
56
69
  const notSpeech = 1 - isSpeech;
@@ -1,7 +1,3 @@
1
- declare function createVad(params: {
2
- pathToModel: string;
3
- activationThreshold: number;
4
- deactivationThreshold: number;
5
- debounceFrames: number;
6
- }): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
7
- export { createVad };
1
+ import { VadParams } from "../vad/types";
2
+ declare function createVad(params: VadParams): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
3
+ export { createVad, VadParams };
@@ -56,44 +56,66 @@ exports.createVad = createVad;
56
56
  const ort = __importStar(require("onnxruntime-node"));
57
57
  const chunkToFloat32Array_1 = require("../vad/chunkToFloat32Array");
58
58
  const SileroVadModel_1 = require("./SileroVadModel");
59
- const BUFFER_SIZE = 512;
59
+ const logger_1 = require("@fonoster/logger");
60
+ const path_1 = require("path");
61
+ const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
62
+ const FULL_FRAME_SIZE = 1024; // 64ms @ 16kHz
63
+ const BUFFER_SIZE = 512; // 32ms @ 16kHz
60
64
  async function createVad(params) {
61
65
  const { pathToModel, activationThreshold, deactivationThreshold, debounceFrames } = params;
62
- const silero = await SileroVadModel_1.SileroVadModel.new(ort, pathToModel);
63
- let audioBuffer = [];
66
+ const effectivePath = pathToModel || (0, path_1.join)(__dirname, "..", "..", "silero_vad_v5.onnx");
67
+ const silero = await SileroVadModel_1.SileroVadModel.new(ort, effectivePath);
68
+ let sampleBuffer = [];
64
69
  let isSpeechActive = false;
65
- let consecutiveSpeechFrames = 0;
66
- let consecutiveNonSpeechFrames = 0;
70
+ let framesSinceStateChange = 0;
71
+ // Reset internal state after a state change.
72
+ const resetState = () => {
73
+ isSpeechActive = false;
74
+ framesSinceStateChange = 0;
75
+ // Clear any pending audio samples to avoid using outdated values.
76
+ sampleBuffer = [];
77
+ silero.resetState();
78
+ logger.silly("State reset -- sampleBuffer cleared");
79
+ };
67
80
  return async function process(chunk, callback) {
81
+ // Convert the incoming chunk to normalized Float32 samples (using chunkToFloat32Array)
68
82
  const float32Array = (0, chunkToFloat32Array_1.chunkToFloat32Array)(chunk);
69
- audioBuffer.push(...float32Array);
70
- const processBuffer = async (buffer) => {
71
- if (buffer.length < BUFFER_SIZE)
72
- return buffer;
73
- const audioFrame = buffer.slice(0, BUFFER_SIZE);
74
- const remainingBuffer = buffer.slice(BUFFER_SIZE);
75
- const result = await silero.process(new Float32Array(audioFrame));
76
- if (result.isSpeech > activationThreshold) {
77
- consecutiveNonSpeechFrames = 0; // Reset non-speech counter
78
- consecutiveSpeechFrames++;
79
- if (consecutiveSpeechFrames >= debounceFrames && !isSpeechActive) {
80
- isSpeechActive = true;
81
- callback("SPEECH_START");
83
+ sampleBuffer.push(...float32Array);
84
+ // Wait until we've collected a full frame worth of samples.
85
+ while (sampleBuffer.length >= FULL_FRAME_SIZE) {
86
+ const fullFrame = sampleBuffer.slice(0, FULL_FRAME_SIZE);
87
+ sampleBuffer = sampleBuffer.slice(FULL_FRAME_SIZE);
88
+ // Use the last BUFFER_SIZE samples from the full frame.
89
+ const frame = fullFrame.slice(fullFrame.length - BUFFER_SIZE);
90
+ const result = await silero.process(new Float32Array(frame));
91
+ const rawScore = result.isSpeech;
92
+ logger.silly("Frame processing", {
93
+ rawScore,
94
+ isSpeechActive,
95
+ framesSinceStateChange,
96
+ pendingSamples: sampleBuffer.length
97
+ });
98
+ framesSinceStateChange++;
99
+ if (isSpeechActive) {
100
+ // If already in speech, check if the score has dropped below deactivationThreshold
101
+ if (rawScore < deactivationThreshold &&
102
+ framesSinceStateChange >= debounceFrames) {
103
+ callback("SPEECH_END");
104
+ resetState();
105
+ logger.silly("Speech end detected", { rawScore });
106
+ continue;
82
107
  }
83
108
  }
84
109
  else {
85
- consecutiveSpeechFrames = 0; // Reset speech counter
86
- consecutiveNonSpeechFrames++;
87
- if (consecutiveNonSpeechFrames >= debounceFrames &&
88
- isSpeechActive &&
89
- result.isSpeech < deactivationThreshold) {
90
- isSpeechActive = false;
91
- callback("SPEECH_END");
92
- silero.resetState(); // Reset VAD state after speech ends
110
+ // If currently not speaking, check if the score is above activationThreshold
111
+ if (rawScore > activationThreshold &&
112
+ framesSinceStateChange >= debounceFrames) {
113
+ isSpeechActive = true;
114
+ framesSinceStateChange = 0;
115
+ callback("SPEECH_START");
116
+ logger.silly("Speech start detected", { rawScore });
93
117
  }
94
118
  }
95
- return processBuffer(remainingBuffer);
96
- };
97
- audioBuffer = await processBuffer(audioBuffer);
119
+ }
98
120
  };
99
121
  }
@@ -2,22 +2,34 @@ type VadEvent = "SPEECH_START" | "SPEECH_END";
2
2
  type Vad = {
3
3
  processChunk: (chunk: Uint8Array, callback: (event: VadEvent) => void) => void;
4
4
  };
5
- type SpeechProbabilities = {
5
+ export interface SpeechProbabilities {
6
6
  notSpeech: number;
7
7
  isSpeech: number;
8
- };
9
- type ONNXRuntimeAPI = {
8
+ }
9
+ export interface ONNXRuntimeAPI {
10
10
  InferenceSession: {
11
- create(modelArrayBuffer: ArrayBuffer, sessionOption: {
11
+ create: (modelPath: ArrayBuffer | string, options?: {
12
12
  interOpNumThreads: number;
13
13
  intraOpNumThreads: number;
14
- }): Promise<unknown>;
15
- };
16
- Tensor: {
17
- new (type: "int64", data: BigInt[]): unknown;
18
- new (type: "int64", data: BigInt[], dims: [1]): unknown;
19
- new (type: "float32", data: Float32Array | number[], dims: [2, 1, 128]): unknown;
20
- new (type: "float32", data: Float32Array, dims: [1, number]): unknown;
14
+ }) => Promise<ONNXSession>;
21
15
  };
22
- };
23
- export { ONNXRuntimeAPI, SpeechProbabilities, Vad, VadEvent };
16
+ Tensor: new (type: string, data: Float32Array | bigint[], dims: number[]) => ONNXTensor;
17
+ }
18
+ export interface ONNXSession {
19
+ run: (feeds: {
20
+ [key: string]: ONNXTensor;
21
+ }) => Promise<{
22
+ output: {
23
+ data: Float32Array;
24
+ };
25
+ stateN: ONNXTensor;
26
+ }>;
27
+ inputNames: string[];
28
+ outputNames: string[];
29
+ }
30
+ export interface ONNXTensor {
31
+ data: Float32Array | bigint[];
32
+ dims: number[];
33
+ type: string;
34
+ }
35
+ export { Vad, VadEvent };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fonoster/autopilot",
3
- "version": "0.8.47",
3
+ "version": "0.8.50",
4
4
  "description": "Voice AI for the Fonoster platform",
5
5
  "author": "Pedro Sanders <psanders@fonoster.com>",
6
6
  "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -33,11 +33,11 @@
33
33
  },
34
34
  "dependencies": {
35
35
  "@aws-sdk/client-s3": "^3.712.0",
36
- "@fonoster/common": "^0.8.47",
36
+ "@fonoster/common": "^0.8.50",
37
37
  "@fonoster/logger": "^0.8.47",
38
- "@fonoster/sdk": "^0.8.47",
38
+ "@fonoster/sdk": "^0.8.50",
39
39
  "@fonoster/types": "^0.8.47",
40
- "@fonoster/voice": "^0.8.47",
40
+ "@fonoster/voice": "^0.8.50",
41
41
  "@langchain/community": "^0.3.19",
42
42
  "@langchain/core": "^0.3.23",
43
43
  "@langchain/groq": "^0.1.2",
@@ -55,5 +55,5 @@
55
55
  "devDependencies": {
56
56
  "typescript": "^5.5.4"
57
57
  },
58
- "gitHead": "33970ac7794e8e55333936ad3ad84f7260d5c138"
58
+ "gitHead": "d0e373668e8e77295e3847c99a346c4aa1c8d3d7"
59
59
  }
@@ -1,7 +0,0 @@
1
- declare function makeVad(params: {
2
- pathToModel?: string;
3
- activationThreshold: number;
4
- deactivationThreshold: number;
5
- debounceFrames: number;
6
- }): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
7
- export { makeVad };