@fonoster/autopilot 0.7.18 → 0.7.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -88,7 +88,7 @@ The Autopilot supports multiple language model providers. The following is a lis
88
88
 
89
89
  | Provider | Description | Supported models
90
90
  |------------|------------------------------------------------------------|------------------------------------------------------------------------------|
91
- | OpenAI | OpenAI provides various GPT models for conversational AI | `gpt-4o`, `gpt-4o-mini` |
91
+ | OpenAI | OpenAI provides various GPT models for conversational AI | `gpt-4o`, `gpt-4o-mini`, `gpt-3.5-turbo`, `gpt-4-turbo` |
92
92
  | Groq | Groq offers high-performance AI models optimized for speed | `gemm-7b-it`, `llama3-groq-70b-8192-tool-use-preview`, `llama3-1-8b-instant` |
93
93
  | Ollama | Self-hosted Ollama models | `lama3.1` |
94
94
 
@@ -6,6 +6,7 @@ declare const conversationSettingsSchema: z.ZodObject<{
6
6
  goodbyeMessage: z.ZodString;
7
7
  systemErrorMessage: z.ZodString;
8
8
  initialDtmf: z.ZodNullable<z.ZodOptional<z.ZodString>>;
9
+ maxSpeechWaitTimeout: z.ZodNumber;
9
10
  transferOptions: z.ZodNullable<z.ZodOptional<z.ZodObject<{
10
11
  phoneNumber: z.ZodString;
11
12
  message: z.ZodString;
@@ -32,11 +33,34 @@ declare const conversationSettingsSchema: z.ZodObject<{
32
33
  timeout: number;
33
34
  maxTimeoutCount: number;
34
35
  }>>>;
36
+ vad: z.ZodObject<{
37
+ pathToModel: z.ZodOptional<z.ZodString>;
38
+ activationThreshold: z.ZodNumber;
39
+ deactivationThreshold: z.ZodNumber;
40
+ debounceFrames: z.ZodNumber;
41
+ }, "strip", z.ZodTypeAny, {
42
+ activationThreshold: number;
43
+ deactivationThreshold: number;
44
+ debounceFrames: number;
45
+ pathToModel?: string | undefined;
46
+ }, {
47
+ activationThreshold: number;
48
+ deactivationThreshold: number;
49
+ debounceFrames: number;
50
+ pathToModel?: string | undefined;
51
+ }>;
35
52
  }, "strip", z.ZodTypeAny, {
36
53
  firstMessage: string;
37
54
  systemTemplate: string;
38
55
  goodbyeMessage: string;
39
56
  systemErrorMessage: string;
57
+ maxSpeechWaitTimeout: number;
58
+ vad: {
59
+ activationThreshold: number;
60
+ deactivationThreshold: number;
61
+ debounceFrames: number;
62
+ pathToModel?: string | undefined;
63
+ };
40
64
  initialDtmf?: string | null | undefined;
41
65
  transferOptions?: {
42
66
  message: string;
@@ -53,6 +77,13 @@ declare const conversationSettingsSchema: z.ZodObject<{
53
77
  systemTemplate: string;
54
78
  goodbyeMessage: string;
55
79
  systemErrorMessage: string;
80
+ maxSpeechWaitTimeout: number;
81
+ vad: {
82
+ activationThreshold: number;
83
+ deactivationThreshold: number;
84
+ debounceFrames: number;
85
+ pathToModel?: string | undefined;
86
+ };
56
87
  initialDtmf?: string | null | undefined;
57
88
  transferOptions?: {
58
89
  message: string;
@@ -267,6 +298,7 @@ declare const assistantSchema: z.ZodObject<{
267
298
  goodbyeMessage: z.ZodString;
268
299
  systemErrorMessage: z.ZodString;
269
300
  initialDtmf: z.ZodNullable<z.ZodOptional<z.ZodString>>;
301
+ maxSpeechWaitTimeout: z.ZodNumber;
270
302
  transferOptions: z.ZodNullable<z.ZodOptional<z.ZodObject<{
271
303
  phoneNumber: z.ZodString;
272
304
  message: z.ZodString;
@@ -293,11 +325,34 @@ declare const assistantSchema: z.ZodObject<{
293
325
  timeout: number;
294
326
  maxTimeoutCount: number;
295
327
  }>>>;
328
+ vad: z.ZodObject<{
329
+ pathToModel: z.ZodOptional<z.ZodString>;
330
+ activationThreshold: z.ZodNumber;
331
+ deactivationThreshold: z.ZodNumber;
332
+ debounceFrames: z.ZodNumber;
333
+ }, "strip", z.ZodTypeAny, {
334
+ activationThreshold: number;
335
+ deactivationThreshold: number;
336
+ debounceFrames: number;
337
+ pathToModel?: string | undefined;
338
+ }, {
339
+ activationThreshold: number;
340
+ deactivationThreshold: number;
341
+ debounceFrames: number;
342
+ pathToModel?: string | undefined;
343
+ }>;
296
344
  }, "strip", z.ZodTypeAny, {
297
345
  firstMessage: string;
298
346
  systemTemplate: string;
299
347
  goodbyeMessage: string;
300
348
  systemErrorMessage: string;
349
+ maxSpeechWaitTimeout: number;
350
+ vad: {
351
+ activationThreshold: number;
352
+ deactivationThreshold: number;
353
+ debounceFrames: number;
354
+ pathToModel?: string | undefined;
355
+ };
301
356
  initialDtmf?: string | null | undefined;
302
357
  transferOptions?: {
303
358
  message: string;
@@ -314,6 +369,13 @@ declare const assistantSchema: z.ZodObject<{
314
369
  systemTemplate: string;
315
370
  goodbyeMessage: string;
316
371
  systemErrorMessage: string;
372
+ maxSpeechWaitTimeout: number;
373
+ vad: {
374
+ activationThreshold: number;
375
+ deactivationThreshold: number;
376
+ debounceFrames: number;
377
+ pathToModel?: string | undefined;
378
+ };
317
379
  initialDtmf?: string | null | undefined;
318
380
  transferOptions?: {
319
381
  message: string;
@@ -527,6 +589,13 @@ declare const assistantSchema: z.ZodObject<{
527
589
  systemTemplate: string;
528
590
  goodbyeMessage: string;
529
591
  systemErrorMessage: string;
592
+ maxSpeechWaitTimeout: number;
593
+ vad: {
594
+ activationThreshold: number;
595
+ deactivationThreshold: number;
596
+ debounceFrames: number;
597
+ pathToModel?: string | undefined;
598
+ };
530
599
  initialDtmf?: string | null | undefined;
531
600
  transferOptions?: {
532
601
  message: string;
@@ -578,6 +647,13 @@ declare const assistantSchema: z.ZodObject<{
578
647
  systemTemplate: string;
579
648
  goodbyeMessage: string;
580
649
  systemErrorMessage: string;
650
+ maxSpeechWaitTimeout: number;
651
+ vad: {
652
+ activationThreshold: number;
653
+ deactivationThreshold: number;
654
+ debounceFrames: number;
655
+ pathToModel?: string | undefined;
656
+ };
581
657
  initialDtmf?: string | null | undefined;
582
658
  transferOptions?: {
583
659
  message: string;
@@ -28,6 +28,7 @@ const conversationSettingsSchema = zod_1.z.object({
28
28
  goodbyeMessage: zod_1.z.string(),
29
29
  systemErrorMessage: zod_1.z.string(),
30
30
  initialDtmf: zod_1.z.string().optional().nullable(),
31
+ maxSpeechWaitTimeout: zod_1.z.number(),
31
32
  transferOptions: zod_1.z
32
33
  .object({
33
34
  phoneNumber: zod_1.z.string(),
@@ -43,7 +44,13 @@ const conversationSettingsSchema = zod_1.z.object({
43
44
  maxTimeoutCount: zod_1.z.number()
44
45
  })
45
46
  .optional()
46
- .nullable()
47
+ .nullable(),
48
+ vad: zod_1.z.object({
49
+ pathToModel: zod_1.z.string().optional(),
50
+ activationThreshold: zod_1.z.number(),
51
+ deactivationThreshold: zod_1.z.number(),
52
+ debounceFrames: zod_1.z.number()
53
+ })
47
54
  });
48
55
  exports.conversationSettingsSchema = conversationSettingsSchema;
49
56
  const languageModelConfigSchema = zod_1.z.object({
@@ -60,7 +60,7 @@ async function handleVoiceRequest(req, res) {
60
60
  const assistantConfig = (0, loadAssistantConfig_1.loadAssistantConfig)();
61
61
  const knowledgeBase = await (0, loadKnowledgeBase_1.loadKnowledgeBase)();
62
62
  const voice = new _1.VoiceImpl(sessionRef, res);
63
- const vad = new _1.SileroVad();
63
+ const vad = new _1.SileroVad(assistantConfig.conversationSettings.vad);
64
64
  const languageModel = (0, createLanguageModel_1.createLanguageModel)({
65
65
  voice,
66
66
  assistantConfig,
@@ -79,7 +79,7 @@ declare const machine: import("xstate").StateMachine<AutopilotContext, {
79
79
  type: "isNotSpeaking";
80
80
  params: unknown;
81
81
  };
82
- }>, "IDLE_TIMEOUT", "hangup" | "greeting" | "idle" | "waitingForUserRequest" | "transitioningToIdle" | "updatingSpeech" | "processingUserRequest", string, {
82
+ }>, "IDLE_TIMEOUT" | "MAX_SPEECH_WAIT_TIMEOUT", "hangup" | "greeting" | "idle" | "waitingForUserRequest" | "transitioningToIdle" | "updatingSpeech" | "processingUserRequest", string, {
83
83
  conversationSettings: ConversationSettings;
84
84
  languageModel: LanguageModel;
85
85
  voice: Voice;
@@ -157,6 +157,7 @@ declare const machine: import("xstate").StateMachine<AutopilotContext, {
157
157
  idleTimeout: number;
158
158
  maxIdleTimeoutCount: number;
159
159
  idleTimeoutCount: number;
160
+ maxSpeechWaitTimeout: number;
160
161
  speechResponseStartTime: number;
161
162
  speechResponseTime: number;
162
163
  isSpeaking: false;
@@ -178,6 +179,10 @@ declare const machine: import("xstate").StateMachine<AutopilotContext, {
178
179
  readonly target: "waitingForUserRequest";
179
180
  readonly description: "Event from VAD system.";
180
181
  };
182
+ readonly SPEECH_RESULT: {
183
+ readonly target: "waitingForUserRequest";
184
+ readonly description: "Event from Speech to Text provider.";
185
+ };
181
186
  };
182
187
  readonly after: {
183
188
  readonly IDLE_TIMEOUT: readonly [{
@@ -253,6 +258,11 @@ declare const machine: import("xstate").StateMachine<AutopilotContext, {
253
258
  };
254
259
  }];
255
260
  };
261
+ readonly after: {
262
+ readonly MAX_SPEECH_WAIT_TIMEOUT: {
263
+ readonly target: "processingUserRequest";
264
+ };
265
+ };
256
266
  };
257
267
  readonly processingUserRequest: {
258
268
  readonly on: {
@@ -124,6 +124,9 @@ const machine = (0, xstate_1.setup)({
124
124
  delays: {
125
125
  IDLE_TIMEOUT: ({ context }) => {
126
126
  return context.idleTimeout;
127
+ },
128
+ MAX_SPEECH_WAIT_TIMEOUT: ({ context }) => {
129
+ return context.maxSpeechWaitTimeout;
127
130
  }
128
131
  },
129
132
  actors: {
@@ -191,6 +194,7 @@ const machine = (0, xstate_1.setup)({
191
194
  idleTimeout: input.conversationSettings.idleOptions?.timeout || 10000,
192
195
  maxIdleTimeoutCount: input.conversationSettings.idleOptions?.maxTimeoutCount || 3,
193
196
  idleTimeoutCount: 0,
197
+ maxSpeechWaitTimeout: input.conversationSettings.maxSpeechWaitTimeout,
194
198
  speechResponseStartTime: 0,
195
199
  speechResponseTime: 0,
196
200
  isSpeaking: false
@@ -211,6 +215,10 @@ const machine = (0, xstate_1.setup)({
211
215
  SPEECH_START: {
212
216
  target: "waitingForUserRequest",
213
217
  description: "Event from VAD system."
218
+ },
219
+ SPEECH_RESULT: {
220
+ target: "waitingForUserRequest",
221
+ description: "Event from Speech to Text provider."
214
222
  }
215
223
  },
216
224
  after: {
@@ -303,6 +311,11 @@ const machine = (0, xstate_1.setup)({
303
311
  }
304
312
  }
305
313
  ]
314
+ },
315
+ after: {
316
+ MAX_SPEECH_WAIT_TIMEOUT: {
317
+ target: "processingUserRequest"
318
+ }
306
319
  }
307
320
  },
308
321
  processingUserRequest: {
@@ -13,6 +13,7 @@ type AutopilotContext = {
13
13
  idleTimeout: number;
14
14
  idleTimeoutCount: number;
15
15
  maxIdleTimeoutCount: number;
16
+ maxSpeechWaitTimeout: number;
16
17
  speechBuffer: string;
17
18
  speechResponseStartTime: number;
18
19
  speechResponseTime: number;
@@ -1,7 +1,9 @@
1
1
  import { BaseModelParams } from "../types";
2
2
  declare enum OpenAIModel {
3
3
  GPT_4O = "gpt-4o",
4
- GPT_4O_MINI = "gpt-4o-mini"
4
+ GPT_4O_MINI = "gpt-4o-mini",
5
+ GPT_3_5_TURBO = "gpt-3.5-turbo",
6
+ GPT_4_TURBO = "gpt-4-turbo"
5
7
  }
6
8
  type OpenAIParams = BaseModelParams & {
7
9
  model: OpenAIModel;
@@ -5,4 +5,6 @@ var OpenAIModel;
5
5
  (function (OpenAIModel) {
6
6
  OpenAIModel["GPT_4O"] = "gpt-4o";
7
7
  OpenAIModel["GPT_4O_MINI"] = "gpt-4o-mini";
8
+ OpenAIModel["GPT_3_5_TURBO"] = "gpt-3.5-turbo";
9
+ OpenAIModel["GPT_4_TURBO"] = "gpt-4-turbo";
8
10
  })(OpenAIModel || (exports.OpenAIModel = OpenAIModel = {}));
@@ -1,7 +1,17 @@
1
1
  import { Vad } from "./types";
2
2
  declare class SileroVad implements Vad {
3
3
  private vad;
4
- constructor();
4
+ private params;
5
+ constructor(params: {
6
+ pathToModel?: string;
7
+ activationThreshold: number;
8
+ deactivationThreshold: number;
9
+ debounceFrames: number;
10
+ });
11
+ pathToModel?: string;
12
+ activationThreshold: number;
13
+ deactivationThreshold: number;
14
+ debounceFrames: number;
5
15
  private init;
6
16
  processChunk(data: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void): void;
7
17
  }
@@ -20,14 +20,18 @@ exports.SileroVad = void 0;
20
20
  * See the License for the specific language governing permissions and
21
21
  * limitations under the License.
22
22
  */
23
+ const logger_1 = require("@fonoster/logger");
23
24
  const makeVad_1 = require("./makeVad");
25
+ const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
24
26
  class SileroVad {
25
- constructor() {
27
+ constructor(params) {
28
+ logger.verbose("starting instance of silero vad", { ...params });
29
+ this.params = params;
26
30
  this.init();
27
31
  }
28
32
  async init() {
29
33
  // FIXME: It feels strange to do this in the constructor
30
- this.vad = await (0, makeVad_1.makeVad)();
34
+ this.vad = await (0, makeVad_1.makeVad)(this.params);
31
35
  }
32
36
  processChunk(data, callback) {
33
37
  if (!this.vad) {
@@ -1,2 +1,7 @@
1
- declare function makeVad(pathToModel?: string): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
1
+ declare function makeVad(params: {
2
+ pathToModel?: string;
3
+ activationThreshold: number;
4
+ deactivationThreshold: number;
5
+ debounceFrames: number;
6
+ }): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END") => void) => Promise<void>>;
2
7
  export { makeVad };
@@ -50,11 +50,14 @@ const chunkToFloat32Array_1 = require("./chunkToFloat32Array");
50
50
  const SileroVadModel_1 = require("./SileroVadModel");
51
51
  const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
52
52
  const BUFFER_SIZE = 16000;
53
- async function makeVad(pathToModel) {
53
+ async function makeVad(params) {
54
+ const { pathToModel, activationThreshold, deactivationThreshold, debounceFrames } = params;
54
55
  const effectivePath = pathToModel || (0, path_1.join)(__dirname, "..", "..", "silero_vad.onnx");
55
56
  const silero = await SileroVadModel_1.SileroVadModel.new(ort, effectivePath);
56
57
  let audioBuffer = [];
57
58
  let isSpeechActive = false;
59
+ let consecutiveSpeechFrames = 0;
60
+ let consecutiveNonSpeechFrames = 0;
58
61
  return async function process(chunk, callback) {
59
62
  const float32Array = (0, chunkToFloat32Array_1.chunkToFloat32Array)(chunk);
60
63
  audioBuffer.push(...float32Array);
@@ -65,20 +68,24 @@ async function makeVad(pathToModel) {
65
68
  const remainingBuffer = buffer.slice(BUFFER_SIZE);
66
69
  const result = await silero.process(new Float32Array(audioFrame));
67
70
  logger.silly("last vad result", { ...result });
68
- if (result.isSpeech > 0.5) {
69
- if (!isSpeechActive) {
71
+ if (result.isSpeech > activationThreshold) {
72
+ consecutiveNonSpeechFrames = 0; // Reset non-speech counter
73
+ consecutiveSpeechFrames++;
74
+ if (consecutiveSpeechFrames >= debounceFrames && !isSpeechActive) {
70
75
  isSpeechActive = true;
71
76
  callback("SPEECH_START");
72
- return processBuffer(remainingBuffer);
73
77
  }
74
78
  }
75
- else if (isSpeechActive) {
76
- isSpeechActive = false;
77
- callback("SPEECH_END");
78
- // WARNING: I'm unsure if this has any effect on the model
79
- // but it seems to work fine to ensure the model works optimally
80
- silero.resetState();
81
- return processBuffer(remainingBuffer);
79
+ else {
80
+ consecutiveSpeechFrames = 0; // Reset speech counter
81
+ consecutiveNonSpeechFrames++;
82
+ if (consecutiveNonSpeechFrames >= debounceFrames &&
83
+ isSpeechActive &&
84
+ result.isSpeech < deactivationThreshold) {
85
+ isSpeechActive = false;
86
+ callback("SPEECH_END");
87
+ silero.resetState(); // Reset VAD state after speech ends
88
+ }
82
89
  }
83
90
  return processBuffer(remainingBuffer);
84
91
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fonoster/autopilot",
3
- "version": "0.7.18",
3
+ "version": "0.7.20",
4
4
  "description": "Voice AI for the Fonoster platform",
5
5
  "author": "Pedro Sanders <psanders@fonoster.com>",
6
6
  "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -56,5 +56,5 @@
56
56
  "devDependencies": {
57
57
  "typescript": "^5.5.4"
58
58
  },
59
- "gitHead": "4150dcb8086de182d0650df0c6d990ee76658058"
59
+ "gitHead": "bde17656cad787fd22127beaa5d8617e38be2eb6"
60
60
  }