npm - @fonoster/autopilot - Versions diffs - 0.7.3 → 0.7.4 - Mend

@fonoster/autopilot 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/Autopilot.js +11 -3
package/dist/assistants/AssistantSchema.d.ts +21 -0
package/dist/assistants/AssistantSchema.js +8 -1
package/dist/assistants/types.d.ts +7 -0
package/dist/machine/machine.d.ts +185 -69
package/dist/machine/machine.js +218 -65
package/dist/machine/machinev0.d.ts +163 -0
package/dist/machine/machinev0.js +142 -0
package/dist/vad/chunkToFloat32Array.d.ts +1 -1
package/dist/vad/chunkToFloat32Array.js +17 -3
package/dist/vad/makeVad.d.ts +1 -1
package/dist/vad/makeVad.js +4 -6
package/dist/vad/micVadTest.js +1 -1
package/dist/vad/types.d.ts +1 -1
package/package.json +4 -4

package/dist/Autopilot.js CHANGED Viewed

@@ -41,9 +41,12 @@ class Autopilot {
     }
     createActor() {
         const { voice } = this.config;
-        const { firstMessage } = this.config.assistantConfig;
         return (0, xstate_1.createActor)(machine_1.machine, {
-            input: { firstMessage, voice, assistant: this.assistant }
+            input: {
+                ...this.config.assistantConfig,
+                assistant: this.assistant,
+                voice
+            }
         });
     }
     subscribeToActorState() {
@@ -65,6 +68,7 @@ class Autopilot {
                 const data = payload.data;
                 await vad(data, (event) => {
                     if (event === "SPEECH_START" || event === "SPEECH_END") {
+                        logger.verbose("received speech event", { event });
                         this.actor.send({ type: event });
                     }
                 });
@@ -79,7 +83,11 @@ class Autopilot {
             source: common_1.StreamGatherSource.SPEECH
         });
         stream.onPayload((payload) => {
-            this.actor.send({ type: "HUMAN_PROMPT", speech: payload.speech });
+            const { speech } = payload;
+            logger.verbose("received speech result", { speech });
+            if (speech) {
+                this.actor.send({ type: "SPEECH_RESULT", speech });
+            }
         });
     }
 }

package/dist/assistants/AssistantSchema.d.ts CHANGED Viewed

@@ -7,6 +7,13 @@ declare const AssistantSchema: z.ZodObject<{
     model: z.ZodNativeEnum<typeof Model>;
     temperature: z.ZodNumber;
     maxTokens: z.ZodNumber;
+    language: z.ZodString;
+    timezone: z.ZodString;
+    goodbyeMessage: z.ZodString;
+    systemErrorMessage: z.ZodString;
+    idleMessage: z.ZodString;
+    idleTimeout: z.ZodNumber;
+    maxIdleTimeoutCount: z.ZodNumber;
 }, "strip", z.ZodTypeAny, {
     systemTemplate: string;
     model: Model;
@@ -14,6 +21,13 @@ declare const AssistantSchema: z.ZodObject<{
     maxTokens: number;
     name: string;
     firstMessage: string;
+    language: string;
+    timezone: string;
+    goodbyeMessage: string;
+    systemErrorMessage: string;
+    idleMessage: string;
+    idleTimeout: number;
+    maxIdleTimeoutCount: number;
 }, {
     systemTemplate: string;
     model: Model;
@@ -21,5 +35,12 @@ declare const AssistantSchema: z.ZodObject<{
     maxTokens: number;
     name: string;
     firstMessage: string;
+    language: string;
+    timezone: string;
+    goodbyeMessage: string;
+    systemErrorMessage: string;
+    idleMessage: string;
+    idleTimeout: number;
+    maxIdleTimeoutCount: number;
 }>;
 export { AssistantSchema };

package/dist/assistants/AssistantSchema.js CHANGED Viewed

@@ -27,6 +27,13 @@ const AssistantSchema = zod_1.z.object({
     systemTemplate: zod_1.z.string(),
     model: zod_1.z.nativeEnum(types_1.Model),
     temperature: zod_1.z.number(),
-    maxTokens: zod_1.z.number()
+    maxTokens: zod_1.z.number(),
+    language: zod_1.z.string(),
+    timezone: zod_1.z.string(),
+    goodbyeMessage: zod_1.z.string(),
+    systemErrorMessage: zod_1.z.string(),
+    idleMessage: zod_1.z.string(),
+    idleTimeout: zod_1.z.number(),
+    maxIdleTimeoutCount: zod_1.z.number()
 });
 exports.AssistantSchema = AssistantSchema;

package/dist/assistants/types.d.ts CHANGED Viewed

@@ -5,11 +5,18 @@ declare enum Model {
 }
 type AssistantFromJson = {
     name: string;
+    language: string;
+    timezone: string;
     firstMessage: string;
     systemTemplate: string;
     model: Model;
     temperature: number;
     maxTokens: number;
+    goodbyeMessage: string;
+    systemErrorMessage: string;
+    idleMessage: string;
+    idleTimeout: number;
+    maxIdleTimeoutCount: number;
 };
 type AssistantConfig = AssistantFromJson & {
     apiKey: string;

package/dist/machine/machine.d.ts CHANGED Viewed

@@ -1,48 +1,100 @@
+import { VoiceResponse } from "@fonoster/voice";
+import { Assistant } from "../assistants/assistants";
 declare const machine: import("xstate").StateMachine<{
-    firstMessage: string;
-    voice: import("@fonoster/voice").VoiceResponse;
-    assistant: import("../assistants/assistants").Assistant;
+    assistant: Assistant;
+    voice: VoiceResponse;
     playbackRef: string;
+    firstMessage: string;
+    goodbyeMessage: string;
+    systemErrorMessage: string;
+    idleMessage: string;
+    idleTimeout: number;
+    idleTimeoutCount: number;
+    maxIdleTimeoutCount: number;
     speechBuffer: string;
     speechResponseStartTime: number;
     speechResponseTime: number;
+    isSpeaking: boolean;
 }, {
     type: "SPEECH_START";
 } | {
     type: "SPEECH_END";
 } | {
-    type: "SESSION_END";
-} | {
-    type: "HUMAN_PROMPT";
+    type: "SPEECH_RESULT";
     speech: string;
+} | {
+    type: "USER_REQUEST_PROCESSED";
 }, {}, never, import("xstate").Values<{
-    sendGreeting: {
-        type: "sendGreeting";
+    greetUser: {
+        type: "greetUser";
+        params: unknown;
+    };
+    goodbye: {
+        type: "goodbye";
+        params: unknown;
+    };
+    announceSystemError: {
+        type: "announceSystemError";
         params: unknown;
     };
-    interruptMachineSpeaking: {
-        type: "interruptMachineSpeaking";
+    interruptPlayback: {
+        type: "interruptPlayback";
         params: unknown;
     };
+    processUserRequest: {
+        type: "processUserRequest";
+        params: unknown;
+    };
+    announceIdleTimeout: {
+        type: "announceIdleTimeout";
+        params: unknown;
+    };
+    increaseIdleTimeoutCount: {
+        type: "increaseIdleTimeoutCount";
+        params: import("xstate").NonReducibleUnknown;
+    };
+    cleanSpeech: {
+        type: "cleanSpeech";
+        params: import("xstate").NonReducibleUnknown;
+    };
     appendSpeech: {
         type: "appendSpeech";
+        params: import("xstate").NonReducibleUnknown;
+    };
+    resetIdleTimeoutCount: {
+        type: "resetIdleTimeoutCount";
+        params: import("xstate").NonReducibleUnknown;
+    };
+    setSpeaking: {
+        type: "setSpeaking";
+        params: import("xstate").NonReducibleUnknown;
+    };
+    setSpeakingDone: {
+        type: "setSpeakingDone";
+        params: import("xstate").NonReducibleUnknown;
+    };
+}>, import("xstate").Values<{
+    idleTimeoutCountExceedsMax: {
+        type: "idleTimeoutCountExceedsMax";
         params: unknown;
     };
-    processHumanRequest: {
-        type: "processHumanRequest";
+    hasSpeechResult: {
+        type: "hasSpeechResult";
         params: unknown;
     };
-    hangup: {
-        type: "hangup";
+    isNotSpeaking: {
+        type: "isNotSpeaking";
         params: unknown;
     };
-}>, {
-    type: "hasSpeechBuffer";
-    params: unknown;
-}, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
+}>, "IDLE_TIMEOUT", "greeting" | "idle" | "waitingForUserRequest" | "hangup" | "hackingTimeout" | "updatingSpeech" | "processingUserRequest", string, {
+    assistant: Assistant;
+    voice: VoiceResponse;
     firstMessage: string;
-    voice: import("@fonoster/voice").VoiceResponse;
-    assistant: import("../assistants/assistants").Assistant;
+    goodbyeMessage: string;
+    systemErrorMessage: string;
+    idleMessage: string;
+    idleTimeout: number;
+    maxIdleTimeoutCount: number;
 }, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
     readonly context: ({ input }: {
         spawn: {
@@ -55,108 +107,172 @@ declare const machine: import("xstate").StateMachine<{
             } | undefined): import("xstate").ActorRefFromLogic<TLogic>;
         };
         input: {
+            assistant: Assistant;
+            voice: VoiceResponse;
             firstMessage: string;
-            voice: import("@fonoster/voice").VoiceResponse;
-            assistant: import("../assistants/assistants").Assistant;
+            goodbyeMessage: string;
+            systemErrorMessage: string;
+            idleMessage: string;
+            idleTimeout: number;
+            maxIdleTimeoutCount: number;
         };
         self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
-            firstMessage: string;
-            voice: import("@fonoster/voice").VoiceResponse;
-            assistant: import("../assistants/assistants").Assistant;
+            assistant: Assistant;
+            voice: VoiceResponse;
             playbackRef: string;
+            firstMessage: string;
+            goodbyeMessage: string;
+            systemErrorMessage: string;
+            idleMessage: string;
+            idleTimeout: number;
+            idleTimeoutCount: number;
+            maxIdleTimeoutCount: number;
             speechBuffer: string;
             speechResponseStartTime: number;
             speechResponseTime: number;
+            isSpeaking: boolean;
         }, {
             type: "SPEECH_START";
         } | {
             type: "SPEECH_END";
         } | {
-            type: "SESSION_END";
-        } | {
-            type: "HUMAN_PROMPT";
+            type: "SPEECH_RESULT";
             speech: string;
+        } | {
+            type: "USER_REQUEST_PROCESSED";
         }, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
             type: "SPEECH_START";
         } | {
             type: "SPEECH_END";
         } | {
-            type: "SESSION_END";
-        } | {
-            type: "HUMAN_PROMPT";
+            type: "SPEECH_RESULT";
             speech: string;
+        } | {
+            type: "USER_REQUEST_PROCESSED";
         }, import("xstate").AnyEventObject>;
     }) => {
-        firstMessage: string;
-        voice: import("@fonoster/voice").VoiceResponse;
+        voice: VoiceResponse;
         assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
         playbackRef: string;
         speechBuffer: string;
+        firstMessage: string;
+        goodbyeMessage: string;
+        systemErrorMessage: string;
+        idleMessage: string;
+        idleTimeout: number;
+        maxIdleTimeoutCount: number;
+        idleTimeoutCount: number;
         speechResponseStartTime: number;
         speechResponseTime: number;
+        isSpeaking: false;
     };
     readonly id: "fnAI";
-    readonly initial: "welcome";
+    readonly initial: "greeting";
     readonly states: {
-        readonly welcome: {
-            readonly entry: {
-                readonly type: "sendGreeting";
-            };
+        readonly greeting: {
             readonly always: {
-                readonly target: "machineListening";
+                readonly target: "idle";
+            };
+            readonly entry: {
+                readonly type: "greetUser";
             };
-            readonly description: "The initial state where the AI greets the Human.";
         };
-        readonly machineListening: {
+        readonly idle: {
             readonly on: {
                 readonly SPEECH_START: {
-                    readonly target: "humanSpeaking";
-                    readonly description: "This must be triggered by a VAD or similar system.";
+                    readonly target: "waitingForUserRequest";
+                    readonly description: "Event from VAD system.";
                 };
-                readonly HUMAN_PROMPT: {
+            };
+            readonly after: {
+                readonly IDLE_TIMEOUT: readonly [{
+                    readonly target: "hangup";
                     readonly actions: {
-                        readonly type: "appendSpeech";
+                        readonly type: "goodbye";
                     };
-                    readonly description: "Appends the speech to the buffer.";
-                };
+                    readonly guard: {
+                        readonly type: "idleTimeoutCountExceedsMax";
+                    };
+                }, {
+                    readonly target: "hackingTimeout";
+                    readonly actions: readonly [{
+                        readonly type: "increaseIdleTimeoutCount";
+                    }, {
+                        readonly type: "announceIdleTimeout";
+                    }];
+                }];
             };
-            readonly description: "The state where the AI is actively listening in conversation.";
         };
-        readonly humanSpeaking: {
-            readonly entry: {
-                readonly type: "interruptMachineSpeaking";
+        readonly waitingForUserRequest: {
+            readonly always: {
+                readonly target: "updatingSpeech";
+            };
+            readonly entry: readonly [{
+                readonly type: "cleanSpeech";
+            }, {
+                readonly type: "interruptPlayback";
+            }, {
+                readonly type: "resetIdleTimeoutCount";
+            }, {
+                readonly type: "setSpeaking";
+            }];
+        };
+        readonly hangup: {
+            readonly type: "final";
+        };
+        readonly hackingTimeout: {
+            readonly always: {
+                readonly target: "idle";
             };
+        };
+        readonly updatingSpeech: {
             readonly on: {
-                readonly HUMAN_PROMPT: {
+                readonly SPEECH_RESULT: readonly [{
+                    readonly target: "processingUserRequest";
                     readonly actions: {
                         readonly type: "appendSpeech";
                     };
-                    readonly description: "Appends the speech to the buffer.";
-                };
-                readonly SPEECH_END: {
-                    readonly target: "machineListening";
+                    readonly guard: {
+                        readonly type: "isNotSpeaking";
+                    };
+                    readonly description: "Speech result from the Speech to Text provider.";
+                }, {
+                    readonly target: "updatingSpeech";
                     readonly actions: {
-                        readonly type: "processHumanRequest";
+                        readonly type: "appendSpeech";
+                    };
+                }];
+                readonly SPEECH_END: readonly [{
+                    readonly target: "processingUserRequest";
+                    readonly actions: {
+                        readonly type: "setSpeakingDone";
                     };
                     readonly guard: {
-                        readonly type: "hasSpeechBuffer";
+                        readonly type: "hasSpeechResult";
                     };
-                    readonly description: "This must be triggered by a VAD or similar system.";
-                };
+                    readonly description: "Event from VAD or similar system.";
+                }, {
+                    readonly target: "updatingSpeech";
+                    readonly actions: {
+                        readonly type: "setSpeakingDone";
+                    };
+                }];
             };
-            readonly description: "The state where the AI detects Human speech while it is speaking.";
         };
-        readonly hangup: {
-            readonly type: "final";
-            readonly entry: {
-                readonly type: "hangup";
-            };
+        readonly processingUserRequest: {
             readonly on: {
-                readonly SESSION_END: {
-                    readonly target: "hangup";
+                readonly SPEECH_START: {
+                    readonly target: "waitingForUserRequest";
+                    readonly description: "Event from VAD or similar system.";
                 };
+                readonly USER_REQUEST_PROCESSED: {
+                    readonly target: "idle";
+                    readonly description: "Go back home.";
+                };
+            };
+            readonly entry: {
+                readonly type: "processUserRequest";
             };
-            readonly description: "The final state where the AI terminates the conversation due to inactivity.";
         };
     };
 }>;

package/dist/machine/machine.js CHANGED Viewed

@@ -23,119 +23,272 @@ const common_1 = require("@fonoster/common");
 const logger_1 = require("@fonoster/logger");
 const uuid_1 = require("uuid");
 const xstate_1 = require("xstate");
-const types_1 = require("./types");
 const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
 const machine = (0, xstate_1.setup)({
-    types: types_1.types,
+    types: {
+        context: {},
+        input: {},
+        events: {}
+    },
     actions: {
-        sendGreeting: async function ({ context }) {
+        greetUser: async ({ context }) => {
+            logger.verbose("called greetUser action", {
+                firstMessage: context.firstMessage
+            });
             await context.voice.answer();
             await context.voice.say(context.firstMessage, {
                 playbackRef: context.playbackRef
             });
         },
-        interruptMachineSpeaking: async function ({ context }) {
-            logger.verbose("interrupting the machine", {
+        goodbye: async ({ context }) => {
+            logger.verbose("called goodbye action", {
+                goodbyeMessage: context.goodbyeMessage
+            });
+            await context.voice.say(context.goodbyeMessage, {
                 playbackRef: context.playbackRef
             });
-            await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
+            await context.voice.hangup();
         },
-        appendSpeech: function ({ context, event }) {
-            const speech = event.speech;
-            context.speechBuffer = (context.speechBuffer || "") + " " + speech;
-            context.speechResponseStartTime = Date.now();
-            logger.verbose("appended speech", { speechBuffer: context.speechBuffer });
+        announceSystemError: async ({ context }) => {
+            logger.verbose("called announceSystemError action", {
+                systemErrorMessage: context.systemErrorMessage
+            });
+            await context.voice.say(context.systemErrorMessage, {
+                playbackRef: context.playbackRef
+            });
         },
-        processHumanRequest: async function ({ context }) {
-            const speech = context.speechBuffer.trim();
-            logger.verbose("processing human request", { speech });
-            const response = await context.assistant.invoke({
-                text: speech
+        interruptPlayback: async ({ context }) => {
+            logger.verbose("called interruptPlayback action", {
+                playbackRef: context.playbackRef
+            });
+            await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
+        },
+        processUserRequest: async ({ context }) => {
+            logger.verbose("called processUserRequest action", {
+                speechBuffer: context.speechBuffer
             });
+            const speech = context.speechBuffer.trim();
+            const response = await context.assistant.invoke({ text: speech });
             const speechResponseTime = Date.now() - context.speechResponseStartTime;
             context.speechResponseTime = speechResponseTime;
-            logger.verbose("assistant response", {
-                response,
-                responseTime: speechResponseTime
-            });
-            await context.voice.say(response, { playbackRef: context.playbackRef });
-            // Clear the speech buffer and reset response timing
             context.speechBuffer = "";
             context.speechResponseStartTime = 0;
+            await context.voice.say(response, {
+                playbackRef: context.playbackRef
+            });
+            (0, xstate_1.raise)({ type: "USER_REQUEST_PROCESSED" });
         },
-        hangup: async function ({ context }) {
-            await context.voice.hangup();
-        }
+        announceIdleTimeout: async ({ context }) => {
+            logger.verbose("called announceIdleTimeout action", {
+                idleMessage: context.idleMessage
+            });
+            await context.voice.say(context.idleMessage, {
+                playbackRef: context.playbackRef
+            });
+        },
+        increaseIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
+            logger.verbose("called increaseIdleTimeoutCount action", {
+                idleTimeoutCount: context.idleTimeoutCount + 1
+            });
+            context.idleTimeoutCount++;
+            return context;
+        }),
+        cleanSpeech: (0, xstate_1.assign)({ speechBuffer: "" }),
+        appendSpeech: (0, xstate_1.assign)(({ context, event }) => {
+            logger.verbose("called appendSpeech action", {
+                speech: event.speech
+            });
+            const speech = event.speech;
+            context.speechBuffer = (context.speechBuffer || "") + " " + speech;
+            context.speechResponseStartTime = Date.now();
+            return context;
+        }),
+        resetIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
+            logger.verbose("called resetIdleTimeoutCount action", {
+                idleTimeoutCount: 0
+            });
+            context.idleTimeoutCount = 0;
+            return context;
+        }),
+        setSpeaking: (0, xstate_1.assign)(({ context }) => {
+            logger.verbose("called setSpeaking action", {
+                isSpeaking: true
+            });
+            context.isSpeaking = true;
+            return context;
+        }),
+        setSpeakingDone: (0, xstate_1.assign)(({ context }) => {
+            logger.verbose("called setSpeakingDone action", {
+                isSpeaking: false
+            });
+            context.isSpeaking = false;
+            return context;
+        })
     },
     guards: {
-        hasSpeechBuffer: function ({ context }) {
-            return context.speechBuffer?.trim().length > 0;
+        idleTimeoutCountExceedsMax: function ({ context }) {
+            logger.verbose("called idleTimeoutCountExceedsMax guard", {
+                idleTimeoutCount: context.idleTimeoutCount,
+                maxIdleTimeoutCount: context.maxIdleTimeoutCount
+            });
+            return context.idleTimeoutCount >= context.maxIdleTimeoutCount;
+        },
+        hasSpeechResult: function ({ context }) {
+            return context.speechBuffer !== "";
+        },
+        isNotSpeaking: function ({ context }) {
+            logger.verbose("called isNotSpeaking guard", {
+                isSpeaking: context.isSpeaking
+            });
+            return !context.isSpeaking;
+        }
+    },
+    delays: {
+        IDLE_TIMEOUT: ({ context }) => {
+            return context.idleTimeout;
         }
     }
 }).createMachine({
     context: ({ input }) => ({
-        firstMessage: input.firstMessage,
         voice: input.voice,
         assistant: input.assistant,
         playbackRef: (0, uuid_1.v4)(),
         speechBuffer: "",
+        firstMessage: input.firstMessage,
+        goodbyeMessage: input.goodbyeMessage,
+        systemErrorMessage: input.systemErrorMessage,
+        idleMessage: input.idleMessage,
+        idleTimeout: input.idleTimeout,
+        maxIdleTimeoutCount: input.maxIdleTimeoutCount,
+        idleTimeoutCount: 0,
         speechResponseStartTime: 0,
-        speechResponseTime: 0
+        speechResponseTime: 0,
+        isSpeaking: false
     }),
     id: "fnAI",
-    initial: "welcome",
+    initial: "greeting",
     states: {
-        welcome: {
-            entry: {
-                type: "sendGreeting"
-            },
+        greeting: {
             always: {
-                target: "machineListening"
+                target: "idle"
             },
-            description: "The initial state where the AI greets the Human."
+            entry: {
+                type: "greetUser"
+            }
         },
-        machineListening: {
+        idle: {
             on: {
                 SPEECH_START: {
-                    target: "humanSpeaking",
-                    description: "This must be triggered by a VAD or similar system."
-                },
-                HUMAN_PROMPT: {
-                    actions: { type: "appendSpeech" },
-                    description: "Appends the speech to the buffer."
+                    target: "waitingForUserRequest",
+                    description: "Event from VAD system."
                 }
             },
-            description: "The state where the AI is actively listening in conversation."
+            after: {
+                IDLE_TIMEOUT: [
+                    {
+                        target: "hangup",
+                        actions: {
+                            type: "goodbye"
+                        },
+                        guard: {
+                            type: "idleTimeoutCountExceedsMax"
+                        }
+                    },
+                    {
+                        target: "hackingTimeout",
+                        actions: [
+                            {
+                                type: "increaseIdleTimeoutCount"
+                            },
+                            {
+                                type: "announceIdleTimeout"
+                            }
+                        ]
+                    }
+                ]
+            }
         },
-        humanSpeaking: {
-            entry: {
-                type: "interruptMachineSpeaking"
+        waitingForUserRequest: {
+            always: {
+                target: "updatingSpeech"
             },
-            on: {
-                HUMAN_PROMPT: {
-                    actions: { type: "appendSpeech" },
-                    description: "Appends the speech to the buffer."
+            entry: [
+                {
+                    type: "cleanSpeech"
+                },
+                {
+                    type: "interruptPlayback"
+                },
+                {
+                    type: "resetIdleTimeoutCount"
                 },
-                SPEECH_END: {
-                    target: "machineListening",
-                    actions: { type: "processHumanRequest" },
-                    guard: { type: "hasSpeechBuffer" },
-                    description: "This must be triggered by a VAD or similar system."
+                {
+                    type: "setSpeaking"
                 }
-            },
-            description: "The state where the AI detects Human speech while it is speaking."
+            ]
         },
         hangup: {
-            type: "final",
-            entry: {
-                type: "hangup"
-            },
+            type: "final"
+        },
+        hackingTimeout: {
+            always: {
+                target: "idle"
+            }
+        },
+        updatingSpeech: {
+            on: {
+                SPEECH_RESULT: [
+                    {
+                        target: "processingUserRequest",
+                        actions: {
+                            type: "appendSpeech"
+                        },
+                        guard: {
+                            type: "isNotSpeaking"
+                        },
+                        description: "Speech result from the Speech to Text provider."
+                    },
+                    {
+                        target: "updatingSpeech",
+                        actions: {
+                            type: "appendSpeech"
+                        }
+                    }
+                ],
+                SPEECH_END: [
+                    {
+                        target: "processingUserRequest",
+                        actions: {
+                            type: "setSpeakingDone"
+                        },
+                        guard: {
+                            type: "hasSpeechResult"
+                        },
+                        description: "Event from VAD or similar system."
+                    },
+                    {
+                        target: "updatingSpeech",
+                        actions: {
+                            type: "setSpeakingDone"
+                        }
+                    }
+                ]
+            }
+        },
+        processingUserRequest: {
             on: {
-                SESSION_END: {
-                    target: "hangup"
+                SPEECH_START: {
+                    target: "waitingForUserRequest",
+                    description: "Event from VAD or similar system."
+                },
+                USER_REQUEST_PROCESSED: {
+                    target: "idle",
+                    description: "Go back home."
                 }
             },
-            description: "The final state where the AI terminates the conversation due to inactivity."
+            entry: {
+                type: "processUserRequest"
+            }
         }
     }
 });

package/dist/machine/machinev0.d.ts ADDED Viewed

@@ -0,0 +1,163 @@
+declare const machine: import("xstate").StateMachine<{
+    firstMessage: string;
+    voice: import("@fonoster/voice").VoiceResponse;
+    assistant: import("../assistants/assistants").Assistant;
+    playbackRef: string;
+    speechBuffer: string;
+    speechResponseStartTime: number;
+    speechResponseTime: number;
+}, {
+    type: "SPEECH_START";
+} | {
+    type: "SPEECH_END";
+} | {
+    type: "SESSION_END";
+} | {
+    type: "HUMAN_PROMPT";
+    speech: string;
+}, {}, never, import("xstate").Values<{
+    appendSpeech: {
+        type: "appendSpeech";
+        params: unknown;
+    };
+    hangup: {
+        type: "hangup";
+        params: unknown;
+    };
+    sendGreeting: {
+        type: "sendGreeting";
+        params: unknown;
+    };
+    interruptMachineSpeaking: {
+        type: "interruptMachineSpeaking";
+        params: unknown;
+    };
+    processHumanRequest: {
+        type: "processHumanRequest";
+        params: unknown;
+    };
+}>, {
+    type: "hasSpeechBuffer";
+    params: unknown;
+}, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
+    firstMessage: string;
+    voice: import("@fonoster/voice").VoiceResponse;
+    assistant: import("../assistants/assistants").Assistant;
+}, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
+    readonly context: ({ input }: {
+        spawn: {
+            <TSrc extends never>(logic: TSrc, ...[options]: never): import("xstate").ActorRefFromLogic<never>;
+            <TLogic extends import("xstate").AnyActorLogic>(src: TLogic, options?: {
+                id?: never;
+                systemId?: string;
+                input?: import("xstate").InputFrom<TLogic> | undefined;
+                syncSnapshot?: boolean;
+            } | undefined): import("xstate").ActorRefFromLogic<TLogic>;
+        };
+        input: {
+            firstMessage: string;
+            voice: import("@fonoster/voice").VoiceResponse;
+            assistant: import("../assistants/assistants").Assistant;
+        };
+        self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
+            firstMessage: string;
+            voice: import("@fonoster/voice").VoiceResponse;
+            assistant: import("../assistants/assistants").Assistant;
+            playbackRef: string;
+            speechBuffer: string;
+            speechResponseStartTime: number;
+            speechResponseTime: number;
+        }, {
+            type: "SPEECH_START";
+        } | {
+            type: "SPEECH_END";
+        } | {
+            type: "SESSION_END";
+        } | {
+            type: "HUMAN_PROMPT";
+            speech: string;
+        }, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
+            type: "SPEECH_START";
+        } | {
+            type: "SPEECH_END";
+        } | {
+            type: "SESSION_END";
+        } | {
+            type: "HUMAN_PROMPT";
+            speech: string;
+        }, import("xstate").AnyEventObject>;
+    }) => {
+        firstMessage: string;
+        voice: import("@fonoster/voice").VoiceResponse;
+        assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
+        playbackRef: string;
+        speechBuffer: string;
+        speechResponseStartTime: number;
+        speechResponseTime: number;
+    };
+    readonly id: "fnAI_v0";
+    readonly initial: "welcome";
+    readonly states: {
+        readonly welcome: {
+            readonly entry: {
+                readonly type: "sendGreeting";
+            };
+            readonly always: {
+                readonly target: "machineListening";
+            };
+            readonly description: "The initial state where the AI greets the Human.";
+        };
+        readonly machineListening: {
+            readonly on: {
+                readonly SPEECH_START: {
+                    readonly target: "humanSpeaking";
+                    readonly description: "This must be triggered by a VAD or similar system.";
+                };
+                readonly HUMAN_PROMPT: {
+                    readonly actions: {
+                        readonly type: "appendSpeech";
+                    };
+                    readonly description: "Appends the speech to the buffer.";
+                };
+            };
+            readonly description: "The state where the AI is actively listening in conversation.";
+        };
+        readonly humanSpeaking: {
+            readonly entry: {
+                readonly type: "interruptMachineSpeaking";
+            };
+            readonly on: {
+                readonly HUMAN_PROMPT: {
+                    readonly actions: {
+                        readonly type: "appendSpeech";
+                    };
+                    readonly description: "Appends the speech to the buffer.";
+                };
+                readonly SPEECH_END: {
+                    readonly target: "machineListening";
+                    readonly actions: {
+                        readonly type: "processHumanRequest";
+                    };
+                    readonly guard: {
+                        readonly type: "hasSpeechBuffer";
+                    };
+                    readonly description: "This must be triggered by a VAD or similar system.";
+                };
+            };
+            readonly description: "The state where the AI detects Human speech while it is speaking.";
+        };
+        readonly hangup: {
+            readonly type: "final";
+            readonly entry: {
+                readonly type: "hangup";
+            };
+            readonly on: {
+                readonly SESSION_END: {
+                    readonly target: "hangup";
+                };
+            };
+            readonly description: "The final state where the AI terminates the conversation due to inactivity.";
+        };
+    };
+}>;
+export { machine };

package/dist/machine/machinev0.js ADDED Viewed

@@ -0,0 +1,142 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.machine = void 0;
+/*
+ * Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
+ * http://github.com/fonoster/fonoster
+ *
+ * This file is part of Fonoster
+ *
+ * Licensed under the MIT License (the "License");
+ * you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *    https://opensource.org/licenses/MIT
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+const common_1 = require("@fonoster/common");
+const logger_1 = require("@fonoster/logger");
+const uuid_1 = require("uuid");
+const xstate_1 = require("xstate");
+const types_1 = require("./types");
+const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
+const machine = (0, xstate_1.setup)({
+    types: types_1.types,
+    actions: {
+        sendGreeting: async function ({ context }) {
+            await context.voice.answer();
+            await context.voice.say(context.firstMessage, {
+                playbackRef: context.playbackRef
+            });
+        },
+        interruptMachineSpeaking: async function ({ context }) {
+            logger.verbose("interrupting the machine", {
+                playbackRef: context.playbackRef
+            });
+            await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
+        },
+        appendSpeech: function ({ context, event }) {
+            const speech = event.speech;
+            context.speechBuffer = (context.speechBuffer || "") + " " + speech;
+            context.speechResponseStartTime = Date.now();
+            logger.verbose("appended speech", { speechBuffer: context.speechBuffer });
+        },
+        processHumanRequest: async function ({ context }) {
+            const speech = context.speechBuffer.trim();
+            logger.verbose("processing human request", { speech });
+            const response = await context.assistant.invoke({
+                text: speech
+            });
+            const speechResponseTime = Date.now() - context.speechResponseStartTime;
+            context.speechResponseTime = speechResponseTime;
+            logger.verbose("assistant response", {
+                response,
+                responseTime: speechResponseTime
+            });
+            await context.voice.say(response, { playbackRef: context.playbackRef });
+            // Clear the speech buffer and reset response timing
+            context.speechBuffer = "";
+            context.speechResponseStartTime = 0;
+        },
+        hangup: async function ({ context }) {
+            await context.voice.hangup();
+        }
+    },
+    guards: {
+        hasSpeechBuffer: function ({ context }) {
+            return context.speechBuffer?.trim().length > 0;
+        }
+    }
+}).createMachine({
+    context: ({ input }) => ({
+        firstMessage: input.firstMessage,
+        voice: input.voice,
+        assistant: input.assistant,
+        playbackRef: (0, uuid_1.v4)(),
+        speechBuffer: "",
+        speechResponseStartTime: 0,
+        speechResponseTime: 0
+    }),
+    id: "fnAI_v0",
+    initial: "welcome",
+    states: {
+        welcome: {
+            entry: {
+                type: "sendGreeting"
+            },
+            always: {
+                target: "machineListening"
+            },
+            description: "The initial state where the AI greets the Human."
+        },
+        machineListening: {
+            on: {
+                SPEECH_START: {
+                    target: "humanSpeaking",
+                    description: "This must be triggered by a VAD or similar system."
+                },
+                HUMAN_PROMPT: {
+                    actions: { type: "appendSpeech" },
+                    description: "Appends the speech to the buffer."
+                }
+            },
+            description: "The state where the AI is actively listening in conversation."
+        },
+        humanSpeaking: {
+            entry: {
+                type: "interruptMachineSpeaking"
+            },
+            on: {
+                HUMAN_PROMPT: {
+                    actions: { type: "appendSpeech" },
+                    description: "Appends the speech to the buffer."
+                },
+                SPEECH_END: {
+                    target: "machineListening",
+                    actions: { type: "processHumanRequest" },
+                    guard: { type: "hasSpeechBuffer" },
+                    description: "This must be triggered by a VAD or similar system."
+                }
+            },
+            description: "The state where the AI detects Human speech while it is speaking."
+        },
+        hangup: {
+            type: "final",
+            entry: {
+                type: "hangup"
+            },
+            on: {
+                SESSION_END: {
+                    target: "hangup"
+                }
+            },
+            description: "The final state where the AI terminates the conversation due to inactivity."
+        }
+    }
+});
+exports.machine = machine;

package/dist/vad/chunkToFloat32Array.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-declare function chunkToFloat32Array(chunk: Float32Array): Float32Array;
+declare function chunkToFloat32Array(chunk: Uint8Array): Float32Array;
 export { chunkToFloat32Array };

package/dist/vad/chunkToFloat32Array.js CHANGED Viewed

@@ -1,6 +1,4 @@
 "use strict";
-Object.defineProperty(exports, "__esModule", { value: true });
-exports.chunkToFloat32Array = chunkToFloat32Array;
 /*
  * Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
  * http://github.com/fonoster/fonoster
@@ -19,7 +17,23 @@ exports.chunkToFloat32Array = chunkToFloat32Array;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.chunkToFloat32Array = chunkToFloat32Array;
+// This version of the chunkToFloat32Array accounts for the case where
+// the byteOffset is misaligned.
+//
+// Q. Would it be the same if we just created a new Uint8Array from the chunk?
 function chunkToFloat32Array(chunk) {
-    const int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
+    // Check if byteOffset is not aligned
+    const alignedByteOffset = chunk.byteOffset % Int16Array.BYTES_PER_ELEMENT === 0;
+    let int16Array;
+    if (alignedByteOffset) {
+        int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
+    }
+    else {
+        // Create a new aligned Uint8Array and then an Int16Array from it
+        const alignedChunk = new Uint8Array(chunk);
+        int16Array = new Int16Array(alignedChunk.buffer, alignedChunk.byteOffset, alignedChunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
+    }
     return new Float32Array(Array.from(int16Array, (sample) => sample / 32768.0));
 }

package/dist/vad/makeVad.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-declare function makeVad(pathToModel?: string): Promise<(chunk: Float32Array, callback: (event: "SPEECH_START" | "SPEECH_END", data: Record<string, unknown>) => void) => Promise<void>>;
+declare function makeVad(pathToModel?: string): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END", data: Record<string, unknown>) => void) => Promise<void>>;
 export { makeVad };

package/dist/vad/makeVad.js CHANGED Viewed

@@ -69,12 +69,10 @@ async function makeVad(pathToModel) {
                     return processBuffer(remainingBuffer);
                 }
             }
-            else {
-                if (isSpeechActive) {
-                    isSpeechActive = false;
-                    callback("SPEECH_END", {});
-                    return processBuffer(remainingBuffer);
-                }
+            else if (isSpeechActive) {
+                isSpeechActive = false;
+                callback("SPEECH_END", {});
+                return processBuffer(remainingBuffer);
             }
             return processBuffer(remainingBuffer);
         };

package/dist/vad/micVadTest.js CHANGED Viewed

@@ -36,7 +36,7 @@ async function main() {
     })
         .stream();
     mic.on("data", async (data) => {
-        const chunk = new Float32Array(data.buffer);
+        const chunk = new Uint8Array(data.buffer);
         await vad(chunk, (event, _data) => {
             logger.info("vad event:", { event, data: _data });
         });

package/dist/vad/types.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-type Vad = (chunk: Float32Array, cb: (event: string) => void) => Promise<void>;
+type Vad = (chunk: Uint8Array, cb: (event: string) => void) => Promise<void>;
 type SpeechProbabilities = {
     notSpeech: number;
     isSpeech: number;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fonoster/autopilot",
-  "version": "0.7.3",
+  "version": "0.7.4",
   "description": "Voice AI for the Fonoster platform",
   "author": "Pedro Sanders <psanders@fonoster.com>",
   "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -34,8 +34,8 @@
     "url": "https://github.com/fonoster/fonoster/issues"
   },
   "dependencies": {
-    "@fonoster/logger": "^0.7.2",
-    "@fonoster/voice": "^0.7.3",
+    "@fonoster/logger": "^0.7.4",
+    "@fonoster/voice": "^0.7.4",
     "@langchain/openai": "^0.2.7",
     "dotenv": "^16.4.5",
     "onnxruntime-node": "^1.19.0",
@@ -46,5 +46,5 @@
     "node-record-lpcm16": "^1.0.1",
     "typescript": "^5.5.4"
   },
-  "gitHead": "97aa6649691819fe8a704b96bc62b1b142162393"
+  "gitHead": "e550aa46c1a9087a70157496365b64afd5aea11d"
 }