@fonoster/autopilot 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/Autopilot.js CHANGED
@@ -41,9 +41,12 @@ class Autopilot {
41
41
  }
42
42
  createActor() {
43
43
  const { voice } = this.config;
44
- const { firstMessage } = this.config.assistantConfig;
45
44
  return (0, xstate_1.createActor)(machine_1.machine, {
46
- input: { firstMessage, voice, assistant: this.assistant }
45
+ input: {
46
+ ...this.config.assistantConfig,
47
+ assistant: this.assistant,
48
+ voice
49
+ }
47
50
  });
48
51
  }
49
52
  subscribeToActorState() {
@@ -65,6 +68,7 @@ class Autopilot {
65
68
  const data = payload.data;
66
69
  await vad(data, (event) => {
67
70
  if (event === "SPEECH_START" || event === "SPEECH_END") {
71
+ logger.verbose("received speech event", { event });
68
72
  this.actor.send({ type: event });
69
73
  }
70
74
  });
@@ -79,7 +83,11 @@ class Autopilot {
79
83
  source: common_1.StreamGatherSource.SPEECH
80
84
  });
81
85
  stream.onPayload((payload) => {
82
- this.actor.send({ type: "HUMAN_PROMPT", speech: payload.speech });
86
+ const { speech } = payload;
87
+ logger.verbose("received speech result", { speech });
88
+ if (speech) {
89
+ this.actor.send({ type: "SPEECH_RESULT", speech });
90
+ }
83
91
  });
84
92
  }
85
93
  }
@@ -7,6 +7,13 @@ declare const AssistantSchema: z.ZodObject<{
7
7
  model: z.ZodNativeEnum<typeof Model>;
8
8
  temperature: z.ZodNumber;
9
9
  maxTokens: z.ZodNumber;
10
+ language: z.ZodString;
11
+ timezone: z.ZodString;
12
+ goodbyeMessage: z.ZodString;
13
+ systemErrorMessage: z.ZodString;
14
+ idleMessage: z.ZodString;
15
+ idleTimeout: z.ZodNumber;
16
+ maxIdleTimeoutCount: z.ZodNumber;
10
17
  }, "strip", z.ZodTypeAny, {
11
18
  systemTemplate: string;
12
19
  model: Model;
@@ -14,6 +21,13 @@ declare const AssistantSchema: z.ZodObject<{
14
21
  maxTokens: number;
15
22
  name: string;
16
23
  firstMessage: string;
24
+ language: string;
25
+ timezone: string;
26
+ goodbyeMessage: string;
27
+ systemErrorMessage: string;
28
+ idleMessage: string;
29
+ idleTimeout: number;
30
+ maxIdleTimeoutCount: number;
17
31
  }, {
18
32
  systemTemplate: string;
19
33
  model: Model;
@@ -21,5 +35,12 @@ declare const AssistantSchema: z.ZodObject<{
21
35
  maxTokens: number;
22
36
  name: string;
23
37
  firstMessage: string;
38
+ language: string;
39
+ timezone: string;
40
+ goodbyeMessage: string;
41
+ systemErrorMessage: string;
42
+ idleMessage: string;
43
+ idleTimeout: number;
44
+ maxIdleTimeoutCount: number;
24
45
  }>;
25
46
  export { AssistantSchema };
@@ -27,6 +27,13 @@ const AssistantSchema = zod_1.z.object({
27
27
  systemTemplate: zod_1.z.string(),
28
28
  model: zod_1.z.nativeEnum(types_1.Model),
29
29
  temperature: zod_1.z.number(),
30
- maxTokens: zod_1.z.number()
30
+ maxTokens: zod_1.z.number(),
31
+ language: zod_1.z.string(),
32
+ timezone: zod_1.z.string(),
33
+ goodbyeMessage: zod_1.z.string(),
34
+ systemErrorMessage: zod_1.z.string(),
35
+ idleMessage: zod_1.z.string(),
36
+ idleTimeout: zod_1.z.number(),
37
+ maxIdleTimeoutCount: zod_1.z.number()
31
38
  });
32
39
  exports.AssistantSchema = AssistantSchema;
@@ -5,11 +5,18 @@ declare enum Model {
5
5
  }
6
6
  type AssistantFromJson = {
7
7
  name: string;
8
+ language: string;
9
+ timezone: string;
8
10
  firstMessage: string;
9
11
  systemTemplate: string;
10
12
  model: Model;
11
13
  temperature: number;
12
14
  maxTokens: number;
15
+ goodbyeMessage: string;
16
+ systemErrorMessage: string;
17
+ idleMessage: string;
18
+ idleTimeout: number;
19
+ maxIdleTimeoutCount: number;
13
20
  };
14
21
  type AssistantConfig = AssistantFromJson & {
15
22
  apiKey: string;
@@ -1,48 +1,100 @@
1
+ import { VoiceResponse } from "@fonoster/voice";
2
+ import { Assistant } from "../assistants/assistants";
1
3
  declare const machine: import("xstate").StateMachine<{
2
- firstMessage: string;
3
- voice: import("@fonoster/voice").VoiceResponse;
4
- assistant: import("../assistants/assistants").Assistant;
4
+ assistant: Assistant;
5
+ voice: VoiceResponse;
5
6
  playbackRef: string;
7
+ firstMessage: string;
8
+ goodbyeMessage: string;
9
+ systemErrorMessage: string;
10
+ idleMessage: string;
11
+ idleTimeout: number;
12
+ idleTimeoutCount: number;
13
+ maxIdleTimeoutCount: number;
6
14
  speechBuffer: string;
7
15
  speechResponseStartTime: number;
8
16
  speechResponseTime: number;
17
+ isSpeaking: boolean;
9
18
  }, {
10
19
  type: "SPEECH_START";
11
20
  } | {
12
21
  type: "SPEECH_END";
13
22
  } | {
14
- type: "SESSION_END";
15
- } | {
16
- type: "HUMAN_PROMPT";
23
+ type: "SPEECH_RESULT";
17
24
  speech: string;
25
+ } | {
26
+ type: "USER_REQUEST_PROCESSED";
18
27
  }, {}, never, import("xstate").Values<{
19
- sendGreeting: {
20
- type: "sendGreeting";
28
+ greetUser: {
29
+ type: "greetUser";
30
+ params: unknown;
31
+ };
32
+ goodbye: {
33
+ type: "goodbye";
34
+ params: unknown;
35
+ };
36
+ announceSystemError: {
37
+ type: "announceSystemError";
21
38
  params: unknown;
22
39
  };
23
- interruptMachineSpeaking: {
24
- type: "interruptMachineSpeaking";
40
+ interruptPlayback: {
41
+ type: "interruptPlayback";
25
42
  params: unknown;
26
43
  };
44
+ processUserRequest: {
45
+ type: "processUserRequest";
46
+ params: unknown;
47
+ };
48
+ announceIdleTimeout: {
49
+ type: "announceIdleTimeout";
50
+ params: unknown;
51
+ };
52
+ increaseIdleTimeoutCount: {
53
+ type: "increaseIdleTimeoutCount";
54
+ params: import("xstate").NonReducibleUnknown;
55
+ };
56
+ cleanSpeech: {
57
+ type: "cleanSpeech";
58
+ params: import("xstate").NonReducibleUnknown;
59
+ };
27
60
  appendSpeech: {
28
61
  type: "appendSpeech";
62
+ params: import("xstate").NonReducibleUnknown;
63
+ };
64
+ resetIdleTimeoutCount: {
65
+ type: "resetIdleTimeoutCount";
66
+ params: import("xstate").NonReducibleUnknown;
67
+ };
68
+ setSpeaking: {
69
+ type: "setSpeaking";
70
+ params: import("xstate").NonReducibleUnknown;
71
+ };
72
+ setSpeakingDone: {
73
+ type: "setSpeakingDone";
74
+ params: import("xstate").NonReducibleUnknown;
75
+ };
76
+ }>, import("xstate").Values<{
77
+ idleTimeoutCountExceedsMax: {
78
+ type: "idleTimeoutCountExceedsMax";
29
79
  params: unknown;
30
80
  };
31
- processHumanRequest: {
32
- type: "processHumanRequest";
81
+ hasSpeechResult: {
82
+ type: "hasSpeechResult";
33
83
  params: unknown;
34
84
  };
35
- hangup: {
36
- type: "hangup";
85
+ isNotSpeaking: {
86
+ type: "isNotSpeaking";
37
87
  params: unknown;
38
88
  };
39
- }>, {
40
- type: "hasSpeechBuffer";
41
- params: unknown;
42
- }, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
89
+ }>, "IDLE_TIMEOUT", "greeting" | "idle" | "waitingForUserRequest" | "hangup" | "hackingTimeout" | "updatingSpeech" | "processingUserRequest", string, {
90
+ assistant: Assistant;
91
+ voice: VoiceResponse;
43
92
  firstMessage: string;
44
- voice: import("@fonoster/voice").VoiceResponse;
45
- assistant: import("../assistants/assistants").Assistant;
93
+ goodbyeMessage: string;
94
+ systemErrorMessage: string;
95
+ idleMessage: string;
96
+ idleTimeout: number;
97
+ maxIdleTimeoutCount: number;
46
98
  }, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
47
99
  readonly context: ({ input }: {
48
100
  spawn: {
@@ -55,108 +107,172 @@ declare const machine: import("xstate").StateMachine<{
55
107
  } | undefined): import("xstate").ActorRefFromLogic<TLogic>;
56
108
  };
57
109
  input: {
110
+ assistant: Assistant;
111
+ voice: VoiceResponse;
58
112
  firstMessage: string;
59
- voice: import("@fonoster/voice").VoiceResponse;
60
- assistant: import("../assistants/assistants").Assistant;
113
+ goodbyeMessage: string;
114
+ systemErrorMessage: string;
115
+ idleMessage: string;
116
+ idleTimeout: number;
117
+ maxIdleTimeoutCount: number;
61
118
  };
62
119
  self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
63
- firstMessage: string;
64
- voice: import("@fonoster/voice").VoiceResponse;
65
- assistant: import("../assistants/assistants").Assistant;
120
+ assistant: Assistant;
121
+ voice: VoiceResponse;
66
122
  playbackRef: string;
123
+ firstMessage: string;
124
+ goodbyeMessage: string;
125
+ systemErrorMessage: string;
126
+ idleMessage: string;
127
+ idleTimeout: number;
128
+ idleTimeoutCount: number;
129
+ maxIdleTimeoutCount: number;
67
130
  speechBuffer: string;
68
131
  speechResponseStartTime: number;
69
132
  speechResponseTime: number;
133
+ isSpeaking: boolean;
70
134
  }, {
71
135
  type: "SPEECH_START";
72
136
  } | {
73
137
  type: "SPEECH_END";
74
138
  } | {
75
- type: "SESSION_END";
76
- } | {
77
- type: "HUMAN_PROMPT";
139
+ type: "SPEECH_RESULT";
78
140
  speech: string;
141
+ } | {
142
+ type: "USER_REQUEST_PROCESSED";
79
143
  }, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
80
144
  type: "SPEECH_START";
81
145
  } | {
82
146
  type: "SPEECH_END";
83
147
  } | {
84
- type: "SESSION_END";
85
- } | {
86
- type: "HUMAN_PROMPT";
148
+ type: "SPEECH_RESULT";
87
149
  speech: string;
150
+ } | {
151
+ type: "USER_REQUEST_PROCESSED";
88
152
  }, import("xstate").AnyEventObject>;
89
153
  }) => {
90
- firstMessage: string;
91
- voice: import("@fonoster/voice").VoiceResponse;
154
+ voice: VoiceResponse;
92
155
  assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
93
156
  playbackRef: string;
94
157
  speechBuffer: string;
158
+ firstMessage: string;
159
+ goodbyeMessage: string;
160
+ systemErrorMessage: string;
161
+ idleMessage: string;
162
+ idleTimeout: number;
163
+ maxIdleTimeoutCount: number;
164
+ idleTimeoutCount: number;
95
165
  speechResponseStartTime: number;
96
166
  speechResponseTime: number;
167
+ isSpeaking: false;
97
168
  };
98
169
  readonly id: "fnAI";
99
- readonly initial: "welcome";
170
+ readonly initial: "greeting";
100
171
  readonly states: {
101
- readonly welcome: {
102
- readonly entry: {
103
- readonly type: "sendGreeting";
104
- };
172
+ readonly greeting: {
105
173
  readonly always: {
106
- readonly target: "machineListening";
174
+ readonly target: "idle";
175
+ };
176
+ readonly entry: {
177
+ readonly type: "greetUser";
107
178
  };
108
- readonly description: "The initial state where the AI greets the Human.";
109
179
  };
110
- readonly machineListening: {
180
+ readonly idle: {
111
181
  readonly on: {
112
182
  readonly SPEECH_START: {
113
- readonly target: "humanSpeaking";
114
- readonly description: "This must be triggered by a VAD or similar system.";
183
+ readonly target: "waitingForUserRequest";
184
+ readonly description: "Event from VAD system.";
115
185
  };
116
- readonly HUMAN_PROMPT: {
186
+ };
187
+ readonly after: {
188
+ readonly IDLE_TIMEOUT: readonly [{
189
+ readonly target: "hangup";
117
190
  readonly actions: {
118
- readonly type: "appendSpeech";
191
+ readonly type: "goodbye";
119
192
  };
120
- readonly description: "Appends the speech to the buffer.";
121
- };
193
+ readonly guard: {
194
+ readonly type: "idleTimeoutCountExceedsMax";
195
+ };
196
+ }, {
197
+ readonly target: "hackingTimeout";
198
+ readonly actions: readonly [{
199
+ readonly type: "increaseIdleTimeoutCount";
200
+ }, {
201
+ readonly type: "announceIdleTimeout";
202
+ }];
203
+ }];
122
204
  };
123
- readonly description: "The state where the AI is actively listening in conversation.";
124
205
  };
125
- readonly humanSpeaking: {
126
- readonly entry: {
127
- readonly type: "interruptMachineSpeaking";
206
+ readonly waitingForUserRequest: {
207
+ readonly always: {
208
+ readonly target: "updatingSpeech";
209
+ };
210
+ readonly entry: readonly [{
211
+ readonly type: "cleanSpeech";
212
+ }, {
213
+ readonly type: "interruptPlayback";
214
+ }, {
215
+ readonly type: "resetIdleTimeoutCount";
216
+ }, {
217
+ readonly type: "setSpeaking";
218
+ }];
219
+ };
220
+ readonly hangup: {
221
+ readonly type: "final";
222
+ };
223
+ readonly hackingTimeout: {
224
+ readonly always: {
225
+ readonly target: "idle";
128
226
  };
227
+ };
228
+ readonly updatingSpeech: {
129
229
  readonly on: {
130
- readonly HUMAN_PROMPT: {
230
+ readonly SPEECH_RESULT: readonly [{
231
+ readonly target: "processingUserRequest";
131
232
  readonly actions: {
132
233
  readonly type: "appendSpeech";
133
234
  };
134
- readonly description: "Appends the speech to the buffer.";
135
- };
136
- readonly SPEECH_END: {
137
- readonly target: "machineListening";
235
+ readonly guard: {
236
+ readonly type: "isNotSpeaking";
237
+ };
238
+ readonly description: "Speech result from the Speech to Text provider.";
239
+ }, {
240
+ readonly target: "updatingSpeech";
138
241
  readonly actions: {
139
- readonly type: "processHumanRequest";
242
+ readonly type: "appendSpeech";
243
+ };
244
+ }];
245
+ readonly SPEECH_END: readonly [{
246
+ readonly target: "processingUserRequest";
247
+ readonly actions: {
248
+ readonly type: "setSpeakingDone";
140
249
  };
141
250
  readonly guard: {
142
- readonly type: "hasSpeechBuffer";
251
+ readonly type: "hasSpeechResult";
143
252
  };
144
- readonly description: "This must be triggered by a VAD or similar system.";
145
- };
253
+ readonly description: "Event from VAD or similar system.";
254
+ }, {
255
+ readonly target: "updatingSpeech";
256
+ readonly actions: {
257
+ readonly type: "setSpeakingDone";
258
+ };
259
+ }];
146
260
  };
147
- readonly description: "The state where the AI detects Human speech while it is speaking.";
148
261
  };
149
- readonly hangup: {
150
- readonly type: "final";
151
- readonly entry: {
152
- readonly type: "hangup";
153
- };
262
+ readonly processingUserRequest: {
154
263
  readonly on: {
155
- readonly SESSION_END: {
156
- readonly target: "hangup";
264
+ readonly SPEECH_START: {
265
+ readonly target: "waitingForUserRequest";
266
+ readonly description: "Event from VAD or similar system.";
157
267
  };
268
+ readonly USER_REQUEST_PROCESSED: {
269
+ readonly target: "idle";
270
+ readonly description: "Go back home.";
271
+ };
272
+ };
273
+ readonly entry: {
274
+ readonly type: "processUserRequest";
158
275
  };
159
- readonly description: "The final state where the AI terminates the conversation due to inactivity.";
160
276
  };
161
277
  };
162
278
  }>;
@@ -23,119 +23,272 @@ const common_1 = require("@fonoster/common");
23
23
  const logger_1 = require("@fonoster/logger");
24
24
  const uuid_1 = require("uuid");
25
25
  const xstate_1 = require("xstate");
26
- const types_1 = require("./types");
27
26
  const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
28
27
  const machine = (0, xstate_1.setup)({
29
- types: types_1.types,
28
+ types: {
29
+ context: {},
30
+ input: {},
31
+ events: {}
32
+ },
30
33
  actions: {
31
- sendGreeting: async function ({ context }) {
34
+ greetUser: async ({ context }) => {
35
+ logger.verbose("called greetUser action", {
36
+ firstMessage: context.firstMessage
37
+ });
32
38
  await context.voice.answer();
33
39
  await context.voice.say(context.firstMessage, {
34
40
  playbackRef: context.playbackRef
35
41
  });
36
42
  },
37
- interruptMachineSpeaking: async function ({ context }) {
38
- logger.verbose("interrupting the machine", {
43
+ goodbye: async ({ context }) => {
44
+ logger.verbose("called goodbye action", {
45
+ goodbyeMessage: context.goodbyeMessage
46
+ });
47
+ await context.voice.say(context.goodbyeMessage, {
39
48
  playbackRef: context.playbackRef
40
49
  });
41
- await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
50
+ await context.voice.hangup();
42
51
  },
43
- appendSpeech: function ({ context, event }) {
44
- const speech = event.speech;
45
- context.speechBuffer = (context.speechBuffer || "") + " " + speech;
46
- context.speechResponseStartTime = Date.now();
47
- logger.verbose("appended speech", { speechBuffer: context.speechBuffer });
52
+ announceSystemError: async ({ context }) => {
53
+ logger.verbose("called announceSystemError action", {
54
+ systemErrorMessage: context.systemErrorMessage
55
+ });
56
+ await context.voice.say(context.systemErrorMessage, {
57
+ playbackRef: context.playbackRef
58
+ });
48
59
  },
49
- processHumanRequest: async function ({ context }) {
50
- const speech = context.speechBuffer.trim();
51
- logger.verbose("processing human request", { speech });
52
- const response = await context.assistant.invoke({
53
- text: speech
60
+ interruptPlayback: async ({ context }) => {
61
+ logger.verbose("called interruptPlayback action", {
62
+ playbackRef: context.playbackRef
63
+ });
64
+ await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
65
+ },
66
+ processUserRequest: async ({ context }) => {
67
+ logger.verbose("called processUserRequest action", {
68
+ speechBuffer: context.speechBuffer
54
69
  });
70
+ const speech = context.speechBuffer.trim();
71
+ const response = await context.assistant.invoke({ text: speech });
55
72
  const speechResponseTime = Date.now() - context.speechResponseStartTime;
56
73
  context.speechResponseTime = speechResponseTime;
57
- logger.verbose("assistant response", {
58
- response,
59
- responseTime: speechResponseTime
60
- });
61
- await context.voice.say(response, { playbackRef: context.playbackRef });
62
- // Clear the speech buffer and reset response timing
63
74
  context.speechBuffer = "";
64
75
  context.speechResponseStartTime = 0;
76
+ await context.voice.say(response, {
77
+ playbackRef: context.playbackRef
78
+ });
79
+ (0, xstate_1.raise)({ type: "USER_REQUEST_PROCESSED" });
65
80
  },
66
- hangup: async function ({ context }) {
67
- await context.voice.hangup();
68
- }
81
+ announceIdleTimeout: async ({ context }) => {
82
+ logger.verbose("called announceIdleTimeout action", {
83
+ idleMessage: context.idleMessage
84
+ });
85
+ await context.voice.say(context.idleMessage, {
86
+ playbackRef: context.playbackRef
87
+ });
88
+ },
89
+ increaseIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
90
+ logger.verbose("called increaseIdleTimeoutCount action", {
91
+ idleTimeoutCount: context.idleTimeoutCount + 1
92
+ });
93
+ context.idleTimeoutCount++;
94
+ return context;
95
+ }),
96
+ cleanSpeech: (0, xstate_1.assign)({ speechBuffer: "" }),
97
+ appendSpeech: (0, xstate_1.assign)(({ context, event }) => {
98
+ logger.verbose("called appendSpeech action", {
99
+ speech: event.speech
100
+ });
101
+ const speech = event.speech;
102
+ context.speechBuffer = (context.speechBuffer || "") + " " + speech;
103
+ context.speechResponseStartTime = Date.now();
104
+ return context;
105
+ }),
106
+ resetIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
107
+ logger.verbose("called resetIdleTimeoutCount action", {
108
+ idleTimeoutCount: 0
109
+ });
110
+ context.idleTimeoutCount = 0;
111
+ return context;
112
+ }),
113
+ setSpeaking: (0, xstate_1.assign)(({ context }) => {
114
+ logger.verbose("called setSpeaking action", {
115
+ isSpeaking: true
116
+ });
117
+ context.isSpeaking = true;
118
+ return context;
119
+ }),
120
+ setSpeakingDone: (0, xstate_1.assign)(({ context }) => {
121
+ logger.verbose("called setSpeakingDone action", {
122
+ isSpeaking: false
123
+ });
124
+ context.isSpeaking = false;
125
+ return context;
126
+ })
69
127
  },
70
128
  guards: {
71
- hasSpeechBuffer: function ({ context }) {
72
- return context.speechBuffer?.trim().length > 0;
129
+ idleTimeoutCountExceedsMax: function ({ context }) {
130
+ logger.verbose("called idleTimeoutCountExceedsMax guard", {
131
+ idleTimeoutCount: context.idleTimeoutCount,
132
+ maxIdleTimeoutCount: context.maxIdleTimeoutCount
133
+ });
134
+ return context.idleTimeoutCount >= context.maxIdleTimeoutCount;
135
+ },
136
+ hasSpeechResult: function ({ context }) {
137
+ return context.speechBuffer !== "";
138
+ },
139
+ isNotSpeaking: function ({ context }) {
140
+ logger.verbose("called isNotSpeaking guard", {
141
+ isSpeaking: context.isSpeaking
142
+ });
143
+ return !context.isSpeaking;
144
+ }
145
+ },
146
+ delays: {
147
+ IDLE_TIMEOUT: ({ context }) => {
148
+ return context.idleTimeout;
73
149
  }
74
150
  }
75
151
  }).createMachine({
76
152
  context: ({ input }) => ({
77
- firstMessage: input.firstMessage,
78
153
  voice: input.voice,
79
154
  assistant: input.assistant,
80
155
  playbackRef: (0, uuid_1.v4)(),
81
156
  speechBuffer: "",
157
+ firstMessage: input.firstMessage,
158
+ goodbyeMessage: input.goodbyeMessage,
159
+ systemErrorMessage: input.systemErrorMessage,
160
+ idleMessage: input.idleMessage,
161
+ idleTimeout: input.idleTimeout,
162
+ maxIdleTimeoutCount: input.maxIdleTimeoutCount,
163
+ idleTimeoutCount: 0,
82
164
  speechResponseStartTime: 0,
83
- speechResponseTime: 0
165
+ speechResponseTime: 0,
166
+ isSpeaking: false
84
167
  }),
85
168
  id: "fnAI",
86
- initial: "welcome",
169
+ initial: "greeting",
87
170
  states: {
88
- welcome: {
89
- entry: {
90
- type: "sendGreeting"
91
- },
171
+ greeting: {
92
172
  always: {
93
- target: "machineListening"
173
+ target: "idle"
94
174
  },
95
- description: "The initial state where the AI greets the Human."
175
+ entry: {
176
+ type: "greetUser"
177
+ }
96
178
  },
97
- machineListening: {
179
+ idle: {
98
180
  on: {
99
181
  SPEECH_START: {
100
- target: "humanSpeaking",
101
- description: "This must be triggered by a VAD or similar system."
102
- },
103
- HUMAN_PROMPT: {
104
- actions: { type: "appendSpeech" },
105
- description: "Appends the speech to the buffer."
182
+ target: "waitingForUserRequest",
183
+ description: "Event from VAD system."
106
184
  }
107
185
  },
108
- description: "The state where the AI is actively listening in conversation."
186
+ after: {
187
+ IDLE_TIMEOUT: [
188
+ {
189
+ target: "hangup",
190
+ actions: {
191
+ type: "goodbye"
192
+ },
193
+ guard: {
194
+ type: "idleTimeoutCountExceedsMax"
195
+ }
196
+ },
197
+ {
198
+ target: "hackingTimeout",
199
+ actions: [
200
+ {
201
+ type: "increaseIdleTimeoutCount"
202
+ },
203
+ {
204
+ type: "announceIdleTimeout"
205
+ }
206
+ ]
207
+ }
208
+ ]
209
+ }
109
210
  },
110
- humanSpeaking: {
111
- entry: {
112
- type: "interruptMachineSpeaking"
211
+ waitingForUserRequest: {
212
+ always: {
213
+ target: "updatingSpeech"
113
214
  },
114
- on: {
115
- HUMAN_PROMPT: {
116
- actions: { type: "appendSpeech" },
117
- description: "Appends the speech to the buffer."
215
+ entry: [
216
+ {
217
+ type: "cleanSpeech"
218
+ },
219
+ {
220
+ type: "interruptPlayback"
221
+ },
222
+ {
223
+ type: "resetIdleTimeoutCount"
118
224
  },
119
- SPEECH_END: {
120
- target: "machineListening",
121
- actions: { type: "processHumanRequest" },
122
- guard: { type: "hasSpeechBuffer" },
123
- description: "This must be triggered by a VAD or similar system."
225
+ {
226
+ type: "setSpeaking"
124
227
  }
125
- },
126
- description: "The state where the AI detects Human speech while it is speaking."
228
+ ]
127
229
  },
128
230
  hangup: {
129
- type: "final",
130
- entry: {
131
- type: "hangup"
132
- },
231
+ type: "final"
232
+ },
233
+ hackingTimeout: {
234
+ always: {
235
+ target: "idle"
236
+ }
237
+ },
238
+ updatingSpeech: {
239
+ on: {
240
+ SPEECH_RESULT: [
241
+ {
242
+ target: "processingUserRequest",
243
+ actions: {
244
+ type: "appendSpeech"
245
+ },
246
+ guard: {
247
+ type: "isNotSpeaking"
248
+ },
249
+ description: "Speech result from the Speech to Text provider."
250
+ },
251
+ {
252
+ target: "updatingSpeech",
253
+ actions: {
254
+ type: "appendSpeech"
255
+ }
256
+ }
257
+ ],
258
+ SPEECH_END: [
259
+ {
260
+ target: "processingUserRequest",
261
+ actions: {
262
+ type: "setSpeakingDone"
263
+ },
264
+ guard: {
265
+ type: "hasSpeechResult"
266
+ },
267
+ description: "Event from VAD or similar system."
268
+ },
269
+ {
270
+ target: "updatingSpeech",
271
+ actions: {
272
+ type: "setSpeakingDone"
273
+ }
274
+ }
275
+ ]
276
+ }
277
+ },
278
+ processingUserRequest: {
133
279
  on: {
134
- SESSION_END: {
135
- target: "hangup"
280
+ SPEECH_START: {
281
+ target: "waitingForUserRequest",
282
+ description: "Event from VAD or similar system."
283
+ },
284
+ USER_REQUEST_PROCESSED: {
285
+ target: "idle",
286
+ description: "Go back home."
136
287
  }
137
288
  },
138
- description: "The final state where the AI terminates the conversation due to inactivity."
289
+ entry: {
290
+ type: "processUserRequest"
291
+ }
139
292
  }
140
293
  }
141
294
  });
@@ -0,0 +1,163 @@
1
+ declare const machine: import("xstate").StateMachine<{
2
+ firstMessage: string;
3
+ voice: import("@fonoster/voice").VoiceResponse;
4
+ assistant: import("../assistants/assistants").Assistant;
5
+ playbackRef: string;
6
+ speechBuffer: string;
7
+ speechResponseStartTime: number;
8
+ speechResponseTime: number;
9
+ }, {
10
+ type: "SPEECH_START";
11
+ } | {
12
+ type: "SPEECH_END";
13
+ } | {
14
+ type: "SESSION_END";
15
+ } | {
16
+ type: "HUMAN_PROMPT";
17
+ speech: string;
18
+ }, {}, never, import("xstate").Values<{
19
+ appendSpeech: {
20
+ type: "appendSpeech";
21
+ params: unknown;
22
+ };
23
+ hangup: {
24
+ type: "hangup";
25
+ params: unknown;
26
+ };
27
+ sendGreeting: {
28
+ type: "sendGreeting";
29
+ params: unknown;
30
+ };
31
+ interruptMachineSpeaking: {
32
+ type: "interruptMachineSpeaking";
33
+ params: unknown;
34
+ };
35
+ processHumanRequest: {
36
+ type: "processHumanRequest";
37
+ params: unknown;
38
+ };
39
+ }>, {
40
+ type: "hasSpeechBuffer";
41
+ params: unknown;
42
+ }, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
43
+ firstMessage: string;
44
+ voice: import("@fonoster/voice").VoiceResponse;
45
+ assistant: import("../assistants/assistants").Assistant;
46
+ }, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
47
+ readonly context: ({ input }: {
48
+ spawn: {
49
+ <TSrc extends never>(logic: TSrc, ...[options]: never): import("xstate").ActorRefFromLogic<never>;
50
+ <TLogic extends import("xstate").AnyActorLogic>(src: TLogic, options?: {
51
+ id?: never;
52
+ systemId?: string;
53
+ input?: import("xstate").InputFrom<TLogic> | undefined;
54
+ syncSnapshot?: boolean;
55
+ } | undefined): import("xstate").ActorRefFromLogic<TLogic>;
56
+ };
57
+ input: {
58
+ firstMessage: string;
59
+ voice: import("@fonoster/voice").VoiceResponse;
60
+ assistant: import("../assistants/assistants").Assistant;
61
+ };
62
+ self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
63
+ firstMessage: string;
64
+ voice: import("@fonoster/voice").VoiceResponse;
65
+ assistant: import("../assistants/assistants").Assistant;
66
+ playbackRef: string;
67
+ speechBuffer: string;
68
+ speechResponseStartTime: number;
69
+ speechResponseTime: number;
70
+ }, {
71
+ type: "SPEECH_START";
72
+ } | {
73
+ type: "SPEECH_END";
74
+ } | {
75
+ type: "SESSION_END";
76
+ } | {
77
+ type: "HUMAN_PROMPT";
78
+ speech: string;
79
+ }, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
80
+ type: "SPEECH_START";
81
+ } | {
82
+ type: "SPEECH_END";
83
+ } | {
84
+ type: "SESSION_END";
85
+ } | {
86
+ type: "HUMAN_PROMPT";
87
+ speech: string;
88
+ }, import("xstate").AnyEventObject>;
89
+ }) => {
90
+ firstMessage: string;
91
+ voice: import("@fonoster/voice").VoiceResponse;
92
+ assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
93
+ playbackRef: string;
94
+ speechBuffer: string;
95
+ speechResponseStartTime: number;
96
+ speechResponseTime: number;
97
+ };
98
+ readonly id: "fnAI_v0";
99
+ readonly initial: "welcome";
100
+ readonly states: {
101
+ readonly welcome: {
102
+ readonly entry: {
103
+ readonly type: "sendGreeting";
104
+ };
105
+ readonly always: {
106
+ readonly target: "machineListening";
107
+ };
108
+ readonly description: "The initial state where the AI greets the Human.";
109
+ };
110
+ readonly machineListening: {
111
+ readonly on: {
112
+ readonly SPEECH_START: {
113
+ readonly target: "humanSpeaking";
114
+ readonly description: "This must be triggered by a VAD or similar system.";
115
+ };
116
+ readonly HUMAN_PROMPT: {
117
+ readonly actions: {
118
+ readonly type: "appendSpeech";
119
+ };
120
+ readonly description: "Appends the speech to the buffer.";
121
+ };
122
+ };
123
+ readonly description: "The state where the AI is actively listening in conversation.";
124
+ };
125
+ readonly humanSpeaking: {
126
+ readonly entry: {
127
+ readonly type: "interruptMachineSpeaking";
128
+ };
129
+ readonly on: {
130
+ readonly HUMAN_PROMPT: {
131
+ readonly actions: {
132
+ readonly type: "appendSpeech";
133
+ };
134
+ readonly description: "Appends the speech to the buffer.";
135
+ };
136
+ readonly SPEECH_END: {
137
+ readonly target: "machineListening";
138
+ readonly actions: {
139
+ readonly type: "processHumanRequest";
140
+ };
141
+ readonly guard: {
142
+ readonly type: "hasSpeechBuffer";
143
+ };
144
+ readonly description: "This must be triggered by a VAD or similar system.";
145
+ };
146
+ };
147
+ readonly description: "The state where the AI detects Human speech while it is speaking.";
148
+ };
149
+ readonly hangup: {
150
+ readonly type: "final";
151
+ readonly entry: {
152
+ readonly type: "hangup";
153
+ };
154
+ readonly on: {
155
+ readonly SESSION_END: {
156
+ readonly target: "hangup";
157
+ };
158
+ };
159
+ readonly description: "The final state where the AI terminates the conversation due to inactivity.";
160
+ };
161
+ };
162
+ }>;
163
+ export { machine };
@@ -0,0 +1,142 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.machine = void 0;
4
+ /*
5
+ * Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
6
+ * http://github.com/fonoster/fonoster
7
+ *
8
+ * This file is part of Fonoster
9
+ *
10
+ * Licensed under the MIT License (the "License");
11
+ * you may not use this file except in compliance with
12
+ * the License. You may obtain a copy of the License at
13
+ *
14
+ * https://opensource.org/licenses/MIT
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+ const common_1 = require("@fonoster/common");
23
+ const logger_1 = require("@fonoster/logger");
24
+ const uuid_1 = require("uuid");
25
+ const xstate_1 = require("xstate");
26
+ const types_1 = require("./types");
27
+ const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
28
+ const machine = (0, xstate_1.setup)({
29
+ types: types_1.types,
30
+ actions: {
31
+ sendGreeting: async function ({ context }) {
32
+ await context.voice.answer();
33
+ await context.voice.say(context.firstMessage, {
34
+ playbackRef: context.playbackRef
35
+ });
36
+ },
37
+ interruptMachineSpeaking: async function ({ context }) {
38
+ logger.verbose("interrupting the machine", {
39
+ playbackRef: context.playbackRef
40
+ });
41
+ await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
42
+ },
43
+ appendSpeech: function ({ context, event }) {
44
+ const speech = event.speech;
45
+ context.speechBuffer = (context.speechBuffer || "") + " " + speech;
46
+ context.speechResponseStartTime = Date.now();
47
+ logger.verbose("appended speech", { speechBuffer: context.speechBuffer });
48
+ },
49
+ processHumanRequest: async function ({ context }) {
50
+ const speech = context.speechBuffer.trim();
51
+ logger.verbose("processing human request", { speech });
52
+ const response = await context.assistant.invoke({
53
+ text: speech
54
+ });
55
+ const speechResponseTime = Date.now() - context.speechResponseStartTime;
56
+ context.speechResponseTime = speechResponseTime;
57
+ logger.verbose("assistant response", {
58
+ response,
59
+ responseTime: speechResponseTime
60
+ });
61
+ await context.voice.say(response, { playbackRef: context.playbackRef });
62
+ // Clear the speech buffer and reset response timing
63
+ context.speechBuffer = "";
64
+ context.speechResponseStartTime = 0;
65
+ },
66
+ hangup: async function ({ context }) {
67
+ await context.voice.hangup();
68
+ }
69
+ },
70
+ guards: {
71
+ hasSpeechBuffer: function ({ context }) {
72
+ return context.speechBuffer?.trim().length > 0;
73
+ }
74
+ }
75
+ }).createMachine({
76
+ context: ({ input }) => ({
77
+ firstMessage: input.firstMessage,
78
+ voice: input.voice,
79
+ assistant: input.assistant,
80
+ playbackRef: (0, uuid_1.v4)(),
81
+ speechBuffer: "",
82
+ speechResponseStartTime: 0,
83
+ speechResponseTime: 0
84
+ }),
85
+ id: "fnAI_v0",
86
+ initial: "welcome",
87
+ states: {
88
+ welcome: {
89
+ entry: {
90
+ type: "sendGreeting"
91
+ },
92
+ always: {
93
+ target: "machineListening"
94
+ },
95
+ description: "The initial state where the AI greets the Human."
96
+ },
97
+ machineListening: {
98
+ on: {
99
+ SPEECH_START: {
100
+ target: "humanSpeaking",
101
+ description: "This must be triggered by a VAD or similar system."
102
+ },
103
+ HUMAN_PROMPT: {
104
+ actions: { type: "appendSpeech" },
105
+ description: "Appends the speech to the buffer."
106
+ }
107
+ },
108
+ description: "The state where the AI is actively listening in conversation."
109
+ },
110
+ humanSpeaking: {
111
+ entry: {
112
+ type: "interruptMachineSpeaking"
113
+ },
114
+ on: {
115
+ HUMAN_PROMPT: {
116
+ actions: { type: "appendSpeech" },
117
+ description: "Appends the speech to the buffer."
118
+ },
119
+ SPEECH_END: {
120
+ target: "machineListening",
121
+ actions: { type: "processHumanRequest" },
122
+ guard: { type: "hasSpeechBuffer" },
123
+ description: "This must be triggered by a VAD or similar system."
124
+ }
125
+ },
126
+ description: "The state where the AI detects Human speech while it is speaking."
127
+ },
128
+ hangup: {
129
+ type: "final",
130
+ entry: {
131
+ type: "hangup"
132
+ },
133
+ on: {
134
+ SESSION_END: {
135
+ target: "hangup"
136
+ }
137
+ },
138
+ description: "The final state where the AI terminates the conversation due to inactivity."
139
+ }
140
+ }
141
+ });
142
+ exports.machine = machine;
@@ -1,2 +1,2 @@
1
- declare function chunkToFloat32Array(chunk: Float32Array): Float32Array;
1
+ declare function chunkToFloat32Array(chunk: Uint8Array): Float32Array;
2
2
  export { chunkToFloat32Array };
@@ -1,6 +1,4 @@
1
1
  "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.chunkToFloat32Array = chunkToFloat32Array;
4
2
  /*
5
3
  * Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
6
4
  * http://github.com/fonoster/fonoster
@@ -19,7 +17,23 @@ exports.chunkToFloat32Array = chunkToFloat32Array;
19
17
  * See the License for the specific language governing permissions and
20
18
  * limitations under the License.
21
19
  */
20
+ Object.defineProperty(exports, "__esModule", { value: true });
21
+ exports.chunkToFloat32Array = chunkToFloat32Array;
22
+ // This version of the chunkToFloat32Array accounts for the case where
23
+ // the byteOffset is misaligned.
24
+ //
25
+ // Q. Would it be the same if we just created a new Uint8Array from the chunk?
22
26
  function chunkToFloat32Array(chunk) {
23
- const int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
27
+ // Check if byteOffset is not aligned
28
+ const alignedByteOffset = chunk.byteOffset % Int16Array.BYTES_PER_ELEMENT === 0;
29
+ let int16Array;
30
+ if (alignedByteOffset) {
31
+ int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
32
+ }
33
+ else {
34
+ // Create a new aligned Uint8Array and then an Int16Array from it
35
+ const alignedChunk = new Uint8Array(chunk);
36
+ int16Array = new Int16Array(alignedChunk.buffer, alignedChunk.byteOffset, alignedChunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
37
+ }
24
38
  return new Float32Array(Array.from(int16Array, (sample) => sample / 32768.0));
25
39
  }
@@ -1,2 +1,2 @@
1
- declare function makeVad(pathToModel?: string): Promise<(chunk: Float32Array, callback: (event: "SPEECH_START" | "SPEECH_END", data: Record<string, unknown>) => void) => Promise<void>>;
1
+ declare function makeVad(pathToModel?: string): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END", data: Record<string, unknown>) => void) => Promise<void>>;
2
2
  export { makeVad };
@@ -69,12 +69,10 @@ async function makeVad(pathToModel) {
69
69
  return processBuffer(remainingBuffer);
70
70
  }
71
71
  }
72
- else {
73
- if (isSpeechActive) {
74
- isSpeechActive = false;
75
- callback("SPEECH_END", {});
76
- return processBuffer(remainingBuffer);
77
- }
72
+ else if (isSpeechActive) {
73
+ isSpeechActive = false;
74
+ callback("SPEECH_END", {});
75
+ return processBuffer(remainingBuffer);
78
76
  }
79
77
  return processBuffer(remainingBuffer);
80
78
  };
@@ -36,7 +36,7 @@ async function main() {
36
36
  })
37
37
  .stream();
38
38
  mic.on("data", async (data) => {
39
- const chunk = new Float32Array(data.buffer);
39
+ const chunk = new Uint8Array(data.buffer);
40
40
  await vad(chunk, (event, _data) => {
41
41
  logger.info("vad event:", { event, data: _data });
42
42
  });
@@ -1,4 +1,4 @@
1
- type Vad = (chunk: Float32Array, cb: (event: string) => void) => Promise<void>;
1
+ type Vad = (chunk: Uint8Array, cb: (event: string) => void) => Promise<void>;
2
2
  type SpeechProbabilities = {
3
3
  notSpeech: number;
4
4
  isSpeech: number;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fonoster/autopilot",
3
- "version": "0.7.3",
3
+ "version": "0.7.4",
4
4
  "description": "Voice AI for the Fonoster platform",
5
5
  "author": "Pedro Sanders <psanders@fonoster.com>",
6
6
  "homepage": "https://github.com/fonoster/fonoster#readme",
@@ -34,8 +34,8 @@
34
34
  "url": "https://github.com/fonoster/fonoster/issues"
35
35
  },
36
36
  "dependencies": {
37
- "@fonoster/logger": "^0.7.2",
38
- "@fonoster/voice": "^0.7.3",
37
+ "@fonoster/logger": "^0.7.4",
38
+ "@fonoster/voice": "^0.7.4",
39
39
  "@langchain/openai": "^0.2.7",
40
40
  "dotenv": "^16.4.5",
41
41
  "onnxruntime-node": "^1.19.0",
@@ -46,5 +46,5 @@
46
46
  "node-record-lpcm16": "^1.0.1",
47
47
  "typescript": "^5.5.4"
48
48
  },
49
- "gitHead": "97aa6649691819fe8a704b96bc62b1b142162393"
49
+ "gitHead": "e550aa46c1a9087a70157496365b64afd5aea11d"
50
50
  }