@fonoster/autopilot 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Autopilot.js +11 -3
- package/dist/assistants/AssistantSchema.d.ts +21 -0
- package/dist/assistants/AssistantSchema.js +8 -1
- package/dist/assistants/types.d.ts +7 -0
- package/dist/machine/machine.d.ts +185 -69
- package/dist/machine/machine.js +218 -65
- package/dist/machine/machinev0.d.ts +163 -0
- package/dist/machine/machinev0.js +142 -0
- package/dist/vad/chunkToFloat32Array.d.ts +1 -1
- package/dist/vad/chunkToFloat32Array.js +17 -3
- package/dist/vad/makeVad.d.ts +1 -1
- package/dist/vad/makeVad.js +4 -6
- package/dist/vad/micVadTest.js +1 -1
- package/dist/vad/types.d.ts +1 -1
- package/package.json +4 -4
package/dist/Autopilot.js
CHANGED
|
@@ -41,9 +41,12 @@ class Autopilot {
|
|
|
41
41
|
}
|
|
42
42
|
createActor() {
|
|
43
43
|
const { voice } = this.config;
|
|
44
|
-
const { firstMessage } = this.config.assistantConfig;
|
|
45
44
|
return (0, xstate_1.createActor)(machine_1.machine, {
|
|
46
|
-
input: {
|
|
45
|
+
input: {
|
|
46
|
+
...this.config.assistantConfig,
|
|
47
|
+
assistant: this.assistant,
|
|
48
|
+
voice
|
|
49
|
+
}
|
|
47
50
|
});
|
|
48
51
|
}
|
|
49
52
|
subscribeToActorState() {
|
|
@@ -65,6 +68,7 @@ class Autopilot {
|
|
|
65
68
|
const data = payload.data;
|
|
66
69
|
await vad(data, (event) => {
|
|
67
70
|
if (event === "SPEECH_START" || event === "SPEECH_END") {
|
|
71
|
+
logger.verbose("received speech event", { event });
|
|
68
72
|
this.actor.send({ type: event });
|
|
69
73
|
}
|
|
70
74
|
});
|
|
@@ -79,7 +83,11 @@ class Autopilot {
|
|
|
79
83
|
source: common_1.StreamGatherSource.SPEECH
|
|
80
84
|
});
|
|
81
85
|
stream.onPayload((payload) => {
|
|
82
|
-
|
|
86
|
+
const { speech } = payload;
|
|
87
|
+
logger.verbose("received speech result", { speech });
|
|
88
|
+
if (speech) {
|
|
89
|
+
this.actor.send({ type: "SPEECH_RESULT", speech });
|
|
90
|
+
}
|
|
83
91
|
});
|
|
84
92
|
}
|
|
85
93
|
}
|
|
@@ -7,6 +7,13 @@ declare const AssistantSchema: z.ZodObject<{
|
|
|
7
7
|
model: z.ZodNativeEnum<typeof Model>;
|
|
8
8
|
temperature: z.ZodNumber;
|
|
9
9
|
maxTokens: z.ZodNumber;
|
|
10
|
+
language: z.ZodString;
|
|
11
|
+
timezone: z.ZodString;
|
|
12
|
+
goodbyeMessage: z.ZodString;
|
|
13
|
+
systemErrorMessage: z.ZodString;
|
|
14
|
+
idleMessage: z.ZodString;
|
|
15
|
+
idleTimeout: z.ZodNumber;
|
|
16
|
+
maxIdleTimeoutCount: z.ZodNumber;
|
|
10
17
|
}, "strip", z.ZodTypeAny, {
|
|
11
18
|
systemTemplate: string;
|
|
12
19
|
model: Model;
|
|
@@ -14,6 +21,13 @@ declare const AssistantSchema: z.ZodObject<{
|
|
|
14
21
|
maxTokens: number;
|
|
15
22
|
name: string;
|
|
16
23
|
firstMessage: string;
|
|
24
|
+
language: string;
|
|
25
|
+
timezone: string;
|
|
26
|
+
goodbyeMessage: string;
|
|
27
|
+
systemErrorMessage: string;
|
|
28
|
+
idleMessage: string;
|
|
29
|
+
idleTimeout: number;
|
|
30
|
+
maxIdleTimeoutCount: number;
|
|
17
31
|
}, {
|
|
18
32
|
systemTemplate: string;
|
|
19
33
|
model: Model;
|
|
@@ -21,5 +35,12 @@ declare const AssistantSchema: z.ZodObject<{
|
|
|
21
35
|
maxTokens: number;
|
|
22
36
|
name: string;
|
|
23
37
|
firstMessage: string;
|
|
38
|
+
language: string;
|
|
39
|
+
timezone: string;
|
|
40
|
+
goodbyeMessage: string;
|
|
41
|
+
systemErrorMessage: string;
|
|
42
|
+
idleMessage: string;
|
|
43
|
+
idleTimeout: number;
|
|
44
|
+
maxIdleTimeoutCount: number;
|
|
24
45
|
}>;
|
|
25
46
|
export { AssistantSchema };
|
|
@@ -27,6 +27,13 @@ const AssistantSchema = zod_1.z.object({
|
|
|
27
27
|
systemTemplate: zod_1.z.string(),
|
|
28
28
|
model: zod_1.z.nativeEnum(types_1.Model),
|
|
29
29
|
temperature: zod_1.z.number(),
|
|
30
|
-
maxTokens: zod_1.z.number()
|
|
30
|
+
maxTokens: zod_1.z.number(),
|
|
31
|
+
language: zod_1.z.string(),
|
|
32
|
+
timezone: zod_1.z.string(),
|
|
33
|
+
goodbyeMessage: zod_1.z.string(),
|
|
34
|
+
systemErrorMessage: zod_1.z.string(),
|
|
35
|
+
idleMessage: zod_1.z.string(),
|
|
36
|
+
idleTimeout: zod_1.z.number(),
|
|
37
|
+
maxIdleTimeoutCount: zod_1.z.number()
|
|
31
38
|
});
|
|
32
39
|
exports.AssistantSchema = AssistantSchema;
|
|
@@ -5,11 +5,18 @@ declare enum Model {
|
|
|
5
5
|
}
|
|
6
6
|
type AssistantFromJson = {
|
|
7
7
|
name: string;
|
|
8
|
+
language: string;
|
|
9
|
+
timezone: string;
|
|
8
10
|
firstMessage: string;
|
|
9
11
|
systemTemplate: string;
|
|
10
12
|
model: Model;
|
|
11
13
|
temperature: number;
|
|
12
14
|
maxTokens: number;
|
|
15
|
+
goodbyeMessage: string;
|
|
16
|
+
systemErrorMessage: string;
|
|
17
|
+
idleMessage: string;
|
|
18
|
+
idleTimeout: number;
|
|
19
|
+
maxIdleTimeoutCount: number;
|
|
13
20
|
};
|
|
14
21
|
type AssistantConfig = AssistantFromJson & {
|
|
15
22
|
apiKey: string;
|
|
@@ -1,48 +1,100 @@
|
|
|
1
|
+
import { VoiceResponse } from "@fonoster/voice";
|
|
2
|
+
import { Assistant } from "../assistants/assistants";
|
|
1
3
|
declare const machine: import("xstate").StateMachine<{
|
|
2
|
-
|
|
3
|
-
voice:
|
|
4
|
-
assistant: import("../assistants/assistants").Assistant;
|
|
4
|
+
assistant: Assistant;
|
|
5
|
+
voice: VoiceResponse;
|
|
5
6
|
playbackRef: string;
|
|
7
|
+
firstMessage: string;
|
|
8
|
+
goodbyeMessage: string;
|
|
9
|
+
systemErrorMessage: string;
|
|
10
|
+
idleMessage: string;
|
|
11
|
+
idleTimeout: number;
|
|
12
|
+
idleTimeoutCount: number;
|
|
13
|
+
maxIdleTimeoutCount: number;
|
|
6
14
|
speechBuffer: string;
|
|
7
15
|
speechResponseStartTime: number;
|
|
8
16
|
speechResponseTime: number;
|
|
17
|
+
isSpeaking: boolean;
|
|
9
18
|
}, {
|
|
10
19
|
type: "SPEECH_START";
|
|
11
20
|
} | {
|
|
12
21
|
type: "SPEECH_END";
|
|
13
22
|
} | {
|
|
14
|
-
type: "
|
|
15
|
-
} | {
|
|
16
|
-
type: "HUMAN_PROMPT";
|
|
23
|
+
type: "SPEECH_RESULT";
|
|
17
24
|
speech: string;
|
|
25
|
+
} | {
|
|
26
|
+
type: "USER_REQUEST_PROCESSED";
|
|
18
27
|
}, {}, never, import("xstate").Values<{
|
|
19
|
-
|
|
20
|
-
type: "
|
|
28
|
+
greetUser: {
|
|
29
|
+
type: "greetUser";
|
|
30
|
+
params: unknown;
|
|
31
|
+
};
|
|
32
|
+
goodbye: {
|
|
33
|
+
type: "goodbye";
|
|
34
|
+
params: unknown;
|
|
35
|
+
};
|
|
36
|
+
announceSystemError: {
|
|
37
|
+
type: "announceSystemError";
|
|
21
38
|
params: unknown;
|
|
22
39
|
};
|
|
23
|
-
|
|
24
|
-
type: "
|
|
40
|
+
interruptPlayback: {
|
|
41
|
+
type: "interruptPlayback";
|
|
25
42
|
params: unknown;
|
|
26
43
|
};
|
|
44
|
+
processUserRequest: {
|
|
45
|
+
type: "processUserRequest";
|
|
46
|
+
params: unknown;
|
|
47
|
+
};
|
|
48
|
+
announceIdleTimeout: {
|
|
49
|
+
type: "announceIdleTimeout";
|
|
50
|
+
params: unknown;
|
|
51
|
+
};
|
|
52
|
+
increaseIdleTimeoutCount: {
|
|
53
|
+
type: "increaseIdleTimeoutCount";
|
|
54
|
+
params: import("xstate").NonReducibleUnknown;
|
|
55
|
+
};
|
|
56
|
+
cleanSpeech: {
|
|
57
|
+
type: "cleanSpeech";
|
|
58
|
+
params: import("xstate").NonReducibleUnknown;
|
|
59
|
+
};
|
|
27
60
|
appendSpeech: {
|
|
28
61
|
type: "appendSpeech";
|
|
62
|
+
params: import("xstate").NonReducibleUnknown;
|
|
63
|
+
};
|
|
64
|
+
resetIdleTimeoutCount: {
|
|
65
|
+
type: "resetIdleTimeoutCount";
|
|
66
|
+
params: import("xstate").NonReducibleUnknown;
|
|
67
|
+
};
|
|
68
|
+
setSpeaking: {
|
|
69
|
+
type: "setSpeaking";
|
|
70
|
+
params: import("xstate").NonReducibleUnknown;
|
|
71
|
+
};
|
|
72
|
+
setSpeakingDone: {
|
|
73
|
+
type: "setSpeakingDone";
|
|
74
|
+
params: import("xstate").NonReducibleUnknown;
|
|
75
|
+
};
|
|
76
|
+
}>, import("xstate").Values<{
|
|
77
|
+
idleTimeoutCountExceedsMax: {
|
|
78
|
+
type: "idleTimeoutCountExceedsMax";
|
|
29
79
|
params: unknown;
|
|
30
80
|
};
|
|
31
|
-
|
|
32
|
-
type: "
|
|
81
|
+
hasSpeechResult: {
|
|
82
|
+
type: "hasSpeechResult";
|
|
33
83
|
params: unknown;
|
|
34
84
|
};
|
|
35
|
-
|
|
36
|
-
type: "
|
|
85
|
+
isNotSpeaking: {
|
|
86
|
+
type: "isNotSpeaking";
|
|
37
87
|
params: unknown;
|
|
38
88
|
};
|
|
39
|
-
}>, {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
}, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
|
|
89
|
+
}>, "IDLE_TIMEOUT", "greeting" | "idle" | "waitingForUserRequest" | "hangup" | "hackingTimeout" | "updatingSpeech" | "processingUserRequest", string, {
|
|
90
|
+
assistant: Assistant;
|
|
91
|
+
voice: VoiceResponse;
|
|
43
92
|
firstMessage: string;
|
|
44
|
-
|
|
45
|
-
|
|
93
|
+
goodbyeMessage: string;
|
|
94
|
+
systemErrorMessage: string;
|
|
95
|
+
idleMessage: string;
|
|
96
|
+
idleTimeout: number;
|
|
97
|
+
maxIdleTimeoutCount: number;
|
|
46
98
|
}, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
|
|
47
99
|
readonly context: ({ input }: {
|
|
48
100
|
spawn: {
|
|
@@ -55,108 +107,172 @@ declare const machine: import("xstate").StateMachine<{
|
|
|
55
107
|
} | undefined): import("xstate").ActorRefFromLogic<TLogic>;
|
|
56
108
|
};
|
|
57
109
|
input: {
|
|
110
|
+
assistant: Assistant;
|
|
111
|
+
voice: VoiceResponse;
|
|
58
112
|
firstMessage: string;
|
|
59
|
-
|
|
60
|
-
|
|
113
|
+
goodbyeMessage: string;
|
|
114
|
+
systemErrorMessage: string;
|
|
115
|
+
idleMessage: string;
|
|
116
|
+
idleTimeout: number;
|
|
117
|
+
maxIdleTimeoutCount: number;
|
|
61
118
|
};
|
|
62
119
|
self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
|
|
63
|
-
|
|
64
|
-
voice:
|
|
65
|
-
assistant: import("../assistants/assistants").Assistant;
|
|
120
|
+
assistant: Assistant;
|
|
121
|
+
voice: VoiceResponse;
|
|
66
122
|
playbackRef: string;
|
|
123
|
+
firstMessage: string;
|
|
124
|
+
goodbyeMessage: string;
|
|
125
|
+
systemErrorMessage: string;
|
|
126
|
+
idleMessage: string;
|
|
127
|
+
idleTimeout: number;
|
|
128
|
+
idleTimeoutCount: number;
|
|
129
|
+
maxIdleTimeoutCount: number;
|
|
67
130
|
speechBuffer: string;
|
|
68
131
|
speechResponseStartTime: number;
|
|
69
132
|
speechResponseTime: number;
|
|
133
|
+
isSpeaking: boolean;
|
|
70
134
|
}, {
|
|
71
135
|
type: "SPEECH_START";
|
|
72
136
|
} | {
|
|
73
137
|
type: "SPEECH_END";
|
|
74
138
|
} | {
|
|
75
|
-
type: "
|
|
76
|
-
} | {
|
|
77
|
-
type: "HUMAN_PROMPT";
|
|
139
|
+
type: "SPEECH_RESULT";
|
|
78
140
|
speech: string;
|
|
141
|
+
} | {
|
|
142
|
+
type: "USER_REQUEST_PROCESSED";
|
|
79
143
|
}, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
|
|
80
144
|
type: "SPEECH_START";
|
|
81
145
|
} | {
|
|
82
146
|
type: "SPEECH_END";
|
|
83
147
|
} | {
|
|
84
|
-
type: "
|
|
85
|
-
} | {
|
|
86
|
-
type: "HUMAN_PROMPT";
|
|
148
|
+
type: "SPEECH_RESULT";
|
|
87
149
|
speech: string;
|
|
150
|
+
} | {
|
|
151
|
+
type: "USER_REQUEST_PROCESSED";
|
|
88
152
|
}, import("xstate").AnyEventObject>;
|
|
89
153
|
}) => {
|
|
90
|
-
|
|
91
|
-
voice: import("@fonoster/voice").VoiceResponse;
|
|
154
|
+
voice: VoiceResponse;
|
|
92
155
|
assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
|
|
93
156
|
playbackRef: string;
|
|
94
157
|
speechBuffer: string;
|
|
158
|
+
firstMessage: string;
|
|
159
|
+
goodbyeMessage: string;
|
|
160
|
+
systemErrorMessage: string;
|
|
161
|
+
idleMessage: string;
|
|
162
|
+
idleTimeout: number;
|
|
163
|
+
maxIdleTimeoutCount: number;
|
|
164
|
+
idleTimeoutCount: number;
|
|
95
165
|
speechResponseStartTime: number;
|
|
96
166
|
speechResponseTime: number;
|
|
167
|
+
isSpeaking: false;
|
|
97
168
|
};
|
|
98
169
|
readonly id: "fnAI";
|
|
99
|
-
readonly initial: "
|
|
170
|
+
readonly initial: "greeting";
|
|
100
171
|
readonly states: {
|
|
101
|
-
readonly
|
|
102
|
-
readonly entry: {
|
|
103
|
-
readonly type: "sendGreeting";
|
|
104
|
-
};
|
|
172
|
+
readonly greeting: {
|
|
105
173
|
readonly always: {
|
|
106
|
-
readonly target: "
|
|
174
|
+
readonly target: "idle";
|
|
175
|
+
};
|
|
176
|
+
readonly entry: {
|
|
177
|
+
readonly type: "greetUser";
|
|
107
178
|
};
|
|
108
|
-
readonly description: "The initial state where the AI greets the Human.";
|
|
109
179
|
};
|
|
110
|
-
readonly
|
|
180
|
+
readonly idle: {
|
|
111
181
|
readonly on: {
|
|
112
182
|
readonly SPEECH_START: {
|
|
113
|
-
readonly target: "
|
|
114
|
-
readonly description: "
|
|
183
|
+
readonly target: "waitingForUserRequest";
|
|
184
|
+
readonly description: "Event from VAD system.";
|
|
115
185
|
};
|
|
116
|
-
|
|
186
|
+
};
|
|
187
|
+
readonly after: {
|
|
188
|
+
readonly IDLE_TIMEOUT: readonly [{
|
|
189
|
+
readonly target: "hangup";
|
|
117
190
|
readonly actions: {
|
|
118
|
-
readonly type: "
|
|
191
|
+
readonly type: "goodbye";
|
|
119
192
|
};
|
|
120
|
-
readonly
|
|
121
|
-
|
|
193
|
+
readonly guard: {
|
|
194
|
+
readonly type: "idleTimeoutCountExceedsMax";
|
|
195
|
+
};
|
|
196
|
+
}, {
|
|
197
|
+
readonly target: "hackingTimeout";
|
|
198
|
+
readonly actions: readonly [{
|
|
199
|
+
readonly type: "increaseIdleTimeoutCount";
|
|
200
|
+
}, {
|
|
201
|
+
readonly type: "announceIdleTimeout";
|
|
202
|
+
}];
|
|
203
|
+
}];
|
|
122
204
|
};
|
|
123
|
-
readonly description: "The state where the AI is actively listening in conversation.";
|
|
124
205
|
};
|
|
125
|
-
readonly
|
|
126
|
-
readonly
|
|
127
|
-
readonly
|
|
206
|
+
readonly waitingForUserRequest: {
|
|
207
|
+
readonly always: {
|
|
208
|
+
readonly target: "updatingSpeech";
|
|
209
|
+
};
|
|
210
|
+
readonly entry: readonly [{
|
|
211
|
+
readonly type: "cleanSpeech";
|
|
212
|
+
}, {
|
|
213
|
+
readonly type: "interruptPlayback";
|
|
214
|
+
}, {
|
|
215
|
+
readonly type: "resetIdleTimeoutCount";
|
|
216
|
+
}, {
|
|
217
|
+
readonly type: "setSpeaking";
|
|
218
|
+
}];
|
|
219
|
+
};
|
|
220
|
+
readonly hangup: {
|
|
221
|
+
readonly type: "final";
|
|
222
|
+
};
|
|
223
|
+
readonly hackingTimeout: {
|
|
224
|
+
readonly always: {
|
|
225
|
+
readonly target: "idle";
|
|
128
226
|
};
|
|
227
|
+
};
|
|
228
|
+
readonly updatingSpeech: {
|
|
129
229
|
readonly on: {
|
|
130
|
-
readonly
|
|
230
|
+
readonly SPEECH_RESULT: readonly [{
|
|
231
|
+
readonly target: "processingUserRequest";
|
|
131
232
|
readonly actions: {
|
|
132
233
|
readonly type: "appendSpeech";
|
|
133
234
|
};
|
|
134
|
-
readonly
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
readonly
|
|
235
|
+
readonly guard: {
|
|
236
|
+
readonly type: "isNotSpeaking";
|
|
237
|
+
};
|
|
238
|
+
readonly description: "Speech result from the Speech to Text provider.";
|
|
239
|
+
}, {
|
|
240
|
+
readonly target: "updatingSpeech";
|
|
138
241
|
readonly actions: {
|
|
139
|
-
readonly type: "
|
|
242
|
+
readonly type: "appendSpeech";
|
|
243
|
+
};
|
|
244
|
+
}];
|
|
245
|
+
readonly SPEECH_END: readonly [{
|
|
246
|
+
readonly target: "processingUserRequest";
|
|
247
|
+
readonly actions: {
|
|
248
|
+
readonly type: "setSpeakingDone";
|
|
140
249
|
};
|
|
141
250
|
readonly guard: {
|
|
142
|
-
readonly type: "
|
|
251
|
+
readonly type: "hasSpeechResult";
|
|
143
252
|
};
|
|
144
|
-
readonly description: "
|
|
145
|
-
}
|
|
253
|
+
readonly description: "Event from VAD or similar system.";
|
|
254
|
+
}, {
|
|
255
|
+
readonly target: "updatingSpeech";
|
|
256
|
+
readonly actions: {
|
|
257
|
+
readonly type: "setSpeakingDone";
|
|
258
|
+
};
|
|
259
|
+
}];
|
|
146
260
|
};
|
|
147
|
-
readonly description: "The state where the AI detects Human speech while it is speaking.";
|
|
148
261
|
};
|
|
149
|
-
readonly
|
|
150
|
-
readonly type: "final";
|
|
151
|
-
readonly entry: {
|
|
152
|
-
readonly type: "hangup";
|
|
153
|
-
};
|
|
262
|
+
readonly processingUserRequest: {
|
|
154
263
|
readonly on: {
|
|
155
|
-
readonly
|
|
156
|
-
readonly target: "
|
|
264
|
+
readonly SPEECH_START: {
|
|
265
|
+
readonly target: "waitingForUserRequest";
|
|
266
|
+
readonly description: "Event from VAD or similar system.";
|
|
157
267
|
};
|
|
268
|
+
readonly USER_REQUEST_PROCESSED: {
|
|
269
|
+
readonly target: "idle";
|
|
270
|
+
readonly description: "Go back home.";
|
|
271
|
+
};
|
|
272
|
+
};
|
|
273
|
+
readonly entry: {
|
|
274
|
+
readonly type: "processUserRequest";
|
|
158
275
|
};
|
|
159
|
-
readonly description: "The final state where the AI terminates the conversation due to inactivity.";
|
|
160
276
|
};
|
|
161
277
|
};
|
|
162
278
|
}>;
|
package/dist/machine/machine.js
CHANGED
|
@@ -23,119 +23,272 @@ const common_1 = require("@fonoster/common");
|
|
|
23
23
|
const logger_1 = require("@fonoster/logger");
|
|
24
24
|
const uuid_1 = require("uuid");
|
|
25
25
|
const xstate_1 = require("xstate");
|
|
26
|
-
const types_1 = require("./types");
|
|
27
26
|
const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
|
|
28
27
|
const machine = (0, xstate_1.setup)({
|
|
29
|
-
types:
|
|
28
|
+
types: {
|
|
29
|
+
context: {},
|
|
30
|
+
input: {},
|
|
31
|
+
events: {}
|
|
32
|
+
},
|
|
30
33
|
actions: {
|
|
31
|
-
|
|
34
|
+
greetUser: async ({ context }) => {
|
|
35
|
+
logger.verbose("called greetUser action", {
|
|
36
|
+
firstMessage: context.firstMessage
|
|
37
|
+
});
|
|
32
38
|
await context.voice.answer();
|
|
33
39
|
await context.voice.say(context.firstMessage, {
|
|
34
40
|
playbackRef: context.playbackRef
|
|
35
41
|
});
|
|
36
42
|
},
|
|
37
|
-
|
|
38
|
-
logger.verbose("
|
|
43
|
+
goodbye: async ({ context }) => {
|
|
44
|
+
logger.verbose("called goodbye action", {
|
|
45
|
+
goodbyeMessage: context.goodbyeMessage
|
|
46
|
+
});
|
|
47
|
+
await context.voice.say(context.goodbyeMessage, {
|
|
39
48
|
playbackRef: context.playbackRef
|
|
40
49
|
});
|
|
41
|
-
await context.voice.
|
|
50
|
+
await context.voice.hangup();
|
|
42
51
|
},
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
52
|
+
announceSystemError: async ({ context }) => {
|
|
53
|
+
logger.verbose("called announceSystemError action", {
|
|
54
|
+
systemErrorMessage: context.systemErrorMessage
|
|
55
|
+
});
|
|
56
|
+
await context.voice.say(context.systemErrorMessage, {
|
|
57
|
+
playbackRef: context.playbackRef
|
|
58
|
+
});
|
|
48
59
|
},
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
60
|
+
interruptPlayback: async ({ context }) => {
|
|
61
|
+
logger.verbose("called interruptPlayback action", {
|
|
62
|
+
playbackRef: context.playbackRef
|
|
63
|
+
});
|
|
64
|
+
await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
|
|
65
|
+
},
|
|
66
|
+
processUserRequest: async ({ context }) => {
|
|
67
|
+
logger.verbose("called processUserRequest action", {
|
|
68
|
+
speechBuffer: context.speechBuffer
|
|
54
69
|
});
|
|
70
|
+
const speech = context.speechBuffer.trim();
|
|
71
|
+
const response = await context.assistant.invoke({ text: speech });
|
|
55
72
|
const speechResponseTime = Date.now() - context.speechResponseStartTime;
|
|
56
73
|
context.speechResponseTime = speechResponseTime;
|
|
57
|
-
logger.verbose("assistant response", {
|
|
58
|
-
response,
|
|
59
|
-
responseTime: speechResponseTime
|
|
60
|
-
});
|
|
61
|
-
await context.voice.say(response, { playbackRef: context.playbackRef });
|
|
62
|
-
// Clear the speech buffer and reset response timing
|
|
63
74
|
context.speechBuffer = "";
|
|
64
75
|
context.speechResponseStartTime = 0;
|
|
76
|
+
await context.voice.say(response, {
|
|
77
|
+
playbackRef: context.playbackRef
|
|
78
|
+
});
|
|
79
|
+
(0, xstate_1.raise)({ type: "USER_REQUEST_PROCESSED" });
|
|
65
80
|
},
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
81
|
+
announceIdleTimeout: async ({ context }) => {
|
|
82
|
+
logger.verbose("called announceIdleTimeout action", {
|
|
83
|
+
idleMessage: context.idleMessage
|
|
84
|
+
});
|
|
85
|
+
await context.voice.say(context.idleMessage, {
|
|
86
|
+
playbackRef: context.playbackRef
|
|
87
|
+
});
|
|
88
|
+
},
|
|
89
|
+
increaseIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
|
|
90
|
+
logger.verbose("called increaseIdleTimeoutCount action", {
|
|
91
|
+
idleTimeoutCount: context.idleTimeoutCount + 1
|
|
92
|
+
});
|
|
93
|
+
context.idleTimeoutCount++;
|
|
94
|
+
return context;
|
|
95
|
+
}),
|
|
96
|
+
cleanSpeech: (0, xstate_1.assign)({ speechBuffer: "" }),
|
|
97
|
+
appendSpeech: (0, xstate_1.assign)(({ context, event }) => {
|
|
98
|
+
logger.verbose("called appendSpeech action", {
|
|
99
|
+
speech: event.speech
|
|
100
|
+
});
|
|
101
|
+
const speech = event.speech;
|
|
102
|
+
context.speechBuffer = (context.speechBuffer || "") + " " + speech;
|
|
103
|
+
context.speechResponseStartTime = Date.now();
|
|
104
|
+
return context;
|
|
105
|
+
}),
|
|
106
|
+
resetIdleTimeoutCount: (0, xstate_1.assign)(({ context }) => {
|
|
107
|
+
logger.verbose("called resetIdleTimeoutCount action", {
|
|
108
|
+
idleTimeoutCount: 0
|
|
109
|
+
});
|
|
110
|
+
context.idleTimeoutCount = 0;
|
|
111
|
+
return context;
|
|
112
|
+
}),
|
|
113
|
+
setSpeaking: (0, xstate_1.assign)(({ context }) => {
|
|
114
|
+
logger.verbose("called setSpeaking action", {
|
|
115
|
+
isSpeaking: true
|
|
116
|
+
});
|
|
117
|
+
context.isSpeaking = true;
|
|
118
|
+
return context;
|
|
119
|
+
}),
|
|
120
|
+
setSpeakingDone: (0, xstate_1.assign)(({ context }) => {
|
|
121
|
+
logger.verbose("called setSpeakingDone action", {
|
|
122
|
+
isSpeaking: false
|
|
123
|
+
});
|
|
124
|
+
context.isSpeaking = false;
|
|
125
|
+
return context;
|
|
126
|
+
})
|
|
69
127
|
},
|
|
70
128
|
guards: {
|
|
71
|
-
|
|
72
|
-
|
|
129
|
+
idleTimeoutCountExceedsMax: function ({ context }) {
|
|
130
|
+
logger.verbose("called idleTimeoutCountExceedsMax guard", {
|
|
131
|
+
idleTimeoutCount: context.idleTimeoutCount,
|
|
132
|
+
maxIdleTimeoutCount: context.maxIdleTimeoutCount
|
|
133
|
+
});
|
|
134
|
+
return context.idleTimeoutCount >= context.maxIdleTimeoutCount;
|
|
135
|
+
},
|
|
136
|
+
hasSpeechResult: function ({ context }) {
|
|
137
|
+
return context.speechBuffer !== "";
|
|
138
|
+
},
|
|
139
|
+
isNotSpeaking: function ({ context }) {
|
|
140
|
+
logger.verbose("called isNotSpeaking guard", {
|
|
141
|
+
isSpeaking: context.isSpeaking
|
|
142
|
+
});
|
|
143
|
+
return !context.isSpeaking;
|
|
144
|
+
}
|
|
145
|
+
},
|
|
146
|
+
delays: {
|
|
147
|
+
IDLE_TIMEOUT: ({ context }) => {
|
|
148
|
+
return context.idleTimeout;
|
|
73
149
|
}
|
|
74
150
|
}
|
|
75
151
|
}).createMachine({
|
|
76
152
|
context: ({ input }) => ({
|
|
77
|
-
firstMessage: input.firstMessage,
|
|
78
153
|
voice: input.voice,
|
|
79
154
|
assistant: input.assistant,
|
|
80
155
|
playbackRef: (0, uuid_1.v4)(),
|
|
81
156
|
speechBuffer: "",
|
|
157
|
+
firstMessage: input.firstMessage,
|
|
158
|
+
goodbyeMessage: input.goodbyeMessage,
|
|
159
|
+
systemErrorMessage: input.systemErrorMessage,
|
|
160
|
+
idleMessage: input.idleMessage,
|
|
161
|
+
idleTimeout: input.idleTimeout,
|
|
162
|
+
maxIdleTimeoutCount: input.maxIdleTimeoutCount,
|
|
163
|
+
idleTimeoutCount: 0,
|
|
82
164
|
speechResponseStartTime: 0,
|
|
83
|
-
speechResponseTime: 0
|
|
165
|
+
speechResponseTime: 0,
|
|
166
|
+
isSpeaking: false
|
|
84
167
|
}),
|
|
85
168
|
id: "fnAI",
|
|
86
|
-
initial: "
|
|
169
|
+
initial: "greeting",
|
|
87
170
|
states: {
|
|
88
|
-
|
|
89
|
-
entry: {
|
|
90
|
-
type: "sendGreeting"
|
|
91
|
-
},
|
|
171
|
+
greeting: {
|
|
92
172
|
always: {
|
|
93
|
-
target: "
|
|
173
|
+
target: "idle"
|
|
94
174
|
},
|
|
95
|
-
|
|
175
|
+
entry: {
|
|
176
|
+
type: "greetUser"
|
|
177
|
+
}
|
|
96
178
|
},
|
|
97
|
-
|
|
179
|
+
idle: {
|
|
98
180
|
on: {
|
|
99
181
|
SPEECH_START: {
|
|
100
|
-
target: "
|
|
101
|
-
description: "
|
|
102
|
-
},
|
|
103
|
-
HUMAN_PROMPT: {
|
|
104
|
-
actions: { type: "appendSpeech" },
|
|
105
|
-
description: "Appends the speech to the buffer."
|
|
182
|
+
target: "waitingForUserRequest",
|
|
183
|
+
description: "Event from VAD system."
|
|
106
184
|
}
|
|
107
185
|
},
|
|
108
|
-
|
|
186
|
+
after: {
|
|
187
|
+
IDLE_TIMEOUT: [
|
|
188
|
+
{
|
|
189
|
+
target: "hangup",
|
|
190
|
+
actions: {
|
|
191
|
+
type: "goodbye"
|
|
192
|
+
},
|
|
193
|
+
guard: {
|
|
194
|
+
type: "idleTimeoutCountExceedsMax"
|
|
195
|
+
}
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
target: "hackingTimeout",
|
|
199
|
+
actions: [
|
|
200
|
+
{
|
|
201
|
+
type: "increaseIdleTimeoutCount"
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
type: "announceIdleTimeout"
|
|
205
|
+
}
|
|
206
|
+
]
|
|
207
|
+
}
|
|
208
|
+
]
|
|
209
|
+
}
|
|
109
210
|
},
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
211
|
+
waitingForUserRequest: {
|
|
212
|
+
always: {
|
|
213
|
+
target: "updatingSpeech"
|
|
113
214
|
},
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
215
|
+
entry: [
|
|
216
|
+
{
|
|
217
|
+
type: "cleanSpeech"
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
type: "interruptPlayback"
|
|
221
|
+
},
|
|
222
|
+
{
|
|
223
|
+
type: "resetIdleTimeoutCount"
|
|
118
224
|
},
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
actions: { type: "processHumanRequest" },
|
|
122
|
-
guard: { type: "hasSpeechBuffer" },
|
|
123
|
-
description: "This must be triggered by a VAD or similar system."
|
|
225
|
+
{
|
|
226
|
+
type: "setSpeaking"
|
|
124
227
|
}
|
|
125
|
-
|
|
126
|
-
description: "The state where the AI detects Human speech while it is speaking."
|
|
228
|
+
]
|
|
127
229
|
},
|
|
128
230
|
hangup: {
|
|
129
|
-
type: "final"
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
231
|
+
type: "final"
|
|
232
|
+
},
|
|
233
|
+
hackingTimeout: {
|
|
234
|
+
always: {
|
|
235
|
+
target: "idle"
|
|
236
|
+
}
|
|
237
|
+
},
|
|
238
|
+
updatingSpeech: {
|
|
239
|
+
on: {
|
|
240
|
+
SPEECH_RESULT: [
|
|
241
|
+
{
|
|
242
|
+
target: "processingUserRequest",
|
|
243
|
+
actions: {
|
|
244
|
+
type: "appendSpeech"
|
|
245
|
+
},
|
|
246
|
+
guard: {
|
|
247
|
+
type: "isNotSpeaking"
|
|
248
|
+
},
|
|
249
|
+
description: "Speech result from the Speech to Text provider."
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
target: "updatingSpeech",
|
|
253
|
+
actions: {
|
|
254
|
+
type: "appendSpeech"
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
],
|
|
258
|
+
SPEECH_END: [
|
|
259
|
+
{
|
|
260
|
+
target: "processingUserRequest",
|
|
261
|
+
actions: {
|
|
262
|
+
type: "setSpeakingDone"
|
|
263
|
+
},
|
|
264
|
+
guard: {
|
|
265
|
+
type: "hasSpeechResult"
|
|
266
|
+
},
|
|
267
|
+
description: "Event from VAD or similar system."
|
|
268
|
+
},
|
|
269
|
+
{
|
|
270
|
+
target: "updatingSpeech",
|
|
271
|
+
actions: {
|
|
272
|
+
type: "setSpeakingDone"
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
]
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
processingUserRequest: {
|
|
133
279
|
on: {
|
|
134
|
-
|
|
135
|
-
target: "
|
|
280
|
+
SPEECH_START: {
|
|
281
|
+
target: "waitingForUserRequest",
|
|
282
|
+
description: "Event from VAD or similar system."
|
|
283
|
+
},
|
|
284
|
+
USER_REQUEST_PROCESSED: {
|
|
285
|
+
target: "idle",
|
|
286
|
+
description: "Go back home."
|
|
136
287
|
}
|
|
137
288
|
},
|
|
138
|
-
|
|
289
|
+
entry: {
|
|
290
|
+
type: "processUserRequest"
|
|
291
|
+
}
|
|
139
292
|
}
|
|
140
293
|
}
|
|
141
294
|
});
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
declare const machine: import("xstate").StateMachine<{
|
|
2
|
+
firstMessage: string;
|
|
3
|
+
voice: import("@fonoster/voice").VoiceResponse;
|
|
4
|
+
assistant: import("../assistants/assistants").Assistant;
|
|
5
|
+
playbackRef: string;
|
|
6
|
+
speechBuffer: string;
|
|
7
|
+
speechResponseStartTime: number;
|
|
8
|
+
speechResponseTime: number;
|
|
9
|
+
}, {
|
|
10
|
+
type: "SPEECH_START";
|
|
11
|
+
} | {
|
|
12
|
+
type: "SPEECH_END";
|
|
13
|
+
} | {
|
|
14
|
+
type: "SESSION_END";
|
|
15
|
+
} | {
|
|
16
|
+
type: "HUMAN_PROMPT";
|
|
17
|
+
speech: string;
|
|
18
|
+
}, {}, never, import("xstate").Values<{
|
|
19
|
+
appendSpeech: {
|
|
20
|
+
type: "appendSpeech";
|
|
21
|
+
params: unknown;
|
|
22
|
+
};
|
|
23
|
+
hangup: {
|
|
24
|
+
type: "hangup";
|
|
25
|
+
params: unknown;
|
|
26
|
+
};
|
|
27
|
+
sendGreeting: {
|
|
28
|
+
type: "sendGreeting";
|
|
29
|
+
params: unknown;
|
|
30
|
+
};
|
|
31
|
+
interruptMachineSpeaking: {
|
|
32
|
+
type: "interruptMachineSpeaking";
|
|
33
|
+
params: unknown;
|
|
34
|
+
};
|
|
35
|
+
processHumanRequest: {
|
|
36
|
+
type: "processHumanRequest";
|
|
37
|
+
params: unknown;
|
|
38
|
+
};
|
|
39
|
+
}>, {
|
|
40
|
+
type: "hasSpeechBuffer";
|
|
41
|
+
params: unknown;
|
|
42
|
+
}, never, "hangup" | "welcome" | "machineListening" | "humanSpeaking", string, {
|
|
43
|
+
firstMessage: string;
|
|
44
|
+
voice: import("@fonoster/voice").VoiceResponse;
|
|
45
|
+
assistant: import("../assistants/assistants").Assistant;
|
|
46
|
+
}, import("xstate").NonReducibleUnknown, import("xstate").EventObject, import("xstate").MetaObject, {
|
|
47
|
+
readonly context: ({ input }: {
|
|
48
|
+
spawn: {
|
|
49
|
+
<TSrc extends never>(logic: TSrc, ...[options]: never): import("xstate").ActorRefFromLogic<never>;
|
|
50
|
+
<TLogic extends import("xstate").AnyActorLogic>(src: TLogic, options?: {
|
|
51
|
+
id?: never;
|
|
52
|
+
systemId?: string;
|
|
53
|
+
input?: import("xstate").InputFrom<TLogic> | undefined;
|
|
54
|
+
syncSnapshot?: boolean;
|
|
55
|
+
} | undefined): import("xstate").ActorRefFromLogic<TLogic>;
|
|
56
|
+
};
|
|
57
|
+
input: {
|
|
58
|
+
firstMessage: string;
|
|
59
|
+
voice: import("@fonoster/voice").VoiceResponse;
|
|
60
|
+
assistant: import("../assistants/assistants").Assistant;
|
|
61
|
+
};
|
|
62
|
+
self: import("xstate").ActorRef<import("xstate").MachineSnapshot<{
|
|
63
|
+
firstMessage: string;
|
|
64
|
+
voice: import("@fonoster/voice").VoiceResponse;
|
|
65
|
+
assistant: import("../assistants/assistants").Assistant;
|
|
66
|
+
playbackRef: string;
|
|
67
|
+
speechBuffer: string;
|
|
68
|
+
speechResponseStartTime: number;
|
|
69
|
+
speechResponseTime: number;
|
|
70
|
+
}, {
|
|
71
|
+
type: "SPEECH_START";
|
|
72
|
+
} | {
|
|
73
|
+
type: "SPEECH_END";
|
|
74
|
+
} | {
|
|
75
|
+
type: "SESSION_END";
|
|
76
|
+
} | {
|
|
77
|
+
type: "HUMAN_PROMPT";
|
|
78
|
+
speech: string;
|
|
79
|
+
}, Record<string, import("xstate").AnyActorRef | undefined>, import("xstate").StateValue, string, unknown, any, any>, {
|
|
80
|
+
type: "SPEECH_START";
|
|
81
|
+
} | {
|
|
82
|
+
type: "SPEECH_END";
|
|
83
|
+
} | {
|
|
84
|
+
type: "SESSION_END";
|
|
85
|
+
} | {
|
|
86
|
+
type: "HUMAN_PROMPT";
|
|
87
|
+
speech: string;
|
|
88
|
+
}, import("xstate").AnyEventObject>;
|
|
89
|
+
}) => {
|
|
90
|
+
firstMessage: string;
|
|
91
|
+
voice: import("@fonoster/voice").VoiceResponse;
|
|
92
|
+
assistant: import("@langchain/core/runnables").Runnable<any, string, import("@langchain/core/runnables").RunnableConfig>;
|
|
93
|
+
playbackRef: string;
|
|
94
|
+
speechBuffer: string;
|
|
95
|
+
speechResponseStartTime: number;
|
|
96
|
+
speechResponseTime: number;
|
|
97
|
+
};
|
|
98
|
+
readonly id: "fnAI_v0";
|
|
99
|
+
readonly initial: "welcome";
|
|
100
|
+
readonly states: {
|
|
101
|
+
readonly welcome: {
|
|
102
|
+
readonly entry: {
|
|
103
|
+
readonly type: "sendGreeting";
|
|
104
|
+
};
|
|
105
|
+
readonly always: {
|
|
106
|
+
readonly target: "machineListening";
|
|
107
|
+
};
|
|
108
|
+
readonly description: "The initial state where the AI greets the Human.";
|
|
109
|
+
};
|
|
110
|
+
readonly machineListening: {
|
|
111
|
+
readonly on: {
|
|
112
|
+
readonly SPEECH_START: {
|
|
113
|
+
readonly target: "humanSpeaking";
|
|
114
|
+
readonly description: "This must be triggered by a VAD or similar system.";
|
|
115
|
+
};
|
|
116
|
+
readonly HUMAN_PROMPT: {
|
|
117
|
+
readonly actions: {
|
|
118
|
+
readonly type: "appendSpeech";
|
|
119
|
+
};
|
|
120
|
+
readonly description: "Appends the speech to the buffer.";
|
|
121
|
+
};
|
|
122
|
+
};
|
|
123
|
+
readonly description: "The state where the AI is actively listening in conversation.";
|
|
124
|
+
};
|
|
125
|
+
readonly humanSpeaking: {
|
|
126
|
+
readonly entry: {
|
|
127
|
+
readonly type: "interruptMachineSpeaking";
|
|
128
|
+
};
|
|
129
|
+
readonly on: {
|
|
130
|
+
readonly HUMAN_PROMPT: {
|
|
131
|
+
readonly actions: {
|
|
132
|
+
readonly type: "appendSpeech";
|
|
133
|
+
};
|
|
134
|
+
readonly description: "Appends the speech to the buffer.";
|
|
135
|
+
};
|
|
136
|
+
readonly SPEECH_END: {
|
|
137
|
+
readonly target: "machineListening";
|
|
138
|
+
readonly actions: {
|
|
139
|
+
readonly type: "processHumanRequest";
|
|
140
|
+
};
|
|
141
|
+
readonly guard: {
|
|
142
|
+
readonly type: "hasSpeechBuffer";
|
|
143
|
+
};
|
|
144
|
+
readonly description: "This must be triggered by a VAD or similar system.";
|
|
145
|
+
};
|
|
146
|
+
};
|
|
147
|
+
readonly description: "The state where the AI detects Human speech while it is speaking.";
|
|
148
|
+
};
|
|
149
|
+
readonly hangup: {
|
|
150
|
+
readonly type: "final";
|
|
151
|
+
readonly entry: {
|
|
152
|
+
readonly type: "hangup";
|
|
153
|
+
};
|
|
154
|
+
readonly on: {
|
|
155
|
+
readonly SESSION_END: {
|
|
156
|
+
readonly target: "hangup";
|
|
157
|
+
};
|
|
158
|
+
};
|
|
159
|
+
readonly description: "The final state where the AI terminates the conversation due to inactivity.";
|
|
160
|
+
};
|
|
161
|
+
};
|
|
162
|
+
}>;
|
|
163
|
+
export { machine };
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.machine = void 0;
|
|
4
|
+
/*
|
|
5
|
+
* Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
|
|
6
|
+
* http://github.com/fonoster/fonoster
|
|
7
|
+
*
|
|
8
|
+
* This file is part of Fonoster
|
|
9
|
+
*
|
|
10
|
+
* Licensed under the MIT License (the "License");
|
|
11
|
+
* you may not use this file except in compliance with
|
|
12
|
+
* the License. You may obtain a copy of the License at
|
|
13
|
+
*
|
|
14
|
+
* https://opensource.org/licenses/MIT
|
|
15
|
+
*
|
|
16
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
* See the License for the specific language governing permissions and
|
|
20
|
+
* limitations under the License.
|
|
21
|
+
*/
|
|
22
|
+
const common_1 = require("@fonoster/common");
|
|
23
|
+
const logger_1 = require("@fonoster/logger");
|
|
24
|
+
const uuid_1 = require("uuid");
|
|
25
|
+
const xstate_1 = require("xstate");
|
|
26
|
+
const types_1 = require("./types");
|
|
27
|
+
const logger = (0, logger_1.getLogger)({ service: "autopilot", filePath: __filename });
|
|
28
|
+
const machine = (0, xstate_1.setup)({
|
|
29
|
+
types: types_1.types,
|
|
30
|
+
actions: {
|
|
31
|
+
sendGreeting: async function ({ context }) {
|
|
32
|
+
await context.voice.answer();
|
|
33
|
+
await context.voice.say(context.firstMessage, {
|
|
34
|
+
playbackRef: context.playbackRef
|
|
35
|
+
});
|
|
36
|
+
},
|
|
37
|
+
interruptMachineSpeaking: async function ({ context }) {
|
|
38
|
+
logger.verbose("interrupting the machine", {
|
|
39
|
+
playbackRef: context.playbackRef
|
|
40
|
+
});
|
|
41
|
+
await context.voice.playbackControl(context.playbackRef, common_1.PlaybackControlAction.STOP);
|
|
42
|
+
},
|
|
43
|
+
appendSpeech: function ({ context, event }) {
|
|
44
|
+
const speech = event.speech;
|
|
45
|
+
context.speechBuffer = (context.speechBuffer || "") + " " + speech;
|
|
46
|
+
context.speechResponseStartTime = Date.now();
|
|
47
|
+
logger.verbose("appended speech", { speechBuffer: context.speechBuffer });
|
|
48
|
+
},
|
|
49
|
+
processHumanRequest: async function ({ context }) {
|
|
50
|
+
const speech = context.speechBuffer.trim();
|
|
51
|
+
logger.verbose("processing human request", { speech });
|
|
52
|
+
const response = await context.assistant.invoke({
|
|
53
|
+
text: speech
|
|
54
|
+
});
|
|
55
|
+
const speechResponseTime = Date.now() - context.speechResponseStartTime;
|
|
56
|
+
context.speechResponseTime = speechResponseTime;
|
|
57
|
+
logger.verbose("assistant response", {
|
|
58
|
+
response,
|
|
59
|
+
responseTime: speechResponseTime
|
|
60
|
+
});
|
|
61
|
+
await context.voice.say(response, { playbackRef: context.playbackRef });
|
|
62
|
+
// Clear the speech buffer and reset response timing
|
|
63
|
+
context.speechBuffer = "";
|
|
64
|
+
context.speechResponseStartTime = 0;
|
|
65
|
+
},
|
|
66
|
+
hangup: async function ({ context }) {
|
|
67
|
+
await context.voice.hangup();
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
guards: {
|
|
71
|
+
hasSpeechBuffer: function ({ context }) {
|
|
72
|
+
return context.speechBuffer?.trim().length > 0;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}).createMachine({
|
|
76
|
+
context: ({ input }) => ({
|
|
77
|
+
firstMessage: input.firstMessage,
|
|
78
|
+
voice: input.voice,
|
|
79
|
+
assistant: input.assistant,
|
|
80
|
+
playbackRef: (0, uuid_1.v4)(),
|
|
81
|
+
speechBuffer: "",
|
|
82
|
+
speechResponseStartTime: 0,
|
|
83
|
+
speechResponseTime: 0
|
|
84
|
+
}),
|
|
85
|
+
id: "fnAI_v0",
|
|
86
|
+
initial: "welcome",
|
|
87
|
+
states: {
|
|
88
|
+
welcome: {
|
|
89
|
+
entry: {
|
|
90
|
+
type: "sendGreeting"
|
|
91
|
+
},
|
|
92
|
+
always: {
|
|
93
|
+
target: "machineListening"
|
|
94
|
+
},
|
|
95
|
+
description: "The initial state where the AI greets the Human."
|
|
96
|
+
},
|
|
97
|
+
machineListening: {
|
|
98
|
+
on: {
|
|
99
|
+
SPEECH_START: {
|
|
100
|
+
target: "humanSpeaking",
|
|
101
|
+
description: "This must be triggered by a VAD or similar system."
|
|
102
|
+
},
|
|
103
|
+
HUMAN_PROMPT: {
|
|
104
|
+
actions: { type: "appendSpeech" },
|
|
105
|
+
description: "Appends the speech to the buffer."
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
description: "The state where the AI is actively listening in conversation."
|
|
109
|
+
},
|
|
110
|
+
humanSpeaking: {
|
|
111
|
+
entry: {
|
|
112
|
+
type: "interruptMachineSpeaking"
|
|
113
|
+
},
|
|
114
|
+
on: {
|
|
115
|
+
HUMAN_PROMPT: {
|
|
116
|
+
actions: { type: "appendSpeech" },
|
|
117
|
+
description: "Appends the speech to the buffer."
|
|
118
|
+
},
|
|
119
|
+
SPEECH_END: {
|
|
120
|
+
target: "machineListening",
|
|
121
|
+
actions: { type: "processHumanRequest" },
|
|
122
|
+
guard: { type: "hasSpeechBuffer" },
|
|
123
|
+
description: "This must be triggered by a VAD or similar system."
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
description: "The state where the AI detects Human speech while it is speaking."
|
|
127
|
+
},
|
|
128
|
+
hangup: {
|
|
129
|
+
type: "final",
|
|
130
|
+
entry: {
|
|
131
|
+
type: "hangup"
|
|
132
|
+
},
|
|
133
|
+
on: {
|
|
134
|
+
SESSION_END: {
|
|
135
|
+
target: "hangup"
|
|
136
|
+
}
|
|
137
|
+
},
|
|
138
|
+
description: "The final state where the AI terminates the conversation due to inactivity."
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
});
|
|
142
|
+
exports.machine = machine;
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
declare function chunkToFloat32Array(chunk:
|
|
1
|
+
declare function chunkToFloat32Array(chunk: Uint8Array): Float32Array;
|
|
2
2
|
export { chunkToFloat32Array };
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.chunkToFloat32Array = chunkToFloat32Array;
|
|
4
2
|
/*
|
|
5
3
|
* Copyright (C) 2024 by Fonoster Inc (https://fonoster.com)
|
|
6
4
|
* http://github.com/fonoster/fonoster
|
|
@@ -19,7 +17,23 @@ exports.chunkToFloat32Array = chunkToFloat32Array;
|
|
|
19
17
|
* See the License for the specific language governing permissions and
|
|
20
18
|
* limitations under the License.
|
|
21
19
|
*/
|
|
20
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
21
|
+
exports.chunkToFloat32Array = chunkToFloat32Array;
|
|
22
|
+
// This version of the chunkToFloat32Array accounts for the case where
|
|
23
|
+
// the byteOffset is misaligned.
|
|
24
|
+
//
|
|
25
|
+
// Q. Would it be the same if we just created a new Uint8Array from the chunk?
|
|
22
26
|
function chunkToFloat32Array(chunk) {
|
|
23
|
-
|
|
27
|
+
// Check if byteOffset is not aligned
|
|
28
|
+
const alignedByteOffset = chunk.byteOffset % Int16Array.BYTES_PER_ELEMENT === 0;
|
|
29
|
+
let int16Array;
|
|
30
|
+
if (alignedByteOffset) {
|
|
31
|
+
int16Array = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
// Create a new aligned Uint8Array and then an Int16Array from it
|
|
35
|
+
const alignedChunk = new Uint8Array(chunk);
|
|
36
|
+
int16Array = new Int16Array(alignedChunk.buffer, alignedChunk.byteOffset, alignedChunk.byteLength / Int16Array.BYTES_PER_ELEMENT);
|
|
37
|
+
}
|
|
24
38
|
return new Float32Array(Array.from(int16Array, (sample) => sample / 32768.0));
|
|
25
39
|
}
|
package/dist/vad/makeVad.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
declare function makeVad(pathToModel?: string): Promise<(chunk:
|
|
1
|
+
declare function makeVad(pathToModel?: string): Promise<(chunk: Uint8Array, callback: (event: "SPEECH_START" | "SPEECH_END", data: Record<string, unknown>) => void) => Promise<void>>;
|
|
2
2
|
export { makeVad };
|
package/dist/vad/makeVad.js
CHANGED
|
@@ -69,12 +69,10 @@ async function makeVad(pathToModel) {
|
|
|
69
69
|
return processBuffer(remainingBuffer);
|
|
70
70
|
}
|
|
71
71
|
}
|
|
72
|
-
else {
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return processBuffer(remainingBuffer);
|
|
77
|
-
}
|
|
72
|
+
else if (isSpeechActive) {
|
|
73
|
+
isSpeechActive = false;
|
|
74
|
+
callback("SPEECH_END", {});
|
|
75
|
+
return processBuffer(remainingBuffer);
|
|
78
76
|
}
|
|
79
77
|
return processBuffer(remainingBuffer);
|
|
80
78
|
};
|
package/dist/vad/micVadTest.js
CHANGED
|
@@ -36,7 +36,7 @@ async function main() {
|
|
|
36
36
|
})
|
|
37
37
|
.stream();
|
|
38
38
|
mic.on("data", async (data) => {
|
|
39
|
-
const chunk = new
|
|
39
|
+
const chunk = new Uint8Array(data.buffer);
|
|
40
40
|
await vad(chunk, (event, _data) => {
|
|
41
41
|
logger.info("vad event:", { event, data: _data });
|
|
42
42
|
});
|
package/dist/vad/types.d.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fonoster/autopilot",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.4",
|
|
4
4
|
"description": "Voice AI for the Fonoster platform",
|
|
5
5
|
"author": "Pedro Sanders <psanders@fonoster.com>",
|
|
6
6
|
"homepage": "https://github.com/fonoster/fonoster#readme",
|
|
@@ -34,8 +34,8 @@
|
|
|
34
34
|
"url": "https://github.com/fonoster/fonoster/issues"
|
|
35
35
|
},
|
|
36
36
|
"dependencies": {
|
|
37
|
-
"@fonoster/logger": "^0.7.
|
|
38
|
-
"@fonoster/voice": "^0.7.
|
|
37
|
+
"@fonoster/logger": "^0.7.4",
|
|
38
|
+
"@fonoster/voice": "^0.7.4",
|
|
39
39
|
"@langchain/openai": "^0.2.7",
|
|
40
40
|
"dotenv": "^16.4.5",
|
|
41
41
|
"onnxruntime-node": "^1.19.0",
|
|
@@ -46,5 +46,5 @@
|
|
|
46
46
|
"node-record-lpcm16": "^1.0.1",
|
|
47
47
|
"typescript": "^5.5.4"
|
|
48
48
|
},
|
|
49
|
-
"gitHead": "
|
|
49
|
+
"gitHead": "e550aa46c1a9087a70157496365b64afd5aea11d"
|
|
50
50
|
}
|