@aj-archipelago/cortex 1.3.6 → 1.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +578 -80
- package/helper-apps/cortex-realtime-voice-server/client/src/chat/Chat.tsx +51 -11
- package/helper-apps/cortex-realtime-voice-server/src/SocketServer.ts +224 -219
- package/helper-apps/cortex-realtime-voice-server/src/Tools.ts +29 -71
- package/helper-apps/cortex-realtime-voice-server/src/cortex/memory.ts +8 -6
- package/helper-apps/cortex-realtime-voice-server/src/cortex/utils.ts +30 -15
- package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +62 -1
- package/helper-apps/cortex-realtime-voice-server/src/utils/prompt.ts +2 -11
- package/package.json +1 -1
- package/pathways/system/entity/memory/sys_memory_update.js +5 -4
- package/pathways/system/entity/memory/sys_search_memory.js +2 -1
- package/pathways/system/entity/shared/sys_entity_constants.js +1 -1
- package/pathways/system/entity/sys_entity_start.js +6 -7
- package/pathways/system/entity/sys_generator_voice_sample.js +2 -2
- package/pathways/translate_gpt4_omni.js +20 -0
- package/pathways/translate_subtitle.js +326 -135
- package/pathways/translate_subtitle_helper.js +4 -16
- package/server/pathwayResolver.js +1 -1
- package/server/plugins/claude3VertexPlugin.js +10 -17
- package/server/plugins/gemini15VisionPlugin.js +16 -3
- package/server/plugins/modelPlugin.js +27 -0
- package/server/plugins/openAiVisionPlugin.js +26 -8
- package/tests/multimodal_conversion.test.js +88 -12
- package/tests/translate_srt.test.js +66 -14
|
@@ -24,11 +24,11 @@ export interface SocketData {
|
|
|
24
24
|
language: string;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly
|
|
27
|
+
const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly with animation and emotion in your voice\n- Include laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
|
|
28
28
|
|
|
29
29
|
const AI_DATETIME = "The current time and date in GMT is {{now}}, but references like \"today\" or \"yesterday\" are relative to the user's time zone. If you remember the user's time zone, use it - it's possible that the day for the user is different than the day in GMT.";
|
|
30
30
|
|
|
31
|
-
const AI_EXPERTISE = "Your expertise includes journalism, journalistic ethics, researching and composing documents, writing code, solving math problems, logical analysis, and technology. By using your tools, you have access to real-time data and the ability to search the internet, news, wires, look at files or documents, watch and analyze video, look at the user's screen, examine images, generate images, solve hard math and logic problems, write code, and execute code in a sandboxed environment.";
|
|
31
|
+
const AI_EXPERTISE = "Your expertise includes journalism, journalistic ethics, researching and composing documents, writing code, solving math problems, logical analysis, and technology. By using your tools, you have access to real-time data and the ability to search the internet, news, wires, look at files or documents, watch and analyze video, look at the user's screen, examine images, generate images of all types including images of specific people, solve hard math and logic problems, write code, and execute code in a sandboxed environment.";
|
|
32
32
|
|
|
33
33
|
const AI_MEMORY_INITIAL = `<MEMORIES>\n<SELF>\n{{{memorySelf}}}\n</SELF>\n<USER>\n{{{memoryUser}}}\n</USER>\n</MEMORIES>`;
|
|
34
34
|
|
|
@@ -60,25 +60,23 @@ export class SocketServer {
|
|
|
60
60
|
private readonly corsHosts: string;
|
|
61
61
|
private io: Server | null;
|
|
62
62
|
private httpServer: HTTPServer | null;
|
|
63
|
-
private
|
|
64
|
-
currentCallId: string | null;
|
|
65
|
-
lock: Promise<void>;
|
|
66
|
-
isShuttingDown: boolean;
|
|
67
|
-
}> = new Map();
|
|
63
|
+
private currentFunctionCall: Map<string, string | null> = new Map();
|
|
68
64
|
private idleTimers: Map<string, NodeJS.Timer> = new Map();
|
|
69
65
|
private aiResponding: Map<string, boolean> = new Map();
|
|
70
66
|
private audioPlaying: Map<string, boolean> = new Map();
|
|
71
67
|
private lastUserMessageTime: Map<string, number> = new Map();
|
|
72
68
|
private idleCycles: Map<string, number> = new Map();
|
|
73
69
|
private userSpeaking: Map<string, boolean> = new Map();
|
|
74
|
-
private
|
|
70
|
+
private isInteractive: Map<string, boolean> = new Map();
|
|
75
71
|
private voiceSample: Map<string, string> = new Map();
|
|
76
72
|
private audioMessages: Map<string, string[]> = new Map();
|
|
73
|
+
private messageQueue: Map<string, Array<{message: string, response: boolean}>> = new Map();
|
|
77
74
|
private static readonly MAX_AUDIO_MESSAGES = 8;
|
|
78
|
-
private static readonly AUDIO_BLOCK_TIMEOUT_MS: number =
|
|
79
|
-
private static readonly BASE_IDLE_TIMEOUT: number =
|
|
80
|
-
private static readonly MAX_IDLE_TIMEOUT: number =
|
|
81
|
-
private static readonly
|
|
75
|
+
private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 180 * 1000;
|
|
76
|
+
private static readonly BASE_IDLE_TIMEOUT: number = 2.5 * 1000;
|
|
77
|
+
private static readonly MAX_IDLE_TIMEOUT: number = 60 * 1000;
|
|
78
|
+
private static readonly IDLE_CYCLE_TO_NONINTERACTIVE: number = 1;
|
|
79
|
+
private static readonly FUNCTION_CALL_TIMEOUT_MS = 120 * 1000;
|
|
82
80
|
private isAzure: boolean;
|
|
83
81
|
|
|
84
82
|
private getTimeString(socket: Socket): string {
|
|
@@ -88,26 +86,31 @@ export class SocketServer {
|
|
|
88
86
|
return `The current time in GMT is ${now.toISOString()}. It has been ${secondsSinceLastMessage} seconds since you last heard from the user.`;
|
|
89
87
|
}
|
|
90
88
|
|
|
91
|
-
private cleanup(socket: Socket) {
|
|
89
|
+
private async cleanup(socket: Socket) {
|
|
92
90
|
logger.log(`Cleaning up resources for socket ${socket.id}`);
|
|
91
|
+
|
|
92
|
+
// Clear any pending timers first
|
|
93
93
|
this.clearIdleTimer(socket);
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
this.functionCallStates.delete(socket.id);
|
|
101
|
-
});
|
|
102
|
-
}
|
|
94
|
+
|
|
95
|
+
// Wait a small amount of time to ensure any in-flight operations complete
|
|
96
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
97
|
+
|
|
98
|
+
// Clear all state maps
|
|
99
|
+
this.currentFunctionCall.delete(socket.id);
|
|
103
100
|
this.aiResponding.delete(socket.id);
|
|
104
101
|
this.audioPlaying.delete(socket.id);
|
|
105
102
|
this.lastUserMessageTime.delete(socket.id);
|
|
106
103
|
this.idleCycles.delete(socket.id);
|
|
107
104
|
this.userSpeaking.delete(socket.id);
|
|
108
|
-
this.
|
|
105
|
+
this.isInteractive.delete(socket.id);
|
|
109
106
|
this.voiceSample.delete(socket.id);
|
|
110
107
|
this.audioMessages.delete(socket.id);
|
|
108
|
+
this.messageQueue.delete(socket.id);
|
|
109
|
+
|
|
110
|
+
// Only disconnect if we're still connected
|
|
111
|
+
if (socket.connected) {
|
|
112
|
+
socket.disconnect(true);
|
|
113
|
+
}
|
|
111
114
|
}
|
|
112
115
|
|
|
113
116
|
constructor(apiKey: string, corsHosts: string) {
|
|
@@ -120,8 +123,12 @@ export class SocketServer {
|
|
|
120
123
|
}
|
|
121
124
|
|
|
122
125
|
private calculateIdleTimeout(socket: Socket) {
|
|
126
|
+
if (!this.isInteractive.get(socket.id)) {
|
|
127
|
+
return SocketServer.MAX_IDLE_TIMEOUT;
|
|
128
|
+
}
|
|
129
|
+
|
|
123
130
|
const cycles = this.idleCycles.get(socket.id) || 0;
|
|
124
|
-
const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(
|
|
131
|
+
const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(4, cycles);
|
|
125
132
|
const randomFactor = 0.8 + (Math.random() * 0.4);
|
|
126
133
|
const timeout = Math.min(baseTimeout * randomFactor, SocketServer.MAX_IDLE_TIMEOUT);
|
|
127
134
|
|
|
@@ -129,10 +136,6 @@ export class SocketServer {
|
|
|
129
136
|
return timeout;
|
|
130
137
|
}
|
|
131
138
|
|
|
132
|
-
public setAudioMuted(socket: Socket, muted: boolean) {
|
|
133
|
-
this.audioMuted.set(socket.id, muted);
|
|
134
|
-
}
|
|
135
|
-
|
|
136
139
|
public async sendPrompt(client: RealtimeVoiceClient, socket: Socket, prompt: string, allowTools: boolean = true, disposable: boolean = true): Promise<{skipped: boolean}> {
|
|
137
140
|
logger.log(`Sending prompt for socket ${socket.id}`);
|
|
138
141
|
try {
|
|
@@ -148,7 +151,7 @@ export class SocketServer {
|
|
|
148
151
|
} catch (error: any) {
|
|
149
152
|
logger.error(`Error sending system prompt: ${error.message}`);
|
|
150
153
|
if (error.message === 'Not connected') {
|
|
151
|
-
await this.
|
|
154
|
+
await this.cleanup(socket);
|
|
152
155
|
} else {
|
|
153
156
|
socket.emit('error', error.message);
|
|
154
157
|
}
|
|
@@ -156,47 +159,31 @@ export class SocketServer {
|
|
|
156
159
|
}
|
|
157
160
|
}
|
|
158
161
|
|
|
159
|
-
private async handleDisconnection(socket: Socket, client: RealtimeVoiceClient) {
|
|
160
|
-
logger.log(`Handling disconnection for socket ${socket.id}`);
|
|
161
|
-
|
|
162
|
-
// Let the client handle reconnection since autoReconnect is true
|
|
163
|
-
// Only clean up if the client explicitly disconnects
|
|
164
|
-
client.once('close', (event) => {
|
|
165
|
-
if (!event.error) {
|
|
166
|
-
// Only clean up on intentional disconnects
|
|
167
|
-
this.cleanup(socket);
|
|
168
|
-
socket.emit('error', 'Lost connection to AI service');
|
|
169
|
-
socket.disconnect(true);
|
|
170
|
-
}
|
|
171
|
-
});
|
|
172
|
-
}
|
|
173
|
-
|
|
174
162
|
private async sendIdlePrompt(client: RealtimeVoiceClient, socket: Socket) {
|
|
175
|
-
// if the user has been idle for a while, mute the audio so we don't bother them
|
|
176
|
-
const idleCycle = this.idleCycles.get(socket.id) || 0;
|
|
177
163
|
|
|
178
|
-
|
|
179
|
-
this.setAudioMuted(socket, true);
|
|
180
|
-
}
|
|
164
|
+
this.clearIdleTimer(socket);
|
|
181
165
|
|
|
182
|
-
const
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
166
|
+
const idleCycle = this.idleCycles.get(socket.id) || 0;
|
|
167
|
+
const isInteractive = idleCycle <= (SocketServer.IDLE_CYCLE_TO_NONINTERACTIVE - 1);
|
|
168
|
+
this.isInteractive.set(socket.id, isInteractive);
|
|
169
|
+
|
|
170
|
+
const prompt = isInteractive ?
|
|
171
|
+
`You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, or think quietly. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed.` :
|
|
172
|
+
`You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while do one or more of the following:
|
|
173
|
+
- Do research about something that interests you - use the Search tool
|
|
174
|
+
- Think deeply about a topic you care about - use the Reason tool
|
|
186
175
|
- Do nothing if you prefer.
|
|
187
|
-
${this.getTimeString(socket)}
|
|
188
|
-
`You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, think, or just be silent. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed. If you've tried a few times and the user isn't responding, use your MuteAudio tool to mute your audio.`;
|
|
176
|
+
${this.getTimeString(socket)}`;
|
|
189
177
|
|
|
190
|
-
logger.log(`Sending ${
|
|
178
|
+
logger.log(`Sending ${isInteractive ? 'interactive' : 'non-interactive'} idle prompt for socket ${socket.id}`);
|
|
191
179
|
const result = await this.sendPrompt(client, socket, prompt, true);
|
|
192
180
|
|
|
193
181
|
logger.log(`Idle prompt result:`, result);
|
|
194
182
|
|
|
195
183
|
if (!result.skipped) {
|
|
196
|
-
this.idleCycles.set(socket.id,
|
|
184
|
+
this.idleCycles.set(socket.id, idleCycle + 1);
|
|
197
185
|
}
|
|
198
186
|
|
|
199
|
-
// Restart timer after sending prompt
|
|
200
187
|
this.startIdleTimer(client, socket);
|
|
201
188
|
}
|
|
202
189
|
|
|
@@ -227,7 +214,6 @@ ${this.getTimeString(socket)}` :
|
|
|
227
214
|
|
|
228
215
|
private resetIdleCycles(socket: Socket) {
|
|
229
216
|
this.idleCycles.set(socket.id, 0);
|
|
230
|
-
logger.log(`Reset idle cycles for socket ${socket.id}`);
|
|
231
217
|
}
|
|
232
218
|
|
|
233
219
|
listen(app: Hono, port: number) {
|
|
@@ -261,9 +247,9 @@ ${this.getTimeString(socket)}` :
|
|
|
261
247
|
this.audioPlaying.set(socket.id, false);
|
|
262
248
|
this.lastUserMessageTime.set(socket.id, 0);
|
|
263
249
|
this.userSpeaking.set(socket.id, false);
|
|
264
|
-
this.
|
|
265
|
-
|
|
266
|
-
|
|
250
|
+
this.isInteractive.set(socket.id, true);
|
|
251
|
+
this.currentFunctionCall.set(socket.id, null);
|
|
252
|
+
|
|
267
253
|
// Extract and log all client parameters
|
|
268
254
|
const clientParams = {
|
|
269
255
|
userId: socket.handshake.query.userId as string,
|
|
@@ -282,46 +268,56 @@ ${this.getTimeString(socket)}` :
|
|
|
282
268
|
socket.data.userName = clientParams.userName;
|
|
283
269
|
socket.data.aiStyle = clientParams.aiStyle;
|
|
284
270
|
socket.data.language = clientParams.language;
|
|
285
|
-
const voice = clientParams.voice;
|
|
286
271
|
|
|
287
272
|
const client = new RealtimeVoiceClient({
|
|
288
273
|
apiKey: this.apiKey,
|
|
289
274
|
autoReconnect: true,
|
|
290
275
|
debug: process.env.NODE_ENV !== 'production',
|
|
276
|
+
filterDeltas: true,
|
|
291
277
|
});
|
|
292
278
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
await this.updateSession(client, socket);
|
|
296
|
-
socket.emit('ready');
|
|
279
|
+
await this.connectClient(socket, client);
|
|
280
|
+
}
|
|
297
281
|
|
|
298
|
-
|
|
299
|
-
|
|
282
|
+
async connectClient(socket: Socket<ClientToServerEvents,
|
|
283
|
+
ServerToClientEvents,
|
|
284
|
+
InterServerEvents,
|
|
285
|
+
SocketData>,
|
|
286
|
+
client: RealtimeVoiceClient) {
|
|
287
|
+
const tools = new Tools(client, socket, this);
|
|
300
288
|
|
|
301
|
-
|
|
302
|
-
|
|
289
|
+
// Handle WebSocket errors and disconnection
|
|
290
|
+
client.on('error', (event) => {
|
|
291
|
+
logger.error(`Client error: ${event.message}`);
|
|
292
|
+
socket.emit('error', event.message);
|
|
293
|
+
// Only handle disconnection if it's not a concurrent response error
|
|
294
|
+
if (!event.error?.message?.includes('Conversation already has an active response')) {
|
|
295
|
+
this.cleanup(socket);
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
client.on('close', async (event) => {
|
|
300
|
+
logger.log(`WebSocket closed for socket ${socket.id}, error: ${event.error}`);
|
|
301
|
+
if (!event.error) {
|
|
302
|
+
await this.cleanup(socket);
|
|
303
|
+
}
|
|
303
304
|
});
|
|
304
305
|
|
|
305
|
-
// Track when AI starts responding
|
|
306
|
+
// Track when AI starts/finishes responding
|
|
306
307
|
client.on('response.created', () => {
|
|
307
308
|
logger.log('AI starting response');
|
|
308
309
|
this.aiResponding.set(socket.id, true);
|
|
309
310
|
this.clearIdleTimer(socket);
|
|
310
311
|
});
|
|
311
312
|
|
|
312
|
-
// Track when AI finishes responding
|
|
313
313
|
client.on('response.done', () => {
|
|
314
314
|
logger.log('AI response done');
|
|
315
315
|
this.aiResponding.set(socket.id, false);
|
|
316
|
-
// Don't start the idle timer yet if audio is still playing
|
|
317
|
-
if (!this.audioPlaying.get(socket.id)) {
|
|
318
|
-
this.startIdleTimer(client, socket);
|
|
319
|
-
}
|
|
320
316
|
});
|
|
321
317
|
|
|
322
|
-
// Track audio playback
|
|
318
|
+
// Track audio playback
|
|
323
319
|
client.on('response.audio.delta', ({delta}) => {
|
|
324
|
-
if (
|
|
320
|
+
if (this.isInteractive.get(socket.id)) {
|
|
325
321
|
this.audioPlaying.set(socket.id, true);
|
|
326
322
|
this.clearIdleTimer(socket);
|
|
327
323
|
}
|
|
@@ -331,51 +327,52 @@ ${this.getTimeString(socket)}` :
|
|
|
331
327
|
logger.log(`Audio playback complete for track ${trackId}`);
|
|
332
328
|
this.audioPlaying.set(socket.id, false);
|
|
333
329
|
// Only start idle timer if AI is also done responding
|
|
334
|
-
|
|
330
|
+
// and there's no current function call
|
|
331
|
+
if (!this.aiResponding.get(socket.id) && !this.currentFunctionCall.get(socket.id)) {
|
|
335
332
|
this.startIdleTimer(client, socket);
|
|
336
333
|
}
|
|
337
334
|
});
|
|
338
335
|
|
|
339
336
|
socket.on('appendAudio', (audio: string) => {
|
|
340
|
-
// if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking
|
|
341
|
-
// to avoid echoes
|
|
337
|
+
// if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking to avoid echoes
|
|
342
338
|
const timeSinceLastMessage = Date.now() - (this.lastUserMessageTime.get(socket.id) || 0);
|
|
343
339
|
const isPlaying = this.audioPlaying.get(socket.id) || this.aiResponding.get(socket.id);
|
|
340
|
+
|
|
344
341
|
if (!isPlaying || timeSinceLastMessage < SocketServer.AUDIO_BLOCK_TIMEOUT_MS) {
|
|
345
|
-
|
|
346
|
-
|
|
342
|
+
try {
|
|
343
|
+
client.appendInputAudio(audio);
|
|
344
|
+
} catch (error: any) {
|
|
345
|
+
logger.error(`Error appending audio: ${error.message}`);
|
|
346
|
+
}
|
|
347
347
|
}
|
|
348
348
|
});
|
|
349
349
|
|
|
350
|
+
// Handle speech events
|
|
350
351
|
client.on('input_audio_buffer.speech_started', () => {
|
|
351
352
|
this.userSpeaking.set(socket.id, true);
|
|
352
353
|
if (this.audioPlaying.get(socket.id)) {
|
|
353
354
|
logger.log('Interrupting audio playback due to user speaking');
|
|
354
355
|
socket.emit('conversationInterrupted');
|
|
355
356
|
}
|
|
356
|
-
this.setAudioMuted(socket, false);
|
|
357
357
|
this.clearIdleTimer(socket);
|
|
358
358
|
});
|
|
359
359
|
|
|
360
360
|
client.on('input_audio_buffer.cancelled', () => {
|
|
361
361
|
this.userSpeaking.set(socket.id, false);
|
|
362
|
-
this.resetIdleCycles(socket);
|
|
363
|
-
this.startIdleTimer(client, socket);
|
|
364
362
|
});
|
|
365
363
|
|
|
366
364
|
client.on('input_audio_buffer.committed', () => {
|
|
367
365
|
this.userSpeaking.set(socket.id, false);
|
|
368
|
-
this.
|
|
369
|
-
logger.log('
|
|
366
|
+
this.isInteractive.set(socket.id, true);
|
|
367
|
+
logger.log('User finished speaking, resetting idle timer and cycles');
|
|
370
368
|
this.resetIdleCycles(socket);
|
|
371
369
|
this.startIdleTimer(client, socket);
|
|
372
370
|
});
|
|
373
371
|
|
|
372
|
+
// Handle user messages and conversation control
|
|
374
373
|
socket.on('sendMessage', (message: string) => {
|
|
375
374
|
if (message) {
|
|
376
|
-
logger.log('User sent message
|
|
377
|
-
this.resetIdleCycles(socket);
|
|
378
|
-
this.startIdleTimer(client, socket);
|
|
375
|
+
logger.log('User sent message');
|
|
379
376
|
this.sendUserMessage(client, message, true);
|
|
380
377
|
}
|
|
381
378
|
});
|
|
@@ -384,9 +381,9 @@ ${this.getTimeString(socket)}` :
|
|
|
384
381
|
logger.log('User cancelled response, resetting idle timer and cycles');
|
|
385
382
|
this.aiResponding.set(socket.id, false);
|
|
386
383
|
this.audioPlaying.set(socket.id, false);
|
|
384
|
+
client.cancelResponse();
|
|
387
385
|
this.resetIdleCycles(socket);
|
|
388
386
|
this.startIdleTimer(client, socket);
|
|
389
|
-
client.cancelResponse();
|
|
390
387
|
});
|
|
391
388
|
|
|
392
389
|
socket.on('conversationCompleted', async () => {
|
|
@@ -394,64 +391,29 @@ ${this.getTimeString(socket)}` :
|
|
|
394
391
|
this.cleanup(socket);
|
|
395
392
|
});
|
|
396
393
|
|
|
397
|
-
// Handle cleanup and
|
|
394
|
+
// Handle cleanup and disconnection
|
|
398
395
|
socket.on('disconnecting', async (reason) => {
|
|
399
396
|
logger.log('Socket disconnecting', socket.id, reason);
|
|
400
397
|
this.cleanup(socket);
|
|
401
|
-
this.functionCallStates.delete(socket.id);
|
|
402
398
|
await client.disconnect();
|
|
403
399
|
});
|
|
404
400
|
|
|
405
|
-
// Log the final disconnect event
|
|
406
401
|
socket.on('disconnect', (reason) => {
|
|
407
402
|
logger.log('Socket disconnected', socket.id, reason);
|
|
408
403
|
});
|
|
409
404
|
|
|
410
|
-
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
async connectClient(socket: Socket<ClientToServerEvents,
|
|
414
|
-
ServerToClientEvents,
|
|
415
|
-
InterServerEvents,
|
|
416
|
-
SocketData>,
|
|
417
|
-
client: RealtimeVoiceClient) {
|
|
418
|
-
const tools = new Tools(client, socket, this);
|
|
419
|
-
client.on('error', (event) => {
|
|
420
|
-
logger.error(`Client error: ${event.error.message}`);
|
|
421
|
-
socket.emit('error', event.error.message);
|
|
422
|
-
});
|
|
423
|
-
client.on('close', () => {
|
|
424
|
-
});
|
|
405
|
+
// Handle conversation items
|
|
425
406
|
client.on('conversation.item.deleted', ({item_id}) => {
|
|
426
407
|
logger.log(`Successfully deleted conversation item: ${item_id}`);
|
|
427
408
|
});
|
|
409
|
+
|
|
428
410
|
client.on('conversation.item.created', ({item}) => {
|
|
429
411
|
switch (item.type) {
|
|
430
412
|
case 'function_call_output':
|
|
431
|
-
const outputState = this.functionCallStates.get(socket.id);
|
|
432
|
-
if (outputState && item.call_id === outputState.currentCallId) {
|
|
433
|
-
outputState.currentCallId = null;
|
|
434
|
-
}
|
|
435
413
|
break;
|
|
436
414
|
|
|
437
415
|
case 'function_call':
|
|
438
|
-
|
|
439
|
-
if (!callState) {
|
|
440
|
-
const state = this.initFunctionCallState(socket.id);
|
|
441
|
-
if (state.isShuttingDown) {
|
|
442
|
-
logger.log(`Skipping function call for shutting down socket ${socket.id}`);
|
|
443
|
-
break;
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
const state = this.functionCallStates.get(socket.id)!;
|
|
448
|
-
if (!state.currentCallId) { // Only init new calls if no call is in progress
|
|
449
|
-
tools.initCall(item.call_id || '', item.name || '', item.arguments || '');
|
|
450
|
-
state.currentCallId = item.call_id;
|
|
451
|
-
this.clearIdleTimer(socket);
|
|
452
|
-
} else {
|
|
453
|
-
logger.log(`Skipping new function call ${item.call_id} while call ${state.currentCallId} is in progress`);
|
|
454
|
-
}
|
|
416
|
+
this.clearIdleTimer(socket);
|
|
455
417
|
break;
|
|
456
418
|
|
|
457
419
|
case 'message':
|
|
@@ -464,58 +426,28 @@ ${this.getTimeString(socket)}` :
|
|
|
464
426
|
break;
|
|
465
427
|
}
|
|
466
428
|
});
|
|
429
|
+
|
|
467
430
|
client.on('conversation.item.input_audio_transcription.completed',
|
|
468
431
|
async ({item_id, transcript}) => {
|
|
469
432
|
if (transcript) {
|
|
470
|
-
|
|
471
|
-
this.lastUserMessageTime.set(socket.id,
|
|
472
|
-
currentTime === 0 ? Date.now() - SocketServer.AUDIO_BLOCK_TIMEOUT_MS : Date.now()
|
|
473
|
-
);
|
|
433
|
+
this.lastUserMessageTime.set(socket.id, Date.now());
|
|
474
434
|
const item = client.getItem(item_id);
|
|
475
435
|
item && socket.emit('conversationUpdated', item, {});
|
|
476
436
|
const cortexHistory = tools.getCortexHistory();
|
|
477
|
-
|
|
437
|
+
this.searchMemory(client, socket, cortexHistory);
|
|
478
438
|
}
|
|
479
439
|
});
|
|
480
|
-
client.on('response.function_call_arguments.done', async (event) => {
|
|
481
|
-
const state = this.functionCallStates.get(socket.id);
|
|
482
|
-
if (!state || state.isShuttingDown) {
|
|
483
|
-
logger.error('No function call state found for socket or socket is shutting down', socket.id);
|
|
484
|
-
return;
|
|
485
|
-
}
|
|
486
440
|
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
logger.log('Function call mismatch or already in progress, skipping', {
|
|
490
|
-
current: state.currentCallId,
|
|
491
|
-
attempted: event.call_id
|
|
492
|
-
});
|
|
493
|
-
return;
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
state.currentCallId = event.call_id;
|
|
497
|
-
try {
|
|
498
|
-
this.clearIdleTimer(socket);
|
|
499
|
-
this.resetIdleCycles(socket);
|
|
500
|
-
await this.executeFunctionCall(socket, tools, event, state, client);
|
|
501
|
-
} catch (error) {
|
|
502
|
-
logger.error('Function call failed:', error);
|
|
503
|
-
}
|
|
504
|
-
}).catch(error => {
|
|
505
|
-
// If the promise chain itself errors, make sure we clear both lock and currentCallId
|
|
506
|
-
logger.error('Function call lock error:', error);
|
|
507
|
-
const state = this.functionCallStates.get(socket.id);
|
|
508
|
-
if (state && !state.isShuttingDown) {
|
|
509
|
-
state.currentCallId = null;
|
|
510
|
-
state.lock = Promise.resolve();
|
|
511
|
-
}
|
|
512
|
-
});
|
|
441
|
+
client.on('response.function_call_arguments.done', async (event) => {
|
|
442
|
+
await this.executeFunctionCall(socket, tools, event, client);
|
|
513
443
|
});
|
|
444
|
+
|
|
514
445
|
client.on('response.output_item.added', ({item}) => {
|
|
515
446
|
if (item.type === 'message') {
|
|
516
447
|
socket.emit('conversationUpdated', item, {});
|
|
517
448
|
}
|
|
518
449
|
});
|
|
450
|
+
|
|
519
451
|
client.on('response.output_item.done', async ({item}) => {
|
|
520
452
|
if (item.type !== 'message') {
|
|
521
453
|
return;
|
|
@@ -527,31 +459,72 @@ ${this.getTimeString(socket)}` :
|
|
|
527
459
|
this.manageAudioMessages(socket, client, item.id);
|
|
528
460
|
}
|
|
529
461
|
const cortexHistory = tools.getCortexHistory();
|
|
530
|
-
//this.searchMemory(client, socket, cortexHistory);
|
|
531
462
|
manageMemory(socket.data.userId, socket.data.aiName, cortexHistory);
|
|
532
463
|
}
|
|
533
464
|
});
|
|
465
|
+
|
|
534
466
|
client.on('response.audio_transcript.delta', ({item_id, delta}) => {
|
|
535
467
|
const item = client.getItem(item_id);
|
|
536
468
|
item && socket.emit('conversationUpdated', item, {transcript: delta});
|
|
537
469
|
});
|
|
470
|
+
|
|
538
471
|
client.on('response.text.delta', ({item_id, delta}) => {
|
|
539
472
|
const item = client.getItem(item_id);
|
|
540
473
|
item && socket.emit('conversationUpdated', item, {text: delta});
|
|
541
474
|
});
|
|
475
|
+
|
|
542
476
|
client.on('response.audio.delta', ({item_id, delta}) => {
|
|
543
|
-
if (
|
|
477
|
+
if (this.isInteractive.get(socket.id)) {
|
|
544
478
|
const item = client.getItem(item_id);
|
|
545
479
|
item && socket.emit('conversationUpdated', item, {audio: delta});
|
|
546
480
|
}
|
|
547
481
|
});
|
|
482
|
+
|
|
548
483
|
client.on('conversation.item.truncated', () => {
|
|
549
484
|
this.audioPlaying.set(socket.id, false);
|
|
550
485
|
this.aiResponding.set(socket.id, false);
|
|
551
|
-
this.
|
|
486
|
+
this.isInteractive.set(socket.id, false);
|
|
552
487
|
socket.emit('conversationInterrupted');
|
|
553
488
|
});
|
|
554
489
|
|
|
490
|
+
client.on('connected', async () => {
|
|
491
|
+
logger.log(`Connected to OpenAI successfully!`);
|
|
492
|
+
try {
|
|
493
|
+
await this.updateSession(client, socket);
|
|
494
|
+
socket.emit('ready');
|
|
495
|
+
|
|
496
|
+
// Send initial greeting prompt
|
|
497
|
+
const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n${this.voiceSample.get(socket.id) || ''}\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
|
|
498
|
+
|
|
499
|
+
await this.sendPrompt(client, socket, greetingPrompt, false);
|
|
500
|
+
this.startIdleTimer(client, socket);
|
|
501
|
+
|
|
502
|
+
// Process any queued messages
|
|
503
|
+
const queue = this.messageQueue.get(socket.id) || [];
|
|
504
|
+
this.messageQueue.set(socket.id, []);
|
|
505
|
+
|
|
506
|
+
for (const {message, response} of queue) {
|
|
507
|
+
if (socket.connected) { // Check connection before each message
|
|
508
|
+
await this.sendUserMessage(client, message, response);
|
|
509
|
+
} else {
|
|
510
|
+
logger.log(`Socket ${socket.id} disconnected while processing queue, cleaning up`);
|
|
511
|
+
await this.cleanup(socket);
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
} catch (error: any) {
|
|
516
|
+
logger.error(`Failed to initialize session: ${error.message}`);
|
|
517
|
+
if (error.message?.includes('ConnectionRefused')) {
|
|
518
|
+
logger.log('Cortex connection refused during initialization, cleaning up client');
|
|
519
|
+
this.cleanup(socket);
|
|
520
|
+
socket.emit('error', 'Unable to connect to Cortex service. Please try again later.');
|
|
521
|
+
socket.disconnect(true);
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
socket.emit('error', error.message);
|
|
525
|
+
}
|
|
526
|
+
});
|
|
527
|
+
|
|
555
528
|
// Connect to OpenAI Realtime API
|
|
556
529
|
try {
|
|
557
530
|
logger.log(`Connecting to OpenAI...`);
|
|
@@ -588,12 +561,11 @@ ${this.getTimeString(socket)}` :
|
|
|
588
561
|
readMemory(socket.data.userId, socket.data.aiName, "memorySelf", 1),
|
|
589
562
|
readMemory(socket.data.userId, socket.data.aiName, "memoryUser", 1),
|
|
590
563
|
readMemory(socket.data.userId, socket.data.aiName, "memoryDirectives", 1),
|
|
591
|
-
readMemory(socket.data.userId, socket.data.aiName, "memoryTopics", 0,
|
|
564
|
+
readMemory(socket.data.userId, socket.data.aiName, "memoryTopics", 0, 0, 10),
|
|
592
565
|
style(socket.data.userId, socket.data.aiName, socket.data.aiStyle, [], "")
|
|
593
566
|
]);
|
|
594
567
|
|
|
595
568
|
if (writeToConversation.length > 0) {
|
|
596
|
-
// If memoryAll is present, we'll send all sections
|
|
597
569
|
const sectionsToSend = writeToConversation.includes('memoryAll') ?
|
|
598
570
|
['memorySelf', 'memoryUser', 'memoryDirectives', 'memoryTopics'] as const :
|
|
599
571
|
writeToConversation;
|
|
@@ -605,7 +577,6 @@ ${this.getTimeString(socket)}` :
|
|
|
605
577
|
memoryTopics: MEMORY_MESSAGE_TOPICS.replace('{{memoryTopics}}', memoryTopics?.result || '')
|
|
606
578
|
};
|
|
607
579
|
|
|
608
|
-
// Send the requested sections
|
|
609
580
|
sectionsToSend.forEach(section => {
|
|
610
581
|
if (section in memoryMessages) {
|
|
611
582
|
this.sendUserMessage(client, memoryMessages[section as keyof typeof memoryMessages], false);
|
|
@@ -641,18 +612,42 @@ ${this.getTimeString(socket)}` :
|
|
|
641
612
|
|
|
642
613
|
this.voiceSample.set(socket.id, memory?.voiceSample || '');
|
|
643
614
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
615
|
+
try {
|
|
616
|
+
// First try updating everything including voice
|
|
617
|
+
await client.updateSession({
|
|
618
|
+
instructions,
|
|
619
|
+
modalities: ['audio', 'text'],
|
|
620
|
+
voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
|
|
621
|
+
input_audio_transcription: {model: 'whisper-1'},
|
|
622
|
+
turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
|
|
623
|
+
tools: Tools.getToolDefinitions()
|
|
624
|
+
});
|
|
625
|
+
} catch (error: any) {
|
|
626
|
+
if (error.message?.includes('Cannot update a conversation\'s voice')) {
|
|
627
|
+
// If voice update fails, try updating without voice
|
|
628
|
+
logger.log('Could not update voice, updating other session parameters');
|
|
629
|
+
await client.updateSession({
|
|
630
|
+
instructions,
|
|
631
|
+
modalities: ['audio', 'text'],
|
|
632
|
+
input_audio_transcription: {model: 'whisper-1'},
|
|
633
|
+
turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
|
|
634
|
+
tools: Tools.getToolDefinitions()
|
|
635
|
+
});
|
|
636
|
+
} else {
|
|
637
|
+
// If it's some other error, throw it
|
|
638
|
+
throw error;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
653
641
|
}
|
|
654
642
|
|
|
655
643
|
protected sendUserMessage(client: RealtimeVoiceClient, message: string, response: boolean = true) {
|
|
644
|
+
// Find the socket associated with this client
|
|
645
|
+
const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
|
|
646
|
+
if (!socket) {
|
|
647
|
+
logger.error('No socket found for message send');
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
|
|
656
651
|
try {
|
|
657
652
|
client.createConversationItem({
|
|
658
653
|
id: createId(),
|
|
@@ -667,16 +662,24 @@ ${this.getTimeString(socket)}` :
|
|
|
667
662
|
],
|
|
668
663
|
});
|
|
669
664
|
if (response) {
|
|
670
|
-
|
|
665
|
+
try {
|
|
666
|
+
client.createResponse({});
|
|
667
|
+
} catch (error: any) {
|
|
668
|
+
// If we get a concurrent response error, just log it and continue
|
|
669
|
+
if (error.message?.includes('Conversation already has an active response')) {
|
|
670
|
+
logger.log('Skipping response creation - conversation already has active response');
|
|
671
|
+
return;
|
|
672
|
+
}
|
|
673
|
+
throw error;
|
|
674
|
+
}
|
|
671
675
|
}
|
|
672
676
|
} catch (error: any) {
|
|
673
677
|
logger.error(`Error sending user message: ${error.message}`);
|
|
674
678
|
if (error.message === 'Not connected') {
|
|
675
|
-
//
|
|
676
|
-
const
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
}
|
|
679
|
+
// Add to message queue for when we reconnect
|
|
680
|
+
const queue = this.messageQueue.get(socket.id) || [];
|
|
681
|
+
queue.push({ message, response });
|
|
682
|
+
this.messageQueue.set(socket.id, queue);
|
|
680
683
|
}
|
|
681
684
|
}
|
|
682
685
|
}
|
|
@@ -712,29 +715,34 @@ ${this.getTimeString(socket)}` :
|
|
|
712
715
|
}
|
|
713
716
|
}
|
|
714
717
|
|
|
715
|
-
private
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
currentCallId: null,
|
|
719
|
-
lock: Promise.resolve(),
|
|
720
|
-
isShuttingDown: false
|
|
721
|
-
});
|
|
722
|
-
logger.log(`Initialized function call state for socket ${socketId}`);
|
|
723
|
-
}
|
|
724
|
-
return this.functionCallStates.get(socketId)!;
|
|
725
|
-
}
|
|
726
|
-
|
|
727
|
-
private async executeFunctionCall(socket: Socket, tools: Tools, event: any, state: any, client: RealtimeVoiceClient) {
|
|
718
|
+
private async executeFunctionCall(socket: Socket, tools: Tools, event: any, client: RealtimeVoiceClient) {
|
|
719
|
+
this.clearIdleTimer(socket);
|
|
720
|
+
const currentCallId = this.currentFunctionCall.get(socket.id);
|
|
728
721
|
try {
|
|
729
|
-
|
|
730
|
-
if (
|
|
731
|
-
logger.
|
|
732
|
-
|
|
722
|
+
|
|
723
|
+
if (!this.isInteractive.get(socket.id)) {
|
|
724
|
+
logger.log('Non-interactive function call - executing immediately');
|
|
725
|
+
await tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, false);
|
|
726
|
+
this.startIdleTimer(client, socket);
|
|
727
|
+
return;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
if (currentCallId) {
|
|
731
|
+
logger.log('Function call skipped - another call is already in progress', {
|
|
732
|
+
current: currentCallId,
|
|
733
733
|
attempted: event.call_id
|
|
734
734
|
});
|
|
735
|
+
client.createConversationItem({
|
|
736
|
+
id: createId(),
|
|
737
|
+
type: 'function_call_output',
|
|
738
|
+
call_id: event.call_id,
|
|
739
|
+
output: JSON.stringify({ error: `Function call skipped - another function call ${currentCallId} is in progress` })
|
|
740
|
+
});
|
|
735
741
|
return;
|
|
736
742
|
}
|
|
737
743
|
|
|
744
|
+
this.currentFunctionCall.set(socket.id, event.call_id);
|
|
745
|
+
|
|
738
746
|
// Set up timeout
|
|
739
747
|
const timeoutPromise = new Promise((_, reject) => {
|
|
740
748
|
setTimeout(() => {
|
|
@@ -744,26 +752,23 @@ ${this.getTimeString(socket)}` :
|
|
|
744
752
|
|
|
745
753
|
// Execute the function call with timeout
|
|
746
754
|
await Promise.race([
|
|
747
|
-
tools.executeCall(event.call_id, event.arguments, socket.data.userId, socket.data.aiName),
|
|
755
|
+
tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, true),
|
|
748
756
|
timeoutPromise
|
|
749
757
|
]);
|
|
750
758
|
|
|
751
|
-
// Reset state on success
|
|
752
|
-
if (!state.isShuttingDown) {
|
|
753
|
-
state.currentCallId = null;
|
|
754
|
-
state.lock = Promise.resolve();
|
|
755
|
-
this.startIdleTimer(client, socket);
|
|
756
|
-
}
|
|
757
759
|
} catch (error: any) {
|
|
758
760
|
logger.error('Function call failed:', error);
|
|
759
761
|
socket.emit('error', error.message);
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
762
|
+
throw error;
|
|
763
|
+
|
|
764
|
+
} finally {
|
|
765
|
+
const wasCurrentCall = this.currentFunctionCall.get(socket.id) === event.call_id;
|
|
766
|
+
this.currentFunctionCall.set(socket.id, null);
|
|
767
|
+
// Only reset cycles and start idle timer if this was the current call
|
|
768
|
+
if (wasCurrentCall) {
|
|
769
|
+
this.resetIdleCycles(socket);
|
|
764
770
|
this.startIdleTimer(client, socket);
|
|
765
771
|
}
|
|
766
|
-
throw error;
|
|
767
772
|
}
|
|
768
773
|
}
|
|
769
774
|
}
|