@aj-archipelago/cortex 1.3.6 → 1.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,11 +24,11 @@ export interface SocketData {
24
24
  language: string;
25
25
  }
26
26
 
27
- const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly and show a lot of animation and emotion including laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
27
+ const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly with animation and emotion in your voice\n- Include laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
28
28
 
29
29
  const AI_DATETIME = "The current time and date in GMT is {{now}}, but references like \"today\" or \"yesterday\" are relative to the user's time zone. If you remember the user's time zone, use it - it's possible that the day for the user is different than the day in GMT.";
30
30
 
31
- const AI_EXPERTISE = "Your expertise includes journalism, journalistic ethics, researching and composing documents, writing code, solving math problems, logical analysis, and technology. By using your tools, you have access to real-time data and the ability to search the internet, news, wires, look at files or documents, watch and analyze video, look at the user's screen, examine images, generate images, solve hard math and logic problems, write code, and execute code in a sandboxed environment.";
31
+ const AI_EXPERTISE = "Your expertise includes journalism, journalistic ethics, researching and composing documents, writing code, solving math problems, logical analysis, and technology. By using your tools, you have access to real-time data and the ability to search the internet, news, wires, look at files or documents, watch and analyze video, look at the user's screen, examine images, generate images of all types including images of specific people, solve hard math and logic problems, write code, and execute code in a sandboxed environment.";
32
32
 
33
33
  const AI_MEMORY_INITIAL = `<MEMORIES>\n<SELF>\n{{{memorySelf}}}\n</SELF>\n<USER>\n{{{memoryUser}}}\n</USER>\n</MEMORIES>`;
34
34
 
@@ -60,25 +60,23 @@ export class SocketServer {
60
60
  private readonly corsHosts: string;
61
61
  private io: Server | null;
62
62
  private httpServer: HTTPServer | null;
63
- private functionCallStates: Map<string, {
64
- currentCallId: string | null;
65
- lock: Promise<void>;
66
- isShuttingDown: boolean;
67
- }> = new Map();
63
+ private currentFunctionCall: Map<string, string | null> = new Map();
68
64
  private idleTimers: Map<string, NodeJS.Timer> = new Map();
69
65
  private aiResponding: Map<string, boolean> = new Map();
70
66
  private audioPlaying: Map<string, boolean> = new Map();
71
67
  private lastUserMessageTime: Map<string, number> = new Map();
72
68
  private idleCycles: Map<string, number> = new Map();
73
69
  private userSpeaking: Map<string, boolean> = new Map();
74
- private audioMuted: Map<string, boolean> = new Map();
70
+ private isInteractive: Map<string, boolean> = new Map();
75
71
  private voiceSample: Map<string, string> = new Map();
76
72
  private audioMessages: Map<string, string[]> = new Map();
73
+ private messageQueue: Map<string, Array<{message: string, response: boolean}>> = new Map();
77
74
  private static readonly MAX_AUDIO_MESSAGES = 8;
78
- private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 60000;
79
- private static readonly BASE_IDLE_TIMEOUT: number = 3000;
80
- private static readonly MAX_IDLE_TIMEOUT: number = 60000;
81
- private static readonly FUNCTION_CALL_TIMEOUT_MS = 30000; // 30 second timeout
75
+ private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 180 * 1000;
76
+ private static readonly BASE_IDLE_TIMEOUT: number = 2.5 * 1000;
77
+ private static readonly MAX_IDLE_TIMEOUT: number = 60 * 1000;
78
+ private static readonly IDLE_CYCLE_TO_NONINTERACTIVE: number = 1;
79
+ private static readonly FUNCTION_CALL_TIMEOUT_MS = 120 * 1000;
82
80
  private isAzure: boolean;
83
81
 
84
82
  private getTimeString(socket: Socket): string {
@@ -88,26 +86,31 @@ export class SocketServer {
88
86
  return `The current time in GMT is ${now.toISOString()}. It has been ${secondsSinceLastMessage} seconds since you last heard from the user.`;
89
87
  }
90
88
 
91
- private cleanup(socket: Socket) {
89
+ private async cleanup(socket: Socket) {
92
90
  logger.log(`Cleaning up resources for socket ${socket.id}`);
91
+
92
+ // Clear any pending timers first
93
93
  this.clearIdleTimer(socket);
94
- // Mark the function call state as shutting down before deletion
95
- const state = this.functionCallStates.get(socket.id);
96
- if (state) {
97
- state.isShuttingDown = true;
98
- // Wait for any in-progress function call to complete
99
- state.lock.finally(() => {
100
- this.functionCallStates.delete(socket.id);
101
- });
102
- }
94
+
95
+ // Wait a small amount of time to ensure any in-flight operations complete
96
+ await new Promise(resolve => setTimeout(resolve, 100));
97
+
98
+ // Clear all state maps
99
+ this.currentFunctionCall.delete(socket.id);
103
100
  this.aiResponding.delete(socket.id);
104
101
  this.audioPlaying.delete(socket.id);
105
102
  this.lastUserMessageTime.delete(socket.id);
106
103
  this.idleCycles.delete(socket.id);
107
104
  this.userSpeaking.delete(socket.id);
108
- this.audioMuted.delete(socket.id);
105
+ this.isInteractive.delete(socket.id);
109
106
  this.voiceSample.delete(socket.id);
110
107
  this.audioMessages.delete(socket.id);
108
+ this.messageQueue.delete(socket.id);
109
+
110
+ // Only disconnect if we're still connected
111
+ if (socket.connected) {
112
+ socket.disconnect(true);
113
+ }
111
114
  }
112
115
 
113
116
  constructor(apiKey: string, corsHosts: string) {
@@ -120,8 +123,12 @@ export class SocketServer {
120
123
  }
121
124
 
122
125
  private calculateIdleTimeout(socket: Socket) {
126
+ if (!this.isInteractive.get(socket.id)) {
127
+ return SocketServer.MAX_IDLE_TIMEOUT;
128
+ }
129
+
123
130
  const cycles = this.idleCycles.get(socket.id) || 0;
124
- const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(2, cycles);
131
+ const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(4, cycles);
125
132
  const randomFactor = 0.8 + (Math.random() * 0.4);
126
133
  const timeout = Math.min(baseTimeout * randomFactor, SocketServer.MAX_IDLE_TIMEOUT);
127
134
 
@@ -129,10 +136,6 @@ export class SocketServer {
129
136
  return timeout;
130
137
  }
131
138
 
132
- public setAudioMuted(socket: Socket, muted: boolean) {
133
- this.audioMuted.set(socket.id, muted);
134
- }
135
-
136
139
  public async sendPrompt(client: RealtimeVoiceClient, socket: Socket, prompt: string, allowTools: boolean = true, disposable: boolean = true): Promise<{skipped: boolean}> {
137
140
  logger.log(`Sending prompt for socket ${socket.id}`);
138
141
  try {
@@ -148,7 +151,7 @@ export class SocketServer {
148
151
  } catch (error: any) {
149
152
  logger.error(`Error sending system prompt: ${error.message}`);
150
153
  if (error.message === 'Not connected') {
151
- await this.handleDisconnection(socket, client);
154
+ await this.cleanup(socket);
152
155
  } else {
153
156
  socket.emit('error', error.message);
154
157
  }
@@ -156,47 +159,31 @@ export class SocketServer {
156
159
  }
157
160
  }
158
161
 
159
- private async handleDisconnection(socket: Socket, client: RealtimeVoiceClient) {
160
- logger.log(`Handling disconnection for socket ${socket.id}`);
161
-
162
- // Let the client handle reconnection since autoReconnect is true
163
- // Only clean up if the client explicitly disconnects
164
- client.once('close', (event) => {
165
- if (!event.error) {
166
- // Only clean up on intentional disconnects
167
- this.cleanup(socket);
168
- socket.emit('error', 'Lost connection to AI service');
169
- socket.disconnect(true);
170
- }
171
- });
172
- }
173
-
174
162
  private async sendIdlePrompt(client: RealtimeVoiceClient, socket: Socket) {
175
- // if the user has been idle for a while, mute the audio so we don't bother them
176
- const idleCycle = this.idleCycles.get(socket.id) || 0;
177
163
 
178
- if (idleCycle > 1) {
179
- this.setAudioMuted(socket, true);
180
- }
164
+ this.clearIdleTimer(socket);
181
165
 
182
- const prompt = this.audioMuted.get(socket.id) ?
183
- `You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for over a minute, you should do one or more of the following:
184
- - Do silent research about something that interests you - use the Search tool with the silent parameter set to true
185
- - Think deeply about a topic you care about - use the Reason tool with the silent parameter set to true
166
+ const idleCycle = this.idleCycles.get(socket.id) || 0;
167
+ const isInteractive = idleCycle <= (SocketServer.IDLE_CYCLE_TO_NONINTERACTIVE - 1);
168
+ this.isInteractive.set(socket.id, isInteractive);
169
+
170
+ const prompt = isInteractive ?
171
+ `You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, or think quietly. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed.` :
172
+ `You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while do one or more of the following:
173
+ - Do research about something that interests you - use the Search tool
174
+ - Think deeply about a topic you care about - use the Reason tool
186
175
  - Do nothing if you prefer.
187
- ${this.getTimeString(socket)}` :
188
- `You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, think, or just be silent. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed. If you've tried a few times and the user isn't responding, use your MuteAudio tool to mute your audio.`;
176
+ ${this.getTimeString(socket)}`;
189
177
 
190
- logger.log(`Sending ${this.audioMuted.get(socket.id) ? 'silent' : 'regular'} idle prompt for socket ${socket.id}`);
178
+ logger.log(`Sending ${isInteractive ? 'interactive' : 'non-interactive'} idle prompt for socket ${socket.id}`);
191
179
  const result = await this.sendPrompt(client, socket, prompt, true);
192
180
 
193
181
  logger.log(`Idle prompt result:`, result);
194
182
 
195
183
  if (!result.skipped) {
196
- this.idleCycles.set(socket.id, (this.idleCycles.get(socket.id) || 0) + 1);
184
+ this.idleCycles.set(socket.id, idleCycle + 1);
197
185
  }
198
186
 
199
- // Restart timer after sending prompt
200
187
  this.startIdleTimer(client, socket);
201
188
  }
202
189
 
@@ -227,7 +214,6 @@ ${this.getTimeString(socket)}` :
227
214
 
228
215
  private resetIdleCycles(socket: Socket) {
229
216
  this.idleCycles.set(socket.id, 0);
230
- logger.log(`Reset idle cycles for socket ${socket.id}`);
231
217
  }
232
218
 
233
219
  listen(app: Hono, port: number) {
@@ -261,9 +247,9 @@ ${this.getTimeString(socket)}` :
261
247
  this.audioPlaying.set(socket.id, false);
262
248
  this.lastUserMessageTime.set(socket.id, 0);
263
249
  this.userSpeaking.set(socket.id, false);
264
- this.audioMuted.set(socket.id, false);
265
- // Initialize function call state for this socket
266
- this.initFunctionCallState(socket.id);
250
+ this.isInteractive.set(socket.id, true);
251
+ this.currentFunctionCall.set(socket.id, null);
252
+
267
253
  // Extract and log all client parameters
268
254
  const clientParams = {
269
255
  userId: socket.handshake.query.userId as string,
@@ -282,46 +268,56 @@ ${this.getTimeString(socket)}` :
282
268
  socket.data.userName = clientParams.userName;
283
269
  socket.data.aiStyle = clientParams.aiStyle;
284
270
  socket.data.language = clientParams.language;
285
- const voice = clientParams.voice;
286
271
 
287
272
  const client = new RealtimeVoiceClient({
288
273
  apiKey: this.apiKey,
289
274
  autoReconnect: true,
290
275
  debug: process.env.NODE_ENV !== 'production',
276
+ filterDeltas: true,
291
277
  });
292
278
 
293
- client.on('connected', async () => {
294
- logger.log(`Connected to OpenAI successfully!`);
295
- await this.updateSession(client, socket);
296
- socket.emit('ready');
279
+ await this.connectClient(socket, client);
280
+ }
297
281
 
298
- // Send initial greeting prompt
299
- const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. Respond naturally using your unique voice and style. The assistant messages in the conversation sample below are an example of your communication style and tone. Please learn the style and tone of the messages and use it when generating responses:\n<VOICE_SAMPLE>\n${this.voiceSample.get(socket.id) || ''}\n</VOICE_SAMPLE>\n\nThe current GMT time is ${new Date().toISOString()}.`;
282
+ async connectClient(socket: Socket<ClientToServerEvents,
283
+ ServerToClientEvents,
284
+ InterServerEvents,
285
+ SocketData>,
286
+ client: RealtimeVoiceClient) {
287
+ const tools = new Tools(client, socket, this);
300
288
 
301
- await this.sendPrompt(client, socket, greetingPrompt, false);
302
- this.startIdleTimer(client, socket);
289
+ // Handle WebSocket errors and disconnection
290
+ client.on('error', (event) => {
291
+ logger.error(`Client error: ${event.message}`);
292
+ socket.emit('error', event.message);
293
+ // Only handle disconnection if it's not a concurrent response error
294
+ if (!event.error?.message?.includes('Conversation already has an active response')) {
295
+ this.cleanup(socket);
296
+ }
297
+ });
298
+
299
+ client.on('close', async (event) => {
300
+ logger.log(`WebSocket closed for socket ${socket.id}, error: ${event.error}`);
301
+ if (!event.error) {
302
+ await this.cleanup(socket);
303
+ }
303
304
  });
304
305
 
305
- // Track when AI starts responding
306
+ // Track when AI starts/finishes responding
306
307
  client.on('response.created', () => {
307
308
  logger.log('AI starting response');
308
309
  this.aiResponding.set(socket.id, true);
309
310
  this.clearIdleTimer(socket);
310
311
  });
311
312
 
312
- // Track when AI finishes responding
313
313
  client.on('response.done', () => {
314
314
  logger.log('AI response done');
315
315
  this.aiResponding.set(socket.id, false);
316
- // Don't start the idle timer yet if audio is still playing
317
- if (!this.audioPlaying.get(socket.id)) {
318
- this.startIdleTimer(client, socket);
319
- }
320
316
  });
321
317
 
322
- // Track audio playback start
318
+ // Track audio playback
323
319
  client.on('response.audio.delta', ({delta}) => {
324
- if (!this.audioMuted.get(socket.id)) {
320
+ if (this.isInteractive.get(socket.id)) {
325
321
  this.audioPlaying.set(socket.id, true);
326
322
  this.clearIdleTimer(socket);
327
323
  }
@@ -331,51 +327,52 @@ ${this.getTimeString(socket)}` :
331
327
  logger.log(`Audio playback complete for track ${trackId}`);
332
328
  this.audioPlaying.set(socket.id, false);
333
329
  // Only start idle timer if AI is also done responding
334
- if (!this.aiResponding.get(socket.id)) {
330
+ // and there's no current function call
331
+ if (!this.aiResponding.get(socket.id) && !this.currentFunctionCall.get(socket.id)) {
335
332
  this.startIdleTimer(client, socket);
336
333
  }
337
334
  });
338
335
 
339
336
  socket.on('appendAudio', (audio: string) => {
340
- // if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking
341
- // to avoid echoes
337
+ // if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking to avoid echoes
342
338
  const timeSinceLastMessage = Date.now() - (this.lastUserMessageTime.get(socket.id) || 0);
343
339
  const isPlaying = this.audioPlaying.get(socket.id) || this.aiResponding.get(socket.id);
340
+
344
341
  if (!isPlaying || timeSinceLastMessage < SocketServer.AUDIO_BLOCK_TIMEOUT_MS) {
345
- //logger.log('Time since last message:', timeSinceLastMessage, 'ms');
346
- client.appendInputAudio(audio);
342
+ try {
343
+ client.appendInputAudio(audio);
344
+ } catch (error: any) {
345
+ logger.error(`Error appending audio: ${error.message}`);
346
+ }
347
347
  }
348
348
  });
349
349
 
350
+ // Handle speech events
350
351
  client.on('input_audio_buffer.speech_started', () => {
351
352
  this.userSpeaking.set(socket.id, true);
352
353
  if (this.audioPlaying.get(socket.id)) {
353
354
  logger.log('Interrupting audio playback due to user speaking');
354
355
  socket.emit('conversationInterrupted');
355
356
  }
356
- this.setAudioMuted(socket, false);
357
357
  this.clearIdleTimer(socket);
358
358
  });
359
359
 
360
360
  client.on('input_audio_buffer.cancelled', () => {
361
361
  this.userSpeaking.set(socket.id, false);
362
- this.resetIdleCycles(socket);
363
- this.startIdleTimer(client, socket);
364
362
  });
365
363
 
366
364
  client.on('input_audio_buffer.committed', () => {
367
365
  this.userSpeaking.set(socket.id, false);
368
- this.audioMuted.set(socket.id, false);
369
- logger.log('Audio input committed, resetting idle timer and cycles');
366
+ this.isInteractive.set(socket.id, true);
367
+ logger.log('User finished speaking, resetting idle timer and cycles');
370
368
  this.resetIdleCycles(socket);
371
369
  this.startIdleTimer(client, socket);
372
370
  });
373
371
 
372
+ // Handle user messages and conversation control
374
373
  socket.on('sendMessage', (message: string) => {
375
374
  if (message) {
376
- logger.log('User sent message, resetting idle timer and cycles');
377
- this.resetIdleCycles(socket);
378
- this.startIdleTimer(client, socket);
375
+ logger.log('User sent message');
379
376
  this.sendUserMessage(client, message, true);
380
377
  }
381
378
  });
@@ -384,9 +381,9 @@ ${this.getTimeString(socket)}` :
384
381
  logger.log('User cancelled response, resetting idle timer and cycles');
385
382
  this.aiResponding.set(socket.id, false);
386
383
  this.audioPlaying.set(socket.id, false);
384
+ client.cancelResponse();
387
385
  this.resetIdleCycles(socket);
388
386
  this.startIdleTimer(client, socket);
389
- client.cancelResponse();
390
387
  });
391
388
 
392
389
  socket.on('conversationCompleted', async () => {
@@ -394,64 +391,29 @@ ${this.getTimeString(socket)}` :
394
391
  this.cleanup(socket);
395
392
  });
396
393
 
397
- // Handle cleanup and client disconnect before socket closes
394
+ // Handle cleanup and disconnection
398
395
  socket.on('disconnecting', async (reason) => {
399
396
  logger.log('Socket disconnecting', socket.id, reason);
400
397
  this.cleanup(socket);
401
- this.functionCallStates.delete(socket.id);
402
398
  await client.disconnect();
403
399
  });
404
400
 
405
- // Log the final disconnect event
406
401
  socket.on('disconnect', (reason) => {
407
402
  logger.log('Socket disconnected', socket.id, reason);
408
403
  });
409
404
 
410
- await this.connectClient(socket, client);
411
- }
412
-
413
- async connectClient(socket: Socket<ClientToServerEvents,
414
- ServerToClientEvents,
415
- InterServerEvents,
416
- SocketData>,
417
- client: RealtimeVoiceClient) {
418
- const tools = new Tools(client, socket, this);
419
- client.on('error', (event) => {
420
- logger.error(`Client error: ${event.error.message}`);
421
- socket.emit('error', event.error.message);
422
- });
423
- client.on('close', () => {
424
- });
405
+ // Handle conversation items
425
406
  client.on('conversation.item.deleted', ({item_id}) => {
426
407
  logger.log(`Successfully deleted conversation item: ${item_id}`);
427
408
  });
409
+
428
410
  client.on('conversation.item.created', ({item}) => {
429
411
  switch (item.type) {
430
412
  case 'function_call_output':
431
- const outputState = this.functionCallStates.get(socket.id);
432
- if (outputState && item.call_id === outputState.currentCallId) {
433
- outputState.currentCallId = null;
434
- }
435
413
  break;
436
414
 
437
415
  case 'function_call':
438
- const callState = this.functionCallStates.get(socket.id);
439
- if (!callState) {
440
- const state = this.initFunctionCallState(socket.id);
441
- if (state.isShuttingDown) {
442
- logger.log(`Skipping function call for shutting down socket ${socket.id}`);
443
- break;
444
- }
445
- }
446
-
447
- const state = this.functionCallStates.get(socket.id)!;
448
- if (!state.currentCallId) { // Only init new calls if no call is in progress
449
- tools.initCall(item.call_id || '', item.name || '', item.arguments || '');
450
- state.currentCallId = item.call_id;
451
- this.clearIdleTimer(socket);
452
- } else {
453
- logger.log(`Skipping new function call ${item.call_id} while call ${state.currentCallId} is in progress`);
454
- }
416
+ this.clearIdleTimer(socket);
455
417
  break;
456
418
 
457
419
  case 'message':
@@ -464,58 +426,28 @@ ${this.getTimeString(socket)}` :
464
426
  break;
465
427
  }
466
428
  });
429
+
467
430
  client.on('conversation.item.input_audio_transcription.completed',
468
431
  async ({item_id, transcript}) => {
469
432
  if (transcript) {
470
- const currentTime = this.lastUserMessageTime.get(socket.id) || 0;
471
- this.lastUserMessageTime.set(socket.id,
472
- currentTime === 0 ? Date.now() - SocketServer.AUDIO_BLOCK_TIMEOUT_MS : Date.now()
473
- );
433
+ this.lastUserMessageTime.set(socket.id, Date.now());
474
434
  const item = client.getItem(item_id);
475
435
  item && socket.emit('conversationUpdated', item, {});
476
436
  const cortexHistory = tools.getCortexHistory();
477
- await this.searchMemory(client, socket, cortexHistory);
437
+ this.searchMemory(client, socket, cortexHistory);
478
438
  }
479
439
  });
480
- client.on('response.function_call_arguments.done', async (event) => {
481
- const state = this.functionCallStates.get(socket.id);
482
- if (!state || state.isShuttingDown) {
483
- logger.error('No function call state found for socket or socket is shutting down', socket.id);
484
- return;
485
- }
486
440
 
487
- state.lock = state.lock.then(async () => {
488
- if (state.currentCallId && state.currentCallId !== event.call_id) {
489
- logger.log('Function call mismatch or already in progress, skipping', {
490
- current: state.currentCallId,
491
- attempted: event.call_id
492
- });
493
- return;
494
- }
495
-
496
- state.currentCallId = event.call_id;
497
- try {
498
- this.clearIdleTimer(socket);
499
- this.resetIdleCycles(socket);
500
- await this.executeFunctionCall(socket, tools, event, state, client);
501
- } catch (error) {
502
- logger.error('Function call failed:', error);
503
- }
504
- }).catch(error => {
505
- // If the promise chain itself errors, make sure we clear both lock and currentCallId
506
- logger.error('Function call lock error:', error);
507
- const state = this.functionCallStates.get(socket.id);
508
- if (state && !state.isShuttingDown) {
509
- state.currentCallId = null;
510
- state.lock = Promise.resolve();
511
- }
512
- });
441
+ client.on('response.function_call_arguments.done', async (event) => {
442
+ await this.executeFunctionCall(socket, tools, event, client);
513
443
  });
444
+
514
445
  client.on('response.output_item.added', ({item}) => {
515
446
  if (item.type === 'message') {
516
447
  socket.emit('conversationUpdated', item, {});
517
448
  }
518
449
  });
450
+
519
451
  client.on('response.output_item.done', async ({item}) => {
520
452
  if (item.type !== 'message') {
521
453
  return;
@@ -527,31 +459,72 @@ ${this.getTimeString(socket)}` :
527
459
  this.manageAudioMessages(socket, client, item.id);
528
460
  }
529
461
  const cortexHistory = tools.getCortexHistory();
530
- //this.searchMemory(client, socket, cortexHistory);
531
462
  manageMemory(socket.data.userId, socket.data.aiName, cortexHistory);
532
463
  }
533
464
  });
465
+
534
466
  client.on('response.audio_transcript.delta', ({item_id, delta}) => {
535
467
  const item = client.getItem(item_id);
536
468
  item && socket.emit('conversationUpdated', item, {transcript: delta});
537
469
  });
470
+
538
471
  client.on('response.text.delta', ({item_id, delta}) => {
539
472
  const item = client.getItem(item_id);
540
473
  item && socket.emit('conversationUpdated', item, {text: delta});
541
474
  });
475
+
542
476
  client.on('response.audio.delta', ({item_id, delta}) => {
543
- if (!this.audioMuted.get(socket.id)) {
477
+ if (this.isInteractive.get(socket.id)) {
544
478
  const item = client.getItem(item_id);
545
479
  item && socket.emit('conversationUpdated', item, {audio: delta});
546
480
  }
547
481
  });
482
+
548
483
  client.on('conversation.item.truncated', () => {
549
484
  this.audioPlaying.set(socket.id, false);
550
485
  this.aiResponding.set(socket.id, false);
551
- this.setAudioMuted(socket, true);
486
+ this.isInteractive.set(socket.id, false);
552
487
  socket.emit('conversationInterrupted');
553
488
  });
554
489
 
490
+ client.on('connected', async () => {
491
+ logger.log(`Connected to OpenAI successfully!`);
492
+ try {
493
+ await this.updateSession(client, socket);
494
+ socket.emit('ready');
495
+
496
+ // Send initial greeting prompt
497
+ const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n${this.voiceSample.get(socket.id) || ''}\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
498
+
499
+ await this.sendPrompt(client, socket, greetingPrompt, false);
500
+ this.startIdleTimer(client, socket);
501
+
502
+ // Process any queued messages
503
+ const queue = this.messageQueue.get(socket.id) || [];
504
+ this.messageQueue.set(socket.id, []);
505
+
506
+ for (const {message, response} of queue) {
507
+ if (socket.connected) { // Check connection before each message
508
+ await this.sendUserMessage(client, message, response);
509
+ } else {
510
+ logger.log(`Socket ${socket.id} disconnected while processing queue, cleaning up`);
511
+ await this.cleanup(socket);
512
+ return;
513
+ }
514
+ }
515
+ } catch (error: any) {
516
+ logger.error(`Failed to initialize session: ${error.message}`);
517
+ if (error.message?.includes('ConnectionRefused')) {
518
+ logger.log('Cortex connection refused during initialization, cleaning up client');
519
+ this.cleanup(socket);
520
+ socket.emit('error', 'Unable to connect to Cortex service. Please try again later.');
521
+ socket.disconnect(true);
522
+ return;
523
+ }
524
+ socket.emit('error', error.message);
525
+ }
526
+ });
527
+
555
528
  // Connect to OpenAI Realtime API
556
529
  try {
557
530
  logger.log(`Connecting to OpenAI...`);
@@ -588,12 +561,11 @@ ${this.getTimeString(socket)}` :
588
561
  readMemory(socket.data.userId, socket.data.aiName, "memorySelf", 1),
589
562
  readMemory(socket.data.userId, socket.data.aiName, "memoryUser", 1),
590
563
  readMemory(socket.data.userId, socket.data.aiName, "memoryDirectives", 1),
591
- readMemory(socket.data.userId, socket.data.aiName, "memoryTopics", 0, 48),
564
+ readMemory(socket.data.userId, socket.data.aiName, "memoryTopics", 0, 0, 10),
592
565
  style(socket.data.userId, socket.data.aiName, socket.data.aiStyle, [], "")
593
566
  ]);
594
567
 
595
568
  if (writeToConversation.length > 0) {
596
- // If memoryAll is present, we'll send all sections
597
569
  const sectionsToSend = writeToConversation.includes('memoryAll') ?
598
570
  ['memorySelf', 'memoryUser', 'memoryDirectives', 'memoryTopics'] as const :
599
571
  writeToConversation;
@@ -605,7 +577,6 @@ ${this.getTimeString(socket)}` :
605
577
  memoryTopics: MEMORY_MESSAGE_TOPICS.replace('{{memoryTopics}}', memoryTopics?.result || '')
606
578
  };
607
579
 
608
- // Send the requested sections
609
580
  sectionsToSend.forEach(section => {
610
581
  if (section in memoryMessages) {
611
582
  this.sendUserMessage(client, memoryMessages[section as keyof typeof memoryMessages], false);
@@ -641,18 +612,42 @@ ${this.getTimeString(socket)}` :
641
612
 
642
613
  this.voiceSample.set(socket.id, memory?.voiceSample || '');
643
614
 
644
- client.updateSession({
645
- instructions,
646
- modalities: ['audio', 'text'],
647
- voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
648
- input_audio_transcription: {model: 'whisper-1'},
649
- turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
650
- tools: Tools.getToolDefinitions()
651
- });
652
-
615
+ try {
616
+ // First try updating everything including voice
617
+ await client.updateSession({
618
+ instructions,
619
+ modalities: ['audio', 'text'],
620
+ voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
621
+ input_audio_transcription: {model: 'whisper-1'},
622
+ turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
623
+ tools: Tools.getToolDefinitions()
624
+ });
625
+ } catch (error: any) {
626
+ if (error.message?.includes('Cannot update a conversation\'s voice')) {
627
+ // If voice update fails, try updating without voice
628
+ logger.log('Could not update voice, updating other session parameters');
629
+ await client.updateSession({
630
+ instructions,
631
+ modalities: ['audio', 'text'],
632
+ input_audio_transcription: {model: 'whisper-1'},
633
+ turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
634
+ tools: Tools.getToolDefinitions()
635
+ });
636
+ } else {
637
+ // If it's some other error, throw it
638
+ throw error;
639
+ }
640
+ }
653
641
  }
654
642
 
655
643
  protected sendUserMessage(client: RealtimeVoiceClient, message: string, response: boolean = true) {
644
+ // Find the socket associated with this client
645
+ const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
646
+ if (!socket) {
647
+ logger.error('No socket found for message send');
648
+ return;
649
+ }
650
+
656
651
  try {
657
652
  client.createConversationItem({
658
653
  id: createId(),
@@ -667,16 +662,24 @@ ${this.getTimeString(socket)}` :
667
662
  ],
668
663
  });
669
664
  if (response) {
670
- client.createResponse({});
665
+ try {
666
+ client.createResponse({});
667
+ } catch (error: any) {
668
+ // If we get a concurrent response error, just log it and continue
669
+ if (error.message?.includes('Conversation already has an active response')) {
670
+ logger.log('Skipping response creation - conversation already has active response');
671
+ return;
672
+ }
673
+ throw error;
674
+ }
671
675
  }
672
676
  } catch (error: any) {
673
677
  logger.error(`Error sending user message: ${error.message}`);
674
678
  if (error.message === 'Not connected') {
675
- // Find the socket associated with this client
676
- const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
677
- if (socket) {
678
- this.handleDisconnection(socket, client);
679
- }
679
+ // Add to message queue for when we reconnect
680
+ const queue = this.messageQueue.get(socket.id) || [];
681
+ queue.push({ message, response });
682
+ this.messageQueue.set(socket.id, queue);
680
683
  }
681
684
  }
682
685
  }
@@ -712,29 +715,34 @@ ${this.getTimeString(socket)}` :
712
715
  }
713
716
  }
714
717
 
715
- private initFunctionCallState(socketId: string) {
716
- if (!this.functionCallStates.has(socketId)) {
717
- this.functionCallStates.set(socketId, {
718
- currentCallId: null,
719
- lock: Promise.resolve(),
720
- isShuttingDown: false
721
- });
722
- logger.log(`Initialized function call state for socket ${socketId}`);
723
- }
724
- return this.functionCallStates.get(socketId)!;
725
- }
726
-
727
- private async executeFunctionCall(socket: Socket, tools: Tools, event: any, state: any, client: RealtimeVoiceClient) {
718
+ private async executeFunctionCall(socket: Socket, tools: Tools, event: any, client: RealtimeVoiceClient) {
719
+ this.clearIdleTimer(socket);
720
+ const currentCallId = this.currentFunctionCall.get(socket.id);
728
721
  try {
729
- // Verify this is still the current function call
730
- if (state.currentCallId !== event.call_id) {
731
- logger.error('Function call mismatch in execution', {
732
- current: state.currentCallId,
722
+
723
+ if (!this.isInteractive.get(socket.id)) {
724
+ logger.log('Non-interactive function call - executing immediately');
725
+ await tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, false);
726
+ this.startIdleTimer(client, socket);
727
+ return;
728
+ }
729
+
730
+ if (currentCallId) {
731
+ logger.log('Function call skipped - another call is already in progress', {
732
+ current: currentCallId,
733
733
  attempted: event.call_id
734
734
  });
735
+ client.createConversationItem({
736
+ id: createId(),
737
+ type: 'function_call_output',
738
+ call_id: event.call_id,
739
+ output: JSON.stringify({ error: `Function call skipped - another function call ${currentCallId} is in progress` })
740
+ });
735
741
  return;
736
742
  }
737
743
 
744
+ this.currentFunctionCall.set(socket.id, event.call_id);
745
+
738
746
  // Set up timeout
739
747
  const timeoutPromise = new Promise((_, reject) => {
740
748
  setTimeout(() => {
@@ -744,26 +752,23 @@ ${this.getTimeString(socket)}` :
744
752
 
745
753
  // Execute the function call with timeout
746
754
  await Promise.race([
747
- tools.executeCall(event.call_id, event.arguments, socket.data.userId, socket.data.aiName),
755
+ tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, true),
748
756
  timeoutPromise
749
757
  ]);
750
758
 
751
- // Reset state on success
752
- if (!state.isShuttingDown) {
753
- state.currentCallId = null;
754
- state.lock = Promise.resolve();
755
- this.startIdleTimer(client, socket);
756
- }
757
759
  } catch (error: any) {
758
760
  logger.error('Function call failed:', error);
759
761
  socket.emit('error', error.message);
760
- // Reset state on error
761
- if (!state.isShuttingDown) {
762
- state.currentCallId = null;
763
- state.lock = Promise.resolve();
762
+ throw error;
763
+
764
+ } finally {
765
+ const wasCurrentCall = this.currentFunctionCall.get(socket.id) === event.call_id;
766
+ this.currentFunctionCall.set(socket.id, null);
767
+ // Only reset cycles and start idle timer if this was the current call
768
+ if (wasCurrentCall) {
769
+ this.resetIdleCycles(socket);
764
770
  this.startIdleTimer(client, socket);
765
771
  }
766
- throw error;
767
772
  }
768
773
  }
769
774
  }