@aj-archipelago/cortex 1.3.7 → 1.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,7 +24,7 @@ export interface SocketData {
24
24
  language: string;
25
25
  }
26
26
 
27
- const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly and show a lot of animation and emotion including laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
27
+ const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly with animation and emotion in your voice\n- Include laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
28
28
 
29
29
  const AI_DATETIME = "The current time and date in GMT is {{now}}, but references like \"today\" or \"yesterday\" are relative to the user's time zone. If you remember the user's time zone, use it - it's possible that the day for the user is different than the day in GMT.";
30
30
 
@@ -60,23 +60,22 @@ export class SocketServer {
60
60
  private readonly corsHosts: string;
61
61
  private io: Server | null;
62
62
  private httpServer: HTTPServer | null;
63
- private functionCallStates: Map<string, {
64
- currentCallId: string | null;
65
- }> = new Map();
63
+ private currentFunctionCall: Map<string, string | null> = new Map();
66
64
  private idleTimers: Map<string, NodeJS.Timer> = new Map();
67
65
  private aiResponding: Map<string, boolean> = new Map();
68
66
  private audioPlaying: Map<string, boolean> = new Map();
69
67
  private lastUserMessageTime: Map<string, number> = new Map();
70
68
  private idleCycles: Map<string, number> = new Map();
71
69
  private userSpeaking: Map<string, boolean> = new Map();
72
- private audioMuted: Map<string, boolean> = new Map();
70
+ private isInteractive: Map<string, boolean> = new Map();
73
71
  private voiceSample: Map<string, string> = new Map();
74
72
  private audioMessages: Map<string, string[]> = new Map();
73
+ private messageQueue: Map<string, Array<{message: string, response: boolean}>> = new Map();
75
74
  private static readonly MAX_AUDIO_MESSAGES = 8;
76
- private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 60 * 1000;
77
- private static readonly BASE_IDLE_TIMEOUT: number = 3 * 1000;
75
+ private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 180 * 1000;
76
+ private static readonly BASE_IDLE_TIMEOUT: number = 2.5 * 1000;
78
77
  private static readonly MAX_IDLE_TIMEOUT: number = 60 * 1000;
79
- private static readonly IDLE_CYCLE_TO_MUTE: number = 2;
78
+ private static readonly IDLE_CYCLE_TO_NONINTERACTIVE: number = 1;
80
79
  private static readonly FUNCTION_CALL_TIMEOUT_MS = 120 * 1000;
81
80
  private isAzure: boolean;
82
81
 
@@ -87,18 +86,31 @@ export class SocketServer {
87
86
  return `The current time in GMT is ${now.toISOString()}. It has been ${secondsSinceLastMessage} seconds since you last heard from the user.`;
88
87
  }
89
88
 
90
- private cleanup(socket: Socket) {
89
+ private async cleanup(socket: Socket) {
91
90
  logger.log(`Cleaning up resources for socket ${socket.id}`);
91
+
92
+ // Clear any pending timers first
92
93
  this.clearIdleTimer(socket);
93
- this.functionCallStates.delete(socket.id);
94
+
95
+ // Wait a small amount of time to ensure any in-flight operations complete
96
+ await new Promise(resolve => setTimeout(resolve, 100));
97
+
98
+ // Clear all state maps
99
+ this.currentFunctionCall.delete(socket.id);
94
100
  this.aiResponding.delete(socket.id);
95
101
  this.audioPlaying.delete(socket.id);
96
102
  this.lastUserMessageTime.delete(socket.id);
97
103
  this.idleCycles.delete(socket.id);
98
104
  this.userSpeaking.delete(socket.id);
99
- this.audioMuted.delete(socket.id);
105
+ this.isInteractive.delete(socket.id);
100
106
  this.voiceSample.delete(socket.id);
101
107
  this.audioMessages.delete(socket.id);
108
+ this.messageQueue.delete(socket.id);
109
+
110
+ // Only disconnect if we're still connected
111
+ if (socket.connected) {
112
+ socket.disconnect(true);
113
+ }
102
114
  }
103
115
 
104
116
  constructor(apiKey: string, corsHosts: string) {
@@ -111,8 +123,12 @@ export class SocketServer {
111
123
  }
112
124
 
113
125
  private calculateIdleTimeout(socket: Socket) {
126
+ if (!this.isInteractive.get(socket.id)) {
127
+ return SocketServer.MAX_IDLE_TIMEOUT;
128
+ }
129
+
114
130
  const cycles = this.idleCycles.get(socket.id) || 0;
115
- const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(2, cycles);
131
+ const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(4, cycles);
116
132
  const randomFactor = 0.8 + (Math.random() * 0.4);
117
133
  const timeout = Math.min(baseTimeout * randomFactor, SocketServer.MAX_IDLE_TIMEOUT);
118
134
 
@@ -120,10 +136,6 @@ export class SocketServer {
120
136
  return timeout;
121
137
  }
122
138
 
123
- public setAudioMuted(socket: Socket, muted: boolean) {
124
- this.audioMuted.set(socket.id, muted);
125
- }
126
-
127
139
  public async sendPrompt(client: RealtimeVoiceClient, socket: Socket, prompt: string, allowTools: boolean = true, disposable: boolean = true): Promise<{skipped: boolean}> {
128
140
  logger.log(`Sending prompt for socket ${socket.id}`);
129
141
  try {
@@ -139,7 +151,7 @@ export class SocketServer {
139
151
  } catch (error: any) {
140
152
  logger.error(`Error sending system prompt: ${error.message}`);
141
153
  if (error.message === 'Not connected') {
142
- await this.handleDisconnection(socket, client);
154
+ await this.cleanup(socket);
143
155
  } else {
144
156
  socket.emit('error', error.message);
145
157
  }
@@ -147,47 +159,31 @@ export class SocketServer {
147
159
  }
148
160
  }
149
161
 
150
- private async handleDisconnection(socket: Socket, client: RealtimeVoiceClient) {
151
- logger.log(`Handling disconnection for socket ${socket.id}`);
152
-
153
- // Let the client handle reconnection since autoReconnect is true
154
- // Only clean up if the client explicitly disconnects
155
- client.once('close', (event) => {
156
- if (!event.error) {
157
- // Only clean up on intentional disconnects
158
- this.cleanup(socket);
159
- socket.emit('error', 'Lost connection to AI service');
160
- socket.disconnect(true);
161
- }
162
- });
163
- }
164
-
165
162
  private async sendIdlePrompt(client: RealtimeVoiceClient, socket: Socket) {
166
- // if the user has been idle for a while, mute the audio so we don't bother them
167
- const idleCycle = this.idleCycles.get(socket.id) || 0;
168
163
 
169
- if (idleCycle > (SocketServer.IDLE_CYCLE_TO_MUTE - 1)) {
170
- this.setAudioMuted(socket, true);
171
- }
164
+ this.clearIdleTimer(socket);
172
165
 
173
- const prompt = this.audioMuted.get(socket.id) ?
174
- `You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while, you should make sure you're muted by calling your MuteAudio tool. If you're muted, do one or more of the following:
175
- - Do silent research about something that interests you - use the Search tool with the silent parameter set to true
176
- - Think deeply about a topic you care about - use the Reason tool with the silent parameter set to true
166
+ const idleCycle = this.idleCycles.get(socket.id) || 0;
167
+ const isInteractive = idleCycle <= (SocketServer.IDLE_CYCLE_TO_NONINTERACTIVE - 1);
168
+ this.isInteractive.set(socket.id, isInteractive);
169
+
170
+ const prompt = isInteractive ?
171
+ `You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, or think quietly. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed.` :
172
+ `You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while do one or more of the following:
173
+ - Do research about something that interests you - use the Search tool
174
+ - Think deeply about a topic you care about - use the Reason tool
177
175
  - Do nothing if you prefer.
178
- ${this.getTimeString(socket)}` :
179
- `You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, think, or just be silent. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed. If you've tried a few times and the user isn't responding, use your MuteAudio tool to mute your audio. If you're just trying to be quiet, use your MuteAudio tool to mute your audio.`;
176
+ ${this.getTimeString(socket)}`;
180
177
 
181
- logger.log(`Sending ${this.audioMuted.get(socket.id) ? 'silent' : 'regular'} idle prompt for socket ${socket.id}`);
178
+ logger.log(`Sending ${isInteractive ? 'interactive' : 'non-interactive'} idle prompt for socket ${socket.id}`);
182
179
  const result = await this.sendPrompt(client, socket, prompt, true);
183
180
 
184
181
  logger.log(`Idle prompt result:`, result);
185
182
 
186
183
  if (!result.skipped) {
187
- this.idleCycles.set(socket.id, (this.idleCycles.get(socket.id) || 0) + 1);
184
+ this.idleCycles.set(socket.id, idleCycle + 1);
188
185
  }
189
186
 
190
- // Restart timer after sending prompt
191
187
  this.startIdleTimer(client, socket);
192
188
  }
193
189
 
@@ -218,7 +214,6 @@ ${this.getTimeString(socket)}` :
218
214
 
219
215
  private resetIdleCycles(socket: Socket) {
220
216
  this.idleCycles.set(socket.id, 0);
221
- logger.log(`Reset idle cycles for socket ${socket.id}`);
222
217
  }
223
218
 
224
219
  listen(app: Hono, port: number) {
@@ -252,9 +247,9 @@ ${this.getTimeString(socket)}` :
252
247
  this.audioPlaying.set(socket.id, false);
253
248
  this.lastUserMessageTime.set(socket.id, 0);
254
249
  this.userSpeaking.set(socket.id, false);
255
- this.audioMuted.set(socket.id, false);
256
- // Initialize function call state for this socket
257
- this.getFunctionCallState(socket.id);
250
+ this.isInteractive.set(socket.id, true);
251
+ this.currentFunctionCall.set(socket.id, null);
252
+
258
253
  // Extract and log all client parameters
259
254
  const clientParams = {
260
255
  userId: socket.handshake.query.userId as string,
@@ -273,7 +268,6 @@ ${this.getTimeString(socket)}` :
273
268
  socket.data.userName = clientParams.userName;
274
269
  socket.data.aiStyle = clientParams.aiStyle;
275
270
  socket.data.language = clientParams.language;
276
- const voice = clientParams.voice;
277
271
 
278
272
  const client = new RealtimeVoiceClient({
279
273
  apiKey: this.apiKey,
@@ -282,38 +276,48 @@ ${this.getTimeString(socket)}` :
282
276
  filterDeltas: true,
283
277
  });
284
278
 
285
- client.on('connected', async () => {
286
- logger.log(`Connected to OpenAI successfully!`);
287
- await this.updateSession(client, socket);
288
- socket.emit('ready');
279
+ await this.connectClient(socket, client);
280
+ }
289
281
 
290
- // Send initial greeting prompt
291
- const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n<VOICE_SAMPLE>\n${this.voiceSample.get(socket.id) || ''}\n</VOICE_SAMPLE>\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
282
+ async connectClient(socket: Socket<ClientToServerEvents,
283
+ ServerToClientEvents,
284
+ InterServerEvents,
285
+ SocketData>,
286
+ client: RealtimeVoiceClient) {
287
+ const tools = new Tools(client, socket, this);
292
288
 
293
- await this.sendPrompt(client, socket, greetingPrompt, false);
294
- this.startIdleTimer(client, socket);
289
+ // Handle WebSocket errors and disconnection
290
+ client.on('error', (event) => {
291
+ logger.error(`Client error: ${event.message}`);
292
+ socket.emit('error', event.message);
293
+ // Only handle disconnection if it's not a concurrent response error
294
+ if (!event.error?.message?.includes('Conversation already has an active response')) {
295
+ this.cleanup(socket);
296
+ }
297
+ });
298
+
299
+ client.on('close', async (event) => {
300
+ logger.log(`WebSocket closed for socket ${socket.id}, error: ${event.error}`);
301
+ if (!event.error) {
302
+ await this.cleanup(socket);
303
+ }
295
304
  });
296
305
 
297
- // Track when AI starts responding
306
+ // Track when AI starts/finishes responding
298
307
  client.on('response.created', () => {
299
308
  logger.log('AI starting response');
300
309
  this.aiResponding.set(socket.id, true);
301
310
  this.clearIdleTimer(socket);
302
311
  });
303
312
 
304
- // Track when AI finishes responding
305
313
  client.on('response.done', () => {
306
314
  logger.log('AI response done');
307
315
  this.aiResponding.set(socket.id, false);
308
- // Don't start the idle timer yet if audio is still playing
309
- if (!this.audioPlaying.get(socket.id)) {
310
- this.startIdleTimer(client, socket);
311
- }
312
316
  });
313
317
 
314
- // Track audio playback start
318
+ // Track audio playback
315
319
  client.on('response.audio.delta', ({delta}) => {
316
- if (!this.audioMuted.get(socket.id)) {
320
+ if (this.isInteractive.get(socket.id)) {
317
321
  this.audioPlaying.set(socket.id, true);
318
322
  this.clearIdleTimer(socket);
319
323
  }
@@ -323,51 +327,52 @@ ${this.getTimeString(socket)}` :
323
327
  logger.log(`Audio playback complete for track ${trackId}`);
324
328
  this.audioPlaying.set(socket.id, false);
325
329
  // Only start idle timer if AI is also done responding
326
- if (!this.aiResponding.get(socket.id)) {
330
+ // and there's no current function call
331
+ if (!this.aiResponding.get(socket.id) && !this.currentFunctionCall.get(socket.id)) {
327
332
  this.startIdleTimer(client, socket);
328
333
  }
329
334
  });
330
335
 
331
336
  socket.on('appendAudio', (audio: string) => {
332
- // if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking
333
- // to avoid echoes
337
+ // if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking to avoid echoes
334
338
  const timeSinceLastMessage = Date.now() - (this.lastUserMessageTime.get(socket.id) || 0);
335
339
  const isPlaying = this.audioPlaying.get(socket.id) || this.aiResponding.get(socket.id);
340
+
336
341
  if (!isPlaying || timeSinceLastMessage < SocketServer.AUDIO_BLOCK_TIMEOUT_MS) {
337
- //logger.log('Time since last message:', timeSinceLastMessage, 'ms');
338
- client.appendInputAudio(audio);
342
+ try {
343
+ client.appendInputAudio(audio);
344
+ } catch (error: any) {
345
+ logger.error(`Error appending audio: ${error.message}`);
346
+ }
339
347
  }
340
348
  });
341
349
 
350
+ // Handle speech events
342
351
  client.on('input_audio_buffer.speech_started', () => {
343
352
  this.userSpeaking.set(socket.id, true);
344
353
  if (this.audioPlaying.get(socket.id)) {
345
354
  logger.log('Interrupting audio playback due to user speaking');
346
355
  socket.emit('conversationInterrupted');
347
356
  }
348
- this.setAudioMuted(socket, false);
349
357
  this.clearIdleTimer(socket);
350
358
  });
351
359
 
352
360
  client.on('input_audio_buffer.cancelled', () => {
353
361
  this.userSpeaking.set(socket.id, false);
354
- this.resetIdleCycles(socket);
355
- this.startIdleTimer(client, socket);
356
362
  });
357
363
 
358
364
  client.on('input_audio_buffer.committed', () => {
359
365
  this.userSpeaking.set(socket.id, false);
360
- this.audioMuted.set(socket.id, false);
361
- logger.log('Audio input committed, resetting idle timer and cycles');
366
+ this.isInteractive.set(socket.id, true);
367
+ logger.log('User finished speaking, resetting idle timer and cycles');
362
368
  this.resetIdleCycles(socket);
363
369
  this.startIdleTimer(client, socket);
364
370
  });
365
371
 
372
+ // Handle user messages and conversation control
366
373
  socket.on('sendMessage', (message: string) => {
367
374
  if (message) {
368
- logger.log('User sent message, resetting idle timer and cycles');
369
- this.resetIdleCycles(socket);
370
- this.startIdleTimer(client, socket);
375
+ logger.log('User sent message');
371
376
  this.sendUserMessage(client, message, true);
372
377
  }
373
378
  });
@@ -376,9 +381,9 @@ ${this.getTimeString(socket)}` :
376
381
  logger.log('User cancelled response, resetting idle timer and cycles');
377
382
  this.aiResponding.set(socket.id, false);
378
383
  this.audioPlaying.set(socket.id, false);
384
+ client.cancelResponse();
379
385
  this.resetIdleCycles(socket);
380
386
  this.startIdleTimer(client, socket);
381
- client.cancelResponse();
382
387
  });
383
388
 
384
389
  socket.on('conversationCompleted', async () => {
@@ -386,57 +391,29 @@ ${this.getTimeString(socket)}` :
386
391
  this.cleanup(socket);
387
392
  });
388
393
 
389
- // Handle cleanup and client disconnect before socket closes
394
+ // Handle cleanup and disconnection
390
395
  socket.on('disconnecting', async (reason) => {
391
396
  logger.log('Socket disconnecting', socket.id, reason);
392
397
  this.cleanup(socket);
393
- this.functionCallStates.delete(socket.id);
394
398
  await client.disconnect();
395
399
  });
396
400
 
397
- // Log the final disconnect event
398
401
  socket.on('disconnect', (reason) => {
399
402
  logger.log('Socket disconnected', socket.id, reason);
400
403
  });
401
404
 
402
- await this.connectClient(socket, client);
403
- }
404
-
405
- async connectClient(socket: Socket<ClientToServerEvents,
406
- ServerToClientEvents,
407
- InterServerEvents,
408
- SocketData>,
409
- client: RealtimeVoiceClient) {
410
- const tools = new Tools(client, socket, this);
411
- client.on('error', (event) => {
412
- logger.error(`Client error: ${event.error.message}`);
413
- socket.emit('error', event.error.message);
414
- });
415
- client.on('close', () => {
416
- });
405
+ // Handle conversation items
417
406
  client.on('conversation.item.deleted', ({item_id}) => {
418
407
  logger.log(`Successfully deleted conversation item: ${item_id}`);
419
408
  });
409
+
420
410
  client.on('conversation.item.created', ({item}) => {
421
411
  switch (item.type) {
422
412
  case 'function_call_output':
423
- // Don't release the lock here - wait for execution to complete
424
413
  break;
425
414
 
426
415
  case 'function_call':
427
- const callState = this.getFunctionCallState(socket.id);
428
- if (!callState.currentCallId) {
429
- callState.currentCallId = item.call_id;
430
- this.clearIdleTimer(socket);
431
- } else {
432
- logger.log(`Skipping new function call ${item.call_id} while call ${callState.currentCallId} is in progress`);
433
- client.createConversationItem({
434
- id: createId(),
435
- type: 'function_call_output',
436
- call_id: item.call_id,
437
- output: JSON.stringify({ error: "Function call skipped - another function call is in progress" })
438
- });
439
- }
416
+ this.clearIdleTimer(socket);
440
417
  break;
441
418
 
442
419
  case 'message':
@@ -449,49 +426,28 @@ ${this.getTimeString(socket)}` :
449
426
  break;
450
427
  }
451
428
  });
429
+
452
430
  client.on('conversation.item.input_audio_transcription.completed',
453
431
  async ({item_id, transcript}) => {
454
432
  if (transcript) {
455
- const currentTime = this.lastUserMessageTime.get(socket.id) || 0;
456
- this.lastUserMessageTime.set(socket.id,
457
- currentTime === 0 ? Date.now() - SocketServer.AUDIO_BLOCK_TIMEOUT_MS : Date.now()
458
- );
433
+ this.lastUserMessageTime.set(socket.id, Date.now());
459
434
  const item = client.getItem(item_id);
460
435
  item && socket.emit('conversationUpdated', item, {});
461
436
  const cortexHistory = tools.getCortexHistory();
462
- await this.searchMemory(client, socket, cortexHistory);
437
+ this.searchMemory(client, socket, cortexHistory);
463
438
  }
464
439
  });
465
- client.on('response.function_call_arguments.done', async (event) => {
466
- const callState = this.getFunctionCallState(socket.id);
467
440
 
468
- if (!callState.currentCallId) {
469
- logger.error('Function call arguments completed but no call is registered, skipping', socket.id);
470
- return;
471
- }
472
-
473
- if (callState.currentCallId !== event.call_id) {
474
- logger.log('Function call id mismatch - another call is already in progress, skipping', {
475
- current: callState.currentCallId,
476
- attempted: event.call_id
477
- });
478
- return;
479
- }
480
-
481
- try {
482
- this.clearIdleTimer(socket);
483
- this.resetIdleCycles(socket);
484
- await this.executeFunctionCall(socket, tools, event, callState, client);
485
- } catch (error) {
486
- logger.error('Function call failed:', error);
487
- callState.currentCallId = null;
488
- }
441
+ client.on('response.function_call_arguments.done', async (event) => {
442
+ await this.executeFunctionCall(socket, tools, event, client);
489
443
  });
444
+
490
445
  client.on('response.output_item.added', ({item}) => {
491
446
  if (item.type === 'message') {
492
447
  socket.emit('conversationUpdated', item, {});
493
448
  }
494
449
  });
450
+
495
451
  client.on('response.output_item.done', async ({item}) => {
496
452
  if (item.type !== 'message') {
497
453
  return;
@@ -503,31 +459,72 @@ ${this.getTimeString(socket)}` :
503
459
  this.manageAudioMessages(socket, client, item.id);
504
460
  }
505
461
  const cortexHistory = tools.getCortexHistory();
506
- //this.searchMemory(client, socket, cortexHistory);
507
462
  manageMemory(socket.data.userId, socket.data.aiName, cortexHistory);
508
463
  }
509
464
  });
465
+
510
466
  client.on('response.audio_transcript.delta', ({item_id, delta}) => {
511
467
  const item = client.getItem(item_id);
512
468
  item && socket.emit('conversationUpdated', item, {transcript: delta});
513
469
  });
470
+
514
471
  client.on('response.text.delta', ({item_id, delta}) => {
515
472
  const item = client.getItem(item_id);
516
473
  item && socket.emit('conversationUpdated', item, {text: delta});
517
474
  });
475
+
518
476
  client.on('response.audio.delta', ({item_id, delta}) => {
519
- if (!this.audioMuted.get(socket.id)) {
477
+ if (this.isInteractive.get(socket.id)) {
520
478
  const item = client.getItem(item_id);
521
479
  item && socket.emit('conversationUpdated', item, {audio: delta});
522
480
  }
523
481
  });
482
+
524
483
  client.on('conversation.item.truncated', () => {
525
484
  this.audioPlaying.set(socket.id, false);
526
485
  this.aiResponding.set(socket.id, false);
527
- this.setAudioMuted(socket, true);
486
+ this.isInteractive.set(socket.id, false);
528
487
  socket.emit('conversationInterrupted');
529
488
  });
530
489
 
490
+ client.on('connected', async () => {
491
+ logger.log(`Connected to OpenAI successfully!`);
492
+ try {
493
+ await this.updateSession(client, socket);
494
+ socket.emit('ready');
495
+
496
+ // Send initial greeting prompt
497
+ const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n${this.voiceSample.get(socket.id) || ''}\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
498
+
499
+ await this.sendPrompt(client, socket, greetingPrompt, false);
500
+ this.startIdleTimer(client, socket);
501
+
502
+ // Process any queued messages
503
+ const queue = this.messageQueue.get(socket.id) || [];
504
+ this.messageQueue.set(socket.id, []);
505
+
506
+ for (const {message, response} of queue) {
507
+ if (socket.connected) { // Check connection before each message
508
+ await this.sendUserMessage(client, message, response);
509
+ } else {
510
+ logger.log(`Socket ${socket.id} disconnected while processing queue, cleaning up`);
511
+ await this.cleanup(socket);
512
+ return;
513
+ }
514
+ }
515
+ } catch (error: any) {
516
+ logger.error(`Failed to initialize session: ${error.message}`);
517
+ if (error.message?.includes('ConnectionRefused')) {
518
+ logger.log('Cortex connection refused during initialization, cleaning up client');
519
+ this.cleanup(socket);
520
+ socket.emit('error', 'Unable to connect to Cortex service. Please try again later.');
521
+ socket.disconnect(true);
522
+ return;
523
+ }
524
+ socket.emit('error', error.message);
525
+ }
526
+ });
527
+
531
528
  // Connect to OpenAI Realtime API
532
529
  try {
533
530
  logger.log(`Connecting to OpenAI...`);
@@ -569,7 +566,6 @@ ${this.getTimeString(socket)}` :
569
566
  ]);
570
567
 
571
568
  if (writeToConversation.length > 0) {
572
- // If memoryAll is present, we'll send all sections
573
569
  const sectionsToSend = writeToConversation.includes('memoryAll') ?
574
570
  ['memorySelf', 'memoryUser', 'memoryDirectives', 'memoryTopics'] as const :
575
571
  writeToConversation;
@@ -581,7 +577,6 @@ ${this.getTimeString(socket)}` :
581
577
  memoryTopics: MEMORY_MESSAGE_TOPICS.replace('{{memoryTopics}}', memoryTopics?.result || '')
582
578
  };
583
579
 
584
- // Send the requested sections
585
580
  sectionsToSend.forEach(section => {
586
581
  if (section in memoryMessages) {
587
582
  this.sendUserMessage(client, memoryMessages[section as keyof typeof memoryMessages], false);
@@ -617,18 +612,42 @@ ${this.getTimeString(socket)}` :
617
612
 
618
613
  this.voiceSample.set(socket.id, memory?.voiceSample || '');
619
614
 
620
- client.updateSession({
621
- instructions,
622
- modalities: ['audio', 'text'],
623
- voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
624
- input_audio_transcription: {model: 'whisper-1'},
625
- turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
626
- tools: Tools.getToolDefinitions()
627
- });
628
-
615
+ try {
616
+ // First try updating everything including voice
617
+ await client.updateSession({
618
+ instructions,
619
+ modalities: ['audio', 'text'],
620
+ voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
621
+ input_audio_transcription: {model: 'whisper-1'},
622
+ turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
623
+ tools: Tools.getToolDefinitions()
624
+ });
625
+ } catch (error: any) {
626
+ if (error.message?.includes('Cannot update a conversation\'s voice')) {
627
+ // If voice update fails, try updating without voice
628
+ logger.log('Could not update voice, updating other session parameters');
629
+ await client.updateSession({
630
+ instructions,
631
+ modalities: ['audio', 'text'],
632
+ input_audio_transcription: {model: 'whisper-1'},
633
+ turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
634
+ tools: Tools.getToolDefinitions()
635
+ });
636
+ } else {
637
+ // If it's some other error, throw it
638
+ throw error;
639
+ }
640
+ }
629
641
  }
630
642
 
631
643
  protected sendUserMessage(client: RealtimeVoiceClient, message: string, response: boolean = true) {
644
+ // Find the socket associated with this client
645
+ const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
646
+ if (!socket) {
647
+ logger.error('No socket found for message send');
648
+ return;
649
+ }
650
+
632
651
  try {
633
652
  client.createConversationItem({
634
653
  id: createId(),
@@ -643,16 +662,24 @@ ${this.getTimeString(socket)}` :
643
662
  ],
644
663
  });
645
664
  if (response) {
646
- client.createResponse({});
665
+ try {
666
+ client.createResponse({});
667
+ } catch (error: any) {
668
+ // If we get a concurrent response error, just log it and continue
669
+ if (error.message?.includes('Conversation already has an active response')) {
670
+ logger.log('Skipping response creation - conversation already has active response');
671
+ return;
672
+ }
673
+ throw error;
674
+ }
647
675
  }
648
676
  } catch (error: any) {
649
677
  logger.error(`Error sending user message: ${error.message}`);
650
678
  if (error.message === 'Not connected') {
651
- // Find the socket associated with this client
652
- const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
653
- if (socket) {
654
- this.handleDisconnection(socket, client);
655
- }
679
+ // Add to message queue for when we reconnect
680
+ const queue = this.messageQueue.get(socket.id) || [];
681
+ queue.push({ message, response });
682
+ this.messageQueue.set(socket.id, queue);
656
683
  }
657
684
  }
658
685
  }
@@ -688,27 +715,34 @@ ${this.getTimeString(socket)}` :
688
715
  }
689
716
  }
690
717
 
691
- private getFunctionCallState(socketId: string) {
692
- if (!this.functionCallStates.has(socketId)) {
693
- this.functionCallStates.set(socketId, {
694
- currentCallId: null
695
- });
696
- logger.log(`Initialized function call state for socket ${socketId}`);
697
- }
698
- return this.functionCallStates.get(socketId)!;
699
- }
700
-
701
- private async executeFunctionCall(socket: Socket, tools: Tools, event: any, state: any, client: RealtimeVoiceClient) {
718
+ private async executeFunctionCall(socket: Socket, tools: Tools, event: any, client: RealtimeVoiceClient) {
719
+ this.clearIdleTimer(socket);
720
+ const currentCallId = this.currentFunctionCall.get(socket.id);
702
721
  try {
703
- // Verify this is still the current function call
704
- if (state.currentCallId !== event.call_id) {
705
- logger.error('Function call mismatch in execution', {
706
- current: state.currentCallId,
722
+
723
+ if (!this.isInteractive.get(socket.id)) {
724
+ logger.log('Non-interactive function call - executing immediately');
725
+ await tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, false);
726
+ this.startIdleTimer(client, socket);
727
+ return;
728
+ }
729
+
730
+ if (currentCallId) {
731
+ logger.log('Function call skipped - another call is already in progress', {
732
+ current: currentCallId,
707
733
  attempted: event.call_id
708
734
  });
735
+ client.createConversationItem({
736
+ id: createId(),
737
+ type: 'function_call_output',
738
+ call_id: event.call_id,
739
+ output: JSON.stringify({ error: `Function call skipped - another function call ${currentCallId} is in progress` })
740
+ });
709
741
  return;
710
742
  }
711
743
 
744
+ this.currentFunctionCall.set(socket.id, event.call_id);
745
+
712
746
  // Set up timeout
713
747
  const timeoutPromise = new Promise((_, reject) => {
714
748
  setTimeout(() => {
@@ -718,20 +752,23 @@ ${this.getTimeString(socket)}` :
718
752
 
719
753
  // Execute the function call with timeout
720
754
  await Promise.race([
721
- tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName),
755
+ tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, true),
722
756
  timeoutPromise
723
757
  ]);
724
758
 
725
- // Reset state on success
726
- state.currentCallId = null;
727
- this.startIdleTimer(client, socket);
728
759
  } catch (error: any) {
729
760
  logger.error('Function call failed:', error);
730
761
  socket.emit('error', error.message);
731
- // Reset state on error
732
- state.currentCallId = null;
733
- this.startIdleTimer(client, socket);
734
762
  throw error;
763
+
764
+ } finally {
765
+ const wasCurrentCall = this.currentFunctionCall.get(socket.id) === event.call_id;
766
+ this.currentFunctionCall.set(socket.id, null);
767
+ // Only reset cycles and start idle timer if this was the current call
768
+ if (wasCurrentCall) {
769
+ this.resetIdleCycles(socket);
770
+ this.startIdleTimer(client, socket);
771
+ }
735
772
  }
736
773
  }
737
774
  }