@aj-archipelago/cortex 1.3.7 → 1.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +578 -80
- package/helper-apps/cortex-file-handler/blobHandler.js +27 -8
- package/helper-apps/cortex-file-handler/index.js +20 -2
- package/helper-apps/cortex-realtime-voice-server/client/src/chat/Chat.tsx +51 -11
- package/helper-apps/cortex-realtime-voice-server/src/SocketServer.ts +220 -183
- package/helper-apps/cortex-realtime-voice-server/src/Tools.ts +18 -34
- package/helper-apps/cortex-realtime-voice-server/src/cortex/utils.ts +29 -15
- package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +47 -1
- package/helper-apps/cortex-realtime-voice-server/src/utils/prompt.ts +2 -11
- package/package.json +1 -1
- package/pathways/system/entity/memory/sys_search_memory.js +2 -1
- package/pathways/system/entity/sys_entity_start.js +6 -7
- package/pathways/system/entity/sys_generator_voice_sample.js +2 -2
- package/pathways/translate_gpt4_omni.js +20 -0
- package/pathways/translate_subtitle.js +326 -135
- package/pathways/translate_subtitle_helper.js +4 -16
- package/server/plugins/azureVideoTranslatePlugin.js +27 -15
- package/server/plugins/claude3VertexPlugin.js +10 -17
- package/server/plugins/gemini15VisionPlugin.js +16 -3
- package/server/plugins/modelPlugin.js +27 -0
- package/server/plugins/openAiVisionPlugin.js +26 -8
- package/tests/multimodal_conversion.test.js +88 -12
- package/tests/translate_srt.test.js +66 -14
|
@@ -24,7 +24,7 @@ export interface SocketData {
|
|
|
24
24
|
language: string;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly
|
|
27
|
+
const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly with animation and emotion in your voice\n- Include laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
|
|
28
28
|
|
|
29
29
|
const AI_DATETIME = "The current time and date in GMT is {{now}}, but references like \"today\" or \"yesterday\" are relative to the user's time zone. If you remember the user's time zone, use it - it's possible that the day for the user is different than the day in GMT.";
|
|
30
30
|
|
|
@@ -60,23 +60,22 @@ export class SocketServer {
|
|
|
60
60
|
private readonly corsHosts: string;
|
|
61
61
|
private io: Server | null;
|
|
62
62
|
private httpServer: HTTPServer | null;
|
|
63
|
-
private
|
|
64
|
-
currentCallId: string | null;
|
|
65
|
-
}> = new Map();
|
|
63
|
+
private currentFunctionCall: Map<string, string | null> = new Map();
|
|
66
64
|
private idleTimers: Map<string, NodeJS.Timer> = new Map();
|
|
67
65
|
private aiResponding: Map<string, boolean> = new Map();
|
|
68
66
|
private audioPlaying: Map<string, boolean> = new Map();
|
|
69
67
|
private lastUserMessageTime: Map<string, number> = new Map();
|
|
70
68
|
private idleCycles: Map<string, number> = new Map();
|
|
71
69
|
private userSpeaking: Map<string, boolean> = new Map();
|
|
72
|
-
private
|
|
70
|
+
private isInteractive: Map<string, boolean> = new Map();
|
|
73
71
|
private voiceSample: Map<string, string> = new Map();
|
|
74
72
|
private audioMessages: Map<string, string[]> = new Map();
|
|
73
|
+
private messageQueue: Map<string, Array<{message: string, response: boolean}>> = new Map();
|
|
75
74
|
private static readonly MAX_AUDIO_MESSAGES = 8;
|
|
76
|
-
private static readonly AUDIO_BLOCK_TIMEOUT_MS: number =
|
|
77
|
-
private static readonly BASE_IDLE_TIMEOUT: number =
|
|
75
|
+
private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 180 * 1000;
|
|
76
|
+
private static readonly BASE_IDLE_TIMEOUT: number = 2.5 * 1000;
|
|
78
77
|
private static readonly MAX_IDLE_TIMEOUT: number = 60 * 1000;
|
|
79
|
-
private static readonly
|
|
78
|
+
private static readonly IDLE_CYCLE_TO_NONINTERACTIVE: number = 1;
|
|
80
79
|
private static readonly FUNCTION_CALL_TIMEOUT_MS = 120 * 1000;
|
|
81
80
|
private isAzure: boolean;
|
|
82
81
|
|
|
@@ -87,18 +86,31 @@ export class SocketServer {
|
|
|
87
86
|
return `The current time in GMT is ${now.toISOString()}. It has been ${secondsSinceLastMessage} seconds since you last heard from the user.`;
|
|
88
87
|
}
|
|
89
88
|
|
|
90
|
-
private cleanup(socket: Socket) {
|
|
89
|
+
private async cleanup(socket: Socket) {
|
|
91
90
|
logger.log(`Cleaning up resources for socket ${socket.id}`);
|
|
91
|
+
|
|
92
|
+
// Clear any pending timers first
|
|
92
93
|
this.clearIdleTimer(socket);
|
|
93
|
-
|
|
94
|
+
|
|
95
|
+
// Wait a small amount of time to ensure any in-flight operations complete
|
|
96
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
97
|
+
|
|
98
|
+
// Clear all state maps
|
|
99
|
+
this.currentFunctionCall.delete(socket.id);
|
|
94
100
|
this.aiResponding.delete(socket.id);
|
|
95
101
|
this.audioPlaying.delete(socket.id);
|
|
96
102
|
this.lastUserMessageTime.delete(socket.id);
|
|
97
103
|
this.idleCycles.delete(socket.id);
|
|
98
104
|
this.userSpeaking.delete(socket.id);
|
|
99
|
-
this.
|
|
105
|
+
this.isInteractive.delete(socket.id);
|
|
100
106
|
this.voiceSample.delete(socket.id);
|
|
101
107
|
this.audioMessages.delete(socket.id);
|
|
108
|
+
this.messageQueue.delete(socket.id);
|
|
109
|
+
|
|
110
|
+
// Only disconnect if we're still connected
|
|
111
|
+
if (socket.connected) {
|
|
112
|
+
socket.disconnect(true);
|
|
113
|
+
}
|
|
102
114
|
}
|
|
103
115
|
|
|
104
116
|
constructor(apiKey: string, corsHosts: string) {
|
|
@@ -111,8 +123,12 @@ export class SocketServer {
|
|
|
111
123
|
}
|
|
112
124
|
|
|
113
125
|
private calculateIdleTimeout(socket: Socket) {
|
|
126
|
+
if (!this.isInteractive.get(socket.id)) {
|
|
127
|
+
return SocketServer.MAX_IDLE_TIMEOUT;
|
|
128
|
+
}
|
|
129
|
+
|
|
114
130
|
const cycles = this.idleCycles.get(socket.id) || 0;
|
|
115
|
-
const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(
|
|
131
|
+
const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(4, cycles);
|
|
116
132
|
const randomFactor = 0.8 + (Math.random() * 0.4);
|
|
117
133
|
const timeout = Math.min(baseTimeout * randomFactor, SocketServer.MAX_IDLE_TIMEOUT);
|
|
118
134
|
|
|
@@ -120,10 +136,6 @@ export class SocketServer {
|
|
|
120
136
|
return timeout;
|
|
121
137
|
}
|
|
122
138
|
|
|
123
|
-
public setAudioMuted(socket: Socket, muted: boolean) {
|
|
124
|
-
this.audioMuted.set(socket.id, muted);
|
|
125
|
-
}
|
|
126
|
-
|
|
127
139
|
public async sendPrompt(client: RealtimeVoiceClient, socket: Socket, prompt: string, allowTools: boolean = true, disposable: boolean = true): Promise<{skipped: boolean}> {
|
|
128
140
|
logger.log(`Sending prompt for socket ${socket.id}`);
|
|
129
141
|
try {
|
|
@@ -139,7 +151,7 @@ export class SocketServer {
|
|
|
139
151
|
} catch (error: any) {
|
|
140
152
|
logger.error(`Error sending system prompt: ${error.message}`);
|
|
141
153
|
if (error.message === 'Not connected') {
|
|
142
|
-
await this.
|
|
154
|
+
await this.cleanup(socket);
|
|
143
155
|
} else {
|
|
144
156
|
socket.emit('error', error.message);
|
|
145
157
|
}
|
|
@@ -147,47 +159,31 @@ export class SocketServer {
|
|
|
147
159
|
}
|
|
148
160
|
}
|
|
149
161
|
|
|
150
|
-
private async handleDisconnection(socket: Socket, client: RealtimeVoiceClient) {
|
|
151
|
-
logger.log(`Handling disconnection for socket ${socket.id}`);
|
|
152
|
-
|
|
153
|
-
// Let the client handle reconnection since autoReconnect is true
|
|
154
|
-
// Only clean up if the client explicitly disconnects
|
|
155
|
-
client.once('close', (event) => {
|
|
156
|
-
if (!event.error) {
|
|
157
|
-
// Only clean up on intentional disconnects
|
|
158
|
-
this.cleanup(socket);
|
|
159
|
-
socket.emit('error', 'Lost connection to AI service');
|
|
160
|
-
socket.disconnect(true);
|
|
161
|
-
}
|
|
162
|
-
});
|
|
163
|
-
}
|
|
164
|
-
|
|
165
162
|
private async sendIdlePrompt(client: RealtimeVoiceClient, socket: Socket) {
|
|
166
|
-
// if the user has been idle for a while, mute the audio so we don't bother them
|
|
167
|
-
const idleCycle = this.idleCycles.get(socket.id) || 0;
|
|
168
163
|
|
|
169
|
-
|
|
170
|
-
this.setAudioMuted(socket, true);
|
|
171
|
-
}
|
|
164
|
+
this.clearIdleTimer(socket);
|
|
172
165
|
|
|
173
|
-
const
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
166
|
+
const idleCycle = this.idleCycles.get(socket.id) || 0;
|
|
167
|
+
const isInteractive = idleCycle <= (SocketServer.IDLE_CYCLE_TO_NONINTERACTIVE - 1);
|
|
168
|
+
this.isInteractive.set(socket.id, isInteractive);
|
|
169
|
+
|
|
170
|
+
const prompt = isInteractive ?
|
|
171
|
+
`You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, or think quietly. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed.` :
|
|
172
|
+
`You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while do one or more of the following:
|
|
173
|
+
- Do research about something that interests you - use the Search tool
|
|
174
|
+
- Think deeply about a topic you care about - use the Reason tool
|
|
177
175
|
- Do nothing if you prefer.
|
|
178
|
-
${this.getTimeString(socket)}
|
|
179
|
-
`You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, think, or just be silent. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed. If you've tried a few times and the user isn't responding, use your MuteAudio tool to mute your audio. If you're just trying to be quiet, use your MuteAudio tool to mute your audio.`;
|
|
176
|
+
${this.getTimeString(socket)}`;
|
|
180
177
|
|
|
181
|
-
logger.log(`Sending ${
|
|
178
|
+
logger.log(`Sending ${isInteractive ? 'interactive' : 'non-interactive'} idle prompt for socket ${socket.id}`);
|
|
182
179
|
const result = await this.sendPrompt(client, socket, prompt, true);
|
|
183
180
|
|
|
184
181
|
logger.log(`Idle prompt result:`, result);
|
|
185
182
|
|
|
186
183
|
if (!result.skipped) {
|
|
187
|
-
this.idleCycles.set(socket.id,
|
|
184
|
+
this.idleCycles.set(socket.id, idleCycle + 1);
|
|
188
185
|
}
|
|
189
186
|
|
|
190
|
-
// Restart timer after sending prompt
|
|
191
187
|
this.startIdleTimer(client, socket);
|
|
192
188
|
}
|
|
193
189
|
|
|
@@ -218,7 +214,6 @@ ${this.getTimeString(socket)}` :
|
|
|
218
214
|
|
|
219
215
|
private resetIdleCycles(socket: Socket) {
|
|
220
216
|
this.idleCycles.set(socket.id, 0);
|
|
221
|
-
logger.log(`Reset idle cycles for socket ${socket.id}`);
|
|
222
217
|
}
|
|
223
218
|
|
|
224
219
|
listen(app: Hono, port: number) {
|
|
@@ -252,9 +247,9 @@ ${this.getTimeString(socket)}` :
|
|
|
252
247
|
this.audioPlaying.set(socket.id, false);
|
|
253
248
|
this.lastUserMessageTime.set(socket.id, 0);
|
|
254
249
|
this.userSpeaking.set(socket.id, false);
|
|
255
|
-
this.
|
|
256
|
-
|
|
257
|
-
|
|
250
|
+
this.isInteractive.set(socket.id, true);
|
|
251
|
+
this.currentFunctionCall.set(socket.id, null);
|
|
252
|
+
|
|
258
253
|
// Extract and log all client parameters
|
|
259
254
|
const clientParams = {
|
|
260
255
|
userId: socket.handshake.query.userId as string,
|
|
@@ -273,7 +268,6 @@ ${this.getTimeString(socket)}` :
|
|
|
273
268
|
socket.data.userName = clientParams.userName;
|
|
274
269
|
socket.data.aiStyle = clientParams.aiStyle;
|
|
275
270
|
socket.data.language = clientParams.language;
|
|
276
|
-
const voice = clientParams.voice;
|
|
277
271
|
|
|
278
272
|
const client = new RealtimeVoiceClient({
|
|
279
273
|
apiKey: this.apiKey,
|
|
@@ -282,38 +276,48 @@ ${this.getTimeString(socket)}` :
|
|
|
282
276
|
filterDeltas: true,
|
|
283
277
|
});
|
|
284
278
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
await this.updateSession(client, socket);
|
|
288
|
-
socket.emit('ready');
|
|
279
|
+
await this.connectClient(socket, client);
|
|
280
|
+
}
|
|
289
281
|
|
|
290
|
-
|
|
291
|
-
|
|
282
|
+
async connectClient(socket: Socket<ClientToServerEvents,
|
|
283
|
+
ServerToClientEvents,
|
|
284
|
+
InterServerEvents,
|
|
285
|
+
SocketData>,
|
|
286
|
+
client: RealtimeVoiceClient) {
|
|
287
|
+
const tools = new Tools(client, socket, this);
|
|
292
288
|
|
|
293
|
-
|
|
294
|
-
|
|
289
|
+
// Handle WebSocket errors and disconnection
|
|
290
|
+
client.on('error', (event) => {
|
|
291
|
+
logger.error(`Client error: ${event.message}`);
|
|
292
|
+
socket.emit('error', event.message);
|
|
293
|
+
// Only handle disconnection if it's not a concurrent response error
|
|
294
|
+
if (!event.error?.message?.includes('Conversation already has an active response')) {
|
|
295
|
+
this.cleanup(socket);
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
client.on('close', async (event) => {
|
|
300
|
+
logger.log(`WebSocket closed for socket ${socket.id}, error: ${event.error}`);
|
|
301
|
+
if (!event.error) {
|
|
302
|
+
await this.cleanup(socket);
|
|
303
|
+
}
|
|
295
304
|
});
|
|
296
305
|
|
|
297
|
-
// Track when AI starts responding
|
|
306
|
+
// Track when AI starts/finishes responding
|
|
298
307
|
client.on('response.created', () => {
|
|
299
308
|
logger.log('AI starting response');
|
|
300
309
|
this.aiResponding.set(socket.id, true);
|
|
301
310
|
this.clearIdleTimer(socket);
|
|
302
311
|
});
|
|
303
312
|
|
|
304
|
-
// Track when AI finishes responding
|
|
305
313
|
client.on('response.done', () => {
|
|
306
314
|
logger.log('AI response done');
|
|
307
315
|
this.aiResponding.set(socket.id, false);
|
|
308
|
-
// Don't start the idle timer yet if audio is still playing
|
|
309
|
-
if (!this.audioPlaying.get(socket.id)) {
|
|
310
|
-
this.startIdleTimer(client, socket);
|
|
311
|
-
}
|
|
312
316
|
});
|
|
313
317
|
|
|
314
|
-
// Track audio playback
|
|
318
|
+
// Track audio playback
|
|
315
319
|
client.on('response.audio.delta', ({delta}) => {
|
|
316
|
-
if (
|
|
320
|
+
if (this.isInteractive.get(socket.id)) {
|
|
317
321
|
this.audioPlaying.set(socket.id, true);
|
|
318
322
|
this.clearIdleTimer(socket);
|
|
319
323
|
}
|
|
@@ -323,51 +327,52 @@ ${this.getTimeString(socket)}` :
|
|
|
323
327
|
logger.log(`Audio playback complete for track ${trackId}`);
|
|
324
328
|
this.audioPlaying.set(socket.id, false);
|
|
325
329
|
// Only start idle timer if AI is also done responding
|
|
326
|
-
|
|
330
|
+
// and there's no current function call
|
|
331
|
+
if (!this.aiResponding.get(socket.id) && !this.currentFunctionCall.get(socket.id)) {
|
|
327
332
|
this.startIdleTimer(client, socket);
|
|
328
333
|
}
|
|
329
334
|
});
|
|
330
335
|
|
|
331
336
|
socket.on('appendAudio', (audio: string) => {
|
|
332
|
-
// if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking
|
|
333
|
-
// to avoid echoes
|
|
337
|
+
// if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking to avoid echoes
|
|
334
338
|
const timeSinceLastMessage = Date.now() - (this.lastUserMessageTime.get(socket.id) || 0);
|
|
335
339
|
const isPlaying = this.audioPlaying.get(socket.id) || this.aiResponding.get(socket.id);
|
|
340
|
+
|
|
336
341
|
if (!isPlaying || timeSinceLastMessage < SocketServer.AUDIO_BLOCK_TIMEOUT_MS) {
|
|
337
|
-
|
|
338
|
-
|
|
342
|
+
try {
|
|
343
|
+
client.appendInputAudio(audio);
|
|
344
|
+
} catch (error: any) {
|
|
345
|
+
logger.error(`Error appending audio: ${error.message}`);
|
|
346
|
+
}
|
|
339
347
|
}
|
|
340
348
|
});
|
|
341
349
|
|
|
350
|
+
// Handle speech events
|
|
342
351
|
client.on('input_audio_buffer.speech_started', () => {
|
|
343
352
|
this.userSpeaking.set(socket.id, true);
|
|
344
353
|
if (this.audioPlaying.get(socket.id)) {
|
|
345
354
|
logger.log('Interrupting audio playback due to user speaking');
|
|
346
355
|
socket.emit('conversationInterrupted');
|
|
347
356
|
}
|
|
348
|
-
this.setAudioMuted(socket, false);
|
|
349
357
|
this.clearIdleTimer(socket);
|
|
350
358
|
});
|
|
351
359
|
|
|
352
360
|
client.on('input_audio_buffer.cancelled', () => {
|
|
353
361
|
this.userSpeaking.set(socket.id, false);
|
|
354
|
-
this.resetIdleCycles(socket);
|
|
355
|
-
this.startIdleTimer(client, socket);
|
|
356
362
|
});
|
|
357
363
|
|
|
358
364
|
client.on('input_audio_buffer.committed', () => {
|
|
359
365
|
this.userSpeaking.set(socket.id, false);
|
|
360
|
-
this.
|
|
361
|
-
logger.log('
|
|
366
|
+
this.isInteractive.set(socket.id, true);
|
|
367
|
+
logger.log('User finished speaking, resetting idle timer and cycles');
|
|
362
368
|
this.resetIdleCycles(socket);
|
|
363
369
|
this.startIdleTimer(client, socket);
|
|
364
370
|
});
|
|
365
371
|
|
|
372
|
+
// Handle user messages and conversation control
|
|
366
373
|
socket.on('sendMessage', (message: string) => {
|
|
367
374
|
if (message) {
|
|
368
|
-
logger.log('User sent message
|
|
369
|
-
this.resetIdleCycles(socket);
|
|
370
|
-
this.startIdleTimer(client, socket);
|
|
375
|
+
logger.log('User sent message');
|
|
371
376
|
this.sendUserMessage(client, message, true);
|
|
372
377
|
}
|
|
373
378
|
});
|
|
@@ -376,9 +381,9 @@ ${this.getTimeString(socket)}` :
|
|
|
376
381
|
logger.log('User cancelled response, resetting idle timer and cycles');
|
|
377
382
|
this.aiResponding.set(socket.id, false);
|
|
378
383
|
this.audioPlaying.set(socket.id, false);
|
|
384
|
+
client.cancelResponse();
|
|
379
385
|
this.resetIdleCycles(socket);
|
|
380
386
|
this.startIdleTimer(client, socket);
|
|
381
|
-
client.cancelResponse();
|
|
382
387
|
});
|
|
383
388
|
|
|
384
389
|
socket.on('conversationCompleted', async () => {
|
|
@@ -386,57 +391,29 @@ ${this.getTimeString(socket)}` :
|
|
|
386
391
|
this.cleanup(socket);
|
|
387
392
|
});
|
|
388
393
|
|
|
389
|
-
// Handle cleanup and
|
|
394
|
+
// Handle cleanup and disconnection
|
|
390
395
|
socket.on('disconnecting', async (reason) => {
|
|
391
396
|
logger.log('Socket disconnecting', socket.id, reason);
|
|
392
397
|
this.cleanup(socket);
|
|
393
|
-
this.functionCallStates.delete(socket.id);
|
|
394
398
|
await client.disconnect();
|
|
395
399
|
});
|
|
396
400
|
|
|
397
|
-
// Log the final disconnect event
|
|
398
401
|
socket.on('disconnect', (reason) => {
|
|
399
402
|
logger.log('Socket disconnected', socket.id, reason);
|
|
400
403
|
});
|
|
401
404
|
|
|
402
|
-
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
async connectClient(socket: Socket<ClientToServerEvents,
|
|
406
|
-
ServerToClientEvents,
|
|
407
|
-
InterServerEvents,
|
|
408
|
-
SocketData>,
|
|
409
|
-
client: RealtimeVoiceClient) {
|
|
410
|
-
const tools = new Tools(client, socket, this);
|
|
411
|
-
client.on('error', (event) => {
|
|
412
|
-
logger.error(`Client error: ${event.error.message}`);
|
|
413
|
-
socket.emit('error', event.error.message);
|
|
414
|
-
});
|
|
415
|
-
client.on('close', () => {
|
|
416
|
-
});
|
|
405
|
+
// Handle conversation items
|
|
417
406
|
client.on('conversation.item.deleted', ({item_id}) => {
|
|
418
407
|
logger.log(`Successfully deleted conversation item: ${item_id}`);
|
|
419
408
|
});
|
|
409
|
+
|
|
420
410
|
client.on('conversation.item.created', ({item}) => {
|
|
421
411
|
switch (item.type) {
|
|
422
412
|
case 'function_call_output':
|
|
423
|
-
// Don't release the lock here - wait for execution to complete
|
|
424
413
|
break;
|
|
425
414
|
|
|
426
415
|
case 'function_call':
|
|
427
|
-
|
|
428
|
-
if (!callState.currentCallId) {
|
|
429
|
-
callState.currentCallId = item.call_id;
|
|
430
|
-
this.clearIdleTimer(socket);
|
|
431
|
-
} else {
|
|
432
|
-
logger.log(`Skipping new function call ${item.call_id} while call ${callState.currentCallId} is in progress`);
|
|
433
|
-
client.createConversationItem({
|
|
434
|
-
id: createId(),
|
|
435
|
-
type: 'function_call_output',
|
|
436
|
-
call_id: item.call_id,
|
|
437
|
-
output: JSON.stringify({ error: "Function call skipped - another function call is in progress" })
|
|
438
|
-
});
|
|
439
|
-
}
|
|
416
|
+
this.clearIdleTimer(socket);
|
|
440
417
|
break;
|
|
441
418
|
|
|
442
419
|
case 'message':
|
|
@@ -449,49 +426,28 @@ ${this.getTimeString(socket)}` :
|
|
|
449
426
|
break;
|
|
450
427
|
}
|
|
451
428
|
});
|
|
429
|
+
|
|
452
430
|
client.on('conversation.item.input_audio_transcription.completed',
|
|
453
431
|
async ({item_id, transcript}) => {
|
|
454
432
|
if (transcript) {
|
|
455
|
-
|
|
456
|
-
this.lastUserMessageTime.set(socket.id,
|
|
457
|
-
currentTime === 0 ? Date.now() - SocketServer.AUDIO_BLOCK_TIMEOUT_MS : Date.now()
|
|
458
|
-
);
|
|
433
|
+
this.lastUserMessageTime.set(socket.id, Date.now());
|
|
459
434
|
const item = client.getItem(item_id);
|
|
460
435
|
item && socket.emit('conversationUpdated', item, {});
|
|
461
436
|
const cortexHistory = tools.getCortexHistory();
|
|
462
|
-
|
|
437
|
+
this.searchMemory(client, socket, cortexHistory);
|
|
463
438
|
}
|
|
464
439
|
});
|
|
465
|
-
client.on('response.function_call_arguments.done', async (event) => {
|
|
466
|
-
const callState = this.getFunctionCallState(socket.id);
|
|
467
440
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
return;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
if (callState.currentCallId !== event.call_id) {
|
|
474
|
-
logger.log('Function call id mismatch - another call is already in progress, skipping', {
|
|
475
|
-
current: callState.currentCallId,
|
|
476
|
-
attempted: event.call_id
|
|
477
|
-
});
|
|
478
|
-
return;
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
try {
|
|
482
|
-
this.clearIdleTimer(socket);
|
|
483
|
-
this.resetIdleCycles(socket);
|
|
484
|
-
await this.executeFunctionCall(socket, tools, event, callState, client);
|
|
485
|
-
} catch (error) {
|
|
486
|
-
logger.error('Function call failed:', error);
|
|
487
|
-
callState.currentCallId = null;
|
|
488
|
-
}
|
|
441
|
+
client.on('response.function_call_arguments.done', async (event) => {
|
|
442
|
+
await this.executeFunctionCall(socket, tools, event, client);
|
|
489
443
|
});
|
|
444
|
+
|
|
490
445
|
client.on('response.output_item.added', ({item}) => {
|
|
491
446
|
if (item.type === 'message') {
|
|
492
447
|
socket.emit('conversationUpdated', item, {});
|
|
493
448
|
}
|
|
494
449
|
});
|
|
450
|
+
|
|
495
451
|
client.on('response.output_item.done', async ({item}) => {
|
|
496
452
|
if (item.type !== 'message') {
|
|
497
453
|
return;
|
|
@@ -503,31 +459,72 @@ ${this.getTimeString(socket)}` :
|
|
|
503
459
|
this.manageAudioMessages(socket, client, item.id);
|
|
504
460
|
}
|
|
505
461
|
const cortexHistory = tools.getCortexHistory();
|
|
506
|
-
//this.searchMemory(client, socket, cortexHistory);
|
|
507
462
|
manageMemory(socket.data.userId, socket.data.aiName, cortexHistory);
|
|
508
463
|
}
|
|
509
464
|
});
|
|
465
|
+
|
|
510
466
|
client.on('response.audio_transcript.delta', ({item_id, delta}) => {
|
|
511
467
|
const item = client.getItem(item_id);
|
|
512
468
|
item && socket.emit('conversationUpdated', item, {transcript: delta});
|
|
513
469
|
});
|
|
470
|
+
|
|
514
471
|
client.on('response.text.delta', ({item_id, delta}) => {
|
|
515
472
|
const item = client.getItem(item_id);
|
|
516
473
|
item && socket.emit('conversationUpdated', item, {text: delta});
|
|
517
474
|
});
|
|
475
|
+
|
|
518
476
|
client.on('response.audio.delta', ({item_id, delta}) => {
|
|
519
|
-
if (
|
|
477
|
+
if (this.isInteractive.get(socket.id)) {
|
|
520
478
|
const item = client.getItem(item_id);
|
|
521
479
|
item && socket.emit('conversationUpdated', item, {audio: delta});
|
|
522
480
|
}
|
|
523
481
|
});
|
|
482
|
+
|
|
524
483
|
client.on('conversation.item.truncated', () => {
|
|
525
484
|
this.audioPlaying.set(socket.id, false);
|
|
526
485
|
this.aiResponding.set(socket.id, false);
|
|
527
|
-
this.
|
|
486
|
+
this.isInteractive.set(socket.id, false);
|
|
528
487
|
socket.emit('conversationInterrupted');
|
|
529
488
|
});
|
|
530
489
|
|
|
490
|
+
client.on('connected', async () => {
|
|
491
|
+
logger.log(`Connected to OpenAI successfully!`);
|
|
492
|
+
try {
|
|
493
|
+
await this.updateSession(client, socket);
|
|
494
|
+
socket.emit('ready');
|
|
495
|
+
|
|
496
|
+
// Send initial greeting prompt
|
|
497
|
+
const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n${this.voiceSample.get(socket.id) || ''}\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
|
|
498
|
+
|
|
499
|
+
await this.sendPrompt(client, socket, greetingPrompt, false);
|
|
500
|
+
this.startIdleTimer(client, socket);
|
|
501
|
+
|
|
502
|
+
// Process any queued messages
|
|
503
|
+
const queue = this.messageQueue.get(socket.id) || [];
|
|
504
|
+
this.messageQueue.set(socket.id, []);
|
|
505
|
+
|
|
506
|
+
for (const {message, response} of queue) {
|
|
507
|
+
if (socket.connected) { // Check connection before each message
|
|
508
|
+
await this.sendUserMessage(client, message, response);
|
|
509
|
+
} else {
|
|
510
|
+
logger.log(`Socket ${socket.id} disconnected while processing queue, cleaning up`);
|
|
511
|
+
await this.cleanup(socket);
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
} catch (error: any) {
|
|
516
|
+
logger.error(`Failed to initialize session: ${error.message}`);
|
|
517
|
+
if (error.message?.includes('ConnectionRefused')) {
|
|
518
|
+
logger.log('Cortex connection refused during initialization, cleaning up client');
|
|
519
|
+
this.cleanup(socket);
|
|
520
|
+
socket.emit('error', 'Unable to connect to Cortex service. Please try again later.');
|
|
521
|
+
socket.disconnect(true);
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
socket.emit('error', error.message);
|
|
525
|
+
}
|
|
526
|
+
});
|
|
527
|
+
|
|
531
528
|
// Connect to OpenAI Realtime API
|
|
532
529
|
try {
|
|
533
530
|
logger.log(`Connecting to OpenAI...`);
|
|
@@ -569,7 +566,6 @@ ${this.getTimeString(socket)}` :
|
|
|
569
566
|
]);
|
|
570
567
|
|
|
571
568
|
if (writeToConversation.length > 0) {
|
|
572
|
-
// If memoryAll is present, we'll send all sections
|
|
573
569
|
const sectionsToSend = writeToConversation.includes('memoryAll') ?
|
|
574
570
|
['memorySelf', 'memoryUser', 'memoryDirectives', 'memoryTopics'] as const :
|
|
575
571
|
writeToConversation;
|
|
@@ -581,7 +577,6 @@ ${this.getTimeString(socket)}` :
|
|
|
581
577
|
memoryTopics: MEMORY_MESSAGE_TOPICS.replace('{{memoryTopics}}', memoryTopics?.result || '')
|
|
582
578
|
};
|
|
583
579
|
|
|
584
|
-
// Send the requested sections
|
|
585
580
|
sectionsToSend.forEach(section => {
|
|
586
581
|
if (section in memoryMessages) {
|
|
587
582
|
this.sendUserMessage(client, memoryMessages[section as keyof typeof memoryMessages], false);
|
|
@@ -617,18 +612,42 @@ ${this.getTimeString(socket)}` :
|
|
|
617
612
|
|
|
618
613
|
this.voiceSample.set(socket.id, memory?.voiceSample || '');
|
|
619
614
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
615
|
+
try {
|
|
616
|
+
// First try updating everything including voice
|
|
617
|
+
await client.updateSession({
|
|
618
|
+
instructions,
|
|
619
|
+
modalities: ['audio', 'text'],
|
|
620
|
+
voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
|
|
621
|
+
input_audio_transcription: {model: 'whisper-1'},
|
|
622
|
+
turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
|
|
623
|
+
tools: Tools.getToolDefinitions()
|
|
624
|
+
});
|
|
625
|
+
} catch (error: any) {
|
|
626
|
+
if (error.message?.includes('Cannot update a conversation\'s voice')) {
|
|
627
|
+
// If voice update fails, try updating without voice
|
|
628
|
+
logger.log('Could not update voice, updating other session parameters');
|
|
629
|
+
await client.updateSession({
|
|
630
|
+
instructions,
|
|
631
|
+
modalities: ['audio', 'text'],
|
|
632
|
+
input_audio_transcription: {model: 'whisper-1'},
|
|
633
|
+
turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
|
|
634
|
+
tools: Tools.getToolDefinitions()
|
|
635
|
+
});
|
|
636
|
+
} else {
|
|
637
|
+
// If it's some other error, throw it
|
|
638
|
+
throw error;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
629
641
|
}
|
|
630
642
|
|
|
631
643
|
protected sendUserMessage(client: RealtimeVoiceClient, message: string, response: boolean = true) {
|
|
644
|
+
// Find the socket associated with this client
|
|
645
|
+
const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
|
|
646
|
+
if (!socket) {
|
|
647
|
+
logger.error('No socket found for message send');
|
|
648
|
+
return;
|
|
649
|
+
}
|
|
650
|
+
|
|
632
651
|
try {
|
|
633
652
|
client.createConversationItem({
|
|
634
653
|
id: createId(),
|
|
@@ -643,16 +662,24 @@ ${this.getTimeString(socket)}` :
|
|
|
643
662
|
],
|
|
644
663
|
});
|
|
645
664
|
if (response) {
|
|
646
|
-
|
|
665
|
+
try {
|
|
666
|
+
client.createResponse({});
|
|
667
|
+
} catch (error: any) {
|
|
668
|
+
// If we get a concurrent response error, just log it and continue
|
|
669
|
+
if (error.message?.includes('Conversation already has an active response')) {
|
|
670
|
+
logger.log('Skipping response creation - conversation already has active response');
|
|
671
|
+
return;
|
|
672
|
+
}
|
|
673
|
+
throw error;
|
|
674
|
+
}
|
|
647
675
|
}
|
|
648
676
|
} catch (error: any) {
|
|
649
677
|
logger.error(`Error sending user message: ${error.message}`);
|
|
650
678
|
if (error.message === 'Not connected') {
|
|
651
|
-
//
|
|
652
|
-
const
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
}
|
|
679
|
+
// Add to message queue for when we reconnect
|
|
680
|
+
const queue = this.messageQueue.get(socket.id) || [];
|
|
681
|
+
queue.push({ message, response });
|
|
682
|
+
this.messageQueue.set(socket.id, queue);
|
|
656
683
|
}
|
|
657
684
|
}
|
|
658
685
|
}
|
|
@@ -688,27 +715,34 @@ ${this.getTimeString(socket)}` :
|
|
|
688
715
|
}
|
|
689
716
|
}
|
|
690
717
|
|
|
691
|
-
private
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
currentCallId: null
|
|
695
|
-
});
|
|
696
|
-
logger.log(`Initialized function call state for socket ${socketId}`);
|
|
697
|
-
}
|
|
698
|
-
return this.functionCallStates.get(socketId)!;
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
private async executeFunctionCall(socket: Socket, tools: Tools, event: any, state: any, client: RealtimeVoiceClient) {
|
|
718
|
+
private async executeFunctionCall(socket: Socket, tools: Tools, event: any, client: RealtimeVoiceClient) {
|
|
719
|
+
this.clearIdleTimer(socket);
|
|
720
|
+
const currentCallId = this.currentFunctionCall.get(socket.id);
|
|
702
721
|
try {
|
|
703
|
-
|
|
704
|
-
if (
|
|
705
|
-
logger.
|
|
706
|
-
|
|
722
|
+
|
|
723
|
+
if (!this.isInteractive.get(socket.id)) {
|
|
724
|
+
logger.log('Non-interactive function call - executing immediately');
|
|
725
|
+
await tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, false);
|
|
726
|
+
this.startIdleTimer(client, socket);
|
|
727
|
+
return;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
if (currentCallId) {
|
|
731
|
+
logger.log('Function call skipped - another call is already in progress', {
|
|
732
|
+
current: currentCallId,
|
|
707
733
|
attempted: event.call_id
|
|
708
734
|
});
|
|
735
|
+
client.createConversationItem({
|
|
736
|
+
id: createId(),
|
|
737
|
+
type: 'function_call_output',
|
|
738
|
+
call_id: event.call_id,
|
|
739
|
+
output: JSON.stringify({ error: `Function call skipped - another function call ${currentCallId} is in progress` })
|
|
740
|
+
});
|
|
709
741
|
return;
|
|
710
742
|
}
|
|
711
743
|
|
|
744
|
+
this.currentFunctionCall.set(socket.id, event.call_id);
|
|
745
|
+
|
|
712
746
|
// Set up timeout
|
|
713
747
|
const timeoutPromise = new Promise((_, reject) => {
|
|
714
748
|
setTimeout(() => {
|
|
@@ -718,20 +752,23 @@ ${this.getTimeString(socket)}` :
|
|
|
718
752
|
|
|
719
753
|
// Execute the function call with timeout
|
|
720
754
|
await Promise.race([
|
|
721
|
-
tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName),
|
|
755
|
+
tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName, true),
|
|
722
756
|
timeoutPromise
|
|
723
757
|
]);
|
|
724
758
|
|
|
725
|
-
// Reset state on success
|
|
726
|
-
state.currentCallId = null;
|
|
727
|
-
this.startIdleTimer(client, socket);
|
|
728
759
|
} catch (error: any) {
|
|
729
760
|
logger.error('Function call failed:', error);
|
|
730
761
|
socket.emit('error', error.message);
|
|
731
|
-
// Reset state on error
|
|
732
|
-
state.currentCallId = null;
|
|
733
|
-
this.startIdleTimer(client, socket);
|
|
734
762
|
throw error;
|
|
763
|
+
|
|
764
|
+
} finally {
|
|
765
|
+
const wasCurrentCall = this.currentFunctionCall.get(socket.id) === event.call_id;
|
|
766
|
+
this.currentFunctionCall.set(socket.id, null);
|
|
767
|
+
// Only reset cycles and start idle timer if this was the current call
|
|
768
|
+
if (wasCurrentCall) {
|
|
769
|
+
this.resetIdleCycles(socket);
|
|
770
|
+
this.startIdleTimer(client, socket);
|
|
771
|
+
}
|
|
735
772
|
}
|
|
736
773
|
}
|
|
737
774
|
}
|