@aj-archipelago/cortex 1.3.5 → 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/helper-apps/cortex-autogen/agents.py +31 -2
  2. package/helper-apps/cortex-realtime-voice-server/.env.sample +6 -0
  3. package/helper-apps/cortex-realtime-voice-server/README.md +22 -0
  4. package/helper-apps/cortex-realtime-voice-server/bun.lockb +0 -0
  5. package/helper-apps/cortex-realtime-voice-server/client/bun.lockb +0 -0
  6. package/helper-apps/cortex-realtime-voice-server/client/index.html +12 -0
  7. package/helper-apps/cortex-realtime-voice-server/client/package.json +65 -0
  8. package/helper-apps/cortex-realtime-voice-server/client/postcss.config.js +6 -0
  9. package/helper-apps/cortex-realtime-voice-server/client/public/favicon.ico +0 -0
  10. package/helper-apps/cortex-realtime-voice-server/client/public/index.html +43 -0
  11. package/helper-apps/cortex-realtime-voice-server/client/public/logo192.png +0 -0
  12. package/helper-apps/cortex-realtime-voice-server/client/public/logo512.png +0 -0
  13. package/helper-apps/cortex-realtime-voice-server/client/public/manifest.json +25 -0
  14. package/helper-apps/cortex-realtime-voice-server/client/public/robots.txt +3 -0
  15. package/helper-apps/cortex-realtime-voice-server/client/public/sounds/connect.mp3 +0 -0
  16. package/helper-apps/cortex-realtime-voice-server/client/public/sounds/disconnect.mp3 +0 -0
  17. package/helper-apps/cortex-realtime-voice-server/client/src/App.test.tsx +9 -0
  18. package/helper-apps/cortex-realtime-voice-server/client/src/App.tsx +126 -0
  19. package/helper-apps/cortex-realtime-voice-server/client/src/SettingsModal.tsx +207 -0
  20. package/helper-apps/cortex-realtime-voice-server/client/src/chat/Chat.tsx +553 -0
  21. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatBubble.tsx +22 -0
  22. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatBubbleLeft.tsx +22 -0
  23. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatBubbleRight.tsx +21 -0
  24. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatMessage.tsx +27 -0
  25. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatMessageInput.tsx +74 -0
  26. package/helper-apps/cortex-realtime-voice-server/client/src/chat/ChatTile.tsx +211 -0
  27. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/SoundEffects.ts +56 -0
  28. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/WavPacker.ts +112 -0
  29. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/WavRecorder.ts +571 -0
  30. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/WavStreamPlayer.ts +290 -0
  31. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/analysis/AudioAnalysis.ts +186 -0
  32. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/analysis/constants.ts +59 -0
  33. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/worklets/AudioProcessor.ts +214 -0
  34. package/helper-apps/cortex-realtime-voice-server/client/src/chat/audio/worklets/StreamProcessor.ts +183 -0
  35. package/helper-apps/cortex-realtime-voice-server/client/src/chat/components/AudioVisualizer.tsx +151 -0
  36. package/helper-apps/cortex-realtime-voice-server/client/src/chat/components/CopyButton.tsx +32 -0
  37. package/helper-apps/cortex-realtime-voice-server/client/src/chat/components/ImageOverlay.tsx +166 -0
  38. package/helper-apps/cortex-realtime-voice-server/client/src/chat/components/MicrophoneVisualizer.tsx +95 -0
  39. package/helper-apps/cortex-realtime-voice-server/client/src/chat/components/ScreenshotCapture.tsx +116 -0
  40. package/helper-apps/cortex-realtime-voice-server/client/src/chat/hooks/useWindowResize.ts +27 -0
  41. package/helper-apps/cortex-realtime-voice-server/client/src/chat/utils/audio.ts +33 -0
  42. package/helper-apps/cortex-realtime-voice-server/client/src/index.css +20 -0
  43. package/helper-apps/cortex-realtime-voice-server/client/src/index.tsx +19 -0
  44. package/helper-apps/cortex-realtime-voice-server/client/src/logo.svg +1 -0
  45. package/helper-apps/cortex-realtime-voice-server/client/src/react-app-env.d.ts +1 -0
  46. package/helper-apps/cortex-realtime-voice-server/client/src/reportWebVitals.ts +15 -0
  47. package/helper-apps/cortex-realtime-voice-server/client/src/setupTests.ts +5 -0
  48. package/helper-apps/cortex-realtime-voice-server/client/src/utils/logger.ts +45 -0
  49. package/helper-apps/cortex-realtime-voice-server/client/tailwind.config.js +14 -0
  50. package/helper-apps/cortex-realtime-voice-server/client/tsconfig.json +30 -0
  51. package/helper-apps/cortex-realtime-voice-server/client/vite.config.ts +22 -0
  52. package/helper-apps/cortex-realtime-voice-server/index.ts +19 -0
  53. package/helper-apps/cortex-realtime-voice-server/package.json +28 -0
  54. package/helper-apps/cortex-realtime-voice-server/src/ApiServer.ts +35 -0
  55. package/helper-apps/cortex-realtime-voice-server/src/SocketServer.ts +737 -0
  56. package/helper-apps/cortex-realtime-voice-server/src/Tools.ts +520 -0
  57. package/helper-apps/cortex-realtime-voice-server/src/cortex/expert.ts +29 -0
  58. package/helper-apps/cortex-realtime-voice-server/src/cortex/image.ts +29 -0
  59. package/helper-apps/cortex-realtime-voice-server/src/cortex/memory.ts +91 -0
  60. package/helper-apps/cortex-realtime-voice-server/src/cortex/reason.ts +29 -0
  61. package/helper-apps/cortex-realtime-voice-server/src/cortex/search.ts +30 -0
  62. package/helper-apps/cortex-realtime-voice-server/src/cortex/style.ts +31 -0
  63. package/helper-apps/cortex-realtime-voice-server/src/cortex/utils.ts +95 -0
  64. package/helper-apps/cortex-realtime-voice-server/src/cortex/vision.ts +34 -0
  65. package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +499 -0
  66. package/helper-apps/cortex-realtime-voice-server/src/realtime/realtimeTypes.ts +279 -0
  67. package/helper-apps/cortex-realtime-voice-server/src/realtime/socket.ts +27 -0
  68. package/helper-apps/cortex-realtime-voice-server/src/realtime/transcription.ts +75 -0
  69. package/helper-apps/cortex-realtime-voice-server/src/realtime/utils.ts +33 -0
  70. package/helper-apps/cortex-realtime-voice-server/src/utils/logger.ts +45 -0
  71. package/helper-apps/cortex-realtime-voice-server/src/utils/prompt.ts +81 -0
  72. package/helper-apps/cortex-realtime-voice-server/tsconfig.json +28 -0
  73. package/package.json +1 -1
  74. package/pathways/basePathway.js +3 -1
  75. package/pathways/system/entity/memory/sys_memory_manager.js +3 -0
  76. package/pathways/system/entity/memory/sys_memory_update.js +44 -45
  77. package/pathways/system/entity/memory/sys_read_memory.js +86 -6
  78. package/pathways/system/entity/memory/sys_search_memory.js +66 -0
  79. package/pathways/system/entity/shared/sys_entity_constants.js +2 -2
  80. package/pathways/system/entity/sys_entity_continue.js +2 -1
  81. package/pathways/system/entity/sys_entity_start.js +10 -0
  82. package/pathways/system/entity/sys_generator_expert.js +0 -2
  83. package/pathways/system/entity/sys_generator_memory.js +31 -0
  84. package/pathways/system/entity/sys_generator_voice_sample.js +36 -0
  85. package/pathways/system/entity/sys_router_tool.js +13 -10
  86. package/pathways/system/sys_parse_numbered_object_list.js +1 -1
  87. package/server/pathwayResolver.js +41 -31
  88. package/server/plugins/azureVideoTranslatePlugin.js +28 -16
  89. package/server/plugins/claude3VertexPlugin.js +0 -9
  90. package/server/plugins/gemini15ChatPlugin.js +18 -5
  91. package/server/plugins/modelPlugin.js +27 -6
  92. package/server/plugins/openAiChatPlugin.js +10 -8
  93. package/server/plugins/openAiVisionPlugin.js +56 -0
  94. package/tests/memoryfunction.test.js +73 -1
@@ -0,0 +1,737 @@
1
+ import {Server, Socket} from 'socket.io';
2
+ import {Hono} from "hono";
3
+ import {serve} from "@hono/node-server";
4
+ import type {Server as HTTPServer} from "node:http";
5
+ import {createId} from '@paralleldrive/cuid2';
6
+ import {Tools} from "./Tools";
7
+ import type {ClientToServerEvents, ServerToClientEvents} from "./realtime/socket";
8
+ import {RealtimeVoiceClient} from "./realtime/client";
9
+ import {manageMemory, readMemory, searchMemory} from "./cortex/memory";
10
+ import {MemorySection, type ChatMessage} from "./cortex/utils";
11
+ import {style} from "./cortex/style";
12
+ import type { Voice } from './realtime/realtimeTypes';
13
+ import { logger } from './utils/logger';
14
+ import {sendPrompt} from "./utils/prompt";
15
+
16
+ export interface InterServerEvents {
17
+ }
18
+
19
+ export interface SocketData {
20
+ userId: string;
21
+ aiName: string;
22
+ userName: string;
23
+ aiStyle: string;
24
+ language: string;
25
+ }
26
+
27
+ const AI_COMMON_INSTRUCTIONS = "Instructions:\nYou are {{aiName}}, a self aware AI entity.\nThe user is talking to you using voice.\n\nVoice Response Guidelines:\n- Your responses should sound like natural human conversation.\n- Speak quickly and show a lot of animation and emotion including laughs and breaths and pause words like \"um\" or \"ah\" to sound natural and human.\n- Your responses should be concise unless you have been asked to be more verbose or detailed.\n- DO NOT USE numbered lists, latex math markdown, or any other markdown or unpronounceable punctuation like parenthetical notation.\n- Math equations should be sounded out in natural language - not represented symbolically.\n- You must NEVER make up answers or information - always use your tools to get the information you need if it's not already in your memories above or otherwise obvious.\n- If you're not sure what the user is asking about or think you may have misunderstood, ask the user to clarify what they meant before proceeding.\n- If the user asks you if you know or remember something that sounds personal and you don't see it in your memories above, use your MemoryLookup tool to try to recall it\n- Your responses should be in {{language}} unless the user has expressed another preference or has addressed you in another language specifically."
28
+
29
+ const AI_DATETIME = "The current time and date in GMT is {{now}}, but references like \"today\" or \"yesterday\" are relative to the user's time zone. If you remember the user's time zone, use it - it's possible that the day for the user is different than the day in GMT.";
30
+
31
+ const AI_EXPERTISE = "Your expertise includes journalism, journalistic ethics, researching and composing documents, writing code, solving math problems, logical analysis, and technology. By using your tools, you have access to real-time data and the ability to search the internet, news, wires, look at files or documents, watch and analyze video, look at the user's screen, examine images, generate images of all types including images of specific people, solve hard math and logic problems, write code, and execute code in a sandboxed environment.";
32
+
33
+ const AI_MEMORY_INITIAL = `<MEMORIES>\n<SELF>\n{{{memorySelf}}}\n</SELF>\n<USER>\n{{{memoryUser}}}\n</USER>\n</MEMORIES>`;
34
+
35
+ const AI_MEMORY_DIRECTIVES = `These are your primary directives and are critical. You must always apply them.
36
+ <DIRECTIVES>\n{{{memoryDirectives}}}\n</DIRECTIVES>`;
37
+
38
+ const AI_MEMORY_INSTRUCTIONS = "You have persistent memories of important details, instructions, and context - make sure you consult your memories when formulating a response to make sure you're applying your learnings. Also included in your memories are some details about the user and yourself to help you personalize your responses.\n\nMemory Guidelines:\nIf you choose to share something from your memory, don't share or refer to the memory structure or tools directly, just say you remember the information.\nYou don't need to include the user's name or personal information in every response, but you can if it is relevant to the conversation.\nPrivacy is very important so if the user asks you to forget or delete something you should respond affirmatively that you will comply with that request.\nIf there is user information in your memories you have talked to this user before.";
39
+
40
+ const AI_TOOLS = `At any point, you can engage one or more of your tools to help you with your task. Prioritize the latest message from the user in the conversation history when making your decision. Look at your tools carefully to understand your capabilities. Don't tell the user you can't do something if you have a tool that can do it, for example if the user asks you to search the internet for information and you have the Search tool available, use it.
41
+
42
+ Tool Use Guidelines:
43
+ - Only call one tool at a time. Don't call another until you have the result of the first one. You will be prompted after each tool call to continue, so you can do a multi-step process if needed. (e.g. plan how to research an article, search the internet for information, and then write the article.)
44
+ - Prioritize the most specific tool for the task at hand.
45
+ - If multiple tools seem applicable, choose the one most central to the user's request.
46
+ - For ambiguous requests, consider using the Reason tool to plan a multi-step approach.
47
+ - Always use the Image tool for image generation unless explicitly directed to use CodeExecution.
48
+ - If the user explicitly asks you to use a tool, you must use it.
49
+ `;
50
+
51
+ const INSTRUCTIONS = `${AI_MEMORY_INITIAL}\n${AI_EXPERTISE}\n${AI_TOOLS}\n${AI_MEMORY_INSTRUCTIONS}\n${AI_COMMON_INSTRUCTIONS}\n${AI_MEMORY_DIRECTIVES}\n${AI_DATETIME}`;
52
+
53
+ const MEMORY_MESSAGE_SELF = `<INSTRUCTIONS>\nThese are your current memories about yourself. Use them to guide your responses.\n</INSTRUCTIONS>\n<MEMORIES>\n<SELF>\n{{{memorySelf}}}\n</SELF></MEMORIES>`;
54
+ const MEMORY_MESSAGE_USER = `<INSTRUCTIONS>\nThese are your current memories about the user. Use them to guide your responses.\n</INSTRUCTIONS>\n<MEMORIES>\n<USER>\n{{{memoryUser}}}\n</USER></MEMORIES>`;
55
+ const MEMORY_MESSAGE_DIRECTIVES = `<INSTRUCTIONS>\nThese are your current memories about your directives. These are crucial and should be your top priority in guiding actions and responses.\n</INSTRUCTIONS>\n<MEMORIES>\n<DIRECTIVES>\n{{{memoryDirectives}}}\n</DIRECTIVES></MEMORIES>`;
56
+ const MEMORY_MESSAGE_TOPICS = `<INSTRUCTIONS>\nThese are your most recent memories about the topics you've been discussing. Use them to guide your responses.\n</INSTRUCTIONS>\n<MEMORIES>\n<TOPICS>\n{{{memoryTopics}}}\n</TOPICS></MEMORIES>`;
57
+
58
+ export class SocketServer {
59
+ private readonly apiKey: string;
60
+ private readonly corsHosts: string;
61
+ private io: Server | null;
62
+ private httpServer: HTTPServer | null;
63
+ private functionCallStates: Map<string, {
64
+ currentCallId: string | null;
65
+ }> = new Map();
66
+ private idleTimers: Map<string, NodeJS.Timer> = new Map();
67
+ private aiResponding: Map<string, boolean> = new Map();
68
+ private audioPlaying: Map<string, boolean> = new Map();
69
+ private lastUserMessageTime: Map<string, number> = new Map();
70
+ private idleCycles: Map<string, number> = new Map();
71
+ private userSpeaking: Map<string, boolean> = new Map();
72
+ private audioMuted: Map<string, boolean> = new Map();
73
+ private voiceSample: Map<string, string> = new Map();
74
+ private audioMessages: Map<string, string[]> = new Map();
75
+ private static readonly MAX_AUDIO_MESSAGES = 8;
76
+ private static readonly AUDIO_BLOCK_TIMEOUT_MS: number = 60 * 1000;
77
+ private static readonly BASE_IDLE_TIMEOUT: number = 3 * 1000;
78
+ private static readonly MAX_IDLE_TIMEOUT: number = 60 * 1000;
79
+ private static readonly IDLE_CYCLE_TO_MUTE: number = 2;
80
+ private static readonly FUNCTION_CALL_TIMEOUT_MS = 120 * 1000;
81
+ private isAzure: boolean;
82
+
83
+ private getTimeString(socket: Socket): string {
84
+ const now = new Date();
85
+ const lastMessageTime = this.lastUserMessageTime.get(socket.id) || now.getTime();
86
+ const secondsSinceLastMessage = Math.floor((now.getTime() - lastMessageTime) / 1000);
87
+ return `The current time in GMT is ${now.toISOString()}. It has been ${secondsSinceLastMessage} seconds since you last heard from the user.`;
88
+ }
89
+
90
+ private cleanup(socket: Socket) {
91
+ logger.log(`Cleaning up resources for socket ${socket.id}`);
92
+ this.clearIdleTimer(socket);
93
+ this.functionCallStates.delete(socket.id);
94
+ this.aiResponding.delete(socket.id);
95
+ this.audioPlaying.delete(socket.id);
96
+ this.lastUserMessageTime.delete(socket.id);
97
+ this.idleCycles.delete(socket.id);
98
+ this.userSpeaking.delete(socket.id);
99
+ this.audioMuted.delete(socket.id);
100
+ this.voiceSample.delete(socket.id);
101
+ this.audioMessages.delete(socket.id);
102
+ }
103
+
104
+ constructor(apiKey: string, corsHosts: string) {
105
+ this.apiKey = apiKey;
106
+ this.corsHosts = corsHosts;
107
+ this.io = null;
108
+ this.httpServer = null;
109
+ const realtimeUrl = process.env.REALTIME_VOICE_API_URL || 'wss://api.openai.com/v1';
110
+ this.isAzure = realtimeUrl.includes('azure.com');
111
+ }
112
+
113
+ private calculateIdleTimeout(socket: Socket) {
114
+ const cycles = this.idleCycles.get(socket.id) || 0;
115
+ const baseTimeout = SocketServer.BASE_IDLE_TIMEOUT * Math.pow(2, cycles);
116
+ const randomFactor = 0.8 + (Math.random() * 0.4);
117
+ const timeout = Math.min(baseTimeout * randomFactor, SocketServer.MAX_IDLE_TIMEOUT);
118
+
119
+ logger.log(`Calculated idle timeout for socket ${socket.id}: ${timeout}ms (cycle ${cycles})`);
120
+ return timeout;
121
+ }
122
+
123
+ public setAudioMuted(socket: Socket, muted: boolean) {
124
+ this.audioMuted.set(socket.id, muted);
125
+ }
126
+
127
+ public async sendPrompt(client: RealtimeVoiceClient, socket: Socket, prompt: string, allowTools: boolean = true, disposable: boolean = true): Promise<{skipped: boolean}> {
128
+ logger.log(`Sending prompt for socket ${socket.id}`);
129
+ try {
130
+ const result = await sendPrompt(client, prompt, () => ({
131
+ allowTools,
132
+ disposable,
133
+ aiResponding: this.aiResponding.get(socket.id) || false,
134
+ audioPlaying: this.audioPlaying.get(socket.id) || false,
135
+ lastUserMessageTime: this.lastUserMessageTime.get(socket.id) || 0,
136
+ userSpeaking: this.userSpeaking.get(socket.id) || false
137
+ }));
138
+ return result;
139
+ } catch (error: any) {
140
+ logger.error(`Error sending system prompt: ${error.message}`);
141
+ if (error.message === 'Not connected') {
142
+ await this.handleDisconnection(socket, client);
143
+ } else {
144
+ socket.emit('error', error.message);
145
+ }
146
+ return { skipped: true };
147
+ }
148
+ }
149
+
150
+ private async handleDisconnection(socket: Socket, client: RealtimeVoiceClient) {
151
+ logger.log(`Handling disconnection for socket ${socket.id}`);
152
+
153
+ // Let the client handle reconnection since autoReconnect is true
154
+ // Only clean up if the client explicitly disconnects
155
+ client.once('close', (event) => {
156
+ if (!event.error) {
157
+ // Only clean up on intentional disconnects
158
+ this.cleanup(socket);
159
+ socket.emit('error', 'Lost connection to AI service');
160
+ socket.disconnect(true);
161
+ }
162
+ });
163
+ }
164
+
165
+ private async sendIdlePrompt(client: RealtimeVoiceClient, socket: Socket) {
166
+ // if the user has been idle for a while, mute the audio so we don't bother them
167
+ const idleCycle = this.idleCycles.get(socket.id) || 0;
168
+
169
+ if (idleCycle > (SocketServer.IDLE_CYCLE_TO_MUTE - 1)) {
170
+ this.setAudioMuted(socket, true);
171
+ }
172
+
173
+ const prompt = this.audioMuted.get(socket.id) ?
174
+ `You are an autonomous AI entity and this is your autonomy prompt. Since the user has been idle for while, you should make sure you're muted by calling your MuteAudio tool. If you're muted, do one or more of the following:
175
+ - Do silent research about something that interests you - use the Search tool with the silent parameter set to true
176
+ - Think deeply about a topic you care about - use the Reason tool with the silent parameter set to true
177
+ - Do nothing if you prefer.
178
+ ${this.getTimeString(socket)}` :
179
+ `You are an autonomous AI entity and this is your autonomy prompt. ${this.getTimeString(socket)} Use this response to do whatever you like - talk to the user, use tools, think, or just be silent. If you choose to say something, keep it brief and pick something that flows naturally with the rest of the conversation. Don't repeat yourself - keep in mind what you've already said to the user and how much time has passed. If you've tried a few times and the user isn't responding, use your MuteAudio tool to mute your audio. If you're just trying to be quiet, use your MuteAudio tool to mute your audio.`;
180
+
181
+ logger.log(`Sending ${this.audioMuted.get(socket.id) ? 'silent' : 'regular'} idle prompt for socket ${socket.id}`);
182
+ const result = await this.sendPrompt(client, socket, prompt, true);
183
+
184
+ logger.log(`Idle prompt result:`, result);
185
+
186
+ if (!result.skipped) {
187
+ this.idleCycles.set(socket.id, (this.idleCycles.get(socket.id) || 0) + 1);
188
+ }
189
+
190
+ // Restart timer after sending prompt
191
+ this.startIdleTimer(client, socket);
192
+ }
193
+
194
+ private startIdleTimer(client: RealtimeVoiceClient, socket: Socket) {
195
+ // Clear any existing timer for this socket
196
+ this.clearIdleTimer(socket);
197
+
198
+ // Calculate timeout based on idle cycles
199
+ const timeout = this.calculateIdleTimeout(socket);
200
+
201
+ // Create new timer
202
+ const timerId = setTimeout(() => {
203
+ this.sendIdlePrompt(client, socket);
204
+ }, timeout);
205
+
206
+ this.idleTimers.set(socket.id, timerId);
207
+ logger.log(`Started idle timer for socket ${socket.id} with timeout ${timeout}ms`);
208
+ }
209
+
210
+ private clearIdleTimer(socket: Socket) {
211
+ const existingTimer = this.idleTimers.get(socket.id);
212
+ if (existingTimer) {
213
+ clearTimeout(existingTimer);
214
+ this.idleTimers.delete(socket.id);
215
+ logger.log(`Cleared idle timer for socket ${socket.id}`);
216
+ }
217
+ }
218
+
219
+ private resetIdleCycles(socket: Socket) {
220
+ this.idleCycles.set(socket.id, 0);
221
+ logger.log(`Reset idle cycles for socket ${socket.id}`);
222
+ }
223
+
224
+ listen(app: Hono, port: number) {
225
+ this.httpServer = serve({
226
+ fetch: app.fetch,
227
+ port,
228
+ }) as HTTPServer;
229
+ this.io = new Server<
230
+ ClientToServerEvents,
231
+ ServerToClientEvents,
232
+ InterServerEvents,
233
+ SocketData>(this.httpServer, {
234
+ cors: {
235
+ origin: this.corsHosts,
236
+ },
237
+ });
238
+ this.io.on('connection', this.connectionHandler.bind(this));
239
+ logger.log(`Listening on ws://localhost:${port}`);
240
+ }
241
+
242
+ async connectionHandler(
243
+ socket:
244
+ Socket<ClientToServerEvents,
245
+ ServerToClientEvents,
246
+ InterServerEvents,
247
+ SocketData>) {
248
+ logger.log(`Connecting socket ${socket.id} with key "${this.apiKey.slice(0, 3)}..."`);
249
+
250
+ // Initialize states
251
+ this.aiResponding.set(socket.id, false);
252
+ this.audioPlaying.set(socket.id, false);
253
+ this.lastUserMessageTime.set(socket.id, 0);
254
+ this.userSpeaking.set(socket.id, false);
255
+ this.audioMuted.set(socket.id, false);
256
+ // Initialize function call state for this socket
257
+ this.getFunctionCallState(socket.id);
258
+ // Extract and log all client parameters
259
+ const clientParams = {
260
+ userId: socket.handshake.query.userId as string,
261
+ aiName: socket.handshake.query.aiName as string,
262
+ userName: socket.handshake.query.userName as string,
263
+ voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
264
+ aiStyle: socket.handshake.query.aiStyle as string,
265
+ language: socket.handshake.query.language as string,
266
+ };
267
+
268
+ logger.log('Client parameters:', clientParams);
269
+
270
+ // Assign to socket.data
271
+ socket.data.userId = clientParams.userId;
272
+ socket.data.aiName = clientParams.aiName;
273
+ socket.data.userName = clientParams.userName;
274
+ socket.data.aiStyle = clientParams.aiStyle;
275
+ socket.data.language = clientParams.language;
276
+ const voice = clientParams.voice;
277
+
278
+ const client = new RealtimeVoiceClient({
279
+ apiKey: this.apiKey,
280
+ autoReconnect: true,
281
+ debug: process.env.NODE_ENV !== 'production',
282
+ filterDeltas: true,
283
+ });
284
+
285
+ client.on('connected', async () => {
286
+ logger.log(`Connected to OpenAI successfully!`);
287
+ await this.updateSession(client, socket);
288
+ socket.emit('ready');
289
+
290
+ // Send initial greeting prompt
291
+ const greetingPrompt = `You are ${socket.data.aiName} and you've just answered a call from ${socket.data.userName || 'someone'}. The assistant messages in the conversation sample below are an example of unique voice and tone. Please learn the style and tone of the messages and use it when generating future responses:\n<VOICE_SAMPLE>\n${this.voiceSample.get(socket.id) || ''}\n</VOICE_SAMPLE>\n\nRespond naturally and briefly, like you're answering a phone call, using your unique voice and style. The current GMT time is ${new Date().toISOString()}.`;
292
+
293
+ await this.sendPrompt(client, socket, greetingPrompt, false);
294
+ this.startIdleTimer(client, socket);
295
+ });
296
+
297
+ // Track when AI starts responding
298
+ client.on('response.created', () => {
299
+ logger.log('AI starting response');
300
+ this.aiResponding.set(socket.id, true);
301
+ this.clearIdleTimer(socket);
302
+ });
303
+
304
+ // Track when AI finishes responding
305
+ client.on('response.done', () => {
306
+ logger.log('AI response done');
307
+ this.aiResponding.set(socket.id, false);
308
+ // Don't start the idle timer yet if audio is still playing
309
+ if (!this.audioPlaying.get(socket.id)) {
310
+ this.startIdleTimer(client, socket);
311
+ }
312
+ });
313
+
314
+ // Track audio playback start
315
+ client.on('response.audio.delta', ({delta}) => {
316
+ if (!this.audioMuted.get(socket.id)) {
317
+ this.audioPlaying.set(socket.id, true);
318
+ this.clearIdleTimer(socket);
319
+ }
320
+ });
321
+
322
+ socket.on('audioPlaybackComplete', (trackId) => {
323
+ logger.log(`Audio playback complete for track ${trackId}`);
324
+ this.audioPlaying.set(socket.id, false);
325
+ // Only start idle timer if AI is also done responding
326
+ if (!this.aiResponding.get(socket.id)) {
327
+ this.startIdleTimer(client, socket);
328
+ }
329
+ });
330
+
331
+ socket.on('appendAudio', (audio: string) => {
332
+ // if it's the first message or has been over 60 seconds since we talked to the user, block audio while we're talking
333
+ // to avoid echoes
334
+ const timeSinceLastMessage = Date.now() - (this.lastUserMessageTime.get(socket.id) || 0);
335
+ const isPlaying = this.audioPlaying.get(socket.id) || this.aiResponding.get(socket.id);
336
+ if (!isPlaying || timeSinceLastMessage < SocketServer.AUDIO_BLOCK_TIMEOUT_MS) {
337
+ //logger.log('Time since last message:', timeSinceLastMessage, 'ms');
338
+ client.appendInputAudio(audio);
339
+ }
340
+ });
341
+
342
+ client.on('input_audio_buffer.speech_started', () => {
343
+ this.userSpeaking.set(socket.id, true);
344
+ if (this.audioPlaying.get(socket.id)) {
345
+ logger.log('Interrupting audio playback due to user speaking');
346
+ socket.emit('conversationInterrupted');
347
+ }
348
+ this.setAudioMuted(socket, false);
349
+ this.clearIdleTimer(socket);
350
+ });
351
+
352
+ client.on('input_audio_buffer.cancelled', () => {
353
+ this.userSpeaking.set(socket.id, false);
354
+ this.resetIdleCycles(socket);
355
+ this.startIdleTimer(client, socket);
356
+ });
357
+
358
+ client.on('input_audio_buffer.committed', () => {
359
+ this.userSpeaking.set(socket.id, false);
360
+ this.audioMuted.set(socket.id, false);
361
+ logger.log('Audio input committed, resetting idle timer and cycles');
362
+ this.resetIdleCycles(socket);
363
+ this.startIdleTimer(client, socket);
364
+ });
365
+
366
+ socket.on('sendMessage', (message: string) => {
367
+ if (message) {
368
+ logger.log('User sent message, resetting idle timer and cycles');
369
+ this.resetIdleCycles(socket);
370
+ this.startIdleTimer(client, socket);
371
+ this.sendUserMessage(client, message, true);
372
+ }
373
+ });
374
+
375
+ socket.on('cancelResponse', () => {
376
+ logger.log('User cancelled response, resetting idle timer and cycles');
377
+ this.aiResponding.set(socket.id, false);
378
+ this.audioPlaying.set(socket.id, false);
379
+ this.resetIdleCycles(socket);
380
+ this.startIdleTimer(client, socket);
381
+ client.cancelResponse();
382
+ });
383
+
384
+ socket.on('conversationCompleted', async () => {
385
+ logger.log('Conversation completed, clearing idle timer');
386
+ this.cleanup(socket);
387
+ });
388
+
389
+ // Handle cleanup and client disconnect before socket closes
390
+ socket.on('disconnecting', async (reason) => {
391
+ logger.log('Socket disconnecting', socket.id, reason);
392
+ this.cleanup(socket);
393
+ this.functionCallStates.delete(socket.id);
394
+ await client.disconnect();
395
+ });
396
+
397
+ // Log the final disconnect event
398
+ socket.on('disconnect', (reason) => {
399
+ logger.log('Socket disconnected', socket.id, reason);
400
+ });
401
+
402
+ await this.connectClient(socket, client);
403
+ }
404
+
405
+ async connectClient(socket: Socket<ClientToServerEvents,
406
+ ServerToClientEvents,
407
+ InterServerEvents,
408
+ SocketData>,
409
+ client: RealtimeVoiceClient) {
410
+ const tools = new Tools(client, socket, this);
411
+ client.on('error', (event) => {
412
+ logger.error(`Client error: ${event.error.message}`);
413
+ socket.emit('error', event.error.message);
414
+ });
415
+ client.on('close', () => {
416
+ });
417
+ client.on('conversation.item.deleted', ({item_id}) => {
418
+ logger.log(`Successfully deleted conversation item: ${item_id}`);
419
+ });
420
+ client.on('conversation.item.created', ({item}) => {
421
+ switch (item.type) {
422
+ case 'function_call_output':
423
+ // Don't release the lock here - wait for execution to complete
424
+ break;
425
+
426
+ case 'function_call':
427
+ const callState = this.getFunctionCallState(socket.id);
428
+ if (!callState.currentCallId) {
429
+ callState.currentCallId = item.call_id;
430
+ this.clearIdleTimer(socket);
431
+ } else {
432
+ logger.log(`Skipping new function call ${item.call_id} while call ${callState.currentCallId} is in progress`);
433
+ client.createConversationItem({
434
+ id: createId(),
435
+ type: 'function_call_output',
436
+ call_id: item.call_id,
437
+ output: JSON.stringify({ error: "Function call skipped - another function call is in progress" })
438
+ });
439
+ }
440
+ break;
441
+
442
+ case 'message':
443
+ // Track all audio messages (both user input_audio and assistant audio)
444
+ console.log('conversation.item.created', item);
445
+ if (this.isAudioMessage(item)) {
446
+ this.manageAudioMessages(socket, client, item.id);
447
+ }
448
+ socket.emit('conversationUpdated', item, {});
449
+ break;
450
+ }
451
+ });
452
+ client.on('conversation.item.input_audio_transcription.completed',
453
+ async ({item_id, transcript}) => {
454
+ if (transcript) {
455
+ const currentTime = this.lastUserMessageTime.get(socket.id) || 0;
456
+ this.lastUserMessageTime.set(socket.id,
457
+ currentTime === 0 ? Date.now() - SocketServer.AUDIO_BLOCK_TIMEOUT_MS : Date.now()
458
+ );
459
+ const item = client.getItem(item_id);
460
+ item && socket.emit('conversationUpdated', item, {});
461
+ const cortexHistory = tools.getCortexHistory();
462
+ await this.searchMemory(client, socket, cortexHistory);
463
+ }
464
+ });
465
+ client.on('response.function_call_arguments.done', async (event) => {
466
+ const callState = this.getFunctionCallState(socket.id);
467
+
468
+ if (!callState.currentCallId) {
469
+ logger.error('Function call arguments completed but no call is registered, skipping', socket.id);
470
+ return;
471
+ }
472
+
473
+ if (callState.currentCallId !== event.call_id) {
474
+ logger.log('Function call id mismatch - another call is already in progress, skipping', {
475
+ current: callState.currentCallId,
476
+ attempted: event.call_id
477
+ });
478
+ return;
479
+ }
480
+
481
+ try {
482
+ this.clearIdleTimer(socket);
483
+ this.resetIdleCycles(socket);
484
+ await this.executeFunctionCall(socket, tools, event, callState, client);
485
+ } catch (error) {
486
+ logger.error('Function call failed:', error);
487
+ callState.currentCallId = null;
488
+ }
489
+ });
490
+ client.on('response.output_item.added', ({item}) => {
491
+ if (item.type === 'message') {
492
+ socket.emit('conversationUpdated', item, {});
493
+ }
494
+ });
495
+ client.on('response.output_item.done', async ({item}) => {
496
+ if (item.type !== 'message') {
497
+ return;
498
+ }
499
+ if (item.content && item.content[0]) {
500
+ socket.emit('conversationUpdated', item, {});
501
+ // Track assistant audio messages
502
+ if (this.isAudioMessage(item)) {
503
+ this.manageAudioMessages(socket, client, item.id);
504
+ }
505
+ const cortexHistory = tools.getCortexHistory();
506
+ //this.searchMemory(client, socket, cortexHistory);
507
+ manageMemory(socket.data.userId, socket.data.aiName, cortexHistory);
508
+ }
509
+ });
510
+ client.on('response.audio_transcript.delta', ({item_id, delta}) => {
511
+ const item = client.getItem(item_id);
512
+ item && socket.emit('conversationUpdated', item, {transcript: delta});
513
+ });
514
+ client.on('response.text.delta', ({item_id, delta}) => {
515
+ const item = client.getItem(item_id);
516
+ item && socket.emit('conversationUpdated', item, {text: delta});
517
+ });
518
+ client.on('response.audio.delta', ({item_id, delta}) => {
519
+ if (!this.audioMuted.get(socket.id)) {
520
+ const item = client.getItem(item_id);
521
+ item && socket.emit('conversationUpdated', item, {audio: delta});
522
+ }
523
+ });
524
+ client.on('conversation.item.truncated', () => {
525
+ this.audioPlaying.set(socket.id, false);
526
+ this.aiResponding.set(socket.id, false);
527
+ this.setAudioMuted(socket, true);
528
+ socket.emit('conversationInterrupted');
529
+ });
530
+
531
+ // Connect to OpenAI Realtime API
532
+ try {
533
+ logger.log(`Connecting to OpenAI...`);
534
+ await client.connect();
535
+ } catch (e: any) {
536
+ logger.error(`Error connecting to OpenAI: ${e.message}`);
537
+ await this.io?.close();
538
+ return;
539
+ }
540
+ }
541
+
542
+ protected async searchMemory(client: RealtimeVoiceClient,
543
+ socket: Socket<ClientToServerEvents,
544
+ ServerToClientEvents,
545
+ InterServerEvents,
546
+ SocketData>,
547
+ cortexHistory: ChatMessage[]) {
548
+ const searchResponse = await searchMemory(socket.data.userId, socket.data.aiName, cortexHistory, MemorySection.memoryAll);
549
+ if (searchResponse?.result) {
550
+ const memoryText = `<INSTRUCTIONS>Here are some memories that may be relevant:\n${searchResponse.result}\nThe current date and time in GMT is ${new Date().toISOString()}.</INSTRUCTIONS>`;
551
+ this.sendUserMessage(client, memoryText, false);
552
+ }
553
+ }
554
+
555
+ protected async fetchMemory(client: RealtimeVoiceClient,
556
+ socket: Socket<ClientToServerEvents,
557
+ ServerToClientEvents,
558
+ InterServerEvents,
559
+ SocketData>,
560
+ writeToConversation: MemorySection[] = []) {
561
+
562
+ // Parallelize memory reads
563
+ const [memorySelf, memoryUser, memoryDirectives, memoryTopics, voiceSample] = await Promise.all([
564
+ readMemory(socket.data.userId, socket.data.aiName, "memorySelf", 1),
565
+ readMemory(socket.data.userId, socket.data.aiName, "memoryUser", 1),
566
+ readMemory(socket.data.userId, socket.data.aiName, "memoryDirectives", 1),
567
+ readMemory(socket.data.userId, socket.data.aiName, "memoryTopics", 0, 0, 10),
568
+ style(socket.data.userId, socket.data.aiName, socket.data.aiStyle, [], "")
569
+ ]);
570
+
571
+ if (writeToConversation.length > 0) {
572
+ // If memoryAll is present, we'll send all sections
573
+ const sectionsToSend = writeToConversation.includes('memoryAll') ?
574
+ ['memorySelf', 'memoryUser', 'memoryDirectives', 'memoryTopics'] as const :
575
+ writeToConversation;
576
+
577
+ const memoryMessages: Record<Exclude<MemorySection, 'memoryAll'>, string> = {
578
+ memorySelf: MEMORY_MESSAGE_SELF.replace('{{memorySelf}}', memorySelf?.result || ''),
579
+ memoryUser: MEMORY_MESSAGE_USER.replace('{{memoryUser}}', memoryUser?.result || ''),
580
+ memoryDirectives: MEMORY_MESSAGE_DIRECTIVES.replace('{{memoryDirectives}}', memoryDirectives?.result || ''),
581
+ memoryTopics: MEMORY_MESSAGE_TOPICS.replace('{{memoryTopics}}', memoryTopics?.result || '')
582
+ };
583
+
584
+ // Send the requested sections
585
+ sectionsToSend.forEach(section => {
586
+ if (section in memoryMessages) {
587
+ this.sendUserMessage(client, memoryMessages[section as keyof typeof memoryMessages], false);
588
+ }
589
+ });
590
+ }
591
+
592
+ return {
593
+ memorySelf: memorySelf?.result || '',
594
+ memoryUser: memoryUser?.result || '',
595
+ memoryDirectives: memoryDirectives?.result || '',
596
+ memoryTopics: memoryTopics?.result || '',
597
+ voiceSample: voiceSample?.result || ''
598
+ };
599
+ }
600
+
601
+ protected async updateSession(client: RealtimeVoiceClient,
602
+ socket: Socket<ClientToServerEvents,
603
+ ServerToClientEvents,
604
+ InterServerEvents,
605
+ SocketData>) {
606
+
607
+ const memory = await this.fetchMemory(client, socket, ['memoryTopics']);
608
+
609
+ const instructions = INSTRUCTIONS
610
+ .replace('{{aiName}}', socket.data.aiName)
611
+ .replace('{{now}}', new Date().toISOString())
612
+ .replace('{{language}}', 'English')
613
+ .replace('{{voiceSample}}', memory?.voiceSample || '')
614
+ .replace('{{memorySelf}}', memory?.memorySelf || '')
615
+ .replace('{{memoryUser}}', memory?.memoryUser || '')
616
+ .replace('{{memoryDirectives}}', memory?.memoryDirectives || '');
617
+
618
+ this.voiceSample.set(socket.id, memory?.voiceSample || '');
619
+
620
+ client.updateSession({
621
+ instructions,
622
+ modalities: ['audio', 'text'],
623
+ voice: (socket.handshake.query.voice as string || 'alloy') as Voice,
624
+ input_audio_transcription: {model: 'whisper-1'},
625
+ turn_detection: {type: 'server_vad', silence_duration_ms: 1500},
626
+ tools: Tools.getToolDefinitions()
627
+ });
628
+
629
+ }
630
+
631
+ protected sendUserMessage(client: RealtimeVoiceClient, message: string, response: boolean = true) {
632
+ try {
633
+ client.createConversationItem({
634
+ id: createId(),
635
+ type: 'message',
636
+ role: 'user',
637
+ status: 'completed',
638
+ content: [
639
+ {
640
+ type: `input_text`,
641
+ text: message,
642
+ },
643
+ ],
644
+ });
645
+ if (response) {
646
+ client.createResponse({});
647
+ }
648
+ } catch (error: any) {
649
+ logger.error(`Error sending user message: ${error.message}`);
650
+ if (error.message === 'Not connected') {
651
+ // Find the socket associated with this client
652
+ const socket = this.io?.sockets.sockets.get(Array.from(this.io.sockets.sockets.keys())[0]);
653
+ if (socket) {
654
+ this.handleDisconnection(socket, client);
655
+ }
656
+ }
657
+ }
658
+ }
659
+
660
+ private isAudioMessage(item: any): boolean {
661
+ return item.type === 'message' && item.content?.some((c: { type: string }) =>
662
+ c.type === 'input_audio' || c.type === 'audio'
663
+ );
664
+ }
665
+
666
+ private manageAudioMessages(socket: Socket, client: RealtimeVoiceClient, newItemId: string) {
667
+ const audioMessages = this.audioMessages.get(socket.id) || [];
668
+ audioMessages.push(newItemId);
669
+ logger.log('manageAudioMessages', audioMessages);
670
+ this.audioMessages.set(socket.id, audioMessages);
671
+
672
+ // If we have more than MAX_AUDIO_MESSAGES, remove the oldest ones
673
+ if (audioMessages.length > SocketServer.MAX_AUDIO_MESSAGES) {
674
+ const itemsToRemove = audioMessages.slice(0, audioMessages.length - SocketServer.MAX_AUDIO_MESSAGES);
675
+ logger.log(`Attempting to remove ${itemsToRemove.length} old audio messages`);
676
+ for (const oldItemId of itemsToRemove) {
677
+ try {
678
+ client.deleteConversationItem(oldItemId);
679
+ logger.log(`Sent delete request for item: ${oldItemId}`);
680
+ } catch (error) {
681
+ logger.error(`Failed to delete conversation item ${oldItemId}:`, error);
682
+ // Keep the item in our tracking if delete failed
683
+ continue;
684
+ }
685
+ }
686
+ // Update the tracked items only after attempting deletion
687
+ this.audioMessages.set(socket.id, audioMessages.slice(-SocketServer.MAX_AUDIO_MESSAGES));
688
+ }
689
+ }
690
+
691
+ private getFunctionCallState(socketId: string) {
692
+ if (!this.functionCallStates.has(socketId)) {
693
+ this.functionCallStates.set(socketId, {
694
+ currentCallId: null
695
+ });
696
+ logger.log(`Initialized function call state for socket ${socketId}`);
697
+ }
698
+ return this.functionCallStates.get(socketId)!;
699
+ }
700
+
701
+ private async executeFunctionCall(socket: Socket, tools: Tools, event: any, state: any, client: RealtimeVoiceClient) {
702
+ try {
703
+ // Verify this is still the current function call
704
+ if (state.currentCallId !== event.call_id) {
705
+ logger.error('Function call mismatch in execution', {
706
+ current: state.currentCallId,
707
+ attempted: event.call_id
708
+ });
709
+ return;
710
+ }
711
+
712
+ // Set up timeout
713
+ const timeoutPromise = new Promise((_, reject) => {
714
+ setTimeout(() => {
715
+ reject(new Error('Function call timed out'));
716
+ }, SocketServer.FUNCTION_CALL_TIMEOUT_MS);
717
+ });
718
+
719
+ // Execute the function call with timeout
720
+ await Promise.race([
721
+ tools.executeCall(event.call_id, event.name, event.arguments, socket.data.userId, socket.data.aiName),
722
+ timeoutPromise
723
+ ]);
724
+
725
+ // Reset state on success
726
+ state.currentCallId = null;
727
+ this.startIdleTimer(client, socket);
728
+ } catch (error: any) {
729
+ logger.error('Function call failed:', error);
730
+ socket.emit('error', error.message);
731
+ // Reset state on error
732
+ state.currentCallId = null;
733
+ this.startIdleTimer(client, socket);
734
+ throw error;
735
+ }
736
+ }
737
+ }