@juspay/neurolink 9.53.0 → 9.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/adapters/tts/cartesiaHandler.d.ts +12 -0
- package/dist/adapters/tts/cartesiaHandler.js +130 -0
- package/dist/browser/neurolink.min.js +1 -1
- package/dist/cli/commands/voiceServer.d.ts +6 -0
- package/dist/cli/commands/voiceServer.js +17 -0
- package/dist/cli/parser.js +4 -1
- package/dist/lib/adapters/tts/cartesiaHandler.d.ts +12 -0
- package/dist/lib/adapters/tts/cartesiaHandler.js +131 -0
- package/dist/lib/providers/azureOpenai.d.ts +4 -1
- package/dist/lib/providers/azureOpenai.js +9 -3
- package/dist/lib/server/voice/frameBus.d.ts +8 -0
- package/dist/lib/server/voice/frameBus.js +25 -0
- package/dist/lib/server/voice/turnManager.d.ts +15 -0
- package/dist/lib/server/voice/turnManager.js +36 -0
- package/dist/lib/server/voice/types.d.ts +20 -0
- package/dist/lib/server/voice/types.js +2 -0
- package/dist/lib/server/voice/voiceServerApp.d.ts +1 -0
- package/dist/lib/server/voice/voiceServerApp.js +118 -0
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +11 -0
- package/dist/lib/server/voice/voiceWebSocketHandler.js +536 -0
- package/dist/providers/azureOpenai.d.ts +4 -1
- package/dist/providers/azureOpenai.js +9 -3
- package/dist/server/voice/frameBus.d.ts +8 -0
- package/dist/server/voice/frameBus.js +24 -0
- package/dist/server/voice/public/app.js +275 -0
- package/dist/server/voice/public/index.html +18 -0
- package/dist/server/voice/public/pcm-worklet.js +67 -0
- package/dist/server/voice/public/styles.css +102 -0
- package/dist/server/voice/turnManager.d.ts +15 -0
- package/dist/server/voice/turnManager.js +35 -0
- package/dist/server/voice/types.d.ts +20 -0
- package/dist/server/voice/types.js +1 -0
- package/dist/server/voice/voiceServerApp.d.ts +1 -0
- package/dist/server/voice/voiceServerApp.js +117 -0
- package/dist/server/voice/voiceWebSocketHandler.d.ts +11 -0
- package/dist/server/voice/voiceWebSocketHandler.js +535 -0
- package/package.json +2 -1
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
const {
|
|
2
|
+
WebSocket: BrowserWebSocket,
|
|
3
|
+
HTMLElement: BrowserHTMLElement,
|
|
4
|
+
HTMLButtonElement: BrowserHTMLButtonElement,
|
|
5
|
+
AudioContext: BrowserAudioContext,
|
|
6
|
+
Blob: BrowserBlob,
|
|
7
|
+
alert: browserAlert,
|
|
8
|
+
} = globalThis;
|
|
9
|
+
|
|
10
|
+
const socketProtocol =
|
|
11
|
+
globalThis.location.protocol === "https:" ? "wss:" : "ws:";
|
|
12
|
+
const socketUrl = `${socketProtocol}//${globalThis.location.host}`;
|
|
13
|
+
|
|
14
|
+
/** @type {WebSocket | null} */
|
|
15
|
+
let socket = null;
|
|
16
|
+
|
|
17
|
+
const orb = document.getElementById("orb");
|
|
18
|
+
const statusEl = document.getElementById("status");
|
|
19
|
+
const toggleBtn = document.getElementById("toggleBtn");
|
|
20
|
+
|
|
21
|
+
if (!(orb instanceof BrowserHTMLElement)) {
|
|
22
|
+
throw new Error("Missing #orb element");
|
|
23
|
+
}
|
|
24
|
+
if (!(statusEl instanceof BrowserHTMLElement)) {
|
|
25
|
+
throw new Error("Missing #status element");
|
|
26
|
+
}
|
|
27
|
+
if (!(toggleBtn instanceof BrowserHTMLButtonElement)) {
|
|
28
|
+
throw new Error("Missing #toggleBtn element");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const orbEl = orb;
|
|
32
|
+
const statusNode = statusEl;
|
|
33
|
+
const toggleBtnEl = toggleBtn;
|
|
34
|
+
|
|
35
|
+
// Cobra VAD on the server requires 16kHz raw PCM
|
|
36
|
+
const CAPTURE_SAMPLE_RATE = 16000;
|
|
37
|
+
|
|
38
|
+
let isActive = false;
|
|
39
|
+
|
|
40
|
+
/* ---- PLAYBACK (TTS arrives as 24kHz raw PCM from server) ---- */
|
|
41
|
+
|
|
42
|
+
const playbackCtx = new BrowserAudioContext({ sampleRate: 24000 });
|
|
43
|
+
let playbackTime = 0;
|
|
44
|
+
/** @type {AudioBufferSourceNode[]} */
|
|
45
|
+
let activeSources = []; // track all scheduled sources so we can stop them on interrupt
|
|
46
|
+
let playbackCanceled = false;
|
|
47
|
+
|
|
48
|
+
/* ---- WEBSOCKET ---- */
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Create (or recreate) the WebSocket connection.
|
|
52
|
+
* Called at page load and on every startConversation so the page
|
|
53
|
+
* is recoverable after a disconnect.
|
|
54
|
+
*/
|
|
55
|
+
function connectSocket() {
|
|
56
|
+
if (socket && socket.readyState <= BrowserWebSocket.OPEN) {
|
|
57
|
+
return; // already connected or connecting
|
|
58
|
+
}
|
|
59
|
+
socket = new BrowserWebSocket(socketUrl);
|
|
60
|
+
socket.binaryType = "blob";
|
|
61
|
+
|
|
62
|
+
socket.onopen = () => {
|
|
63
|
+
console.log("Connected");
|
|
64
|
+
statusNode.textContent = "Connected";
|
|
65
|
+
};
|
|
66
|
+
/** @param {Event} e */
|
|
67
|
+
socket.onerror = (e) => console.error("WS error", e);
|
|
68
|
+
socket.onclose = () => {
|
|
69
|
+
stopConversation();
|
|
70
|
+
statusNode.textContent = "Disconnected";
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
socket.onmessage = onSocketMessage;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/* ---- INCOMING: TTS audio + control messages ---- */
|
|
77
|
+
|
|
78
|
+
/** @param {MessageEvent<string | Blob>} event */
|
|
79
|
+
async function onSocketMessage(event) {
|
|
80
|
+
// JSON control message
|
|
81
|
+
if (typeof event.data === "string") {
|
|
82
|
+
let msg;
|
|
83
|
+
try {
|
|
84
|
+
msg = JSON.parse(event.data);
|
|
85
|
+
} catch {
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (msg.type === "interrupt") {
|
|
90
|
+
console.log("Interrupt — stopping playback");
|
|
91
|
+
playbackCanceled = true;
|
|
92
|
+
activeSources.forEach((s) => {
|
|
93
|
+
try {
|
|
94
|
+
s.stop(0);
|
|
95
|
+
} catch {
|
|
96
|
+
/* already stopped */
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
activeSources = [];
|
|
100
|
+
playbackTime = playbackCtx.currentTime;
|
|
101
|
+
orbEl.className = isActive ? "listening" : "idle";
|
|
102
|
+
statusNode.textContent = isActive ? "Listening..." : "Stopped.";
|
|
103
|
+
}
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Binary: raw 16-bit PCM, 24kHz, mono
|
|
108
|
+
if (!(event.data instanceof BrowserBlob)) {
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
orbEl.className = "speaking";
|
|
113
|
+
statusNode.textContent = "Assistant speaking...";
|
|
114
|
+
playbackCanceled = false;
|
|
115
|
+
|
|
116
|
+
const arrayBuffer = await event.data.arrayBuffer();
|
|
117
|
+
const pcm16 = new Int16Array(arrayBuffer);
|
|
118
|
+
const float32 = new Float32Array(pcm16.length);
|
|
119
|
+
for (let i = 0; i < pcm16.length; i++) {
|
|
120
|
+
float32[i] = pcm16[i] / 32768;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const audioBuffer = playbackCtx.createBuffer(1, float32.length, 24000);
|
|
124
|
+
audioBuffer.getChannelData(0).set(float32);
|
|
125
|
+
|
|
126
|
+
const source = playbackCtx.createBufferSource();
|
|
127
|
+
source.buffer = audioBuffer;
|
|
128
|
+
source.connect(playbackCtx.destination);
|
|
129
|
+
|
|
130
|
+
const nowT = playbackCtx.currentTime;
|
|
131
|
+
if (playbackTime < nowT) {
|
|
132
|
+
playbackTime = nowT;
|
|
133
|
+
}
|
|
134
|
+
source.start(playbackTime);
|
|
135
|
+
playbackTime += audioBuffer.duration;
|
|
136
|
+
activeSources.push(source);
|
|
137
|
+
|
|
138
|
+
source.onended = () => {
|
|
139
|
+
activeSources = activeSources.filter((s) => s !== source);
|
|
140
|
+
// When the last chunk finishes, notify server and reset UI.
|
|
141
|
+
// Skip if playback was intentionally canceled (interrupt/stop) —
|
|
142
|
+
// stale onended callbacks must not send playback_done mid-barge-in.
|
|
143
|
+
if (
|
|
144
|
+
!playbackCanceled &&
|
|
145
|
+
activeSources.length === 0 &&
|
|
146
|
+
playbackTime <= playbackCtx.currentTime + 0.05
|
|
147
|
+
) {
|
|
148
|
+
if (socket && socket.readyState === BrowserWebSocket.OPEN) {
|
|
149
|
+
socket.send(JSON.stringify({ type: "playback_done" }));
|
|
150
|
+
}
|
|
151
|
+
orbEl.className = isActive ? "listening" : "idle";
|
|
152
|
+
statusNode.textContent = isActive ? "Listening..." : "Stopped.";
|
|
153
|
+
}
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Connect immediately on page load
|
|
158
|
+
connectSocket();
|
|
159
|
+
|
|
160
|
+
/* ---- CAPTURE ---- */
|
|
161
|
+
|
|
162
|
+
/** @type {AudioContext | null} */
|
|
163
|
+
let captureCtx = null;
|
|
164
|
+
/** @type {ScriptProcessorNode | null} */
|
|
165
|
+
let scriptProcessor = null;
|
|
166
|
+
/** @type {MediaStream | null} */
|
|
167
|
+
let micStream = null;
|
|
168
|
+
|
|
169
|
+
async function startConversation() {
|
|
170
|
+
try {
|
|
171
|
+
// Ensure we have a live WebSocket (reconnects after previous disconnect)
|
|
172
|
+
connectSocket();
|
|
173
|
+
await playbackCtx.resume();
|
|
174
|
+
|
|
175
|
+
micStream = await navigator.mediaDevices.getUserMedia({
|
|
176
|
+
audio: {
|
|
177
|
+
echoCancellation: true,
|
|
178
|
+
noiseSuppression: true,
|
|
179
|
+
autoGainControl: true,
|
|
180
|
+
channelCount: 1,
|
|
181
|
+
},
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
// Separate AudioContext at 16kHz — keeps capture and playback sample rates independent
|
|
185
|
+
captureCtx = new BrowserAudioContext({ sampleRate: CAPTURE_SAMPLE_RATE });
|
|
186
|
+
const micSource = captureCtx.createMediaStreamSource(micStream);
|
|
187
|
+
|
|
188
|
+
// 1024 samples = 64ms per callback; server splits into 512-sample Cobra frames
|
|
189
|
+
scriptProcessor = captureCtx.createScriptProcessor(1024, 1, 1);
|
|
190
|
+
scriptProcessor.onaudioprocess = (e) => {
|
|
191
|
+
if (!isActive || !socket || socket.readyState !== BrowserWebSocket.OPEN) {
|
|
192
|
+
return;
|
|
193
|
+
}
|
|
194
|
+
const input = e.inputBuffer.getChannelData(0);
|
|
195
|
+
const int16 = new Int16Array(input.length);
|
|
196
|
+
for (let i = 0; i < input.length; i++) {
|
|
197
|
+
const s = Math.max(-1, Math.min(1, input[i]));
|
|
198
|
+
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
199
|
+
}
|
|
200
|
+
socket.send(int16.buffer);
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
micSource.connect(scriptProcessor);
|
|
204
|
+
// ScriptProcessor requires a destination in the graph to fire onaudioprocess,
|
|
205
|
+
// but we must NOT route mic audio to speakers — that feeds back into the mic,
|
|
206
|
+
// defeats browser AEC, and causes Soniox to transcribe TTS audio as user speech.
|
|
207
|
+
// A zero-gain node keeps the graph alive while staying completely silent.
|
|
208
|
+
const silentGain = captureCtx.createGain();
|
|
209
|
+
silentGain.gain.value = 0;
|
|
210
|
+
scriptProcessor.connect(silentGain);
|
|
211
|
+
silentGain.connect(captureCtx.destination);
|
|
212
|
+
|
|
213
|
+
isActive = true;
|
|
214
|
+
orbEl.className = "listening";
|
|
215
|
+
statusNode.textContent = "Listening...";
|
|
216
|
+
toggleBtnEl.textContent = "Stop Conversation";
|
|
217
|
+
toggleBtnEl.classList.add("active");
|
|
218
|
+
} catch (err) {
|
|
219
|
+
// Release any partially-initialized resources
|
|
220
|
+
if (scriptProcessor) {
|
|
221
|
+
scriptProcessor.disconnect();
|
|
222
|
+
scriptProcessor = null;
|
|
223
|
+
}
|
|
224
|
+
if (micStream) {
|
|
225
|
+
micStream.getTracks().forEach((t) => t.stop());
|
|
226
|
+
micStream = null;
|
|
227
|
+
}
|
|
228
|
+
if (captureCtx) {
|
|
229
|
+
captureCtx.close();
|
|
230
|
+
captureCtx = null;
|
|
231
|
+
}
|
|
232
|
+
isActive = false;
|
|
233
|
+
console.error("Failed to start:", err);
|
|
234
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
235
|
+
browserAlert("Error: " + message);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function stopConversation() {
|
|
240
|
+
isActive = false;
|
|
241
|
+
playbackCanceled = true;
|
|
242
|
+
activeSources.forEach((s) => {
|
|
243
|
+
try {
|
|
244
|
+
s.stop(0);
|
|
245
|
+
} catch {
|
|
246
|
+
/* already stopped */
|
|
247
|
+
}
|
|
248
|
+
});
|
|
249
|
+
activeSources = [];
|
|
250
|
+
playbackTime = playbackCtx.currentTime;
|
|
251
|
+
if (scriptProcessor) {
|
|
252
|
+
scriptProcessor.disconnect();
|
|
253
|
+
scriptProcessor = null;
|
|
254
|
+
}
|
|
255
|
+
if (micStream) {
|
|
256
|
+
micStream.getTracks().forEach((t) => t.stop());
|
|
257
|
+
micStream = null;
|
|
258
|
+
}
|
|
259
|
+
if (captureCtx) {
|
|
260
|
+
captureCtx.close();
|
|
261
|
+
captureCtx = null;
|
|
262
|
+
}
|
|
263
|
+
orbEl.className = "idle";
|
|
264
|
+
statusNode.textContent = "Stopped.";
|
|
265
|
+
toggleBtnEl.textContent = "Start Conversation";
|
|
266
|
+
toggleBtnEl.classList.remove("active");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
toggleBtnEl.onclick = async () => {
|
|
270
|
+
if (!isActive) {
|
|
271
|
+
await startConversation();
|
|
272
|
+
} else {
|
|
273
|
+
stopConversation();
|
|
274
|
+
}
|
|
275
|
+
};
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8" />
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
6
|
+
<title>Voice Assistant</title>
|
|
7
|
+
<link rel="stylesheet" href="styles.css" />
|
|
8
|
+
</head>
|
|
9
|
+
|
|
10
|
+
<body>
|
|
11
|
+
<div id="orb" class="idle"></div>
|
|
12
|
+
|
|
13
|
+
<button id="toggleBtn">Start Conversation</button>
|
|
14
|
+
<p id="status" aria-live="polite">Ready</p>
|
|
15
|
+
|
|
16
|
+
<script type="module" src="app.js"></script>
|
|
17
|
+
</body>
|
|
18
|
+
</html>
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/** @type {any} */
|
|
2
|
+
const g = globalThis;
|
|
3
|
+
|
|
4
|
+
/** @type {any} */
|
|
5
|
+
const AudioWorkletProcessorBase = g["AudioWorkletProcessor"];
|
|
6
|
+
|
|
7
|
+
/** @type {(name: string, ctor: any) => void} */
|
|
8
|
+
const registerProcessorFn = g["registerProcessor"];
|
|
9
|
+
|
|
10
|
+
/** Max queued chunks before dropping oldest (~10s of 24kHz audio). */
|
|
11
|
+
const HIGH_WATER_MARK = 500;
|
|
12
|
+
|
|
13
|
+
class PCMProcessor extends AudioWorkletProcessorBase {
|
|
14
|
+
constructor() {
|
|
15
|
+
super();
|
|
16
|
+
/** @type {Float32Array[]} */
|
|
17
|
+
this.queue = [];
|
|
18
|
+
/** @type {number} */
|
|
19
|
+
this.offset = 0;
|
|
20
|
+
|
|
21
|
+
/** @param {MessageEvent<ArrayLike<number>>} event */
|
|
22
|
+
this.port.onmessage = (event) => {
|
|
23
|
+
if (this.queue.length >= HIGH_WATER_MARK) {
|
|
24
|
+
// Drop oldest chunks to prevent unbounded memory growth
|
|
25
|
+
this.queue.splice(0, this.queue.length - HIGH_WATER_MARK + 1);
|
|
26
|
+
this.offset = 0;
|
|
27
|
+
}
|
|
28
|
+
this.queue.push(new Float32Array(event.data));
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* @param {Float32Array[][]} _inputs
|
|
34
|
+
* @param {Float32Array[][]} outputs
|
|
35
|
+
* @returns {boolean}
|
|
36
|
+
*/
|
|
37
|
+
process(_inputs, outputs) {
|
|
38
|
+
const output = outputs[0][0];
|
|
39
|
+
|
|
40
|
+
if (!this.queue.length) {
|
|
41
|
+
output.fill(0);
|
|
42
|
+
return true;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
let chunk = this.queue[0];
|
|
46
|
+
|
|
47
|
+
for (let i = 0; i < output.length; i++) {
|
|
48
|
+
if (this.offset >= chunk.length) {
|
|
49
|
+
this.queue.shift();
|
|
50
|
+
this.offset = 0;
|
|
51
|
+
|
|
52
|
+
if (!this.queue.length) {
|
|
53
|
+
output.fill(0, i);
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
chunk = this.queue[0];
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
output[i] = chunk[this.offset++];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
registerProcessorFn("pcm-processor", PCMProcessor);
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/* Base layout */
|
|
2
|
+
body {
|
|
3
|
+
background: radial-gradient(circle at center, #0b0f19 0%, #05070d 70%);
|
|
4
|
+
color: white;
|
|
5
|
+
display: flex;
|
|
6
|
+
flex-direction: column;
|
|
7
|
+
align-items: center;
|
|
8
|
+
justify-content: center;
|
|
9
|
+
height: 100vh;
|
|
10
|
+
margin: 0;
|
|
11
|
+
font-family:
|
|
12
|
+
-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/* Voice orb */
|
|
16
|
+
#orb {
|
|
17
|
+
width: 140px;
|
|
18
|
+
height: 140px;
|
|
19
|
+
border-radius: 50%;
|
|
20
|
+
margin-bottom: 32px;
|
|
21
|
+
transition: all 0.35s ease;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/* Idle */
|
|
25
|
+
#orb.idle {
|
|
26
|
+
background: radial-gradient(circle, #222, #0a0a0a);
|
|
27
|
+
box-shadow: 0 0 18px rgba(255, 255, 255, 0.08);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/* Listening */
|
|
31
|
+
#orb.listening {
|
|
32
|
+
background: radial-gradient(circle, #00ff99, #006644);
|
|
33
|
+
box-shadow: 0 0 70px rgba(0, 255, 153, 0.9);
|
|
34
|
+
animation: pulse 1.6s infinite;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/* Speaking */
|
|
38
|
+
#orb.speaking {
|
|
39
|
+
background: radial-gradient(circle, #00e5ff, #003b44);
|
|
40
|
+
box-shadow: 0 0 90px rgba(0, 229, 255, 1);
|
|
41
|
+
animation: speak 0.35s infinite alternate;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/* Toggle button */
|
|
45
|
+
#toggleBtn {
|
|
46
|
+
padding: 14px 32px;
|
|
47
|
+
font-size: 16px;
|
|
48
|
+
font-weight: 700;
|
|
49
|
+
border-radius: 999px;
|
|
50
|
+
border: none;
|
|
51
|
+
cursor: pointer;
|
|
52
|
+
background: linear-gradient(135deg, #2979ff, #00e5ff);
|
|
53
|
+
color: #001018;
|
|
54
|
+
letter-spacing: 0.6px;
|
|
55
|
+
box-shadow: 0 8px 30px rgba(0, 229, 255, 0.45);
|
|
56
|
+
transition: all 0.3s ease;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#toggleBtn:hover {
|
|
60
|
+
transform: translateY(-2px);
|
|
61
|
+
box-shadow: 0 12px 40px rgba(0, 229, 255, 0.7);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#toggleBtn.active {
|
|
65
|
+
background: linear-gradient(135deg, #ff1744, #ff616f);
|
|
66
|
+
color: white;
|
|
67
|
+
box-shadow: 0 8px 30px rgba(255, 23, 68, 0.5);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/* Status text */
|
|
71
|
+
#status {
|
|
72
|
+
margin-top: 18px;
|
|
73
|
+
font-size: 15px;
|
|
74
|
+
color: #9aa4b2;
|
|
75
|
+
min-height: 20px;
|
|
76
|
+
text-align: center;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/* Animations */
|
|
80
|
+
@keyframes pulse {
|
|
81
|
+
0% {
|
|
82
|
+
transform: scale(1);
|
|
83
|
+
opacity: 1;
|
|
84
|
+
}
|
|
85
|
+
50% {
|
|
86
|
+
transform: scale(1.08);
|
|
87
|
+
opacity: 0.85;
|
|
88
|
+
}
|
|
89
|
+
100% {
|
|
90
|
+
transform: scale(1);
|
|
91
|
+
opacity: 1;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
@keyframes speak {
|
|
96
|
+
from {
|
|
97
|
+
transform: scale(1);
|
|
98
|
+
}
|
|
99
|
+
to {
|
|
100
|
+
transform: scale(1.05);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { FrameBus } from "./frameBus.js";
|
|
2
|
+
export declare enum TurnState {
|
|
3
|
+
IDLE = 0,
|
|
4
|
+
USER_SPEAKING = 1,
|
|
5
|
+
PROCESSING = 2,
|
|
6
|
+
ASSISTANT_SPEAKING = 3
|
|
7
|
+
}
|
|
8
|
+
export declare class TurnManager {
|
|
9
|
+
state: TurnState;
|
|
10
|
+
constructor(bus: FrameBus);
|
|
11
|
+
private onVadStart;
|
|
12
|
+
private onVadStop;
|
|
13
|
+
assistantSpeaking(): void;
|
|
14
|
+
reset(): void;
|
|
15
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
export var TurnState;
|
|
2
|
+
(function (TurnState) {
|
|
3
|
+
TurnState[TurnState["IDLE"] = 0] = "IDLE";
|
|
4
|
+
TurnState[TurnState["USER_SPEAKING"] = 1] = "USER_SPEAKING";
|
|
5
|
+
TurnState[TurnState["PROCESSING"] = 2] = "PROCESSING";
|
|
6
|
+
TurnState[TurnState["ASSISTANT_SPEAKING"] = 3] = "ASSISTANT_SPEAKING";
|
|
7
|
+
})(TurnState || (TurnState = {}));
|
|
8
|
+
export class TurnManager {
|
|
9
|
+
state = TurnState.IDLE;
|
|
10
|
+
constructor(bus) {
|
|
11
|
+
bus.subscribe("vad_start", () => this.onVadStart());
|
|
12
|
+
bus.subscribe("vad_stop", () => this.onVadStop());
|
|
13
|
+
}
|
|
14
|
+
onVadStart() {
|
|
15
|
+
// Only update state if TTS is NOT playing. During ASSISTANT_SPEAKING, the
|
|
16
|
+
// barge-in interrupt is triggered by Soniox non-final tokens — which arrive
|
|
17
|
+
// after a network round-trip. If we let VAD immediately flip state to
|
|
18
|
+
// USER_SPEAKING, the state check in handleSonioxMessage fails and the
|
|
19
|
+
// interrupt never fires.
|
|
20
|
+
if (this.state !== TurnState.ASSISTANT_SPEAKING) {
|
|
21
|
+
this.state = TurnState.USER_SPEAKING;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
onVadStop() {
|
|
25
|
+
if (this.state === TurnState.USER_SPEAKING) {
|
|
26
|
+
this.state = TurnState.PROCESSING;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
assistantSpeaking() {
|
|
30
|
+
this.state = TurnState.ASSISTANT_SPEAKING;
|
|
31
|
+
}
|
|
32
|
+
reset() {
|
|
33
|
+
this.state = TurnState.IDLE;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export type Frame = {
|
|
2
|
+
type: "audio";
|
|
3
|
+
data: Int16Array;
|
|
4
|
+
} | {
|
|
5
|
+
type: "vad_start";
|
|
6
|
+
} | {
|
|
7
|
+
type: "vad_stop";
|
|
8
|
+
} | {
|
|
9
|
+
type: "transcript";
|
|
10
|
+
text: string;
|
|
11
|
+
final: boolean;
|
|
12
|
+
} | {
|
|
13
|
+
type: "llm_token";
|
|
14
|
+
text: string;
|
|
15
|
+
} | {
|
|
16
|
+
type: "tts_audio";
|
|
17
|
+
data: Buffer;
|
|
18
|
+
} | {
|
|
19
|
+
type: "interrupt";
|
|
20
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function startVoiceServer(port?: number): Promise<void>;
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import express from "express";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import http from "http";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import { fileURLToPath } from "url";
|
|
6
|
+
import { setupWebSocket } from "./voiceWebSocketHandler.js";
|
|
7
|
+
import { NeuroLink } from "../../neurolink.js";
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { withTimeout } from "../../utils/async/withTimeout.js";
|
|
10
|
+
import { getCartesiaWsUrl } from "../../adapters/tts/cartesiaHandler.js";
|
|
11
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
12
|
+
const __dirname = path.dirname(__filename);
|
|
13
|
+
/**
|
|
14
|
+
* Resolve the public/ directory containing static assets.
|
|
15
|
+
* The CLI build (tsc) only emits .ts → .js and does NOT copy non-TS assets,
|
|
16
|
+
* so __dirname/public may not exist when running from dist/.
|
|
17
|
+
* Fall back to the original source path in that case.
|
|
18
|
+
*/
|
|
19
|
+
function resolvePublicPath() {
|
|
20
|
+
const compiled = path.join(__dirname, "public");
|
|
21
|
+
if (fs.existsSync(compiled)) {
|
|
22
|
+
return compiled;
|
|
23
|
+
}
|
|
24
|
+
// Resolve from project root → src/lib/server/voice/public
|
|
25
|
+
const source = path.resolve(__dirname, "../../../../src/lib/server/voice/public");
|
|
26
|
+
if (fs.existsSync(source)) {
|
|
27
|
+
return source;
|
|
28
|
+
}
|
|
29
|
+
return compiled; // let express.static handle the 404
|
|
30
|
+
}
|
|
31
|
+
export async function startVoiceServer(port = 3000) {
|
|
32
|
+
const app = express();
|
|
33
|
+
/* ---------- STATIC FILES ---------- */
|
|
34
|
+
const publicPath = resolvePublicPath();
|
|
35
|
+
logger.info("[SERVER] Serving static from:", publicPath);
|
|
36
|
+
app.use(express.static(publicPath));
|
|
37
|
+
app.get("/", (_, res) => {
|
|
38
|
+
res.sendFile(path.join(publicPath, "index.html"));
|
|
39
|
+
});
|
|
40
|
+
/* ---------- HEALTH CHECK ---------- */
|
|
41
|
+
app.get("/health", (_, res) => {
|
|
42
|
+
res.json({ status: "ok" });
|
|
43
|
+
});
|
|
44
|
+
const server = http.createServer(app);
|
|
45
|
+
/* ---------- WS ---------- */
|
|
46
|
+
setupWebSocket(server);
|
|
47
|
+
/* ---------- START ---------- */
|
|
48
|
+
await new Promise((resolve, reject) => {
|
|
49
|
+
server.once("error", reject);
|
|
50
|
+
server.listen(port, () => {
|
|
51
|
+
server.removeListener("error", reject);
|
|
52
|
+
logger.info(`[SERVER] Voice server running at http://localhost:${port}`);
|
|
53
|
+
resolve();
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
/* ---------- WARMUP ---------- */
|
|
57
|
+
// Pre-warm NeuroLink + Azure on startup so the first real user request isn't
|
|
58
|
+
// slow. NeuroLink's MCP init + Azure's connection pool both have cold-start
|
|
59
|
+
// overhead that shows up as 3-4s on the very first call. We also open and
|
|
60
|
+
// immediately close a Cartesia WS to prime the TLS handshake.
|
|
61
|
+
warmup().catch((err) => {
|
|
62
|
+
logger.warn("[WARMUP] Failed (non-fatal):", err.message);
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
async function warmup() {
|
|
66
|
+
const t = Date.now();
|
|
67
|
+
logger.info("[WARMUP] Warming up LLM + TTS...");
|
|
68
|
+
const neurolink = new NeuroLink();
|
|
69
|
+
const provider = process.env.VOICE_LLM_PROVIDER ?? "azure";
|
|
70
|
+
const model = process.env.VOICE_LLM_MODEL ?? "gpt-4o-automatic";
|
|
71
|
+
try {
|
|
72
|
+
const result = await withTimeout(neurolink.stream({
|
|
73
|
+
provider,
|
|
74
|
+
model,
|
|
75
|
+
input: { text: "hi" },
|
|
76
|
+
maxTokens: 3,
|
|
77
|
+
disableTools: true,
|
|
78
|
+
enableAnalytics: false,
|
|
79
|
+
enableEvaluation: false,
|
|
80
|
+
}), 15000, "LLM warmup timed out");
|
|
81
|
+
// Drain the stream so the connection is fully exercised.
|
|
82
|
+
for await (const _chunk of result.stream) {
|
|
83
|
+
/* drain */
|
|
84
|
+
}
|
|
85
|
+
logger.info(`[WARMUP] LLM warmup done in ${Date.now() - t}ms`);
|
|
86
|
+
}
|
|
87
|
+
catch (err) {
|
|
88
|
+
logger.warn("[WARMUP] LLM warmup failed (non-fatal):", err.message);
|
|
89
|
+
}
|
|
90
|
+
// Cartesia TLS warmup — open WS, wait for connect, then close.
|
|
91
|
+
try {
|
|
92
|
+
const { default: WebSocket } = await import("ws");
|
|
93
|
+
const apiKey = process.env.CARTESIA_API_KEY;
|
|
94
|
+
await new Promise((resolve) => {
|
|
95
|
+
const ws = new WebSocket(getCartesiaWsUrl(), {
|
|
96
|
+
headers: apiKey ? { "X-API-Key": apiKey } : undefined,
|
|
97
|
+
});
|
|
98
|
+
const timeout = setTimeout(() => {
|
|
99
|
+
ws.terminate();
|
|
100
|
+
resolve(); // non-fatal, just move on
|
|
101
|
+
}, 5000);
|
|
102
|
+
ws.once("open", () => {
|
|
103
|
+
clearTimeout(timeout);
|
|
104
|
+
ws.close();
|
|
105
|
+
resolve();
|
|
106
|
+
});
|
|
107
|
+
ws.once("error", () => {
|
|
108
|
+
clearTimeout(timeout);
|
|
109
|
+
resolve(); // non-fatal
|
|
110
|
+
});
|
|
111
|
+
});
|
|
112
|
+
logger.info(`[WARMUP] Cartesia warmup done in ${Date.now() - t}ms`);
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
// non-fatal
|
|
116
|
+
}
|
|
117
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { Server as HttpServer } from "http";
|
|
2
|
+
/**
|
|
3
|
+
* Call from the voice-server command handler BEFORE importing anything else
|
|
4
|
+
* so the env change is scoped to voice mode only.
|
|
5
|
+
*/
|
|
6
|
+
export declare function configureVoiceServerEnvironment(): void;
|
|
7
|
+
export type Message = {
|
|
8
|
+
role: "system" | "user" | "assistant";
|
|
9
|
+
content: string;
|
|
10
|
+
};
|
|
11
|
+
export declare function setupWebSocket(server: HttpServer): void;
|