@cognidesk/voice-websocket 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,841 @@
1
+ // src/index.ts
2
+ import { WebSocketServer } from "ws";
3
+ var COGNIDESK_VOICE_PROTOCOL = "cognidesk.voice.v1";
4
+ function createInMemoryVoiceSessionStore(options = {}) {
5
+ const sessions = /* @__PURE__ */ new Map();
6
+ const tokens = /* @__PURE__ */ new Map();
7
+ const createToken = options.createToken ?? (() => createId("voice_socket_token"));
8
+ return {
9
+ async createSession(input) {
10
+ const now = input.now ?? /* @__PURE__ */ new Date();
11
+ const result = input.result;
12
+ const session = {
13
+ id: result.connection.id,
14
+ conversation: result.conversation,
15
+ channelSegment: result.channelSegment,
16
+ connection: result.connection,
17
+ events: result.events,
18
+ createdAt: now.toISOString(),
19
+ updatedAt: now.toISOString(),
20
+ status: "pending",
21
+ lastAckSequence: 0
22
+ };
23
+ sessions.set(session.id, session);
24
+ const token = createTokenRecord({
25
+ createToken,
26
+ connectionId: session.connection.id,
27
+ sessionId: session.id,
28
+ purpose: "start",
29
+ ttlMs: input.tokenTtlMs,
30
+ now
31
+ });
32
+ tokens.set(token.token, token);
33
+ return {
34
+ session,
35
+ socket: {
36
+ url: "",
37
+ token: token.token,
38
+ expiresAt: token.expiresAt,
39
+ protocol: COGNIDESK_VOICE_PROTOCOL
40
+ }
41
+ };
42
+ },
43
+ async claimToken(input) {
44
+ const now = input.now ?? /* @__PURE__ */ new Date();
45
+ const token = tokens.get(input.token);
46
+ if (!token) return null;
47
+ if (token.connectionId !== input.connectionId) return null;
48
+ if (token.consumedAt) return null;
49
+ if (Date.parse(token.expiresAt) <= now.getTime()) return null;
50
+ const session = sessions.get(token.sessionId);
51
+ if (!session || session.status === "ended") return null;
52
+ token.consumedAt = now.toISOString();
53
+ tokens.set(token.token, token);
54
+ return {
55
+ session,
56
+ token,
57
+ reconnect: token.purpose === "reconnect"
58
+ };
59
+ },
60
+ async issueReconnectToken(input) {
61
+ const session = sessions.get(input.sessionId);
62
+ if (!session) throw new Error(`Voice session '${input.sessionId}' was not found.`);
63
+ const token = createTokenRecord({
64
+ createToken,
65
+ connectionId: session.connection.id,
66
+ sessionId: session.id,
67
+ purpose: "reconnect",
68
+ ttlMs: input.ttlMs,
69
+ now: input.now ?? /* @__PURE__ */ new Date()
70
+ });
71
+ tokens.set(token.token, token);
72
+ return token;
73
+ },
74
+ async acknowledgeAudio(input) {
75
+ const session = requireSession(sessions, input.sessionId);
76
+ if (input.sequence > session.lastAckSequence) {
77
+ session.lastAckSequence = input.sequence;
78
+ session.updatedAt = (input.now ?? /* @__PURE__ */ new Date()).toISOString();
79
+ sessions.set(session.id, session);
80
+ }
81
+ return session;
82
+ },
83
+ async markConnected(sessionId, now = /* @__PURE__ */ new Date()) {
84
+ const session = requireSession(sessions, sessionId);
85
+ session.status = "connected";
86
+ session.updatedAt = now.toISOString();
87
+ delete session.reconnectGraceUntil;
88
+ sessions.set(session.id, session);
89
+ return session;
90
+ },
91
+ async markReconnecting(sessionId, now = /* @__PURE__ */ new Date(), graceMs = 3e4) {
92
+ const session = requireSession(sessions, sessionId);
93
+ session.status = "reconnecting";
94
+ session.updatedAt = now.toISOString();
95
+ session.reconnectGraceUntil = new Date(now.getTime() + graceMs).toISOString();
96
+ sessions.set(session.id, session);
97
+ return session;
98
+ },
99
+ async markEnded(sessionId, now = /* @__PURE__ */ new Date()) {
100
+ const session = requireSession(sessions, sessionId);
101
+ session.status = "ended";
102
+ session.updatedAt = now.toISOString();
103
+ sessions.set(session.id, session);
104
+ return session;
105
+ },
106
+ async getSession(sessionId) {
107
+ return sessions.get(sessionId) ?? null;
108
+ }
109
+ };
110
+ }
111
+ function createVoiceSocketHandshake(options) {
112
+ const tokenTtlMs = options.tokenTtlMs ?? 6e4;
113
+ const pathPrefix = normalizePathPrefix(options.pathPrefix ?? "/voice/connections");
114
+ return {
115
+ async createSocket(input) {
116
+ const created = await options.store.createSession({
117
+ result: input.result,
118
+ tokenTtlMs
119
+ });
120
+ return {
121
+ ...created.socket,
122
+ url: buildSocketUrl({
123
+ requestUrl: input.request.url,
124
+ basePath: input.basePath,
125
+ pathPrefix,
126
+ connectionId: input.result.connection.id,
127
+ token: created.socket.token,
128
+ ...options.baseUrl ? { baseUrl: options.baseUrl } : {}
129
+ })
130
+ };
131
+ }
132
+ };
133
+ }
134
+ async function handleVoiceSocket(options) {
135
+ const claimed = await options.store.claimToken({
136
+ connectionId: options.connectionId,
137
+ token: options.token
138
+ });
139
+ if (!claimed) {
140
+ send(options.socket, {
141
+ type: "error",
142
+ event_id: createId("voice_event"),
143
+ error: {
144
+ code: "invalid_voice_socket_token",
145
+ message: "Voice socket token is invalid, expired, or already used."
146
+ }
147
+ });
148
+ options.socket.close(4401, "Invalid voice socket token");
149
+ return;
150
+ }
151
+ const controller = new AbortController();
152
+ const abort = () => controller.abort();
153
+ options.signal?.addEventListener("abort", abort, { once: true });
154
+ const session = await options.store.markConnected(claimed.session.id);
155
+ let providerSession = null;
156
+ let closed = false;
157
+ const inputTranscriptDebounceMs = Math.max(0, options.inputTranscriptDebounceMs ?? 350);
158
+ const turnPreambleMs = Math.max(0, options.turnPreambleMs ?? 1200);
159
+ const useRealtimeControl = Boolean(options.control);
160
+ let pendingInputTranscript = null;
161
+ let pendingInputTranscriptTimer = null;
162
+ let turnPreambleTimer = null;
163
+ let inputTranscriptQueue = Promise.resolve();
164
+ let speechQueue = Promise.resolve();
165
+ let speechGeneration = 0;
166
+ const sendRuntimeEvents = (events) => {
167
+ for (const event of events) {
168
+ send(options.socket, {
169
+ type: "cognidesk.runtime_event",
170
+ event_id: createId("voice_event"),
171
+ event
172
+ });
173
+ }
174
+ };
175
+ const issueReconnect = async () => {
176
+ const token = await options.store.issueReconnectToken({
177
+ sessionId: session.id,
178
+ ttlMs: options.reconnectTokenTtlMs ?? 3e4
179
+ });
180
+ send(options.socket, {
181
+ type: "cognidesk.connection.reconnect_token",
182
+ event_id: createId("voice_event"),
183
+ token: token.token,
184
+ expiresAt: token.expiresAt
185
+ });
186
+ };
187
+ const clearTurnPreambleTimer = () => {
188
+ if (!turnPreambleTimer) return;
189
+ clearTimeout(turnPreambleTimer);
190
+ turnPreambleTimer = null;
191
+ };
192
+ const queueSpeechAction = (generation, action) => {
193
+ const queued = speechQueue.catch(() => void 0).then(async () => {
194
+ if (closed || generation !== speechGeneration) return;
195
+ await action();
196
+ });
197
+ speechQueue = queued.catch((error) => {
198
+ send(options.socket, {
199
+ type: "error",
200
+ event_id: createId("voice_event"),
201
+ error: {
202
+ code: "voice_speech_failed",
203
+ message: error instanceof Error ? error.message : "Failed to queue voice speech."
204
+ }
205
+ });
206
+ });
207
+ };
208
+ const startTurnPreambleTimer = (text, generation) => {
209
+ clearTurnPreambleTimer();
210
+ if (!providerSession?.preamble) return;
211
+ if (turnPreambleMs === 0) return;
212
+ turnPreambleTimer = setTimeout(() => {
213
+ turnPreambleTimer = null;
214
+ queueSpeechAction(generation, () => providerSession?.preamble?.({ text }));
215
+ }, turnPreambleMs);
216
+ };
217
+ const handleProviderEvent = async (event) => {
218
+ if (event.kind === "runtime_events") {
219
+ sendRuntimeEvents(event.events);
220
+ return;
221
+ }
222
+ if (event.kind === "server_event") {
223
+ if (isAgentResponseSignal(event.event)) clearTurnPreambleTimer();
224
+ send(options.socket, event.event);
225
+ if (event.event.type === "response.output_audio.delta") {
226
+ await options.recorder?.onAudio?.({
227
+ session,
228
+ speaker: "assistant",
229
+ audio: event.event.delta
230
+ });
231
+ }
232
+ if (useRealtimeControl && event.event.type === "response.output_audio_transcript.done") {
233
+ await commitControlAssistantTranscript(event.event.transcript, "openai-realtime");
234
+ }
235
+ return;
236
+ }
237
+ if (event.kind === "error") {
238
+ send(options.socket, {
239
+ type: "error",
240
+ event_id: createId("voice_event"),
241
+ error: {
242
+ code: event.code ?? "voice_provider_error",
243
+ message: event.message,
244
+ ...event.retryable !== void 0 ? { retryable: event.retryable } : {},
245
+ ...event.details !== void 0 ? { details: event.details } : {}
246
+ }
247
+ });
248
+ return;
249
+ }
250
+ scheduleInputTranscript(event);
251
+ };
252
+ const scheduleInputTranscript = (event) => {
253
+ const text = event.text.trim();
254
+ if (!text) return;
255
+ sendInputTranscriptCompleted(event, text);
256
+ pendingInputTranscript = mergeInputTranscript(
257
+ pendingInputTranscript,
258
+ {
259
+ ...event,
260
+ text
261
+ }
262
+ );
263
+ if (pendingInputTranscriptTimer) clearTimeout(pendingInputTranscriptTimer);
264
+ if (inputTranscriptDebounceMs === 0) {
265
+ const transcript = pendingInputTranscript;
266
+ pendingInputTranscript = null;
267
+ if (transcript) queueInputTranscript(transcript);
268
+ return;
269
+ }
270
+ const waitMs = debounceMsForTranscript(pendingInputTranscript.text, inputTranscriptDebounceMs);
271
+ pendingInputTranscriptTimer = setTimeout(() => {
272
+ const transcript = pendingInputTranscript;
273
+ pendingInputTranscript = null;
274
+ pendingInputTranscriptTimer = null;
275
+ if (transcript) queueInputTranscript(transcript);
276
+ }, waitMs);
277
+ };
278
+ const sendInputTranscriptCompleted = (event, text) => {
279
+ send(options.socket, {
280
+ type: "input_audio_transcription.completed",
281
+ event_id: createId("voice_event"),
282
+ text,
283
+ ...optionalStringField("item_id", event.itemId),
284
+ ...optionalNumberField("startedAtMs", event.startedAtMs),
285
+ ...optionalNumberField("endedAtMs", event.endedAtMs),
286
+ ...optionalStringField("transcriptionSource", event.transcriptionSource),
287
+ ...event.metadata !== void 0 ? { metadata: event.metadata } : {}
288
+ });
289
+ };
290
+ const queueInputTranscript = (event) => {
291
+ inputTranscriptQueue = inputTranscriptQueue.then(() => useRealtimeControl ? commitControlInputTranscript(event) : commitInputTranscript(event)).catch((error) => {
292
+ send(options.socket, {
293
+ type: "error",
294
+ event_id: createId("voice_event"),
295
+ error: {
296
+ code: "voice_transcript_commit_failed",
297
+ message: error instanceof Error ? error.message : "Failed to commit voice transcript."
298
+ }
299
+ });
300
+ });
301
+ };
302
+ const flushPendingInputTranscript = async () => {
303
+ if (pendingInputTranscriptTimer) {
304
+ clearTimeout(pendingInputTranscriptTimer);
305
+ pendingInputTranscriptTimer = null;
306
+ }
307
+ const transcript = pendingInputTranscript;
308
+ pendingInputTranscript = null;
309
+ if (transcript) queueInputTranscript(transcript);
310
+ await inputTranscriptQueue;
311
+ };
312
+ const commitControlInputTranscript = async (event) => {
313
+ if (!options.runtime.commitVoiceTranscript) return;
314
+ const committed = await options.runtime.commitVoiceTranscript({
315
+ conversationId: session.conversation.id,
316
+ channelSegmentId: session.channelSegment.id,
317
+ speaker: "user",
318
+ text: event.text,
319
+ transcriptionSource: event.transcriptionSource ?? "provider",
320
+ ...optionalNumberField("startedAtMs", event.startedAtMs),
321
+ ...optionalNumberField("endedAtMs", event.endedAtMs),
322
+ ...event.metadata !== void 0 ? { metadata: event.metadata } : {}
323
+ });
324
+ sendRuntimeEvents(committed.events);
325
+ await options.recorder?.onTranscript?.({
326
+ session,
327
+ speaker: "user",
328
+ text: event.text,
329
+ runtimeEvent: committed.event
330
+ });
331
+ };
332
+ const commitControlAssistantTranscript = async (text, transcriptionSource) => {
333
+ const normalized = normalizeSpeechText(text ?? "");
334
+ if (!normalized || !options.runtime.commitVoiceTranscript) return;
335
+ await flushPendingInputTranscript();
336
+ const committed = await options.runtime.commitVoiceTranscript({
337
+ conversationId: session.conversation.id,
338
+ channelSegmentId: session.channelSegment.id,
339
+ speaker: "assistant",
340
+ text: normalized,
341
+ transcriptionSource
342
+ });
343
+ sendRuntimeEvents(committed.events);
344
+ await options.recorder?.onTranscript?.({
345
+ session,
346
+ speaker: "assistant",
347
+ text: normalized,
348
+ runtimeEvent: committed.event
349
+ });
350
+ send(options.socket, {
351
+ type: "cognidesk.turn.completed",
352
+ event_id: createId("voice_event"),
353
+ text: normalized
354
+ });
355
+ };
356
+ const controlSurface = options.control ? {
357
+ ...options.control,
358
+ handleToolCall: async (call) => {
359
+ await flushPendingInputTranscript();
360
+ return options.control.handleToolCall(call);
361
+ }
362
+ } : void 0;
363
+ const commitInputTranscript = async (event) => {
364
+ const generation = ++speechGeneration;
365
+ let assistantSpeechBuffer = "";
366
+ let assistantSpeechQueued = false;
367
+ const queueAssistantSpeech = (text, result2) => {
368
+ const normalized = normalizeSpeechText(text);
369
+ if (!normalized) return;
370
+ clearTurnPreambleTimer();
371
+ assistantSpeechQueued = true;
372
+ queueSpeechAction(generation, () => providerSession?.speak({ text: normalized, ...result2 ? { result: result2 } : {} }));
373
+ };
374
+ const flushAssistantSpeech = (force) => {
375
+ while (true) {
376
+ const chunk = takeSpeakablePrefix(assistantSpeechBuffer, force);
377
+ if (!chunk) return;
378
+ assistantSpeechBuffer = assistantSpeechBuffer.slice(chunk.consumed).trimStart();
379
+ queueAssistantSpeech(chunk.text);
380
+ if (!force) return;
381
+ }
382
+ };
383
+ startTurnPreambleTimer(event.text, generation);
384
+ const result = await options.runtime.handleVoiceUserMessage({
385
+ conversationId: session.conversation.id,
386
+ channelSegmentId: session.channelSegment.id,
387
+ connectionId: session.connection.id,
388
+ text: event.text,
389
+ transcriptionSource: event.transcriptionSource ?? "provider",
390
+ ...optionalNumberField("startedAtMs", event.startedAtMs),
391
+ ...optionalNumberField("endedAtMs", event.endedAtMs),
392
+ ...event.metadata !== void 0 ? { metadata: event.metadata } : {},
393
+ onAssistantTextDelta: (textDelta) => {
394
+ assistantSpeechBuffer += textDelta;
395
+ flushAssistantSpeech(false);
396
+ }
397
+ });
398
+ clearTurnPreambleTimer();
399
+ flushAssistantSpeech(true);
400
+ if (!assistantSpeechQueued) {
401
+ queueAssistantSpeech(result.text, result);
402
+ }
403
+ sendRuntimeEvents(result.events);
404
+ const userRuntimeEvent = result.voiceEvents.find(
405
+ (candidate) => candidate.type === "voice.transcript.committed" && candidate.data.speaker === "user"
406
+ );
407
+ await options.recorder?.onTranscript?.({
408
+ session,
409
+ speaker: "user",
410
+ text: event.text,
411
+ ...userRuntimeEvent ? { runtimeEvent: userRuntimeEvent } : {}
412
+ });
413
+ const assistantRuntimeEvent = result.voiceEvents.find(
414
+ (candidate) => candidate.type === "voice.transcript.committed" && candidate.data.speaker === "assistant"
415
+ );
416
+ await options.recorder?.onTranscript?.({
417
+ session,
418
+ speaker: "assistant",
419
+ text: result.text,
420
+ ...assistantRuntimeEvent ? { runtimeEvent: assistantRuntimeEvent } : {}
421
+ });
422
+ send(options.socket, {
423
+ type: "cognidesk.turn.completed",
424
+ event_id: createId("voice_event"),
425
+ text: result.text,
426
+ ...result.activeJourneyId ? { activeJourneyId: result.activeJourneyId } : {}
427
+ });
428
+ };
429
+ try {
430
+ const controlInstructions = await options.control?.createSessionInstructions?.({ session });
431
+ providerSession = await options.provider.connect({
432
+ session,
433
+ ...options.profile ? { profile: options.profile } : {},
434
+ ...controlSurface ? {
435
+ control: {
436
+ ...controlSurface,
437
+ instructions: [
438
+ controlSurface.instructions,
439
+ controlInstructions
440
+ ].filter(Boolean).join("\n\n")
441
+ }
442
+ } : {},
443
+ signal: controller.signal,
444
+ onEvent: handleProviderEvent
445
+ });
446
+ } catch (error) {
447
+ send(options.socket, {
448
+ type: "error",
449
+ event_id: createId("voice_event"),
450
+ error: {
451
+ code: "voice_provider_connect_failed",
452
+ message: error instanceof Error ? error.message : "Voice provider connection failed."
453
+ }
454
+ });
455
+ options.socket.close(1011, "Voice provider connection failed");
456
+ return;
457
+ }
458
+ send(options.socket, {
459
+ type: "cognidesk.connection.ready",
460
+ event_id: createId("voice_event"),
461
+ protocol: COGNIDESK_VOICE_PROTOCOL,
462
+ conversation: session.conversation,
463
+ channelSegment: session.channelSegment,
464
+ connection: session.connection,
465
+ lastAckSequence: session.lastAckSequence
466
+ });
467
+ await issueReconnect();
468
+ if (options.initialGreeting?.trim()) {
469
+ queueSpeechAction(speechGeneration, () => providerSession?.speak({ text: options.initialGreeting.trim() }));
470
+ }
471
+ options.socket.on("message", (data) => {
472
+ void handleClientMessage(String(data)).catch((error) => {
473
+ send(options.socket, {
474
+ type: "error",
475
+ event_id: createId("voice_event"),
476
+ error: {
477
+ code: "voice_socket_message_failed",
478
+ message: error instanceof Error ? error.message : "Failed to handle voice socket message."
479
+ }
480
+ });
481
+ });
482
+ });
483
+ options.socket.on("error", (error) => {
484
+ send(options.socket, {
485
+ type: "error",
486
+ event_id: createId("voice_event"),
487
+ error: {
488
+ code: "voice_socket_error",
489
+ message: error instanceof Error ? error.message : "Voice socket error."
490
+ }
491
+ });
492
+ });
493
+ options.socket.on("close", (code) => {
494
+ if (closed) return;
495
+ closed = true;
496
+ speechGeneration++;
497
+ if (pendingInputTranscriptTimer) clearTimeout(pendingInputTranscriptTimer);
498
+ clearTurnPreambleTimer();
499
+ pendingInputTranscript = null;
500
+ controller.abort();
501
+ options.signal?.removeEventListener("abort", abort);
502
+ void providerSession?.close();
503
+ const normalClose = code === void 0 || code === 1e3 || code === 1001;
504
+ if (normalClose) {
505
+ void options.store.markEnded(session.id).then(
506
+ () => options.runtime.endVoiceSegment({
507
+ conversationId: session.conversation.id,
508
+ channelSegmentId: session.channelSegment.id,
509
+ connectionId: session.connection.id,
510
+ reason: "socket_closed"
511
+ })
512
+ );
513
+ } else {
514
+ void options.store.markReconnecting(
515
+ session.id,
516
+ /* @__PURE__ */ new Date(),
517
+ options.reconnectGraceMs ?? 3e4
518
+ );
519
+ }
520
+ });
521
+ async function handleClientMessage(raw) {
522
+ const event = parseClientEvent(raw);
523
+ if (event.type === "input_audio_buffer.append") {
524
+ assertBase64Audio(event.audio);
525
+ const sequence = event.sequence;
526
+ const previousAckSequence = session.lastAckSequence;
527
+ if (sequence !== void 0) {
528
+ await options.store.acknowledgeAudio({ sessionId: session.id, sequence });
529
+ send(options.socket, {
530
+ type: "cognidesk.audio.ack",
531
+ event_id: createId("voice_event"),
532
+ sequence
533
+ });
534
+ }
535
+ await options.recorder?.onAudio?.({
536
+ session,
537
+ speaker: "user",
538
+ audio: event.audio,
539
+ ...sequence !== void 0 ? { sequence } : {}
540
+ });
541
+ if (sequence === void 0 || sequence > previousAckSequence) {
542
+ await providerSession?.send(event);
543
+ }
544
+ return;
545
+ }
546
+ if (event.type === "response.cancel") {
547
+ speechGeneration++;
548
+ clearTurnPreambleTimer();
549
+ await providerSession?.send(event);
550
+ const interruption = await options.runtime.recordVoiceInterruption({
551
+ conversationId: session.conversation.id,
552
+ channelSegmentId: session.channelSegment.id,
553
+ connectionId: session.connection.id,
554
+ source: "userSpeech",
555
+ reason: event.reason ?? "client_cancelled_response",
556
+ ...optionalStringField("interruptedMessageId", event.interruptedMessageId),
557
+ ...optionalNumberField("offsetMs", event.playedUntilMs ?? event.audioEndMs)
558
+ });
559
+ send(options.socket, {
560
+ type: "cognidesk.interruption.recorded",
561
+ event_id: createId("voice_event"),
562
+ event: interruption
563
+ });
564
+ sendRuntimeEvents([interruption]);
565
+ return;
566
+ }
567
+ await providerSession?.send(event);
568
+ }
569
+ }
570
+ function attachNodeVoiceWebSocketAdapter(options) {
571
+ const pathPrefix = normalizePathPrefix(options.pathPrefix ?? "/voice/connections");
572
+ const webSocketServer = new WebSocketServer({ noServer: true });
573
+ const upgradeListener = (request, socket, head) => {
574
+ const parsed = parseVoiceSocketRequest(request, pathPrefix);
575
+ if (!parsed) return;
576
+ webSocketServer.handleUpgrade(request, socket, head, (webSocket) => {
577
+ webSocketServer.emit("connection", webSocket, request, parsed);
578
+ });
579
+ };
580
+ options.server.on("upgrade", upgradeListener);
581
+ webSocketServer.on("connection", (webSocket, _request, parsed) => {
582
+ void handleVoiceSocket({
583
+ socket: adaptNodeWebSocket(webSocket),
584
+ connectionId: parsed.connectionId,
585
+ token: parsed.token,
586
+ store: options.store,
587
+ runtime: options.runtime,
588
+ provider: options.provider,
589
+ ...options.control ? { control: options.control } : {},
590
+ ...options.profile ? { profile: options.profile } : {},
591
+ ...options.recorder ? { recorder: options.recorder } : {},
592
+ ...options.initialGreeting !== void 0 ? { initialGreeting: options.initialGreeting } : {},
593
+ ...options.reconnectTokenTtlMs !== void 0 ? { reconnectTokenTtlMs: options.reconnectTokenTtlMs } : {},
594
+ ...options.reconnectGraceMs !== void 0 ? { reconnectGraceMs: options.reconnectGraceMs } : {},
595
+ ...options.turnPreambleMs !== void 0 ? { turnPreambleMs: options.turnPreambleMs } : {}
596
+ });
597
+ });
598
+ return {
599
+ close() {
600
+ options.server.off("upgrade", upgradeListener);
601
+ webSocketServer.close();
602
+ },
603
+ webSocketServer
604
+ };
605
+ }
606
+ function parseVoiceSocketRequest(request, pathPrefix) {
607
+ if (!request.url) return null;
608
+ const url = new URL(request.url, "http://localhost");
609
+ const expectedPrefix = `${pathPrefix}/`;
610
+ if (!url.pathname.startsWith(expectedPrefix) || !url.pathname.endsWith("/socket")) return null;
611
+ const connectionId = decodeURIComponent(url.pathname.slice(expectedPrefix.length, -"/socket".length));
612
+ if (!connectionId) return null;
613
+ const token = url.searchParams.get("token") ?? parseTokenFromProtocol(request.headers["sec-websocket-protocol"]);
614
+ if (!token) return null;
615
+ return { connectionId, token };
616
+ }
617
+ function adaptNodeWebSocket(socket) {
618
+ return {
619
+ send(data) {
620
+ socket.send(data);
621
+ },
622
+ close(code, reason) {
623
+ socket.close(code, reason);
624
+ },
625
+ on(event, listener) {
626
+ if (event === "message") {
627
+ socket.on("message", (data) => {
628
+ listener(rawDataToString(data));
629
+ });
630
+ return;
631
+ }
632
+ if (event === "close") {
633
+ socket.on("close", (code, reason) => {
634
+ listener(code, reason.toString("utf8"));
635
+ });
636
+ return;
637
+ }
638
+ socket.on("error", listener);
639
+ }
640
+ };
641
+ }
642
+ function parseClientEvent(raw) {
643
+ let parsed;
644
+ try {
645
+ parsed = JSON.parse(raw);
646
+ } catch {
647
+ throw new Error("Voice socket message must be valid JSON.");
648
+ }
649
+ if (!isRecord(parsed)) throw new Error("Voice socket message must be a JSON object.");
650
+ const type = parsed.type;
651
+ if (typeof type !== "string") throw new Error("Voice socket message type is required.");
652
+ switch (type) {
653
+ case "session.update":
654
+ return { type, ...optionalEventId(parsed), ...isRecord(parsed.session) ? { session: parsed.session } : {} };
655
+ case "input_audio_buffer.append": {
656
+ const audio = requiredString(parsed, "audio");
657
+ const sequence = optionalInteger(parsed, "sequence");
658
+ return { type, audio, ...optionalEventId(parsed), ...sequence !== void 0 ? { sequence } : {} };
659
+ }
660
+ case "input_audio_buffer.commit":
661
+ case "input_audio_buffer.clear":
662
+ return { type, ...optionalEventId(parsed) };
663
+ case "response.cancel":
664
+ return {
665
+ type,
666
+ ...optionalEventId(parsed),
667
+ ...optionalStringField("response_id", optionalString(parsed, "response_id")),
668
+ ...optionalStringField("interruptedMessageId", optionalString(parsed, "interruptedMessageId")),
669
+ ...optionalStringField("reason", optionalString(parsed, "reason")),
670
+ ...optionalNumberField("playedUntilMs", optionalInteger(parsed, "playedUntilMs")),
671
+ ...optionalNumberField("audioEndMs", optionalInteger(parsed, "audioEndMs"))
672
+ };
673
+ case "conversation.item.truncate":
674
+ return {
675
+ type,
676
+ ...optionalEventId(parsed),
677
+ ...optionalStringField("item_id", optionalString(parsed, "item_id")),
678
+ ...optionalNumberField("content_index", optionalInteger(parsed, "content_index")),
679
+ ...optionalNumberField("audio_end_ms", optionalInteger(parsed, "audio_end_ms"))
680
+ };
681
+ default:
682
+ throw new Error(`Unsupported voice socket event '${type}'.`);
683
+ }
684
+ }
685
+ function buildSocketUrl(input) {
686
+ const requestUrl = new URL(input.requestUrl);
687
+ const base = input.baseUrl ? new URL(input.baseUrl) : requestUrl;
688
+ const protocol = base.protocol === "https:" ? "wss:" : "ws:";
689
+ const url = new URL(`${input.basePath}${input.pathPrefix}/${encodeURIComponent(input.connectionId)}/socket`, base);
690
+ url.protocol = protocol;
691
+ url.searchParams.set("token", input.token);
692
+ return url.toString();
693
+ }
694
+ function normalizePathPrefix(path) {
695
+ const withSlash = path.startsWith("/") ? path : `/${path}`;
696
+ return withSlash.endsWith("/") ? withSlash.slice(0, -1) : withSlash;
697
+ }
698
+ function createTokenRecord(input) {
699
+ return {
700
+ token: input.createToken(),
701
+ connectionId: input.connectionId,
702
+ sessionId: input.sessionId,
703
+ purpose: input.purpose,
704
+ expiresAt: new Date(input.now.getTime() + input.ttlMs).toISOString()
705
+ };
706
+ }
707
+ function requireSession(sessions, sessionId) {
708
+ const session = sessions.get(sessionId);
709
+ if (!session) throw new Error(`Voice session '${sessionId}' was not found.`);
710
+ return session;
711
+ }
712
+ function send(socket, event) {
713
+ socket.send(JSON.stringify(event));
714
+ }
715
+ function mergeInputTranscript(current, next) {
716
+ if (!current) return next;
717
+ const merged = {
718
+ kind: "input_transcript.completed",
719
+ text: `${current.text} ${next.text}`.trim()
720
+ };
721
+ const itemId = next.itemId ?? current.itemId;
722
+ if (itemId) merged.itemId = itemId;
723
+ const startedAtMs = current.startedAtMs ?? next.startedAtMs;
724
+ if (startedAtMs !== void 0) merged.startedAtMs = startedAtMs;
725
+ const endedAtMs = next.endedAtMs ?? current.endedAtMs;
726
+ if (endedAtMs !== void 0) merged.endedAtMs = endedAtMs;
727
+ const transcriptionSource = next.transcriptionSource ?? current.transcriptionSource;
728
+ if (transcriptionSource) merged.transcriptionSource = transcriptionSource;
729
+ const metadata = {
730
+ ...current.metadata ?? {},
731
+ ...next.metadata ?? {}
732
+ };
733
+ if (Object.keys(metadata).length > 0) merged.metadata = metadata;
734
+ return merged;
735
+ }
736
+ function isAgentResponseSignal(event) {
737
+ return event.type === "response.output_audio.delta" || event.type === "response.output_audio_transcript.delta" || event.type === "response.output_audio_transcript.done" || event.type === "response.done";
738
+ }
739
+ function takeSpeakablePrefix(text, force) {
740
+ if (!text.trim()) return null;
741
+ if (force) return { text: normalizeSpeechText(text), consumed: text.length };
742
+ const sentenceBoundary = findLastSentenceBoundary(text);
743
+ if (sentenceBoundary > 0) {
744
+ return {
745
+ text: normalizeSpeechText(text.slice(0, sentenceBoundary)),
746
+ consumed: sentenceBoundary
747
+ };
748
+ }
749
+ if (text.length < 180) return null;
750
+ const softBoundary = findSoftBoundary(text, 140);
751
+ if (softBoundary <= 0) return null;
752
+ return {
753
+ text: normalizeSpeechText(text.slice(0, softBoundary)),
754
+ consumed: softBoundary
755
+ };
756
+ }
757
+ function findLastSentenceBoundary(text) {
758
+ let boundary = -1;
759
+ const pattern = /[.!?。!?](?:["')\]]+)?\s+/g;
760
+ let match;
761
+ while ((match = pattern.exec(text)) !== null) {
762
+ boundary = match.index + match[0].length;
763
+ }
764
+ return boundary;
765
+ }
766
+ function findSoftBoundary(text, minIndex) {
767
+ const candidates = [", ", "; ", ": ", "\n", " "];
768
+ for (const candidate of candidates) {
769
+ const boundary = text.lastIndexOf(candidate);
770
+ if (boundary >= minIndex) return boundary + candidate.length;
771
+ }
772
+ return -1;
773
+ }
774
+ function normalizeSpeechText(text) {
775
+ return text.replace(/\s+/g, " ").trim();
776
+ }
777
+ function debounceMsForTranscript(text, baseMs) {
778
+ const wordCount = text.trim().split(" ").filter(Boolean).length;
779
+ return wordCount <= 2 ? Math.max(baseMs, 900) : baseMs;
780
+ }
781
+ function rawDataToString(data) {
782
+ if (typeof data === "string") return data;
783
+ if (Buffer.isBuffer(data)) return data.toString("utf8");
784
+ if (Array.isArray(data)) return Buffer.concat(data).toString("utf8");
785
+ return Buffer.from(data).toString("utf8");
786
+ }
787
+ function parseTokenFromProtocol(value) {
788
+ const raw = Array.isArray(value) ? value.join(",") : value;
789
+ if (!raw) return void 0;
790
+ const protocols = raw.split(",").map((candidate) => candidate.trim()).filter(Boolean);
791
+ const bearer = protocols.find((candidate) => candidate.startsWith("cognidesk.voice.token."));
792
+ return bearer?.slice("cognidesk.voice.token.".length);
793
+ }
794
+ function assertBase64Audio(value) {
795
+ if (value.length === 0) throw new Error("audio must not be empty.");
796
+ if (!/^[A-Za-z0-9+/]+={0,2}$/.test(value)) throw new Error("audio must be base64 encoded.");
797
+ }
798
+ function optionalEventId(value) {
799
+ return optionalStringField("event_id", optionalString(value, "event_id"));
800
+ }
801
+ function requiredString(value, key) {
802
+ const result = optionalString(value, key);
803
+ if (!result) throw new Error(`${key} must be a non-empty string.`);
804
+ return result;
805
+ }
806
+ function optionalString(value, key) {
807
+ const candidate = value[key];
808
+ if (candidate === void 0 || candidate === null) return void 0;
809
+ if (typeof candidate !== "string") throw new Error(`${key} must be a string.`);
810
+ const trimmed = candidate.trim();
811
+ return trimmed.length > 0 ? trimmed : void 0;
812
+ }
813
+ function optionalInteger(value, key) {
814
+ const candidate = value[key];
815
+ if (candidate === void 0 || candidate === null) return void 0;
816
+ if (typeof candidate !== "number" || !Number.isSafeInteger(candidate) || candidate < 0) {
817
+ throw new Error(`${key} must be a non-negative integer.`);
818
+ }
819
+ return candidate;
820
+ }
821
+ function optionalStringField(key, value) {
822
+ return value ? { [key]: value } : {};
823
+ }
824
+ function optionalNumberField(key, value) {
825
+ return value !== void 0 ? { [key]: value } : {};
826
+ }
827
+ function isRecord(value) {
828
+ return Boolean(value && typeof value === "object" && !Array.isArray(value));
829
+ }
830
+ function createId(prefix) {
831
+ const random = globalThis.crypto?.randomUUID?.() ?? Math.random().toString(36).slice(2);
832
+ return `${prefix}_${random}`;
833
+ }
834
+ export {
835
+ COGNIDESK_VOICE_PROTOCOL,
836
+ attachNodeVoiceWebSocketAdapter,
837
+ createInMemoryVoiceSessionStore,
838
+ createVoiceSocketHandshake,
839
+ handleVoiceSocket
840
+ };
841
+ //# sourceMappingURL=index.js.map