@cognidesk/voice-websocket 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ import * as ws from 'ws';
2
+ import { WebSocket } from 'ws';
3
+ import { Server, IncomingMessage } from 'node:http';
4
+ import { StartVoiceResult, RuntimeEvent, VoiceSocketMetadata, HandleVoiceUserMessageInput, HandleVoiceUserMessageResult, VoiceProfile } from '@cognidesk/core';
5
+
6
+ declare const COGNIDESK_VOICE_PROTOCOL: "cognidesk.voice.v1";
7
+ type VoiceProtocol = typeof COGNIDESK_VOICE_PROTOCOL;
8
+ type VoiceBrowserClientEvent = {
9
+ type: "session.update";
10
+ event_id?: string;
11
+ session?: Record<string, unknown>;
12
+ } | {
13
+ type: "input_audio_buffer.append";
14
+ event_id?: string;
15
+ audio: string;
16
+ sequence?: number;
17
+ } | {
18
+ type: "input_audio_buffer.commit";
19
+ event_id?: string;
20
+ } | {
21
+ type: "input_audio_buffer.clear";
22
+ event_id?: string;
23
+ } | {
24
+ type: "response.cancel";
25
+ event_id?: string;
26
+ response_id?: string;
27
+ interruptedMessageId?: string;
28
+ playedUntilMs?: number;
29
+ audioEndMs?: number;
30
+ reason?: string;
31
+ } | {
32
+ type: "conversation.item.truncate";
33
+ event_id?: string;
34
+ item_id?: string;
35
+ content_index?: number;
36
+ audio_end_ms?: number;
37
+ };
38
+ type VoiceBrowserServerEvent = {
39
+ type: "cognidesk.connection.ready";
40
+ event_id: string;
41
+ protocol: VoiceProtocol;
42
+ conversation: StartVoiceResult["conversation"];
43
+ channelSegment: StartVoiceResult["channelSegment"];
44
+ connection: StartVoiceResult["connection"];
45
+ lastAckSequence: number;
46
+ } | {
47
+ type: "cognidesk.connection.reconnect_token";
48
+ event_id: string;
49
+ token: string;
50
+ expiresAt: string;
51
+ } | {
52
+ type: "cognidesk.audio.ack";
53
+ event_id: string;
54
+ sequence: number;
55
+ } | {
56
+ type: "cognidesk.runtime_event";
57
+ event_id: string;
58
+ event: RuntimeEvent;
59
+ } | {
60
+ type: "cognidesk.turn.completed";
61
+ event_id: string;
62
+ text: string;
63
+ activeJourneyId?: string;
64
+ } | {
65
+ type: "cognidesk.interruption.recorded";
66
+ event_id: string;
67
+ event: RuntimeEvent;
68
+ } | {
69
+ type: "cognidesk.voice.preamble";
70
+ event_id: string;
71
+ text: string;
72
+ elapsedMs: number;
73
+ } | {
74
+ type: "input_audio_buffer.speech_started";
75
+ event_id?: string;
76
+ audio_start_ms?: number;
77
+ item_id?: string;
78
+ } | {
79
+ type: "input_audio_buffer.speech_stopped";
80
+ event_id?: string;
81
+ audio_end_ms?: number;
82
+ item_id?: string;
83
+ } | {
84
+ type: "input_audio_transcription.completed";
85
+ event_id: string;
86
+ text: string;
87
+ item_id?: string;
88
+ startedAtMs?: number;
89
+ endedAtMs?: number;
90
+ transcriptionSource?: string;
91
+ metadata?: Record<string, unknown>;
92
+ } | {
93
+ type: "response.output_audio.delta";
94
+ event_id?: string;
95
+ response_id?: string;
96
+ item_id?: string;
97
+ output_index?: number;
98
+ content_index?: number;
99
+ delta: string;
100
+ } | {
101
+ type: "response.output_audio.done";
102
+ event_id?: string;
103
+ response_id?: string;
104
+ item_id?: string;
105
+ output_index?: number;
106
+ content_index?: number;
107
+ } | {
108
+ type: "response.output_audio_transcript.delta";
109
+ event_id?: string;
110
+ response_id?: string;
111
+ item_id?: string;
112
+ output_index?: number;
113
+ content_index?: number;
114
+ delta: string;
115
+ } | {
116
+ type: "response.output_audio_transcript.done";
117
+ event_id?: string;
118
+ response_id?: string;
119
+ item_id?: string;
120
+ output_index?: number;
121
+ content_index?: number;
122
+ transcript: string;
123
+ } | {
124
+ type: "response.done";
125
+ event_id?: string;
126
+ response?: unknown;
127
+ } | {
128
+ type: "error";
129
+ event_id?: string;
130
+ error: {
131
+ code: string;
132
+ message: string;
133
+ retryable?: boolean;
134
+ details?: unknown;
135
+ };
136
+ };
137
+ type VoiceProviderEvent = {
138
+ kind: "input_transcript.completed";
139
+ text: string;
140
+ itemId?: string;
141
+ startedAtMs?: number;
142
+ endedAtMs?: number;
143
+ transcriptionSource?: string;
144
+ metadata?: Record<string, unknown>;
145
+ } | {
146
+ kind: "server_event";
147
+ event: VoiceBrowserServerEvent;
148
+ } | {
149
+ kind: "runtime_events";
150
+ events: RuntimeEvent[];
151
+ } | {
152
+ kind: "error";
153
+ code?: string;
154
+ message: string;
155
+ retryable?: boolean;
156
+ details?: unknown;
157
+ };
158
+ interface VoiceProviderConnectInput {
159
+ session: VoiceSocketSession;
160
+ profile?: VoiceProfile;
161
+ control?: VoiceControlSurface;
162
+ signal: AbortSignal;
163
+ onEvent(event: VoiceProviderEvent): Promise<void> | void;
164
+ }
165
+ interface VoiceProviderSession {
166
+ send(event: VoiceBrowserClientEvent): Promise<void> | void;
167
+ speak(input: {
168
+ text: string;
169
+ result?: HandleVoiceUserMessageResult;
170
+ }): Promise<void> | void;
171
+ preamble?(input: {
172
+ text: string;
173
+ }): Promise<void> | void;
174
+ close(): Promise<void> | void;
175
+ }
176
+ interface VoiceProvider {
177
+ readonly id: string;
178
+ connect(input: VoiceProviderConnectInput): Promise<VoiceProviderSession>;
179
+ }
180
+ interface VoiceControlTool {
181
+ name: string;
182
+ description?: string;
183
+ parameters?: unknown;
184
+ }
185
+ interface VoiceControlToolCall {
186
+ session: VoiceSocketSession;
187
+ name: string;
188
+ arguments: unknown;
189
+ callId: string;
190
+ itemId?: string;
191
+ responseId?: string;
192
+ signal: AbortSignal;
193
+ notify?(notification: VoiceControlNotification): Promise<void>;
194
+ }
195
+ interface VoiceControlToolResult {
196
+ output: unknown;
197
+ events?: RuntimeEvent[];
198
+ }
199
+ interface VoiceControlNotification {
200
+ message: string;
201
+ events?: RuntimeEvent[];
202
+ responseInstructions?: string;
203
+ createResponse?: boolean;
204
+ }
205
+ interface VoiceControlSurface {
206
+ tools: VoiceControlTool[];
207
+ instructions?: string;
208
+ createSessionInstructions?(input: {
209
+ session: VoiceSocketSession;
210
+ }): Promise<string> | string;
211
+ handleToolCall(input: VoiceControlToolCall): Promise<VoiceControlToolResult> | VoiceControlToolResult;
212
+ }
213
+ interface VoiceSocketLike {
214
+ send(data: string): void;
215
+ close(code?: number, reason?: string): void;
216
+ on(event: "message", listener: (data: string | ArrayBuffer | Uint8Array) => void): void;
217
+ on(event: "close", listener: (code?: number, reason?: string) => void): void;
218
+ on(event: "error", listener: (error: unknown) => void): void;
219
+ }
220
+ interface VoiceRecorder {
221
+ onAudio?(input: {
222
+ session: VoiceSocketSession;
223
+ speaker: "user" | "assistant";
224
+ audio: string;
225
+ sequence?: number;
226
+ }): Promise<void> | void;
227
+ onTranscript?(input: {
228
+ session: VoiceSocketSession;
229
+ speaker: "user" | "assistant";
230
+ text: string;
231
+ runtimeEvent?: RuntimeEvent;
232
+ }): Promise<void> | void;
233
+ }
234
+ interface VoiceSocketSession {
235
+ id: string;
236
+ conversation: StartVoiceResult["conversation"];
237
+ channelSegment: StartVoiceResult["channelSegment"];
238
+ connection: StartVoiceResult["connection"];
239
+ events: RuntimeEvent[];
240
+ createdAt: string;
241
+ updatedAt: string;
242
+ status: "pending" | "connected" | "reconnecting" | "ended";
243
+ lastAckSequence: number;
244
+ reconnectGraceUntil?: string;
245
+ }
246
+ interface VoiceSocketToken {
247
+ token: string;
248
+ connectionId: string;
249
+ sessionId: string;
250
+ purpose: "start" | "reconnect";
251
+ expiresAt: string;
252
+ consumedAt?: string;
253
+ }
254
+ interface VoiceSessionStore {
255
+ createSession(input: {
256
+ result: StartVoiceResult;
257
+ tokenTtlMs: number;
258
+ now?: Date;
259
+ }): Promise<{
260
+ session: VoiceSocketSession;
261
+ socket: VoiceSocketMetadata;
262
+ }>;
263
+ claimToken(input: {
264
+ connectionId: string;
265
+ token: string;
266
+ now?: Date;
267
+ }): Promise<{
268
+ session: VoiceSocketSession;
269
+ token: VoiceSocketToken;
270
+ reconnect: boolean;
271
+ } | null>;
272
+ issueReconnectToken(input: {
273
+ sessionId: string;
274
+ ttlMs: number;
275
+ now?: Date;
276
+ }): Promise<VoiceSocketToken>;
277
+ acknowledgeAudio(input: {
278
+ sessionId: string;
279
+ sequence: number;
280
+ now?: Date;
281
+ }): Promise<VoiceSocketSession>;
282
+ markConnected(sessionId: string, now?: Date): Promise<VoiceSocketSession>;
283
+ markReconnecting(sessionId: string, now?: Date, graceMs?: number): Promise<VoiceSocketSession>;
284
+ markEnded(sessionId: string, now?: Date): Promise<VoiceSocketSession>;
285
+ getSession(sessionId: string): Promise<VoiceSocketSession | null>;
286
+ }
287
+ interface InMemoryVoiceSessionStoreOptions {
288
+ createToken?: () => string;
289
+ }
290
+ declare function createInMemoryVoiceSessionStore(options?: InMemoryVoiceSessionStoreOptions): VoiceSessionStore;
291
+ interface VoiceSocketHandshakeOptions {
292
+ store: VoiceSessionStore;
293
+ tokenTtlMs?: number;
294
+ pathPrefix?: string;
295
+ baseUrl?: string;
296
+ }
297
+ declare function createVoiceSocketHandshake(options: VoiceSocketHandshakeOptions): {
298
+ createSocket(input: {
299
+ result: StartVoiceResult;
300
+ request: Request;
301
+ basePath: string;
302
+ }): Promise<VoiceSocketMetadata>;
303
+ };
304
+ interface VoiceRuntime {
305
+ handleVoiceUserMessage<TTurn = unknown>(input: HandleVoiceUserMessageInput<TTurn>): Promise<HandleVoiceUserMessageResult>;
306
+ commitVoiceTranscript?(input: {
307
+ conversationId: string;
308
+ channelSegmentId: string;
309
+ speaker: "user" | "assistant";
310
+ text: string;
311
+ recordingReferenceId?: string;
312
+ startedAtMs?: number;
313
+ endedAtMs?: number;
314
+ transcriptionSource?: string;
315
+ metadata?: Record<string, unknown>;
316
+ }): Promise<{
317
+ events: RuntimeEvent[];
318
+ event: RuntimeEvent;
319
+ message: RuntimeEvent;
320
+ }>;
321
+ recordVoiceInterruption(input: {
322
+ conversationId: string;
323
+ channelSegmentId: string;
324
+ connectionId?: string;
325
+ interruptedMessageId?: string;
326
+ source?: "userSpeech" | "adapter" | "provider";
327
+ reason?: string;
328
+ recordingReferenceId?: string;
329
+ offsetMs?: number;
330
+ }): Promise<RuntimeEvent>;
331
+ endVoiceSegment(input: {
332
+ conversationId: string;
333
+ channelSegmentId: string;
334
+ connectionId?: string;
335
+ reason?: string;
336
+ }): Promise<RuntimeEvent>;
337
+ }
338
+ interface HandleVoiceSocketOptions {
339
+ socket: VoiceSocketLike;
340
+ connectionId: string;
341
+ token: string;
342
+ store: VoiceSessionStore;
343
+ runtime: VoiceRuntime;
344
+ provider: VoiceProvider;
345
+ control?: VoiceControlSurface;
346
+ profile?: VoiceProfile;
347
+ recorder?: VoiceRecorder;
348
+ initialGreeting?: string;
349
+ reconnectTokenTtlMs?: number;
350
+ reconnectGraceMs?: number;
351
+ inputTranscriptDebounceMs?: number;
352
+ turnPreambleMs?: number;
353
+ signal?: AbortSignal;
354
+ }
355
+ declare function handleVoiceSocket(options: HandleVoiceSocketOptions): Promise<void>;
356
+ interface AttachNodeVoiceWebSocketAdapterOptions {
357
+ server: Server;
358
+ store: VoiceSessionStore;
359
+ runtime: VoiceRuntime;
360
+ provider: VoiceProvider;
361
+ control?: VoiceControlSurface;
362
+ profile?: VoiceProfile;
363
+ recorder?: VoiceRecorder;
364
+ pathPrefix?: string;
365
+ initialGreeting?: string;
366
+ reconnectTokenTtlMs?: number;
367
+ reconnectGraceMs?: number;
368
+ turnPreambleMs?: number;
369
+ }
370
+ declare function attachNodeVoiceWebSocketAdapter(options: AttachNodeVoiceWebSocketAdapterOptions): {
371
+ close(): void;
372
+ webSocketServer: ws.Server<typeof WebSocket, typeof IncomingMessage>;
373
+ };
374
+
375
+ export { type AttachNodeVoiceWebSocketAdapterOptions, COGNIDESK_VOICE_PROTOCOL, type HandleVoiceSocketOptions, type InMemoryVoiceSessionStoreOptions, type VoiceBrowserClientEvent, type VoiceBrowserServerEvent, type VoiceControlNotification, type VoiceControlSurface, type VoiceControlTool, type VoiceControlToolCall, type VoiceControlToolResult, type VoiceProtocol, type VoiceProvider, type VoiceProviderConnectInput, type VoiceProviderEvent, type VoiceProviderSession, type VoiceRecorder, type VoiceRuntime, type VoiceSessionStore, type VoiceSocketHandshakeOptions, type VoiceSocketLike, type VoiceSocketSession, type VoiceSocketToken, attachNodeVoiceWebSocketAdapter, createInMemoryVoiceSessionStore, createVoiceSocketHandshake, handleVoiceSocket };