@drawdream/livespeech 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -17
- package/dist/index.d.mts +108 -87
- package/dist/index.d.ts +108 -87
- package/dist/index.js +91 -74
- package/dist/index.mjs +91 -74
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
|
|
|
21
21
|
*/
|
|
22
22
|
declare function isValidRegion(value: string): value is Region;
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Pipeline mode for audio processing
|
|
26
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
|
|
27
|
+
* - 'composed': Uses separate STT + LLM + TTS services
|
|
28
|
+
*/
|
|
29
|
+
type PipelineMode = 'live' | 'composed';
|
|
24
30
|
/**
|
|
25
31
|
* Configuration options for the LiveSpeech client
|
|
26
32
|
*
|
|
@@ -74,41 +80,20 @@ interface SessionConfig {
|
|
|
74
80
|
/**
|
|
75
81
|
* System prompt for the AI assistant
|
|
76
82
|
*/
|
|
77
|
-
prePrompt
|
|
78
|
-
/**
|
|
79
|
-
* Voice ID for text-to-speech output
|
|
80
|
-
* @default 'en-US-Standard-A'
|
|
81
|
-
*/
|
|
82
|
-
voiceId?: string;
|
|
83
|
-
/**
|
|
84
|
-
* Language code for speech recognition
|
|
85
|
-
* @default 'en-US'
|
|
86
|
-
*/
|
|
87
|
-
languageCode?: string;
|
|
88
|
-
/**
|
|
89
|
-
* Audio encoding format for input
|
|
90
|
-
* @default 'pcm16'
|
|
91
|
-
*/
|
|
92
|
-
inputFormat?: AudioFormat;
|
|
93
|
-
/**
|
|
94
|
-
* Audio encoding format for output
|
|
95
|
-
* @default 'pcm16'
|
|
96
|
-
*/
|
|
97
|
-
outputFormat?: AudioFormat;
|
|
83
|
+
prePrompt?: string;
|
|
98
84
|
/**
|
|
99
|
-
*
|
|
100
|
-
* @default
|
|
85
|
+
* Language code for speech recognition (e.g., "en-US", "ko-KR")
|
|
86
|
+
* @default "en-US"
|
|
101
87
|
*/
|
|
102
|
-
|
|
88
|
+
language?: string;
|
|
103
89
|
/**
|
|
104
|
-
*
|
|
90
|
+
* Pipeline mode for audio processing
|
|
91
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
|
|
92
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
93
|
+
* @default "live"
|
|
105
94
|
*/
|
|
106
|
-
|
|
95
|
+
pipelineMode?: PipelineMode;
|
|
107
96
|
}
|
|
108
|
-
/**
|
|
109
|
-
* Supported audio formats
|
|
110
|
-
*/
|
|
111
|
-
type AudioFormat = 'pcm16' | 'opus' | 'wav';
|
|
112
97
|
/**
|
|
113
98
|
* Internal resolved configuration with defaults applied
|
|
114
99
|
*/
|
|
@@ -125,7 +110,7 @@ interface ResolvedConfig {
|
|
|
125
110
|
/**
|
|
126
111
|
* Event types emitted by the LiveSpeech client
|
|
127
112
|
*/
|
|
128
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | '
|
|
113
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
|
|
129
114
|
/**
|
|
130
115
|
* Event payload for 'connected' event
|
|
131
116
|
*/
|
|
@@ -164,13 +149,19 @@ interface SessionEndedEvent {
|
|
|
164
149
|
timestamp: string;
|
|
165
150
|
}
|
|
166
151
|
/**
|
|
167
|
-
* Event payload for '
|
|
152
|
+
* Event payload for 'ready' event
|
|
153
|
+
*/
|
|
154
|
+
interface ReadyEvent {
|
|
155
|
+
type: 'ready';
|
|
156
|
+
timestamp: string;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Event payload for 'userTranscript' event
|
|
160
|
+
* User's speech transcription
|
|
168
161
|
*/
|
|
169
|
-
interface
|
|
170
|
-
type: '
|
|
162
|
+
interface UserTranscriptEvent {
|
|
163
|
+
type: 'userTranscript';
|
|
171
164
|
text: string;
|
|
172
|
-
isFinal: boolean;
|
|
173
|
-
confidence?: number;
|
|
174
165
|
timestamp: string;
|
|
175
166
|
}
|
|
176
167
|
/**
|
|
@@ -205,7 +196,7 @@ interface ErrorEvent {
|
|
|
205
196
|
/**
|
|
206
197
|
* Error codes
|
|
207
198
|
*/
|
|
208
|
-
type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
|
|
199
|
+
type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'streaming_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
|
|
209
200
|
/**
|
|
210
201
|
* Event payload for 'reconnecting' event
|
|
211
202
|
*/
|
|
@@ -216,14 +207,22 @@ interface ReconnectingEvent {
|
|
|
216
207
|
delay: number;
|
|
217
208
|
timestamp: string;
|
|
218
209
|
}
|
|
210
|
+
/**
|
|
211
|
+
* Event payload for 'turnComplete' event (both modes)
|
|
212
|
+
* Indicates the AI has finished its response turn
|
|
213
|
+
*/
|
|
214
|
+
interface TurnCompleteEvent {
|
|
215
|
+
type: 'turnComplete';
|
|
216
|
+
timestamp: string;
|
|
217
|
+
}
|
|
219
218
|
/**
|
|
220
219
|
* Union type of all event payloads
|
|
221
220
|
*/
|
|
222
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent |
|
|
221
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
|
|
223
222
|
/**
|
|
224
223
|
* Simplified event handlers for common use cases
|
|
225
224
|
*/
|
|
226
|
-
type
|
|
225
|
+
type UserTranscriptHandler = (text: string) => void;
|
|
227
226
|
type ResponseHandler = (text: string, isFinal: boolean) => void;
|
|
228
227
|
type AudioHandler = (data: Uint8Array) => void;
|
|
229
228
|
type ErrorHandler = (error: ErrorEvent) => void;
|
|
@@ -231,30 +230,25 @@ type ErrorHandler = (error: ErrorEvent) => void;
|
|
|
231
230
|
/**
|
|
232
231
|
* WebSocket message types sent from client to server
|
|
233
232
|
*/
|
|
234
|
-
type ClientMessageType = 'startSession' | 'endSession' | '
|
|
233
|
+
type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'ping';
|
|
235
234
|
/**
|
|
236
235
|
* WebSocket message types received from server
|
|
237
236
|
*/
|
|
238
|
-
type ServerMessageType = '
|
|
237
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
|
|
239
238
|
/**
|
|
240
239
|
* Base interface for client messages
|
|
241
240
|
*/
|
|
242
241
|
interface BaseClientMessage {
|
|
243
242
|
action: ClientMessageType;
|
|
244
|
-
requestId?: string;
|
|
245
243
|
}
|
|
246
244
|
/**
|
|
247
245
|
* Start session message
|
|
248
246
|
*/
|
|
249
247
|
interface StartSessionMessage extends BaseClientMessage {
|
|
250
248
|
action: 'startSession';
|
|
251
|
-
prePrompt
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
inputFormat?: string;
|
|
255
|
-
outputFormat?: string;
|
|
256
|
-
sampleRate?: number;
|
|
257
|
-
metadata?: Record<string, string>;
|
|
249
|
+
prePrompt?: string;
|
|
250
|
+
language?: string;
|
|
251
|
+
pipelineMode?: 'live' | 'composed';
|
|
258
252
|
}
|
|
259
253
|
/**
|
|
260
254
|
* End session message
|
|
@@ -263,14 +257,23 @@ interface EndSessionMessage extends BaseClientMessage {
|
|
|
263
257
|
action: 'endSession';
|
|
264
258
|
}
|
|
265
259
|
/**
|
|
266
|
-
* Audio
|
|
260
|
+
* Audio start message - begin streaming session
|
|
261
|
+
*/
|
|
262
|
+
interface AudioStartMessage extends BaseClientMessage {
|
|
263
|
+
action: 'audioStart';
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Audio chunk message - send audio data
|
|
267
267
|
*/
|
|
268
|
-
interface
|
|
269
|
-
action: '
|
|
268
|
+
interface AudioChunkMessage extends BaseClientMessage {
|
|
269
|
+
action: 'audioChunk';
|
|
270
270
|
data: string;
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Audio end message - end streaming session
|
|
274
|
+
*/
|
|
275
|
+
interface AudioEndMessage extends BaseClientMessage {
|
|
276
|
+
action: 'audioEnd';
|
|
274
277
|
}
|
|
275
278
|
/**
|
|
276
279
|
* Ping message for keep-alive
|
|
@@ -281,22 +284,14 @@ interface PingMessage extends BaseClientMessage {
|
|
|
281
284
|
/**
|
|
282
285
|
* Union type of all client messages
|
|
283
286
|
*/
|
|
284
|
-
type ClientMessage = StartSessionMessage | EndSessionMessage |
|
|
287
|
+
type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | PingMessage;
|
|
285
288
|
/**
|
|
286
289
|
* Base interface for server messages
|
|
287
290
|
*/
|
|
288
291
|
interface BaseServerMessage {
|
|
289
292
|
type: ServerMessageType;
|
|
290
|
-
requestId?: string;
|
|
291
293
|
timestamp: string;
|
|
292
294
|
}
|
|
293
|
-
/**
|
|
294
|
-
* Connected message from server
|
|
295
|
-
*/
|
|
296
|
-
interface ServerConnectedMessage extends BaseServerMessage {
|
|
297
|
-
type: 'connected';
|
|
298
|
-
connectionId: string;
|
|
299
|
-
}
|
|
300
295
|
/**
|
|
301
296
|
* Session started message from server
|
|
302
297
|
*/
|
|
@@ -312,13 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
312
307
|
sessionId: string;
|
|
313
308
|
}
|
|
314
309
|
/**
|
|
315
|
-
*
|
|
310
|
+
* User transcript message from server (user's speech transcription)
|
|
316
311
|
*/
|
|
317
|
-
interface
|
|
318
|
-
type: '
|
|
312
|
+
interface ServerUserTranscriptMessage extends BaseServerMessage {
|
|
313
|
+
type: 'userTranscript';
|
|
319
314
|
text: string;
|
|
320
|
-
isFinal: boolean;
|
|
321
|
-
confidence?: number;
|
|
322
315
|
}
|
|
323
316
|
/**
|
|
324
317
|
* Response message from server
|
|
@@ -344,7 +337,6 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
344
337
|
type: 'error';
|
|
345
338
|
code: string;
|
|
346
339
|
message: string;
|
|
347
|
-
details?: unknown;
|
|
348
340
|
}
|
|
349
341
|
/**
|
|
350
342
|
* Pong message from server
|
|
@@ -352,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
352
344
|
interface ServerPongMessage extends BaseServerMessage {
|
|
353
345
|
type: 'pong';
|
|
354
346
|
}
|
|
347
|
+
/**
|
|
348
|
+
* Turn complete message from server
|
|
349
|
+
* Indicates the AI has finished its response turn
|
|
350
|
+
*/
|
|
351
|
+
interface ServerTurnCompleteMessage extends BaseServerMessage {
|
|
352
|
+
type: 'turnComplete';
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Ready message from server
|
|
356
|
+
* Indicates the Gemini Live session is ready for audio input
|
|
357
|
+
*/
|
|
358
|
+
interface ServerReadyMessage extends BaseServerMessage {
|
|
359
|
+
type: 'ready';
|
|
360
|
+
}
|
|
355
361
|
/**
|
|
356
362
|
* Union type of all server messages
|
|
357
363
|
*/
|
|
358
|
-
type ServerMessage =
|
|
364
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
|
|
359
365
|
|
|
360
366
|
/**
|
|
361
367
|
* Connection state
|
|
@@ -368,13 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
|
|
|
368
374
|
type LiveSpeechEventMap = {
|
|
369
375
|
connected: ConnectedEvent;
|
|
370
376
|
disconnected: DisconnectedEvent;
|
|
377
|
+
reconnecting: ReconnectingEvent;
|
|
371
378
|
sessionStarted: SessionStartedEvent;
|
|
372
379
|
sessionEnded: SessionEndedEvent;
|
|
373
|
-
|
|
380
|
+
ready: ReadyEvent;
|
|
381
|
+
userTranscript: UserTranscriptEvent;
|
|
374
382
|
response: ResponseEvent;
|
|
375
383
|
audio: AudioEvent;
|
|
384
|
+
turnComplete: TurnCompleteEvent;
|
|
376
385
|
error: ErrorEvent;
|
|
377
|
-
reconnecting: ReconnectingEvent;
|
|
378
386
|
};
|
|
379
387
|
/**
|
|
380
388
|
* LiveSpeech client for real-time speech-to-speech AI conversations
|
|
@@ -385,9 +393,9 @@ declare class LiveSpeechClient {
|
|
|
385
393
|
private readonly audioEncoder;
|
|
386
394
|
private readonly logger;
|
|
387
395
|
private sessionId;
|
|
388
|
-
private
|
|
396
|
+
private isStreaming;
|
|
389
397
|
private readonly eventListeners;
|
|
390
|
-
private
|
|
398
|
+
private userTranscriptHandler;
|
|
391
399
|
private responseHandler;
|
|
392
400
|
private audioHandler;
|
|
393
401
|
private errorHandler;
|
|
@@ -412,6 +420,10 @@ declare class LiveSpeechClient {
|
|
|
412
420
|
* Check if session is active
|
|
413
421
|
*/
|
|
414
422
|
get hasActiveSession(): boolean;
|
|
423
|
+
/**
|
|
424
|
+
* Check if audio streaming is active
|
|
425
|
+
*/
|
|
426
|
+
get isAudioStreaming(): boolean;
|
|
415
427
|
/**
|
|
416
428
|
* Connect to the server
|
|
417
429
|
*/
|
|
@@ -423,18 +435,23 @@ declare class LiveSpeechClient {
|
|
|
423
435
|
/**
|
|
424
436
|
* Start a new session
|
|
425
437
|
*/
|
|
426
|
-
startSession(config
|
|
438
|
+
startSession(config?: SessionConfig): Promise<string>;
|
|
427
439
|
/**
|
|
428
440
|
* End the current session
|
|
429
441
|
*/
|
|
430
442
|
endSession(): Promise<void>;
|
|
431
443
|
/**
|
|
432
|
-
*
|
|
444
|
+
* Start audio streaming session
|
|
445
|
+
*/
|
|
446
|
+
audioStart(): void;
|
|
447
|
+
/**
|
|
448
|
+
* Send audio chunk (PCM16 base64 encoded)
|
|
449
|
+
*/
|
|
450
|
+
sendAudioChunk(data: Uint8Array): void;
|
|
451
|
+
/**
|
|
452
|
+
* End audio streaming session
|
|
433
453
|
*/
|
|
434
|
-
|
|
435
|
-
format?: AudioFormat;
|
|
436
|
-
isFinal?: boolean;
|
|
437
|
-
}): void;
|
|
454
|
+
audioEnd(): void;
|
|
438
455
|
/**
|
|
439
456
|
* Add event listener
|
|
440
457
|
*/
|
|
@@ -444,17 +461,17 @@ declare class LiveSpeechClient {
|
|
|
444
461
|
*/
|
|
445
462
|
off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
|
|
446
463
|
/**
|
|
447
|
-
* Set
|
|
448
|
-
*/
|
|
449
|
-
setTranscriptHandler(handler: TranscriptHandler): void;
|
|
450
|
-
/**
|
|
451
|
-
* Set response handler (simplified)
|
|
464
|
+
* Set response handler
|
|
452
465
|
*/
|
|
453
466
|
setResponseHandler(handler: ResponseHandler): void;
|
|
454
467
|
/**
|
|
455
468
|
* Set audio handler (simplified)
|
|
456
469
|
*/
|
|
457
470
|
setAudioHandler(handler: AudioHandler): void;
|
|
471
|
+
/**
|
|
472
|
+
* Set user transcript handler
|
|
473
|
+
*/
|
|
474
|
+
setUserTranscriptHandler(handler: UserTranscriptHandler): void;
|
|
458
475
|
/**
|
|
459
476
|
* Set error handler (simplified)
|
|
460
477
|
*/
|
|
@@ -467,6 +484,10 @@ declare class LiveSpeechClient {
|
|
|
467
484
|
private handleMessage;
|
|
468
485
|
}
|
|
469
486
|
|
|
487
|
+
/**
|
|
488
|
+
* Audio format type
|
|
489
|
+
*/
|
|
490
|
+
type AudioFormat = 'pcm16' | 'opus' | 'wav';
|
|
470
491
|
/**
|
|
471
492
|
* Audio encoder options
|
|
472
493
|
*/
|
|
@@ -552,4 +573,4 @@ declare class AudioEncoder {
|
|
|
552
573
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
553
574
|
}
|
|
554
575
|
|
|
555
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type
|
|
576
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.js
CHANGED
|
@@ -46,7 +46,7 @@ var Region = {
|
|
|
46
46
|
};
|
|
47
47
|
var REGION_ENDPOINTS = {
|
|
48
48
|
"ap-northeast-2": "wss://talk.drawdream.co.kr",
|
|
49
|
-
"us-west-2": "wss://talk
|
|
49
|
+
"us-west-2": "wss://talk.drawdream.ca"
|
|
50
50
|
// Coming soon
|
|
51
51
|
};
|
|
52
52
|
function getEndpointForRegion(region) {
|
|
@@ -332,22 +332,13 @@ var WebSocketConnection = class {
|
|
|
332
332
|
/**
|
|
333
333
|
* Handle incoming message
|
|
334
334
|
*/
|
|
335
|
-
handleMessage(data
|
|
335
|
+
handleMessage(data) {
|
|
336
336
|
const message = parseServerMessage(data);
|
|
337
337
|
if (!message) {
|
|
338
338
|
this.logger.warn("Invalid message received:", data);
|
|
339
339
|
return;
|
|
340
340
|
}
|
|
341
341
|
this.logger.debug("Received message:", message.type);
|
|
342
|
-
if (message.type === "connected") {
|
|
343
|
-
this.connectionId = message.connectionId;
|
|
344
|
-
this.state = "connected";
|
|
345
|
-
this.retryController.reset();
|
|
346
|
-
this.startPingInterval();
|
|
347
|
-
this.events.onOpen?.(message.connectionId);
|
|
348
|
-
onFirstConnect?.();
|
|
349
|
-
return;
|
|
350
|
-
}
|
|
351
342
|
if (message.type === "pong") {
|
|
352
343
|
this.logger.debug("Pong received");
|
|
353
344
|
return;
|
|
@@ -613,24 +604,17 @@ var CONFIG_DEFAULTS = {
|
|
|
613
604
|
reconnectDelay: 1e3,
|
|
614
605
|
debug: false
|
|
615
606
|
};
|
|
616
|
-
var SESSION_DEFAULTS = {
|
|
617
|
-
voiceId: "en-US-Standard-A",
|
|
618
|
-
languageCode: "en-US",
|
|
619
|
-
inputFormat: "pcm16",
|
|
620
|
-
outputFormat: "pcm16",
|
|
621
|
-
sampleRate: 16e3
|
|
622
|
-
};
|
|
623
607
|
var LiveSpeechClient = class {
|
|
624
608
|
config;
|
|
625
609
|
connection;
|
|
626
610
|
audioEncoder;
|
|
627
611
|
logger;
|
|
628
612
|
sessionId = null;
|
|
629
|
-
|
|
613
|
+
isStreaming = false;
|
|
630
614
|
// Event listeners using a simple map
|
|
631
615
|
eventListeners = /* @__PURE__ */ new Map();
|
|
632
616
|
// Simplified handlers
|
|
633
|
-
|
|
617
|
+
userTranscriptHandler = null;
|
|
634
618
|
responseHandler = null;
|
|
635
619
|
audioHandler = null;
|
|
636
620
|
errorHandler = null;
|
|
@@ -692,6 +676,12 @@ var LiveSpeechClient = class {
|
|
|
692
676
|
get hasActiveSession() {
|
|
693
677
|
return this.sessionId !== null;
|
|
694
678
|
}
|
|
679
|
+
/**
|
|
680
|
+
* Check if audio streaming is active
|
|
681
|
+
*/
|
|
682
|
+
get isAudioStreaming() {
|
|
683
|
+
return this.isStreaming;
|
|
684
|
+
}
|
|
695
685
|
/**
|
|
696
686
|
* Connect to the server
|
|
697
687
|
*/
|
|
@@ -705,7 +695,7 @@ var LiveSpeechClient = class {
|
|
|
705
695
|
disconnect() {
|
|
706
696
|
this.logger.info("Disconnecting...");
|
|
707
697
|
this.sessionId = null;
|
|
708
|
-
this.
|
|
698
|
+
this.isStreaming = false;
|
|
709
699
|
this.connection.disconnect();
|
|
710
700
|
}
|
|
711
701
|
/**
|
|
@@ -718,16 +708,6 @@ var LiveSpeechClient = class {
|
|
|
718
708
|
if (this.sessionId) {
|
|
719
709
|
throw new Error("Session already active. Call endSession() first.");
|
|
720
710
|
}
|
|
721
|
-
const resolvedConfig = {
|
|
722
|
-
prePrompt: config.prePrompt,
|
|
723
|
-
voiceId: config.voiceId ?? SESSION_DEFAULTS.voiceId,
|
|
724
|
-
languageCode: config.languageCode ?? SESSION_DEFAULTS.languageCode,
|
|
725
|
-
inputFormat: config.inputFormat ?? SESSION_DEFAULTS.inputFormat,
|
|
726
|
-
outputFormat: config.outputFormat ?? SESSION_DEFAULTS.outputFormat,
|
|
727
|
-
sampleRate: config.sampleRate ?? SESSION_DEFAULTS.sampleRate,
|
|
728
|
-
metadata: config.metadata ?? {}
|
|
729
|
-
};
|
|
730
|
-
this.sessionConfig = resolvedConfig;
|
|
731
711
|
this.logger.info("Starting session...");
|
|
732
712
|
return new Promise((resolve, reject) => {
|
|
733
713
|
const onSessionStarted = (event) => {
|
|
@@ -744,16 +724,17 @@ var LiveSpeechClient = class {
|
|
|
744
724
|
};
|
|
745
725
|
this.on("sessionStarted", onSessionStarted);
|
|
746
726
|
this.on("error", onError);
|
|
747
|
-
|
|
748
|
-
action: "startSession"
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
727
|
+
const startMessage = {
|
|
728
|
+
action: "startSession"
|
|
729
|
+
};
|
|
730
|
+
if (config?.prePrompt) {
|
|
731
|
+
startMessage.prePrompt = config.prePrompt;
|
|
732
|
+
}
|
|
733
|
+
if (config?.language) {
|
|
734
|
+
startMessage.language = config.language;
|
|
735
|
+
}
|
|
736
|
+
startMessage.pipelineMode = config?.pipelineMode ?? "live";
|
|
737
|
+
this.connection.send(startMessage);
|
|
757
738
|
});
|
|
758
739
|
}
|
|
759
740
|
/**
|
|
@@ -765,6 +746,9 @@ var LiveSpeechClient = class {
|
|
|
765
746
|
return;
|
|
766
747
|
}
|
|
767
748
|
this.logger.info("Ending session...");
|
|
749
|
+
if (this.isStreaming) {
|
|
750
|
+
this.audioEnd();
|
|
751
|
+
}
|
|
768
752
|
return new Promise((resolve) => {
|
|
769
753
|
const onSessionEnded = () => {
|
|
770
754
|
this.off("sessionEnded", onSessionEnded);
|
|
@@ -775,28 +759,49 @@ var LiveSpeechClient = class {
|
|
|
775
759
|
});
|
|
776
760
|
}
|
|
777
761
|
/**
|
|
778
|
-
*
|
|
762
|
+
* Start audio streaming session
|
|
779
763
|
*/
|
|
780
|
-
|
|
764
|
+
audioStart() {
|
|
781
765
|
if (!this.isConnected) {
|
|
782
766
|
throw new Error("Not connected");
|
|
783
767
|
}
|
|
784
768
|
if (!this.sessionId) {
|
|
785
769
|
throw new Error("No active session. Call startSession() first.");
|
|
786
770
|
}
|
|
771
|
+
if (this.isStreaming) {
|
|
772
|
+
throw new Error("Already streaming. Call audioEnd() first.");
|
|
773
|
+
}
|
|
774
|
+
this.logger.info("Starting audio stream...");
|
|
775
|
+
this.connection.send({ action: "audioStart" });
|
|
776
|
+
this.isStreaming = true;
|
|
777
|
+
}
|
|
778
|
+
/**
|
|
779
|
+
* Send audio chunk (PCM16 base64 encoded)
|
|
780
|
+
*/
|
|
781
|
+
sendAudioChunk(data) {
|
|
782
|
+
if (!this.isConnected) {
|
|
783
|
+
throw new Error("Not connected");
|
|
784
|
+
}
|
|
785
|
+
if (!this.isStreaming) {
|
|
786
|
+
throw new Error("Not streaming. Call audioStart() first.");
|
|
787
|
+
}
|
|
787
788
|
const base64Data = this.audioEncoder.encode(data);
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
789
|
+
this.connection.send({
|
|
790
|
+
action: "audioChunk",
|
|
791
|
+
data: base64Data
|
|
792
|
+
});
|
|
793
|
+
}
|
|
794
|
+
/**
|
|
795
|
+
* End audio streaming session
|
|
796
|
+
*/
|
|
797
|
+
audioEnd() {
|
|
798
|
+
if (!this.isStreaming) {
|
|
799
|
+
this.logger.warn("Not streaming");
|
|
800
|
+
return;
|
|
798
801
|
}
|
|
799
|
-
this.
|
|
802
|
+
this.logger.info("Ending audio stream...");
|
|
803
|
+
this.connection.send({ action: "audioEnd" });
|
|
804
|
+
this.isStreaming = false;
|
|
800
805
|
}
|
|
801
806
|
// ==================== Event System ====================
|
|
802
807
|
/**
|
|
@@ -818,13 +823,7 @@ var LiveSpeechClient = class {
|
|
|
818
823
|
}
|
|
819
824
|
}
|
|
820
825
|
/**
|
|
821
|
-
* Set
|
|
822
|
-
*/
|
|
823
|
-
setTranscriptHandler(handler) {
|
|
824
|
-
this.transcriptHandler = handler;
|
|
825
|
-
}
|
|
826
|
-
/**
|
|
827
|
-
* Set response handler (simplified)
|
|
826
|
+
* Set response handler
|
|
828
827
|
*/
|
|
829
828
|
setResponseHandler(handler) {
|
|
830
829
|
this.responseHandler = handler;
|
|
@@ -835,6 +834,12 @@ var LiveSpeechClient = class {
|
|
|
835
834
|
setAudioHandler(handler) {
|
|
836
835
|
this.audioHandler = handler;
|
|
837
836
|
}
|
|
837
|
+
/**
|
|
838
|
+
* Set user transcript handler
|
|
839
|
+
*/
|
|
840
|
+
setUserTranscriptHandler(handler) {
|
|
841
|
+
this.userTranscriptHandler = handler;
|
|
842
|
+
}
|
|
838
843
|
/**
|
|
839
844
|
* Set error handler (simplified)
|
|
840
845
|
*/
|
|
@@ -864,7 +869,7 @@ var LiveSpeechClient = class {
|
|
|
864
869
|
}
|
|
865
870
|
handleDisconnected(code, _reason) {
|
|
866
871
|
this.sessionId = null;
|
|
867
|
-
this.
|
|
872
|
+
this.isStreaming = false;
|
|
868
873
|
const event = {
|
|
869
874
|
type: "disconnected",
|
|
870
875
|
reason: code === 1e3 ? "normal" : "error",
|
|
@@ -906,25 +911,19 @@ var LiveSpeechClient = class {
|
|
|
906
911
|
break;
|
|
907
912
|
case "sessionEnded":
|
|
908
913
|
this.sessionId = null;
|
|
909
|
-
this.
|
|
914
|
+
this.isStreaming = false;
|
|
910
915
|
this.emit("sessionEnded", {
|
|
911
916
|
type: "sessionEnded",
|
|
912
917
|
sessionId: message.sessionId,
|
|
913
918
|
timestamp: message.timestamp
|
|
914
919
|
});
|
|
915
920
|
break;
|
|
916
|
-
case "
|
|
917
|
-
const
|
|
918
|
-
type: "
|
|
919
|
-
text: message.text,
|
|
920
|
-
isFinal: message.isFinal,
|
|
921
|
+
case "ready": {
|
|
922
|
+
const readyEvent = {
|
|
923
|
+
type: "ready",
|
|
921
924
|
timestamp: message.timestamp
|
|
922
925
|
};
|
|
923
|
-
|
|
924
|
-
transcriptEvent.confidence = message.confidence;
|
|
925
|
-
}
|
|
926
|
-
this.emit("transcript", transcriptEvent);
|
|
927
|
-
this.transcriptHandler?.(message.text, message.isFinal);
|
|
926
|
+
this.emit("ready", readyEvent);
|
|
928
927
|
break;
|
|
929
928
|
}
|
|
930
929
|
case "response": {
|
|
@@ -951,8 +950,26 @@ var LiveSpeechClient = class {
|
|
|
951
950
|
this.audioHandler?.(audioData);
|
|
952
951
|
break;
|
|
953
952
|
}
|
|
953
|
+
case "userTranscript": {
|
|
954
|
+
const userTranscriptEvent = {
|
|
955
|
+
type: "userTranscript",
|
|
956
|
+
text: message.text,
|
|
957
|
+
timestamp: message.timestamp
|
|
958
|
+
};
|
|
959
|
+
this.emit("userTranscript", userTranscriptEvent);
|
|
960
|
+
this.userTranscriptHandler?.(message.text);
|
|
961
|
+
break;
|
|
962
|
+
}
|
|
963
|
+
case "turnComplete": {
|
|
964
|
+
const turnCompleteEvent = {
|
|
965
|
+
type: "turnComplete",
|
|
966
|
+
timestamp: message.timestamp
|
|
967
|
+
};
|
|
968
|
+
this.emit("turnComplete", turnCompleteEvent);
|
|
969
|
+
break;
|
|
970
|
+
}
|
|
954
971
|
case "error":
|
|
955
|
-
this.handleError(message.code, message.message
|
|
972
|
+
this.handleError(message.code, message.message);
|
|
956
973
|
break;
|
|
957
974
|
default:
|
|
958
975
|
this.logger.warn("Unknown message type:", message.type);
|