@drawdream/livespeech 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
21
21
  */
22
22
  declare function isValidRegion(value: string): value is Region;
23
23
 
24
+ /**
25
+ * Pipeline mode for audio processing
26
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
+ * - 'composed': Uses separate STT + LLM + TTS services
28
+ */
29
+ type PipelineMode = 'live' | 'composed';
24
30
  /**
25
31
  * Configuration options for the LiveSpeech client
26
32
  *
@@ -74,41 +80,20 @@ interface SessionConfig {
74
80
  /**
75
81
  * System prompt for the AI assistant
76
82
  */
77
- prePrompt: string;
78
- /**
79
- * Voice ID for text-to-speech output
80
- * @default 'en-US-Standard-A'
81
- */
82
- voiceId?: string;
83
- /**
84
- * Language code for speech recognition
85
- * @default 'en-US'
86
- */
87
- languageCode?: string;
88
- /**
89
- * Audio encoding format for input
90
- * @default 'pcm16'
91
- */
92
- inputFormat?: AudioFormat;
93
- /**
94
- * Audio encoding format for output
95
- * @default 'pcm16'
96
- */
97
- outputFormat?: AudioFormat;
83
+ prePrompt?: string;
98
84
  /**
99
- * Sample rate for audio in Hz
100
- * @default 16000
85
+ * Language code for speech recognition (e.g., "en-US", "ko-KR")
86
+ * @default "en-US"
101
87
  */
102
- sampleRate?: number;
88
+ language?: string;
103
89
  /**
104
- * Custom metadata to attach to the session
90
+ * Pipeline mode for audio processing
91
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
92
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
+ * @default "live"
105
94
  */
106
- metadata?: Record<string, string>;
95
+ pipelineMode?: PipelineMode;
107
96
  }
108
- /**
109
- * Supported audio formats
110
- */
111
- type AudioFormat = 'pcm16' | 'opus' | 'wav';
112
97
  /**
113
98
  * Internal resolved configuration with defaults applied
114
99
  */
@@ -125,7 +110,7 @@ interface ResolvedConfig {
125
110
  /**
126
111
  * Event types emitted by the LiveSpeech client
127
112
  */
128
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
113
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
129
114
  /**
130
115
  * Event payload for 'connected' event
131
116
  */
@@ -164,13 +149,19 @@ interface SessionEndedEvent {
164
149
  timestamp: string;
165
150
  }
166
151
  /**
167
- * Event payload for 'transcript' event
152
+ * Event payload for 'ready' event
153
+ */
154
+ interface ReadyEvent {
155
+ type: 'ready';
156
+ timestamp: string;
157
+ }
158
+ /**
159
+ * Event payload for 'userTranscript' event
160
+ * User's speech transcription
168
161
  */
169
- interface TranscriptEvent {
170
- type: 'transcript';
162
+ interface UserTranscriptEvent {
163
+ type: 'userTranscript';
171
164
  text: string;
172
- isFinal: boolean;
173
- confidence?: number;
174
165
  timestamp: string;
175
166
  }
176
167
  /**
@@ -205,7 +196,7 @@ interface ErrorEvent {
205
196
  /**
206
197
  * Error codes
207
198
  */
208
- type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
199
+ type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'streaming_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
209
200
  /**
210
201
  * Event payload for 'reconnecting' event
211
202
  */
@@ -216,14 +207,22 @@ interface ReconnectingEvent {
216
207
  delay: number;
217
208
  timestamp: string;
218
209
  }
210
+ /**
211
+ * Event payload for 'turnComplete' event (both modes)
212
+ * Indicates the AI has finished its response turn
213
+ */
214
+ interface TurnCompleteEvent {
215
+ type: 'turnComplete';
216
+ timestamp: string;
217
+ }
219
218
  /**
220
219
  * Union type of all event payloads
221
220
  */
222
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
221
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
223
222
  /**
224
223
  * Simplified event handlers for common use cases
225
224
  */
226
- type TranscriptHandler = (text: string, isFinal: boolean) => void;
225
+ type UserTranscriptHandler = (text: string) => void;
227
226
  type ResponseHandler = (text: string, isFinal: boolean) => void;
228
227
  type AudioHandler = (data: Uint8Array) => void;
229
228
  type ErrorHandler = (error: ErrorEvent) => void;
@@ -231,30 +230,25 @@ type ErrorHandler = (error: ErrorEvent) => void;
231
230
  /**
232
231
  * WebSocket message types sent from client to server
233
232
  */
234
- type ClientMessageType = 'startSession' | 'endSession' | 'audio' | 'ping';
233
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'ping';
235
234
  /**
236
235
  * WebSocket message types received from server
237
236
  */
238
- type ServerMessageType = 'connected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
237
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
239
238
  /**
240
239
  * Base interface for client messages
241
240
  */
242
241
  interface BaseClientMessage {
243
242
  action: ClientMessageType;
244
- requestId?: string;
245
243
  }
246
244
  /**
247
245
  * Start session message
248
246
  */
249
247
  interface StartSessionMessage extends BaseClientMessage {
250
248
  action: 'startSession';
251
- prePrompt: string;
252
- voiceId?: string;
253
- languageCode?: string;
254
- inputFormat?: string;
255
- outputFormat?: string;
256
- sampleRate?: number;
257
- metadata?: Record<string, string>;
249
+ prePrompt?: string;
250
+ language?: string;
251
+ pipelineMode?: 'live' | 'composed';
258
252
  }
259
253
  /**
260
254
  * End session message
@@ -263,14 +257,23 @@ interface EndSessionMessage extends BaseClientMessage {
263
257
  action: 'endSession';
264
258
  }
265
259
  /**
266
- * Audio data message
260
+ * Audio start message - begin streaming session
261
+ */
262
+ interface AudioStartMessage extends BaseClientMessage {
263
+ action: 'audioStart';
264
+ }
265
+ /**
266
+ * Audio chunk message - send audio data
267
267
  */
268
- interface AudioMessage extends BaseClientMessage {
269
- action: 'audio';
268
+ interface AudioChunkMessage extends BaseClientMessage {
269
+ action: 'audioChunk';
270
270
  data: string;
271
- format?: string;
272
- sampleRate?: number;
273
- isFinal?: boolean;
271
+ }
272
+ /**
273
+ * Audio end message - end streaming session
274
+ */
275
+ interface AudioEndMessage extends BaseClientMessage {
276
+ action: 'audioEnd';
274
277
  }
275
278
  /**
276
279
  * Ping message for keep-alive
@@ -281,22 +284,14 @@ interface PingMessage extends BaseClientMessage {
281
284
  /**
282
285
  * Union type of all client messages
283
286
  */
284
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioMessage | PingMessage;
287
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | PingMessage;
285
288
  /**
286
289
  * Base interface for server messages
287
290
  */
288
291
  interface BaseServerMessage {
289
292
  type: ServerMessageType;
290
- requestId?: string;
291
293
  timestamp: string;
292
294
  }
293
- /**
294
- * Connected message from server
295
- */
296
- interface ServerConnectedMessage extends BaseServerMessage {
297
- type: 'connected';
298
- connectionId: string;
299
- }
300
295
  /**
301
296
  * Session started message from server
302
297
  */
@@ -312,13 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
312
307
  sessionId: string;
313
308
  }
314
309
  /**
315
- * Transcript message from server
310
+ * User transcript message from server (user's speech transcription)
316
311
  */
317
- interface ServerTranscriptMessage extends BaseServerMessage {
318
- type: 'transcript';
312
+ interface ServerUserTranscriptMessage extends BaseServerMessage {
313
+ type: 'userTranscript';
319
314
  text: string;
320
- isFinal: boolean;
321
- confidence?: number;
322
315
  }
323
316
  /**
324
317
  * Response message from server
@@ -344,7 +337,6 @@ interface ServerErrorMessage extends BaseServerMessage {
344
337
  type: 'error';
345
338
  code: string;
346
339
  message: string;
347
- details?: unknown;
348
340
  }
349
341
  /**
350
342
  * Pong message from server
@@ -352,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
352
344
  interface ServerPongMessage extends BaseServerMessage {
353
345
  type: 'pong';
354
346
  }
347
+ /**
348
+ * Turn complete message from server
349
+ * Indicates the AI has finished its response turn
350
+ */
351
+ interface ServerTurnCompleteMessage extends BaseServerMessage {
352
+ type: 'turnComplete';
353
+ }
354
+ /**
355
+ * Ready message from server
356
+ * Indicates the Gemini Live session is ready for audio input
357
+ */
358
+ interface ServerReadyMessage extends BaseServerMessage {
359
+ type: 'ready';
360
+ }
355
361
  /**
356
362
  * Union type of all server messages
357
363
  */
358
- type ServerMessage = ServerConnectedMessage | ServerSessionStartedMessage | ServerSessionEndedMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
364
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
359
365
 
360
366
  /**
361
367
  * Connection state
@@ -368,13 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
368
374
  type LiveSpeechEventMap = {
369
375
  connected: ConnectedEvent;
370
376
  disconnected: DisconnectedEvent;
377
+ reconnecting: ReconnectingEvent;
371
378
  sessionStarted: SessionStartedEvent;
372
379
  sessionEnded: SessionEndedEvent;
373
- transcript: TranscriptEvent;
380
+ ready: ReadyEvent;
381
+ userTranscript: UserTranscriptEvent;
374
382
  response: ResponseEvent;
375
383
  audio: AudioEvent;
384
+ turnComplete: TurnCompleteEvent;
376
385
  error: ErrorEvent;
377
- reconnecting: ReconnectingEvent;
378
386
  };
379
387
  /**
380
388
  * LiveSpeech client for real-time speech-to-speech AI conversations
@@ -385,9 +393,9 @@ declare class LiveSpeechClient {
385
393
  private readonly audioEncoder;
386
394
  private readonly logger;
387
395
  private sessionId;
388
- private sessionConfig;
396
+ private isStreaming;
389
397
  private readonly eventListeners;
390
- private transcriptHandler;
398
+ private userTranscriptHandler;
391
399
  private responseHandler;
392
400
  private audioHandler;
393
401
  private errorHandler;
@@ -412,6 +420,10 @@ declare class LiveSpeechClient {
412
420
  * Check if session is active
413
421
  */
414
422
  get hasActiveSession(): boolean;
423
+ /**
424
+ * Check if audio streaming is active
425
+ */
426
+ get isAudioStreaming(): boolean;
415
427
  /**
416
428
  * Connect to the server
417
429
  */
@@ -423,18 +435,23 @@ declare class LiveSpeechClient {
423
435
  /**
424
436
  * Start a new session
425
437
  */
426
- startSession(config: SessionConfig): Promise<string>;
438
+ startSession(config?: SessionConfig): Promise<string>;
427
439
  /**
428
440
  * End the current session
429
441
  */
430
442
  endSession(): Promise<void>;
431
443
  /**
432
- * Send audio data
444
+ * Start audio streaming session
445
+ */
446
+ audioStart(): void;
447
+ /**
448
+ * Send audio chunk (PCM16 base64 encoded)
449
+ */
450
+ sendAudioChunk(data: Uint8Array): void;
451
+ /**
452
+ * End audio streaming session
433
453
  */
434
- sendAudio(data: Uint8Array, options?: {
435
- format?: AudioFormat;
436
- isFinal?: boolean;
437
- }): void;
454
+ audioEnd(): void;
438
455
  /**
439
456
  * Add event listener
440
457
  */
@@ -444,17 +461,17 @@ declare class LiveSpeechClient {
444
461
  */
445
462
  off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
446
463
  /**
447
- * Set transcript handler (simplified)
448
- */
449
- setTranscriptHandler(handler: TranscriptHandler): void;
450
- /**
451
- * Set response handler (simplified)
464
+ * Set response handler
452
465
  */
453
466
  setResponseHandler(handler: ResponseHandler): void;
454
467
  /**
455
468
  * Set audio handler (simplified)
456
469
  */
457
470
  setAudioHandler(handler: AudioHandler): void;
471
+ /**
472
+ * Set user transcript handler
473
+ */
474
+ setUserTranscriptHandler(handler: UserTranscriptHandler): void;
458
475
  /**
459
476
  * Set error handler (simplified)
460
477
  */
@@ -467,6 +484,10 @@ declare class LiveSpeechClient {
467
484
  private handleMessage;
468
485
  }
469
486
 
487
+ /**
488
+ * Audio format type
489
+ */
490
+ type AudioFormat = 'pcm16' | 'opus' | 'wav';
470
491
  /**
471
492
  * Audio encoder options
472
493
  */
@@ -552,4 +573,4 @@ declare class AudioEncoder {
552
573
  wrapWav(data: Uint8Array): Uint8Array;
553
574
  }
554
575
 
555
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioFormat, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
576
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
package/dist/index.js CHANGED
@@ -46,7 +46,7 @@ var Region = {
46
46
  };
47
47
  var REGION_ENDPOINTS = {
48
48
  "ap-northeast-2": "wss://talk.drawdream.co.kr",
49
- "us-west-2": "wss://talk..drawdream.ca"
49
+ "us-west-2": "wss://talk.drawdream.ca"
50
50
  // Coming soon
51
51
  };
52
52
  function getEndpointForRegion(region) {
@@ -332,22 +332,13 @@ var WebSocketConnection = class {
332
332
  /**
333
333
  * Handle incoming message
334
334
  */
335
- handleMessage(data, onFirstConnect) {
335
+ handleMessage(data) {
336
336
  const message = parseServerMessage(data);
337
337
  if (!message) {
338
338
  this.logger.warn("Invalid message received:", data);
339
339
  return;
340
340
  }
341
341
  this.logger.debug("Received message:", message.type);
342
- if (message.type === "connected") {
343
- this.connectionId = message.connectionId;
344
- this.state = "connected";
345
- this.retryController.reset();
346
- this.startPingInterval();
347
- this.events.onOpen?.(message.connectionId);
348
- onFirstConnect?.();
349
- return;
350
- }
351
342
  if (message.type === "pong") {
352
343
  this.logger.debug("Pong received");
353
344
  return;
@@ -613,24 +604,17 @@ var CONFIG_DEFAULTS = {
613
604
  reconnectDelay: 1e3,
614
605
  debug: false
615
606
  };
616
- var SESSION_DEFAULTS = {
617
- voiceId: "en-US-Standard-A",
618
- languageCode: "en-US",
619
- inputFormat: "pcm16",
620
- outputFormat: "pcm16",
621
- sampleRate: 16e3
622
- };
623
607
  var LiveSpeechClient = class {
624
608
  config;
625
609
  connection;
626
610
  audioEncoder;
627
611
  logger;
628
612
  sessionId = null;
629
- sessionConfig = null;
613
+ isStreaming = false;
630
614
  // Event listeners using a simple map
631
615
  eventListeners = /* @__PURE__ */ new Map();
632
616
  // Simplified handlers
633
- transcriptHandler = null;
617
+ userTranscriptHandler = null;
634
618
  responseHandler = null;
635
619
  audioHandler = null;
636
620
  errorHandler = null;
@@ -692,6 +676,12 @@ var LiveSpeechClient = class {
692
676
  get hasActiveSession() {
693
677
  return this.sessionId !== null;
694
678
  }
679
+ /**
680
+ * Check if audio streaming is active
681
+ */
682
+ get isAudioStreaming() {
683
+ return this.isStreaming;
684
+ }
695
685
  /**
696
686
  * Connect to the server
697
687
  */
@@ -705,7 +695,7 @@ var LiveSpeechClient = class {
705
695
  disconnect() {
706
696
  this.logger.info("Disconnecting...");
707
697
  this.sessionId = null;
708
- this.sessionConfig = null;
698
+ this.isStreaming = false;
709
699
  this.connection.disconnect();
710
700
  }
711
701
  /**
@@ -718,16 +708,6 @@ var LiveSpeechClient = class {
718
708
  if (this.sessionId) {
719
709
  throw new Error("Session already active. Call endSession() first.");
720
710
  }
721
- const resolvedConfig = {
722
- prePrompt: config.prePrompt,
723
- voiceId: config.voiceId ?? SESSION_DEFAULTS.voiceId,
724
- languageCode: config.languageCode ?? SESSION_DEFAULTS.languageCode,
725
- inputFormat: config.inputFormat ?? SESSION_DEFAULTS.inputFormat,
726
- outputFormat: config.outputFormat ?? SESSION_DEFAULTS.outputFormat,
727
- sampleRate: config.sampleRate ?? SESSION_DEFAULTS.sampleRate,
728
- metadata: config.metadata ?? {}
729
- };
730
- this.sessionConfig = resolvedConfig;
731
711
  this.logger.info("Starting session...");
732
712
  return new Promise((resolve, reject) => {
733
713
  const onSessionStarted = (event) => {
@@ -744,16 +724,17 @@ var LiveSpeechClient = class {
744
724
  };
745
725
  this.on("sessionStarted", onSessionStarted);
746
726
  this.on("error", onError);
747
- this.connection.send({
748
- action: "startSession",
749
- prePrompt: resolvedConfig.prePrompt,
750
- voiceId: resolvedConfig.voiceId,
751
- languageCode: resolvedConfig.languageCode,
752
- inputFormat: resolvedConfig.inputFormat,
753
- outputFormat: resolvedConfig.outputFormat,
754
- sampleRate: resolvedConfig.sampleRate,
755
- metadata: resolvedConfig.metadata
756
- });
727
+ const startMessage = {
728
+ action: "startSession"
729
+ };
730
+ if (config?.prePrompt) {
731
+ startMessage.prePrompt = config.prePrompt;
732
+ }
733
+ if (config?.language) {
734
+ startMessage.language = config.language;
735
+ }
736
+ startMessage.pipelineMode = config?.pipelineMode ?? "live";
737
+ this.connection.send(startMessage);
757
738
  });
758
739
  }
759
740
  /**
@@ -765,6 +746,9 @@ var LiveSpeechClient = class {
765
746
  return;
766
747
  }
767
748
  this.logger.info("Ending session...");
749
+ if (this.isStreaming) {
750
+ this.audioEnd();
751
+ }
768
752
  return new Promise((resolve) => {
769
753
  const onSessionEnded = () => {
770
754
  this.off("sessionEnded", onSessionEnded);
@@ -775,28 +759,49 @@ var LiveSpeechClient = class {
775
759
  });
776
760
  }
777
761
  /**
778
- * Send audio data
762
+ * Start audio streaming session
779
763
  */
780
- sendAudio(data, options) {
764
+ audioStart() {
781
765
  if (!this.isConnected) {
782
766
  throw new Error("Not connected");
783
767
  }
784
768
  if (!this.sessionId) {
785
769
  throw new Error("No active session. Call startSession() first.");
786
770
  }
771
+ if (this.isStreaming) {
772
+ throw new Error("Already streaming. Call audioEnd() first.");
773
+ }
774
+ this.logger.info("Starting audio stream...");
775
+ this.connection.send({ action: "audioStart" });
776
+ this.isStreaming = true;
777
+ }
778
+ /**
779
+ * Send audio chunk (PCM16 base64 encoded)
780
+ */
781
+ sendAudioChunk(data) {
782
+ if (!this.isConnected) {
783
+ throw new Error("Not connected");
784
+ }
785
+ if (!this.isStreaming) {
786
+ throw new Error("Not streaming. Call audioStart() first.");
787
+ }
787
788
  const base64Data = this.audioEncoder.encode(data);
788
- const format = options?.format ?? this.sessionConfig?.inputFormat ?? SESSION_DEFAULTS.inputFormat;
789
- const sampleRate = this.sessionConfig?.sampleRate ?? SESSION_DEFAULTS.sampleRate;
790
- const audioMessage = {
791
- action: "audio",
792
- data: base64Data,
793
- format,
794
- sampleRate
795
- };
796
- if (options?.isFinal !== void 0) {
797
- audioMessage.isFinal = options.isFinal;
789
+ this.connection.send({
790
+ action: "audioChunk",
791
+ data: base64Data
792
+ });
793
+ }
794
+ /**
795
+ * End audio streaming session
796
+ */
797
+ audioEnd() {
798
+ if (!this.isStreaming) {
799
+ this.logger.warn("Not streaming");
800
+ return;
798
801
  }
799
- this.connection.send(audioMessage);
802
+ this.logger.info("Ending audio stream...");
803
+ this.connection.send({ action: "audioEnd" });
804
+ this.isStreaming = false;
800
805
  }
801
806
  // ==================== Event System ====================
802
807
  /**
@@ -818,13 +823,7 @@ var LiveSpeechClient = class {
818
823
  }
819
824
  }
820
825
  /**
821
- * Set transcript handler (simplified)
822
- */
823
- setTranscriptHandler(handler) {
824
- this.transcriptHandler = handler;
825
- }
826
- /**
827
- * Set response handler (simplified)
826
+ * Set response handler
828
827
  */
829
828
  setResponseHandler(handler) {
830
829
  this.responseHandler = handler;
@@ -835,6 +834,12 @@ var LiveSpeechClient = class {
835
834
  setAudioHandler(handler) {
836
835
  this.audioHandler = handler;
837
836
  }
837
+ /**
838
+ * Set user transcript handler
839
+ */
840
+ setUserTranscriptHandler(handler) {
841
+ this.userTranscriptHandler = handler;
842
+ }
838
843
  /**
839
844
  * Set error handler (simplified)
840
845
  */
@@ -864,7 +869,7 @@ var LiveSpeechClient = class {
864
869
  }
865
870
  handleDisconnected(code, _reason) {
866
871
  this.sessionId = null;
867
- this.sessionConfig = null;
872
+ this.isStreaming = false;
868
873
  const event = {
869
874
  type: "disconnected",
870
875
  reason: code === 1e3 ? "normal" : "error",
@@ -906,25 +911,19 @@ var LiveSpeechClient = class {
906
911
  break;
907
912
  case "sessionEnded":
908
913
  this.sessionId = null;
909
- this.sessionConfig = null;
914
+ this.isStreaming = false;
910
915
  this.emit("sessionEnded", {
911
916
  type: "sessionEnded",
912
917
  sessionId: message.sessionId,
913
918
  timestamp: message.timestamp
914
919
  });
915
920
  break;
916
- case "transcript": {
917
- const transcriptEvent = {
918
- type: "transcript",
919
- text: message.text,
920
- isFinal: message.isFinal,
921
+ case "ready": {
922
+ const readyEvent = {
923
+ type: "ready",
921
924
  timestamp: message.timestamp
922
925
  };
923
- if (message.confidence !== void 0) {
924
- transcriptEvent.confidence = message.confidence;
925
- }
926
- this.emit("transcript", transcriptEvent);
927
- this.transcriptHandler?.(message.text, message.isFinal);
926
+ this.emit("ready", readyEvent);
928
927
  break;
929
928
  }
930
929
  case "response": {
@@ -951,8 +950,26 @@ var LiveSpeechClient = class {
951
950
  this.audioHandler?.(audioData);
952
951
  break;
953
952
  }
953
+ case "userTranscript": {
954
+ const userTranscriptEvent = {
955
+ type: "userTranscript",
956
+ text: message.text,
957
+ timestamp: message.timestamp
958
+ };
959
+ this.emit("userTranscript", userTranscriptEvent);
960
+ this.userTranscriptHandler?.(message.text);
961
+ break;
962
+ }
963
+ case "turnComplete": {
964
+ const turnCompleteEvent = {
965
+ type: "turnComplete",
966
+ timestamp: message.timestamp
967
+ };
968
+ this.emit("turnComplete", turnCompleteEvent);
969
+ break;
970
+ }
954
971
  case "error":
955
- this.handleError(message.code, message.message, message.details);
972
+ this.handleError(message.code, message.message);
956
973
  break;
957
974
  default:
958
975
  this.logger.warn("Unknown message type:", message.type);