@drawdream/livespeech 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -26,12 +26,12 @@ const client = new LiveSpeechClient({
26
26
  });
27
27
 
28
28
  // Handle events
29
- client.setTranscriptHandler((text, isFinal) => {
30
- console.log(`Transcript: ${text} (final: ${isFinal})`);
29
+ client.setUserTranscriptHandler((text) => {
30
+ console.log(`You said: ${text}`);
31
31
  });
32
32
 
33
- client.setResponseHandler((text, isFinal) => {
34
- console.log(`AI Response: ${text}`);
33
+ client.setTranscriptHandler((text, isFinal) => {
34
+ console.log(`AI Transcript: ${text}`);
35
35
  });
36
36
 
37
37
  client.setAudioHandler((audioData) => {
@@ -42,10 +42,44 @@ client.setAudioHandler((audioData) => {
42
42
  await client.connect();
43
43
  await client.startSession({
44
44
  prePrompt: 'You are a helpful assistant.',
45
+ // pipelineMode: 'live' is the default
46
+ });
47
+
48
+ // Start streaming and send audio
49
+ client.audioStart();
50
+ client.sendAudioChunk(audioBuffer);
51
+ ```
52
+
53
+ ## Pipeline Modes
54
+
55
+ The SDK supports two pipeline modes for audio processing:
56
+
57
+ ### Live Mode (Default)
58
+
59
+ Uses Gemini 2.5 Flash Live API for end-to-end audio conversation. This provides:
60
+ - **Lower latency** - Direct audio-to-audio processing
61
+ - **Natural conversation** - Built-in voice activity detection and turn-taking
62
+ - **Real-time transcription** - Both user and AI speech are transcribed
63
+
64
+ ```typescript
65
+ await client.startSession({
66
+ prePrompt: 'You are a helpful assistant.',
67
+ pipelineMode: 'live', // Default, can be omitted
45
68
  });
69
+ ```
70
+
71
+ ### Composed Mode
46
72
 
47
- // Send audio
48
- client.sendAudio(audioBuffer);
73
+ Uses separate STT + LLM + TTS services for more customization:
74
+ - **More control** - Separate services for each step
75
+ - **Custom voices** - Use different TTS voices
76
+ - **Text responses** - Access to intermediate text responses
77
+
78
+ ```typescript
79
+ await client.startSession({
80
+ prePrompt: 'You are a helpful assistant.',
81
+ pipelineMode: 'composed',
82
+ });
49
83
  ```
50
84
 
51
85
  ## API Reference
@@ -87,8 +121,9 @@ The SDK provides built-in region support, so you don't need to remember endpoint
87
121
 
88
122
  ```typescript
89
123
  // Simple handlers
90
- client.setTranscriptHandler((text, isFinal) => {});
91
- client.setResponseHandler((text, isFinal) => {});
124
+ client.setUserTranscriptHandler((text) => {}); // User's speech transcription
125
+ client.setTranscriptHandler((text, isFinal) => {}); // AI's speech transcription (live mode)
126
+ client.setResponseHandler((text, isFinal) => {}); // AI text response (composed mode)
92
127
  client.setAudioHandler((audioData) => {});
93
128
  client.setErrorHandler((error) => {});
94
129
 
@@ -97,24 +132,23 @@ client.on('connected', (event) => {});
97
132
  client.on('disconnected', (event) => {});
98
133
  client.on('sessionStarted', (event) => {});
99
134
  client.on('sessionEnded', (event) => {});
100
- client.on('transcript', (event) => {});
101
- client.on('response', (event) => {});
135
+ client.on('userTranscript', (event) => {}); // User's speech transcription
136
+ client.on('transcript', (event) => {}); // AI's speech transcription
137
+ client.on('response', (event) => {}); // AI text response
102
138
  client.on('audio', (event) => {});
103
139
  client.on('error', (event) => {});
104
140
  client.on('reconnecting', (event) => {});
141
+ client.on('ready', (event) => {}); // Gemini Live ready (live mode)
142
+ client.on('turnComplete', (event) => {}); // AI finished speaking (live mode)
105
143
  ```
106
144
 
107
145
  ### SessionConfig
108
146
 
109
147
  | Option | Type | Default | Description |
110
148
  |--------|------|---------|-------------|
111
- | `prePrompt` | `string` | **required** | System prompt for the AI |
112
- | `voiceId` | `string` | `'en-US-Standard-A'` | TTS voice ID |
113
- | `languageCode` | `string` | `'en-US'` | Language for STT |
114
- | `inputFormat` | `AudioFormat` | `'pcm16'` | Input audio format |
115
- | `outputFormat` | `AudioFormat` | `'pcm16'` | Output audio format |
116
- | `sampleRate` | `number` | `16000` | Sample rate in Hz |
117
- | `metadata` | `Record<string,string>` | `{}` | Custom metadata |
149
+ | `prePrompt` | `string` | - | System prompt for the AI |
150
+ | `language` | `string` | `'en-US'` | Language code for speech (e.g., "ko-KR") |
151
+ | `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
118
152
 
119
153
  ## Audio Utilities
120
154
 
package/dist/index.d.mts CHANGED
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
21
21
  */
22
22
  declare function isValidRegion(value: string): value is Region;
23
23
 
24
+ /**
25
+ * Pipeline mode for audio processing
26
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
+ * - 'composed': Uses separate STT + LLM + TTS services
28
+ */
29
+ type PipelineMode = 'live' | 'composed';
24
30
  /**
25
31
  * Configuration options for the LiveSpeech client
26
32
  *
@@ -75,6 +81,18 @@ interface SessionConfig {
75
81
  * System prompt for the AI assistant
76
82
  */
77
83
  prePrompt?: string;
84
+ /**
85
+ * Language code for speech recognition (e.g., "en-US", "ko-KR")
86
+ * @default "en-US"
87
+ */
88
+ language?: string;
89
+ /**
90
+ * Pipeline mode for audio processing
91
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
92
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
+ * @default "live"
94
+ */
95
+ pipelineMode?: PipelineMode;
78
96
  }
79
97
  /**
80
98
  * Internal resolved configuration with defaults applied
@@ -92,7 +110,7 @@ interface ResolvedConfig {
92
110
  /**
93
111
  * Event types emitted by the LiveSpeech client
94
112
  */
95
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
113
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
96
114
  /**
97
115
  * Event payload for 'connected' event
98
116
  */
@@ -131,34 +149,19 @@ interface SessionEndedEvent {
131
149
  timestamp: string;
132
150
  }
133
151
  /**
134
- * Event payload for 'streamingStarted' event - acknowledgment of audioStart
152
+ * Event payload for 'ready' event
135
153
  */
136
- interface StreamingStartedEvent {
137
- type: 'streamingStarted';
154
+ interface ReadyEvent {
155
+ type: 'ready';
138
156
  timestamp: string;
139
157
  }
140
158
  /**
141
- * Event payload for 'speechStart' event - VAD detected speech begin
159
+ * Event payload for 'userTranscript' event
160
+ * User's speech transcription
142
161
  */
143
- interface SpeechStartEvent {
144
- type: 'speechStart';
145
- timestamp: string;
146
- }
147
- /**
148
- * Event payload for 'speechEnd' event - VAD detected speech end
149
- */
150
- interface SpeechEndEvent {
151
- type: 'speechEnd';
152
- timestamp: string;
153
- }
154
- /**
155
- * Event payload for 'transcript' event
156
- */
157
- interface TranscriptEvent {
158
- type: 'transcript';
162
+ interface UserTranscriptEvent {
163
+ type: 'userTranscript';
159
164
  text: string;
160
- isFinal: boolean;
161
- confidence?: number;
162
165
  timestamp: string;
163
166
  }
164
167
  /**
@@ -204,14 +207,22 @@ interface ReconnectingEvent {
204
207
  delay: number;
205
208
  timestamp: string;
206
209
  }
210
+ /**
211
+ * Event payload for 'turnComplete' event (both modes)
212
+ * Indicates the AI has finished its response turn
213
+ */
214
+ interface TurnCompleteEvent {
215
+ type: 'turnComplete';
216
+ timestamp: string;
217
+ }
207
218
  /**
208
219
  * Union type of all event payloads
209
220
  */
210
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | StreamingStartedEvent | SpeechStartEvent | SpeechEndEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
221
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
211
222
  /**
212
223
  * Simplified event handlers for common use cases
213
224
  */
214
- type TranscriptHandler = (text: string, isFinal: boolean) => void;
225
+ type UserTranscriptHandler = (text: string) => void;
215
226
  type ResponseHandler = (text: string, isFinal: boolean) => void;
216
227
  type AudioHandler = (data: Uint8Array) => void;
217
228
  type ErrorHandler = (error: ErrorEvent) => void;
@@ -223,7 +234,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
223
234
  /**
224
235
  * WebSocket message types received from server
225
236
  */
226
- type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
237
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
227
238
  /**
228
239
  * Base interface for client messages
229
240
  */
@@ -236,6 +247,8 @@ interface BaseClientMessage {
236
247
  interface StartSessionMessage extends BaseClientMessage {
237
248
  action: 'startSession';
238
249
  prePrompt?: string;
250
+ language?: string;
251
+ pipelineMode?: 'live' | 'composed';
239
252
  }
240
253
  /**
241
254
  * End session message
@@ -294,31 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
294
307
  sessionId: string;
295
308
  }
296
309
  /**
297
- * Streaming started message - acknowledgment of audioStart
310
+ * User transcript message from server (user's speech transcription)
298
311
  */
299
- interface ServerStreamingStartedMessage extends BaseServerMessage {
300
- type: 'streamingStarted';
301
- }
302
- /**
303
- * Speech start message - VAD detected speech begin
304
- */
305
- interface ServerSpeechStartMessage extends BaseServerMessage {
306
- type: 'speechStart';
307
- }
308
- /**
309
- * Speech end message - VAD detected speech end
310
- */
311
- interface ServerSpeechEndMessage extends BaseServerMessage {
312
- type: 'speechEnd';
313
- }
314
- /**
315
- * Transcript message from server
316
- */
317
- interface ServerTranscriptMessage extends BaseServerMessage {
318
- type: 'transcript';
312
+ interface ServerUserTranscriptMessage extends BaseServerMessage {
313
+ type: 'userTranscript';
319
314
  text: string;
320
- isFinal: boolean;
321
- confidence?: number;
322
315
  }
323
316
  /**
324
317
  * Response message from server
@@ -351,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
351
344
  interface ServerPongMessage extends BaseServerMessage {
352
345
  type: 'pong';
353
346
  }
347
+ /**
348
+ * Turn complete message from server
349
+ * Indicates the AI has finished its response turn
350
+ */
351
+ interface ServerTurnCompleteMessage extends BaseServerMessage {
352
+ type: 'turnComplete';
353
+ }
354
+ /**
355
+ * Ready message from server
356
+ * Indicates the Gemini Live session is ready for audio input
357
+ */
358
+ interface ServerReadyMessage extends BaseServerMessage {
359
+ type: 'ready';
360
+ }
354
361
  /**
355
362
  * Union type of all server messages
356
363
  */
357
- type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerStreamingStartedMessage | ServerSpeechStartMessage | ServerSpeechEndMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
364
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
358
365
 
359
366
  /**
360
367
  * Connection state
@@ -367,16 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
367
374
  type LiveSpeechEventMap = {
368
375
  connected: ConnectedEvent;
369
376
  disconnected: DisconnectedEvent;
377
+ reconnecting: ReconnectingEvent;
370
378
  sessionStarted: SessionStartedEvent;
371
379
  sessionEnded: SessionEndedEvent;
372
- streamingStarted: StreamingStartedEvent;
373
- speechStart: SpeechStartEvent;
374
- speechEnd: SpeechEndEvent;
375
- transcript: TranscriptEvent;
380
+ ready: ReadyEvent;
381
+ userTranscript: UserTranscriptEvent;
376
382
  response: ResponseEvent;
377
383
  audio: AudioEvent;
384
+ turnComplete: TurnCompleteEvent;
378
385
  error: ErrorEvent;
379
- reconnecting: ReconnectingEvent;
380
386
  };
381
387
  /**
382
388
  * LiveSpeech client for real-time speech-to-speech AI conversations
@@ -389,7 +395,7 @@ declare class LiveSpeechClient {
389
395
  private sessionId;
390
396
  private isStreaming;
391
397
  private readonly eventListeners;
392
- private transcriptHandler;
398
+ private userTranscriptHandler;
393
399
  private responseHandler;
394
400
  private audioHandler;
395
401
  private errorHandler;
@@ -455,17 +461,17 @@ declare class LiveSpeechClient {
455
461
  */
456
462
  off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
457
463
  /**
458
- * Set transcript handler (simplified)
459
- */
460
- setTranscriptHandler(handler: TranscriptHandler): void;
461
- /**
462
- * Set response handler (simplified)
464
+ * Set response handler
463
465
  */
464
466
  setResponseHandler(handler: ResponseHandler): void;
465
467
  /**
466
468
  * Set audio handler (simplified)
467
469
  */
468
470
  setAudioHandler(handler: AudioHandler): void;
471
+ /**
472
+ * Set user transcript handler
473
+ */
474
+ setUserTranscriptHandler(handler: UserTranscriptHandler): void;
469
475
  /**
470
476
  * Set error handler (simplified)
471
477
  */
@@ -567,4 +573,4 @@ declare class AudioEncoder {
567
573
  wrapWav(data: Uint8Array): Uint8Array;
568
574
  }
569
575
 
570
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type SpeechEndEvent, type SpeechStartEvent, type StreamingStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
576
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
package/dist/index.d.ts CHANGED
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
21
21
  */
22
22
  declare function isValidRegion(value: string): value is Region;
23
23
 
24
+ /**
25
+ * Pipeline mode for audio processing
26
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
+ * - 'composed': Uses separate STT + LLM + TTS services
28
+ */
29
+ type PipelineMode = 'live' | 'composed';
24
30
  /**
25
31
  * Configuration options for the LiveSpeech client
26
32
  *
@@ -75,6 +81,18 @@ interface SessionConfig {
75
81
  * System prompt for the AI assistant
76
82
  */
77
83
  prePrompt?: string;
84
+ /**
85
+ * Language code for speech recognition (e.g., "en-US", "ko-KR")
86
+ * @default "en-US"
87
+ */
88
+ language?: string;
89
+ /**
90
+ * Pipeline mode for audio processing
91
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
92
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
+ * @default "live"
94
+ */
95
+ pipelineMode?: PipelineMode;
78
96
  }
79
97
  /**
80
98
  * Internal resolved configuration with defaults applied
@@ -92,7 +110,7 @@ interface ResolvedConfig {
92
110
  /**
93
111
  * Event types emitted by the LiveSpeech client
94
112
  */
95
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
113
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
96
114
  /**
97
115
  * Event payload for 'connected' event
98
116
  */
@@ -131,34 +149,19 @@ interface SessionEndedEvent {
131
149
  timestamp: string;
132
150
  }
133
151
  /**
134
- * Event payload for 'streamingStarted' event - acknowledgment of audioStart
152
+ * Event payload for 'ready' event
135
153
  */
136
- interface StreamingStartedEvent {
137
- type: 'streamingStarted';
154
+ interface ReadyEvent {
155
+ type: 'ready';
138
156
  timestamp: string;
139
157
  }
140
158
  /**
141
- * Event payload for 'speechStart' event - VAD detected speech begin
159
+ * Event payload for 'userTranscript' event
160
+ * User's speech transcription
142
161
  */
143
- interface SpeechStartEvent {
144
- type: 'speechStart';
145
- timestamp: string;
146
- }
147
- /**
148
- * Event payload for 'speechEnd' event - VAD detected speech end
149
- */
150
- interface SpeechEndEvent {
151
- type: 'speechEnd';
152
- timestamp: string;
153
- }
154
- /**
155
- * Event payload for 'transcript' event
156
- */
157
- interface TranscriptEvent {
158
- type: 'transcript';
162
+ interface UserTranscriptEvent {
163
+ type: 'userTranscript';
159
164
  text: string;
160
- isFinal: boolean;
161
- confidence?: number;
162
165
  timestamp: string;
163
166
  }
164
167
  /**
@@ -204,14 +207,22 @@ interface ReconnectingEvent {
204
207
  delay: number;
205
208
  timestamp: string;
206
209
  }
210
+ /**
211
+ * Event payload for 'turnComplete' event (both modes)
212
+ * Indicates the AI has finished its response turn
213
+ */
214
+ interface TurnCompleteEvent {
215
+ type: 'turnComplete';
216
+ timestamp: string;
217
+ }
207
218
  /**
208
219
  * Union type of all event payloads
209
220
  */
210
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | StreamingStartedEvent | SpeechStartEvent | SpeechEndEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
221
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
211
222
  /**
212
223
  * Simplified event handlers for common use cases
213
224
  */
214
- type TranscriptHandler = (text: string, isFinal: boolean) => void;
225
+ type UserTranscriptHandler = (text: string) => void;
215
226
  type ResponseHandler = (text: string, isFinal: boolean) => void;
216
227
  type AudioHandler = (data: Uint8Array) => void;
217
228
  type ErrorHandler = (error: ErrorEvent) => void;
@@ -223,7 +234,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
223
234
  /**
224
235
  * WebSocket message types received from server
225
236
  */
226
- type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
237
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
227
238
  /**
228
239
  * Base interface for client messages
229
240
  */
@@ -236,6 +247,8 @@ interface BaseClientMessage {
236
247
  interface StartSessionMessage extends BaseClientMessage {
237
248
  action: 'startSession';
238
249
  prePrompt?: string;
250
+ language?: string;
251
+ pipelineMode?: 'live' | 'composed';
239
252
  }
240
253
  /**
241
254
  * End session message
@@ -294,31 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
294
307
  sessionId: string;
295
308
  }
296
309
  /**
297
- * Streaming started message - acknowledgment of audioStart
310
+ * User transcript message from server (user's speech transcription)
298
311
  */
299
- interface ServerStreamingStartedMessage extends BaseServerMessage {
300
- type: 'streamingStarted';
301
- }
302
- /**
303
- * Speech start message - VAD detected speech begin
304
- */
305
- interface ServerSpeechStartMessage extends BaseServerMessage {
306
- type: 'speechStart';
307
- }
308
- /**
309
- * Speech end message - VAD detected speech end
310
- */
311
- interface ServerSpeechEndMessage extends BaseServerMessage {
312
- type: 'speechEnd';
313
- }
314
- /**
315
- * Transcript message from server
316
- */
317
- interface ServerTranscriptMessage extends BaseServerMessage {
318
- type: 'transcript';
312
+ interface ServerUserTranscriptMessage extends BaseServerMessage {
313
+ type: 'userTranscript';
319
314
  text: string;
320
- isFinal: boolean;
321
- confidence?: number;
322
315
  }
323
316
  /**
324
317
  * Response message from server
@@ -351,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
351
344
  interface ServerPongMessage extends BaseServerMessage {
352
345
  type: 'pong';
353
346
  }
347
+ /**
348
+ * Turn complete message from server
349
+ * Indicates the AI has finished its response turn
350
+ */
351
+ interface ServerTurnCompleteMessage extends BaseServerMessage {
352
+ type: 'turnComplete';
353
+ }
354
+ /**
355
+ * Ready message from server
356
+ * Indicates the Gemini Live session is ready for audio input
357
+ */
358
+ interface ServerReadyMessage extends BaseServerMessage {
359
+ type: 'ready';
360
+ }
354
361
  /**
355
362
  * Union type of all server messages
356
363
  */
357
- type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerStreamingStartedMessage | ServerSpeechStartMessage | ServerSpeechEndMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
364
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
358
365
 
359
366
  /**
360
367
  * Connection state
@@ -367,16 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
367
374
  type LiveSpeechEventMap = {
368
375
  connected: ConnectedEvent;
369
376
  disconnected: DisconnectedEvent;
377
+ reconnecting: ReconnectingEvent;
370
378
  sessionStarted: SessionStartedEvent;
371
379
  sessionEnded: SessionEndedEvent;
372
- streamingStarted: StreamingStartedEvent;
373
- speechStart: SpeechStartEvent;
374
- speechEnd: SpeechEndEvent;
375
- transcript: TranscriptEvent;
380
+ ready: ReadyEvent;
381
+ userTranscript: UserTranscriptEvent;
376
382
  response: ResponseEvent;
377
383
  audio: AudioEvent;
384
+ turnComplete: TurnCompleteEvent;
378
385
  error: ErrorEvent;
379
- reconnecting: ReconnectingEvent;
380
386
  };
381
387
  /**
382
388
  * LiveSpeech client for real-time speech-to-speech AI conversations
@@ -389,7 +395,7 @@ declare class LiveSpeechClient {
389
395
  private sessionId;
390
396
  private isStreaming;
391
397
  private readonly eventListeners;
392
- private transcriptHandler;
398
+ private userTranscriptHandler;
393
399
  private responseHandler;
394
400
  private audioHandler;
395
401
  private errorHandler;
@@ -455,17 +461,17 @@ declare class LiveSpeechClient {
455
461
  */
456
462
  off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
457
463
  /**
458
- * Set transcript handler (simplified)
459
- */
460
- setTranscriptHandler(handler: TranscriptHandler): void;
461
- /**
462
- * Set response handler (simplified)
464
+ * Set response handler
463
465
  */
464
466
  setResponseHandler(handler: ResponseHandler): void;
465
467
  /**
466
468
  * Set audio handler (simplified)
467
469
  */
468
470
  setAudioHandler(handler: AudioHandler): void;
471
+ /**
472
+ * Set user transcript handler
473
+ */
474
+ setUserTranscriptHandler(handler: UserTranscriptHandler): void;
469
475
  /**
470
476
  * Set error handler (simplified)
471
477
  */
@@ -567,4 +573,4 @@ declare class AudioEncoder {
567
573
  wrapWav(data: Uint8Array): Uint8Array;
568
574
  }
569
575
 
570
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type SpeechEndEvent, type SpeechStartEvent, type StreamingStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
576
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
package/dist/index.js CHANGED
@@ -46,7 +46,7 @@ var Region = {
46
46
  };
47
47
  var REGION_ENDPOINTS = {
48
48
  "ap-northeast-2": "wss://talk.drawdream.co.kr",
49
- "us-west-2": "wss://talk..drawdream.ca"
49
+ "us-west-2": "wss://talk.drawdream.ca"
50
50
  // Coming soon
51
51
  };
52
52
  function getEndpointForRegion(region) {
@@ -614,7 +614,7 @@ var LiveSpeechClient = class {
614
614
  // Event listeners using a simple map
615
615
  eventListeners = /* @__PURE__ */ new Map();
616
616
  // Simplified handlers
617
- transcriptHandler = null;
617
+ userTranscriptHandler = null;
618
618
  responseHandler = null;
619
619
  audioHandler = null;
620
620
  errorHandler = null;
@@ -730,6 +730,10 @@ var LiveSpeechClient = class {
730
730
  if (config?.prePrompt) {
731
731
  startMessage.prePrompt = config.prePrompt;
732
732
  }
733
+ if (config?.language) {
734
+ startMessage.language = config.language;
735
+ }
736
+ startMessage.pipelineMode = config?.pipelineMode ?? "live";
733
737
  this.connection.send(startMessage);
734
738
  });
735
739
  }
@@ -819,13 +823,7 @@ var LiveSpeechClient = class {
819
823
  }
820
824
  }
821
825
  /**
822
- * Set transcript handler (simplified)
823
- */
824
- setTranscriptHandler(handler) {
825
- this.transcriptHandler = handler;
826
- }
827
- /**
828
- * Set response handler (simplified)
826
+ * Set response handler
829
827
  */
830
828
  setResponseHandler(handler) {
831
829
  this.responseHandler = handler;
@@ -836,6 +834,12 @@ var LiveSpeechClient = class {
836
834
  setAudioHandler(handler) {
837
835
  this.audioHandler = handler;
838
836
  }
837
+ /**
838
+ * Set user transcript handler
839
+ */
840
+ setUserTranscriptHandler(handler) {
841
+ this.userTranscriptHandler = handler;
842
+ }
839
843
  /**
840
844
  * Set error handler (simplified)
841
845
  */
@@ -914,36 +918,12 @@ var LiveSpeechClient = class {
914
918
  timestamp: message.timestamp
915
919
  });
916
920
  break;
917
- case "streamingStarted":
918
- this.emit("streamingStarted", {
919
- type: "streamingStarted",
920
- timestamp: message.timestamp
921
- });
922
- break;
923
- case "speechStart":
924
- this.emit("speechStart", {
925
- type: "speechStart",
926
- timestamp: message.timestamp
927
- });
928
- break;
929
- case "speechEnd":
930
- this.emit("speechEnd", {
931
- type: "speechEnd",
932
- timestamp: message.timestamp
933
- });
934
- break;
935
- case "transcript": {
936
- const transcriptEvent = {
937
- type: "transcript",
938
- text: message.text,
939
- isFinal: message.isFinal,
921
+ case "ready": {
922
+ const readyEvent = {
923
+ type: "ready",
940
924
  timestamp: message.timestamp
941
925
  };
942
- if (message.confidence !== void 0) {
943
- transcriptEvent.confidence = message.confidence;
944
- }
945
- this.emit("transcript", transcriptEvent);
946
- this.transcriptHandler?.(message.text, message.isFinal);
926
+ this.emit("ready", readyEvent);
947
927
  break;
948
928
  }
949
929
  case "response": {
@@ -970,6 +950,24 @@ var LiveSpeechClient = class {
970
950
  this.audioHandler?.(audioData);
971
951
  break;
972
952
  }
953
+ case "userTranscript": {
954
+ const userTranscriptEvent = {
955
+ type: "userTranscript",
956
+ text: message.text,
957
+ timestamp: message.timestamp
958
+ };
959
+ this.emit("userTranscript", userTranscriptEvent);
960
+ this.userTranscriptHandler?.(message.text);
961
+ break;
962
+ }
963
+ case "turnComplete": {
964
+ const turnCompleteEvent = {
965
+ type: "turnComplete",
966
+ timestamp: message.timestamp
967
+ };
968
+ this.emit("turnComplete", turnCompleteEvent);
969
+ break;
970
+ }
973
971
  case "error":
974
972
  this.handleError(message.code, message.message);
975
973
  break;
package/dist/index.mjs CHANGED
@@ -7,7 +7,7 @@ var Region = {
7
7
  };
8
8
  var REGION_ENDPOINTS = {
9
9
  "ap-northeast-2": "wss://talk.drawdream.co.kr",
10
- "us-west-2": "wss://talk..drawdream.ca"
10
+ "us-west-2": "wss://talk.drawdream.ca"
11
11
  // Coming soon
12
12
  };
13
13
  function getEndpointForRegion(region) {
@@ -575,7 +575,7 @@ var LiveSpeechClient = class {
575
575
  // Event listeners using a simple map
576
576
  eventListeners = /* @__PURE__ */ new Map();
577
577
  // Simplified handlers
578
- transcriptHandler = null;
578
+ userTranscriptHandler = null;
579
579
  responseHandler = null;
580
580
  audioHandler = null;
581
581
  errorHandler = null;
@@ -691,6 +691,10 @@ var LiveSpeechClient = class {
691
691
  if (config?.prePrompt) {
692
692
  startMessage.prePrompt = config.prePrompt;
693
693
  }
694
+ if (config?.language) {
695
+ startMessage.language = config.language;
696
+ }
697
+ startMessage.pipelineMode = config?.pipelineMode ?? "live";
694
698
  this.connection.send(startMessage);
695
699
  });
696
700
  }
@@ -780,13 +784,7 @@ var LiveSpeechClient = class {
780
784
  }
781
785
  }
782
786
  /**
783
- * Set transcript handler (simplified)
784
- */
785
- setTranscriptHandler(handler) {
786
- this.transcriptHandler = handler;
787
- }
788
- /**
789
- * Set response handler (simplified)
787
+ * Set response handler
790
788
  */
791
789
  setResponseHandler(handler) {
792
790
  this.responseHandler = handler;
@@ -797,6 +795,12 @@ var LiveSpeechClient = class {
797
795
  setAudioHandler(handler) {
798
796
  this.audioHandler = handler;
799
797
  }
798
+ /**
799
+ * Set user transcript handler
800
+ */
801
+ setUserTranscriptHandler(handler) {
802
+ this.userTranscriptHandler = handler;
803
+ }
800
804
  /**
801
805
  * Set error handler (simplified)
802
806
  */
@@ -875,36 +879,12 @@ var LiveSpeechClient = class {
875
879
  timestamp: message.timestamp
876
880
  });
877
881
  break;
878
- case "streamingStarted":
879
- this.emit("streamingStarted", {
880
- type: "streamingStarted",
881
- timestamp: message.timestamp
882
- });
883
- break;
884
- case "speechStart":
885
- this.emit("speechStart", {
886
- type: "speechStart",
887
- timestamp: message.timestamp
888
- });
889
- break;
890
- case "speechEnd":
891
- this.emit("speechEnd", {
892
- type: "speechEnd",
893
- timestamp: message.timestamp
894
- });
895
- break;
896
- case "transcript": {
897
- const transcriptEvent = {
898
- type: "transcript",
899
- text: message.text,
900
- isFinal: message.isFinal,
882
+ case "ready": {
883
+ const readyEvent = {
884
+ type: "ready",
901
885
  timestamp: message.timestamp
902
886
  };
903
- if (message.confidence !== void 0) {
904
- transcriptEvent.confidence = message.confidence;
905
- }
906
- this.emit("transcript", transcriptEvent);
907
- this.transcriptHandler?.(message.text, message.isFinal);
887
+ this.emit("ready", readyEvent);
908
888
  break;
909
889
  }
910
890
  case "response": {
@@ -931,6 +911,24 @@ var LiveSpeechClient = class {
931
911
  this.audioHandler?.(audioData);
932
912
  break;
933
913
  }
914
+ case "userTranscript": {
915
+ const userTranscriptEvent = {
916
+ type: "userTranscript",
917
+ text: message.text,
918
+ timestamp: message.timestamp
919
+ };
920
+ this.emit("userTranscript", userTranscriptEvent);
921
+ this.userTranscriptHandler?.(message.text);
922
+ break;
923
+ }
924
+ case "turnComplete": {
925
+ const turnCompleteEvent = {
926
+ type: "turnComplete",
927
+ timestamp: message.timestamp
928
+ };
929
+ this.emit("turnComplete", turnCompleteEvent);
930
+ break;
931
+ }
934
932
  case "error":
935
933
  this.handleError(message.code, message.message);
936
934
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@drawdream/livespeech",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "Real-time speech-to-speech AI conversation SDK",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",