@drawdream/livespeech 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -26,12 +26,12 @@ const client = new LiveSpeechClient({
26
26
  });
27
27
 
28
28
  // Handle events
29
- client.setTranscriptHandler((text, isFinal) => {
30
- console.log(`Transcript: ${text} (final: ${isFinal})`);
29
+ client.setUserTranscriptHandler((text) => {
30
+ console.log(`You said: ${text}`);
31
31
  });
32
32
 
33
- client.setResponseHandler((text, isFinal) => {
34
- console.log(`AI Response: ${text}`);
33
+ client.setTranscriptHandler((text, isFinal) => {
34
+ console.log(`AI Transcript: ${text}`);
35
35
  });
36
36
 
37
37
  client.setAudioHandler((audioData) => {
@@ -42,10 +42,44 @@ client.setAudioHandler((audioData) => {
42
42
  await client.connect();
43
43
  await client.startSession({
44
44
  prePrompt: 'You are a helpful assistant.',
45
+ // pipelineMode: 'live' is the default
46
+ });
47
+
48
+ // Start streaming and send audio
49
+ client.audioStart();
50
+ client.sendAudioChunk(audioBuffer);
51
+ ```
52
+
53
+ ## Pipeline Modes
54
+
55
+ The SDK supports two pipeline modes for audio processing:
56
+
57
+ ### Live Mode (Default)
58
+
59
+ Uses Gemini 2.5 Flash Live API for end-to-end audio conversation. This provides:
60
+ - **Lower latency** - Direct audio-to-audio processing
61
+ - **Natural conversation** - Built-in voice activity detection and turn-taking
62
+ - **Real-time transcription** - Both user and AI speech are transcribed
63
+
64
+ ```typescript
65
+ await client.startSession({
66
+ prePrompt: 'You are a helpful assistant.',
67
+ pipelineMode: 'live', // Default, can be omitted
45
68
  });
69
+ ```
70
+
71
+ ### Composed Mode
46
72
 
47
- // Send audio
48
- client.sendAudio(audioBuffer);
73
+ Uses separate STT + LLM + TTS services for more customization:
74
+ - **More control** - Separate services for each step
75
+ - **Custom voices** - Use different TTS voices
76
+ - **Text responses** - Access to intermediate text responses
77
+
78
+ ```typescript
79
+ await client.startSession({
80
+ prePrompt: 'You are a helpful assistant.',
81
+ pipelineMode: 'composed',
82
+ });
49
83
  ```
50
84
 
51
85
  ## API Reference
@@ -87,8 +121,9 @@ The SDK provides built-in region support, so you don't need to remember endpoint
87
121
 
88
122
  ```typescript
89
123
  // Simple handlers
90
- client.setTranscriptHandler((text, isFinal) => {});
91
- client.setResponseHandler((text, isFinal) => {});
124
+ client.setUserTranscriptHandler((text) => {}); // User's speech transcription
125
+ client.setTranscriptHandler((text, isFinal) => {}); // AI's speech transcription (live mode)
126
+ client.setResponseHandler((text, isFinal) => {}); // AI text response (composed mode)
92
127
  client.setAudioHandler((audioData) => {});
93
128
  client.setErrorHandler((error) => {});
94
129
 
@@ -97,24 +132,23 @@ client.on('connected', (event) => {});
97
132
  client.on('disconnected', (event) => {});
98
133
  client.on('sessionStarted', (event) => {});
99
134
  client.on('sessionEnded', (event) => {});
100
- client.on('transcript', (event) => {});
101
- client.on('response', (event) => {});
135
+ client.on('userTranscript', (event) => {}); // User's speech transcription
136
+ client.on('transcript', (event) => {}); // AI's speech transcription
137
+ client.on('response', (event) => {}); // AI text response
102
138
  client.on('audio', (event) => {});
103
139
  client.on('error', (event) => {});
104
140
  client.on('reconnecting', (event) => {});
141
+ client.on('ready', (event) => {}); // Gemini Live ready (live mode)
142
+ client.on('turnComplete', (event) => {}); // AI finished speaking (live mode)
105
143
  ```
106
144
 
107
145
  ### SessionConfig
108
146
 
109
147
  | Option | Type | Default | Description |
110
148
  |--------|------|---------|-------------|
111
- | `prePrompt` | `string` | **required** | System prompt for the AI |
112
- | `voiceId` | `string` | `'en-US-Standard-A'` | TTS voice ID |
113
- | `languageCode` | `string` | `'en-US'` | Language for STT |
114
- | `inputFormat` | `AudioFormat` | `'pcm16'` | Input audio format |
115
- | `outputFormat` | `AudioFormat` | `'pcm16'` | Output audio format |
116
- | `sampleRate` | `number` | `16000` | Sample rate in Hz |
117
- | `metadata` | `Record<string,string>` | `{}` | Custom metadata |
149
+ | `prePrompt` | `string` | - | System prompt for the AI |
150
+ | `language` | `string` | `'en-US'` | Language code for speech (e.g., "ko-KR") |
151
+ | `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
118
152
 
119
153
  ## Audio Utilities
120
154
 
package/dist/index.d.mts CHANGED
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
21
21
  */
22
22
  declare function isValidRegion(value: string): value is Region;
23
23
 
24
+ /**
25
+ * Pipeline mode for audio processing
26
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
+ * - 'composed': Uses separate STT + LLM + TTS services
28
+ */
29
+ type PipelineMode = 'live' | 'composed';
24
30
  /**
25
31
  * Configuration options for the LiveSpeech client
26
32
  *
@@ -74,41 +80,20 @@ interface SessionConfig {
74
80
  /**
75
81
  * System prompt for the AI assistant
76
82
  */
77
- prePrompt: string;
78
- /**
79
- * Voice ID for text-to-speech output
80
- * @default 'en-US-Standard-A'
81
- */
82
- voiceId?: string;
83
- /**
84
- * Language code for speech recognition
85
- * @default 'en-US'
86
- */
87
- languageCode?: string;
88
- /**
89
- * Audio encoding format for input
90
- * @default 'pcm16'
91
- */
92
- inputFormat?: AudioFormat;
93
- /**
94
- * Audio encoding format for output
95
- * @default 'pcm16'
96
- */
97
- outputFormat?: AudioFormat;
83
+ prePrompt?: string;
98
84
  /**
99
- * Sample rate for audio in Hz
100
- * @default 16000
85
+ * Language code for speech recognition (e.g., "en-US", "ko-KR")
86
+ * @default "en-US"
101
87
  */
102
- sampleRate?: number;
88
+ language?: string;
103
89
  /**
104
- * Custom metadata to attach to the session
90
+ * Pipeline mode for audio processing
91
+ * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
92
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
+ * @default "live"
105
94
  */
106
- metadata?: Record<string, string>;
95
+ pipelineMode?: PipelineMode;
107
96
  }
108
- /**
109
- * Supported audio formats
110
- */
111
- type AudioFormat = 'pcm16' | 'opus' | 'wav';
112
97
  /**
113
98
  * Internal resolved configuration with defaults applied
114
99
  */
@@ -125,7 +110,7 @@ interface ResolvedConfig {
125
110
  /**
126
111
  * Event types emitted by the LiveSpeech client
127
112
  */
128
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
113
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
129
114
  /**
130
115
  * Event payload for 'connected' event
131
116
  */
@@ -164,13 +149,19 @@ interface SessionEndedEvent {
164
149
  timestamp: string;
165
150
  }
166
151
  /**
167
- * Event payload for 'transcript' event
152
+ * Event payload for 'ready' event
153
+ */
154
+ interface ReadyEvent {
155
+ type: 'ready';
156
+ timestamp: string;
157
+ }
158
+ /**
159
+ * Event payload for 'userTranscript' event
160
+ * User's speech transcription
168
161
  */
169
- interface TranscriptEvent {
170
- type: 'transcript';
162
+ interface UserTranscriptEvent {
163
+ type: 'userTranscript';
171
164
  text: string;
172
- isFinal: boolean;
173
- confidence?: number;
174
165
  timestamp: string;
175
166
  }
176
167
  /**
@@ -205,7 +196,7 @@ interface ErrorEvent {
205
196
  /**
206
197
  * Error codes
207
198
  */
208
- type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
199
+ type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'streaming_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
209
200
  /**
210
201
  * Event payload for 'reconnecting' event
211
202
  */
@@ -216,14 +207,22 @@ interface ReconnectingEvent {
216
207
  delay: number;
217
208
  timestamp: string;
218
209
  }
210
+ /**
211
+ * Event payload for 'turnComplete' event (both modes)
212
+ * Indicates the AI has finished its response turn
213
+ */
214
+ interface TurnCompleteEvent {
215
+ type: 'turnComplete';
216
+ timestamp: string;
217
+ }
219
218
  /**
220
219
  * Union type of all event payloads
221
220
  */
222
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
221
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
223
222
  /**
224
223
  * Simplified event handlers for common use cases
225
224
  */
226
- type TranscriptHandler = (text: string, isFinal: boolean) => void;
225
+ type UserTranscriptHandler = (text: string) => void;
227
226
  type ResponseHandler = (text: string, isFinal: boolean) => void;
228
227
  type AudioHandler = (data: Uint8Array) => void;
229
228
  type ErrorHandler = (error: ErrorEvent) => void;
@@ -231,30 +230,25 @@ type ErrorHandler = (error: ErrorEvent) => void;
231
230
  /**
232
231
  * WebSocket message types sent from client to server
233
232
  */
234
- type ClientMessageType = 'startSession' | 'endSession' | 'audio' | 'ping';
233
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'ping';
235
234
  /**
236
235
  * WebSocket message types received from server
237
236
  */
238
- type ServerMessageType = 'connected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
237
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
239
238
  /**
240
239
  * Base interface for client messages
241
240
  */
242
241
  interface BaseClientMessage {
243
242
  action: ClientMessageType;
244
- requestId?: string;
245
243
  }
246
244
  /**
247
245
  * Start session message
248
246
  */
249
247
  interface StartSessionMessage extends BaseClientMessage {
250
248
  action: 'startSession';
251
- prePrompt: string;
252
- voiceId?: string;
253
- languageCode?: string;
254
- inputFormat?: string;
255
- outputFormat?: string;
256
- sampleRate?: number;
257
- metadata?: Record<string, string>;
249
+ prePrompt?: string;
250
+ language?: string;
251
+ pipelineMode?: 'live' | 'composed';
258
252
  }
259
253
  /**
260
254
  * End session message
@@ -263,14 +257,23 @@ interface EndSessionMessage extends BaseClientMessage {
263
257
  action: 'endSession';
264
258
  }
265
259
  /**
266
- * Audio data message
260
+ * Audio start message - begin streaming session
261
+ */
262
+ interface AudioStartMessage extends BaseClientMessage {
263
+ action: 'audioStart';
264
+ }
265
+ /**
266
+ * Audio chunk message - send audio data
267
267
  */
268
- interface AudioMessage extends BaseClientMessage {
269
- action: 'audio';
268
+ interface AudioChunkMessage extends BaseClientMessage {
269
+ action: 'audioChunk';
270
270
  data: string;
271
- format?: string;
272
- sampleRate?: number;
273
- isFinal?: boolean;
271
+ }
272
+ /**
273
+ * Audio end message - end streaming session
274
+ */
275
+ interface AudioEndMessage extends BaseClientMessage {
276
+ action: 'audioEnd';
274
277
  }
275
278
  /**
276
279
  * Ping message for keep-alive
@@ -281,22 +284,14 @@ interface PingMessage extends BaseClientMessage {
281
284
  /**
282
285
  * Union type of all client messages
283
286
  */
284
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioMessage | PingMessage;
287
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | PingMessage;
285
288
  /**
286
289
  * Base interface for server messages
287
290
  */
288
291
  interface BaseServerMessage {
289
292
  type: ServerMessageType;
290
- requestId?: string;
291
293
  timestamp: string;
292
294
  }
293
- /**
294
- * Connected message from server
295
- */
296
- interface ServerConnectedMessage extends BaseServerMessage {
297
- type: 'connected';
298
- connectionId: string;
299
- }
300
295
  /**
301
296
  * Session started message from server
302
297
  */
@@ -312,13 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
312
307
  sessionId: string;
313
308
  }
314
309
  /**
315
- * Transcript message from server
310
+ * User transcript message from server (user's speech transcription)
316
311
  */
317
- interface ServerTranscriptMessage extends BaseServerMessage {
318
- type: 'transcript';
312
+ interface ServerUserTranscriptMessage extends BaseServerMessage {
313
+ type: 'userTranscript';
319
314
  text: string;
320
- isFinal: boolean;
321
- confidence?: number;
322
315
  }
323
316
  /**
324
317
  * Response message from server
@@ -344,7 +337,6 @@ interface ServerErrorMessage extends BaseServerMessage {
344
337
  type: 'error';
345
338
  code: string;
346
339
  message: string;
347
- details?: unknown;
348
340
  }
349
341
  /**
350
342
  * Pong message from server
@@ -352,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
352
344
  interface ServerPongMessage extends BaseServerMessage {
353
345
  type: 'pong';
354
346
  }
347
+ /**
348
+ * Turn complete message from server
349
+ * Indicates the AI has finished its response turn
350
+ */
351
+ interface ServerTurnCompleteMessage extends BaseServerMessage {
352
+ type: 'turnComplete';
353
+ }
354
+ /**
355
+ * Ready message from server
356
+ * Indicates the Gemini Live session is ready for audio input
357
+ */
358
+ interface ServerReadyMessage extends BaseServerMessage {
359
+ type: 'ready';
360
+ }
355
361
  /**
356
362
  * Union type of all server messages
357
363
  */
358
- type ServerMessage = ServerConnectedMessage | ServerSessionStartedMessage | ServerSessionEndedMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
364
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
359
365
 
360
366
  /**
361
367
  * Connection state
@@ -368,13 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
368
374
  type LiveSpeechEventMap = {
369
375
  connected: ConnectedEvent;
370
376
  disconnected: DisconnectedEvent;
377
+ reconnecting: ReconnectingEvent;
371
378
  sessionStarted: SessionStartedEvent;
372
379
  sessionEnded: SessionEndedEvent;
373
- transcript: TranscriptEvent;
380
+ ready: ReadyEvent;
381
+ userTranscript: UserTranscriptEvent;
374
382
  response: ResponseEvent;
375
383
  audio: AudioEvent;
384
+ turnComplete: TurnCompleteEvent;
376
385
  error: ErrorEvent;
377
- reconnecting: ReconnectingEvent;
378
386
  };
379
387
  /**
380
388
  * LiveSpeech client for real-time speech-to-speech AI conversations
@@ -385,9 +393,9 @@ declare class LiveSpeechClient {
385
393
  private readonly audioEncoder;
386
394
  private readonly logger;
387
395
  private sessionId;
388
- private sessionConfig;
396
+ private isStreaming;
389
397
  private readonly eventListeners;
390
- private transcriptHandler;
398
+ private userTranscriptHandler;
391
399
  private responseHandler;
392
400
  private audioHandler;
393
401
  private errorHandler;
@@ -412,6 +420,10 @@ declare class LiveSpeechClient {
412
420
  * Check if session is active
413
421
  */
414
422
  get hasActiveSession(): boolean;
423
+ /**
424
+ * Check if audio streaming is active
425
+ */
426
+ get isAudioStreaming(): boolean;
415
427
  /**
416
428
  * Connect to the server
417
429
  */
@@ -423,18 +435,23 @@ declare class LiveSpeechClient {
423
435
  /**
424
436
  * Start a new session
425
437
  */
426
- startSession(config: SessionConfig): Promise<string>;
438
+ startSession(config?: SessionConfig): Promise<string>;
427
439
  /**
428
440
  * End the current session
429
441
  */
430
442
  endSession(): Promise<void>;
431
443
  /**
432
- * Send audio data
444
+ * Start audio streaming session
445
+ */
446
+ audioStart(): void;
447
+ /**
448
+ * Send audio chunk (PCM16 base64 encoded)
449
+ */
450
+ sendAudioChunk(data: Uint8Array): void;
451
+ /**
452
+ * End audio streaming session
433
453
  */
434
- sendAudio(data: Uint8Array, options?: {
435
- format?: AudioFormat;
436
- isFinal?: boolean;
437
- }): void;
454
+ audioEnd(): void;
438
455
  /**
439
456
  * Add event listener
440
457
  */
@@ -444,17 +461,17 @@ declare class LiveSpeechClient {
444
461
  */
445
462
  off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
446
463
  /**
447
- * Set transcript handler (simplified)
448
- */
449
- setTranscriptHandler(handler: TranscriptHandler): void;
450
- /**
451
- * Set response handler (simplified)
464
+ * Set response handler
452
465
  */
453
466
  setResponseHandler(handler: ResponseHandler): void;
454
467
  /**
455
468
  * Set audio handler (simplified)
456
469
  */
457
470
  setAudioHandler(handler: AudioHandler): void;
471
+ /**
472
+ * Set user transcript handler
473
+ */
474
+ setUserTranscriptHandler(handler: UserTranscriptHandler): void;
458
475
  /**
459
476
  * Set error handler (simplified)
460
477
  */
@@ -467,6 +484,10 @@ declare class LiveSpeechClient {
467
484
  private handleMessage;
468
485
  }
469
486
 
487
+ /**
488
+ * Audio format type
489
+ */
490
+ type AudioFormat = 'pcm16' | 'opus' | 'wav';
470
491
  /**
471
492
  * Audio encoder options
472
493
  */
@@ -552,4 +573,4 @@ declare class AudioEncoder {
552
573
  wrapWav(data: Uint8Array): Uint8Array;
553
574
  }
554
575
 
555
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioFormat, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
576
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };