@drawdream/livespeech 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -74,41 +74,8 @@ interface SessionConfig {
74
74
  /**
75
75
  * System prompt for the AI assistant
76
76
  */
77
- prePrompt: string;
78
- /**
79
- * Voice ID for text-to-speech output
80
- * @default 'en-US-Standard-A'
81
- */
82
- voiceId?: string;
83
- /**
84
- * Language code for speech recognition
85
- * @default 'en-US'
86
- */
87
- languageCode?: string;
88
- /**
89
- * Audio encoding format for input
90
- * @default 'pcm16'
91
- */
92
- inputFormat?: AudioFormat;
93
- /**
94
- * Audio encoding format for output
95
- * @default 'pcm16'
96
- */
97
- outputFormat?: AudioFormat;
98
- /**
99
- * Sample rate for audio in Hz
100
- * @default 16000
101
- */
102
- sampleRate?: number;
103
- /**
104
- * Custom metadata to attach to the session
105
- */
106
- metadata?: Record<string, string>;
77
+ prePrompt?: string;
107
78
  }
108
- /**
109
- * Supported audio formats
110
- */
111
- type AudioFormat = 'pcm16' | 'opus' | 'wav';
112
79
  /**
113
80
  * Internal resolved configuration with defaults applied
114
81
  */
@@ -125,7 +92,7 @@ interface ResolvedConfig {
125
92
  /**
126
93
  * Event types emitted by the LiveSpeech client
127
94
  */
128
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
95
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
129
96
  /**
130
97
  * Event payload for 'connected' event
131
98
  */
@@ -163,6 +130,27 @@ interface SessionEndedEvent {
163
130
  sessionId: string;
164
131
  timestamp: string;
165
132
  }
133
+ /**
134
+ * Event payload for 'streamingStarted' event - acknowledgment of audioStart
135
+ */
136
+ interface StreamingStartedEvent {
137
+ type: 'streamingStarted';
138
+ timestamp: string;
139
+ }
140
+ /**
141
+ * Event payload for 'speechStart' event - VAD detected speech begin
142
+ */
143
+ interface SpeechStartEvent {
144
+ type: 'speechStart';
145
+ timestamp: string;
146
+ }
147
+ /**
148
+ * Event payload for 'speechEnd' event - VAD detected speech end
149
+ */
150
+ interface SpeechEndEvent {
151
+ type: 'speechEnd';
152
+ timestamp: string;
153
+ }
166
154
  /**
167
155
  * Event payload for 'transcript' event
168
156
  */
@@ -205,7 +193,7 @@ interface ErrorEvent {
205
193
  /**
206
194
  * Error codes
207
195
  */
208
- type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
196
+ type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'streaming_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
209
197
  /**
210
198
  * Event payload for 'reconnecting' event
211
199
  */
@@ -219,7 +207,7 @@ interface ReconnectingEvent {
219
207
  /**
220
208
  * Union type of all event payloads
221
209
  */
222
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
210
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | StreamingStartedEvent | SpeechStartEvent | SpeechEndEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
223
211
  /**
224
212
  * Simplified event handlers for common use cases
225
213
  */
@@ -231,30 +219,23 @@ type ErrorHandler = (error: ErrorEvent) => void;
231
219
  /**
232
220
  * WebSocket message types sent from client to server
233
221
  */
234
- type ClientMessageType = 'startSession' | 'endSession' | 'audio' | 'ping';
222
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'ping';
235
223
  /**
236
224
  * WebSocket message types received from server
237
225
  */
238
- type ServerMessageType = 'connected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
226
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
239
227
  /**
240
228
  * Base interface for client messages
241
229
  */
242
230
  interface BaseClientMessage {
243
231
  action: ClientMessageType;
244
- requestId?: string;
245
232
  }
246
233
  /**
247
234
  * Start session message
248
235
  */
249
236
  interface StartSessionMessage extends BaseClientMessage {
250
237
  action: 'startSession';
251
- prePrompt: string;
252
- voiceId?: string;
253
- languageCode?: string;
254
- inputFormat?: string;
255
- outputFormat?: string;
256
- sampleRate?: number;
257
- metadata?: Record<string, string>;
238
+ prePrompt?: string;
258
239
  }
259
240
  /**
260
241
  * End session message
@@ -263,14 +244,23 @@ interface EndSessionMessage extends BaseClientMessage {
263
244
  action: 'endSession';
264
245
  }
265
246
  /**
266
- * Audio data message
247
+ * Audio start message - begin streaming session
267
248
  */
268
- interface AudioMessage extends BaseClientMessage {
269
- action: 'audio';
249
+ interface AudioStartMessage extends BaseClientMessage {
250
+ action: 'audioStart';
251
+ }
252
+ /**
253
+ * Audio chunk message - send audio data
254
+ */
255
+ interface AudioChunkMessage extends BaseClientMessage {
256
+ action: 'audioChunk';
270
257
  data: string;
271
- format?: string;
272
- sampleRate?: number;
273
- isFinal?: boolean;
258
+ }
259
+ /**
260
+ * Audio end message - end streaming session
261
+ */
262
+ interface AudioEndMessage extends BaseClientMessage {
263
+ action: 'audioEnd';
274
264
  }
275
265
  /**
276
266
  * Ping message for keep-alive
@@ -281,22 +271,14 @@ interface PingMessage extends BaseClientMessage {
281
271
  /**
282
272
  * Union type of all client messages
283
273
  */
284
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioMessage | PingMessage;
274
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | PingMessage;
285
275
  /**
286
276
  * Base interface for server messages
287
277
  */
288
278
  interface BaseServerMessage {
289
279
  type: ServerMessageType;
290
- requestId?: string;
291
280
  timestamp: string;
292
281
  }
293
- /**
294
- * Connected message from server
295
- */
296
- interface ServerConnectedMessage extends BaseServerMessage {
297
- type: 'connected';
298
- connectionId: string;
299
- }
300
282
  /**
301
283
  * Session started message from server
302
284
  */
@@ -311,6 +293,24 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
311
293
  type: 'sessionEnded';
312
294
  sessionId: string;
313
295
  }
296
+ /**
297
+ * Streaming started message - acknowledgment of audioStart
298
+ */
299
+ interface ServerStreamingStartedMessage extends BaseServerMessage {
300
+ type: 'streamingStarted';
301
+ }
302
+ /**
303
+ * Speech start message - VAD detected speech begin
304
+ */
305
+ interface ServerSpeechStartMessage extends BaseServerMessage {
306
+ type: 'speechStart';
307
+ }
308
+ /**
309
+ * Speech end message - VAD detected speech end
310
+ */
311
+ interface ServerSpeechEndMessage extends BaseServerMessage {
312
+ type: 'speechEnd';
313
+ }
314
314
  /**
315
315
  * Transcript message from server
316
316
  */
@@ -344,7 +344,6 @@ interface ServerErrorMessage extends BaseServerMessage {
344
344
  type: 'error';
345
345
  code: string;
346
346
  message: string;
347
- details?: unknown;
348
347
  }
349
348
  /**
350
349
  * Pong message from server
@@ -355,7 +354,7 @@ interface ServerPongMessage extends BaseServerMessage {
355
354
  /**
356
355
  * Union type of all server messages
357
356
  */
358
- type ServerMessage = ServerConnectedMessage | ServerSessionStartedMessage | ServerSessionEndedMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
357
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerStreamingStartedMessage | ServerSpeechStartMessage | ServerSpeechEndMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
359
358
 
360
359
  /**
361
360
  * Connection state
@@ -370,6 +369,9 @@ type LiveSpeechEventMap = {
370
369
  disconnected: DisconnectedEvent;
371
370
  sessionStarted: SessionStartedEvent;
372
371
  sessionEnded: SessionEndedEvent;
372
+ streamingStarted: StreamingStartedEvent;
373
+ speechStart: SpeechStartEvent;
374
+ speechEnd: SpeechEndEvent;
373
375
  transcript: TranscriptEvent;
374
376
  response: ResponseEvent;
375
377
  audio: AudioEvent;
@@ -385,7 +387,7 @@ declare class LiveSpeechClient {
385
387
  private readonly audioEncoder;
386
388
  private readonly logger;
387
389
  private sessionId;
388
- private sessionConfig;
390
+ private isStreaming;
389
391
  private readonly eventListeners;
390
392
  private transcriptHandler;
391
393
  private responseHandler;
@@ -412,6 +414,10 @@ declare class LiveSpeechClient {
412
414
  * Check if session is active
413
415
  */
414
416
  get hasActiveSession(): boolean;
417
+ /**
418
+ * Check if audio streaming is active
419
+ */
420
+ get isAudioStreaming(): boolean;
415
421
  /**
416
422
  * Connect to the server
417
423
  */
@@ -423,18 +429,23 @@ declare class LiveSpeechClient {
423
429
  /**
424
430
  * Start a new session
425
431
  */
426
- startSession(config: SessionConfig): Promise<string>;
432
+ startSession(config?: SessionConfig): Promise<string>;
427
433
  /**
428
434
  * End the current session
429
435
  */
430
436
  endSession(): Promise<void>;
431
437
  /**
432
- * Send audio data
438
+ * Start audio streaming session
433
439
  */
434
- sendAudio(data: Uint8Array, options?: {
435
- format?: AudioFormat;
436
- isFinal?: boolean;
437
- }): void;
440
+ audioStart(): void;
441
+ /**
442
+ * Send audio chunk (PCM16 base64 encoded)
443
+ */
444
+ sendAudioChunk(data: Uint8Array): void;
445
+ /**
446
+ * End audio streaming session
447
+ */
448
+ audioEnd(): void;
438
449
  /**
439
450
  * Add event listener
440
451
  */
@@ -467,6 +478,10 @@ declare class LiveSpeechClient {
467
478
  private handleMessage;
468
479
  }
469
480
 
481
+ /**
482
+ * Audio format type
483
+ */
484
+ type AudioFormat = 'pcm16' | 'opus' | 'wav';
470
485
  /**
471
486
  * Audio encoder options
472
487
  */
@@ -552,4 +567,4 @@ declare class AudioEncoder {
552
567
  wrapWav(data: Uint8Array): Uint8Array;
553
568
  }
554
569
 
555
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioFormat, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
570
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type SpeechEndEvent, type SpeechStartEvent, type StreamingStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
package/dist/index.d.ts CHANGED
@@ -74,41 +74,8 @@ interface SessionConfig {
74
74
  /**
75
75
  * System prompt for the AI assistant
76
76
  */
77
- prePrompt: string;
78
- /**
79
- * Voice ID for text-to-speech output
80
- * @default 'en-US-Standard-A'
81
- */
82
- voiceId?: string;
83
- /**
84
- * Language code for speech recognition
85
- * @default 'en-US'
86
- */
87
- languageCode?: string;
88
- /**
89
- * Audio encoding format for input
90
- * @default 'pcm16'
91
- */
92
- inputFormat?: AudioFormat;
93
- /**
94
- * Audio encoding format for output
95
- * @default 'pcm16'
96
- */
97
- outputFormat?: AudioFormat;
98
- /**
99
- * Sample rate for audio in Hz
100
- * @default 16000
101
- */
102
- sampleRate?: number;
103
- /**
104
- * Custom metadata to attach to the session
105
- */
106
- metadata?: Record<string, string>;
77
+ prePrompt?: string;
107
78
  }
108
- /**
109
- * Supported audio formats
110
- */
111
- type AudioFormat = 'pcm16' | 'opus' | 'wav';
112
79
  /**
113
80
  * Internal resolved configuration with defaults applied
114
81
  */
@@ -125,7 +92,7 @@ interface ResolvedConfig {
125
92
  /**
126
93
  * Event types emitted by the LiveSpeech client
127
94
  */
128
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
95
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'reconnecting';
129
96
  /**
130
97
  * Event payload for 'connected' event
131
98
  */
@@ -163,6 +130,27 @@ interface SessionEndedEvent {
163
130
  sessionId: string;
164
131
  timestamp: string;
165
132
  }
133
+ /**
134
+ * Event payload for 'streamingStarted' event - acknowledgment of audioStart
135
+ */
136
+ interface StreamingStartedEvent {
137
+ type: 'streamingStarted';
138
+ timestamp: string;
139
+ }
140
+ /**
141
+ * Event payload for 'speechStart' event - VAD detected speech begin
142
+ */
143
+ interface SpeechStartEvent {
144
+ type: 'speechStart';
145
+ timestamp: string;
146
+ }
147
+ /**
148
+ * Event payload for 'speechEnd' event - VAD detected speech end
149
+ */
150
+ interface SpeechEndEvent {
151
+ type: 'speechEnd';
152
+ timestamp: string;
153
+ }
166
154
  /**
167
155
  * Event payload for 'transcript' event
168
156
  */
@@ -205,7 +193,7 @@ interface ErrorEvent {
205
193
  /**
206
194
  * Error codes
207
195
  */
208
- type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
196
+ type ErrorCode = 'connection_failed' | 'connection_timeout' | 'authentication_failed' | 'session_error' | 'audio_error' | 'streaming_error' | 'stt_error' | 'llm_error' | 'tts_error' | 'rate_limit' | 'internal_error' | 'invalid_message';
209
197
  /**
210
198
  * Event payload for 'reconnecting' event
211
199
  */
@@ -219,7 +207,7 @@ interface ReconnectingEvent {
219
207
  /**
220
208
  * Union type of all event payloads
221
209
  */
222
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
210
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | SessionStartedEvent | SessionEndedEvent | StreamingStartedEvent | SpeechStartEvent | SpeechEndEvent | TranscriptEvent | ResponseEvent | AudioEvent | ErrorEvent | ReconnectingEvent;
223
211
  /**
224
212
  * Simplified event handlers for common use cases
225
213
  */
@@ -231,30 +219,23 @@ type ErrorHandler = (error: ErrorEvent) => void;
231
219
  /**
232
220
  * WebSocket message types sent from client to server
233
221
  */
234
- type ClientMessageType = 'startSession' | 'endSession' | 'audio' | 'ping';
222
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'ping';
235
223
  /**
236
224
  * WebSocket message types received from server
237
225
  */
238
- type ServerMessageType = 'connected' | 'sessionStarted' | 'sessionEnded' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
226
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'streamingStarted' | 'speechStart' | 'speechEnd' | 'transcript' | 'response' | 'audio' | 'error' | 'pong';
239
227
  /**
240
228
  * Base interface for client messages
241
229
  */
242
230
  interface BaseClientMessage {
243
231
  action: ClientMessageType;
244
- requestId?: string;
245
232
  }
246
233
  /**
247
234
  * Start session message
248
235
  */
249
236
  interface StartSessionMessage extends BaseClientMessage {
250
237
  action: 'startSession';
251
- prePrompt: string;
252
- voiceId?: string;
253
- languageCode?: string;
254
- inputFormat?: string;
255
- outputFormat?: string;
256
- sampleRate?: number;
257
- metadata?: Record<string, string>;
238
+ prePrompt?: string;
258
239
  }
259
240
  /**
260
241
  * End session message
@@ -263,14 +244,23 @@ interface EndSessionMessage extends BaseClientMessage {
263
244
  action: 'endSession';
264
245
  }
265
246
  /**
266
- * Audio data message
247
+ * Audio start message - begin streaming session
267
248
  */
268
- interface AudioMessage extends BaseClientMessage {
269
- action: 'audio';
249
+ interface AudioStartMessage extends BaseClientMessage {
250
+ action: 'audioStart';
251
+ }
252
+ /**
253
+ * Audio chunk message - send audio data
254
+ */
255
+ interface AudioChunkMessage extends BaseClientMessage {
256
+ action: 'audioChunk';
270
257
  data: string;
271
- format?: string;
272
- sampleRate?: number;
273
- isFinal?: boolean;
258
+ }
259
+ /**
260
+ * Audio end message - end streaming session
261
+ */
262
+ interface AudioEndMessage extends BaseClientMessage {
263
+ action: 'audioEnd';
274
264
  }
275
265
  /**
276
266
  * Ping message for keep-alive
@@ -281,22 +271,14 @@ interface PingMessage extends BaseClientMessage {
281
271
  /**
282
272
  * Union type of all client messages
283
273
  */
284
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioMessage | PingMessage;
274
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | PingMessage;
285
275
  /**
286
276
  * Base interface for server messages
287
277
  */
288
278
  interface BaseServerMessage {
289
279
  type: ServerMessageType;
290
- requestId?: string;
291
280
  timestamp: string;
292
281
  }
293
- /**
294
- * Connected message from server
295
- */
296
- interface ServerConnectedMessage extends BaseServerMessage {
297
- type: 'connected';
298
- connectionId: string;
299
- }
300
282
  /**
301
283
  * Session started message from server
302
284
  */
@@ -311,6 +293,24 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
311
293
  type: 'sessionEnded';
312
294
  sessionId: string;
313
295
  }
296
+ /**
297
+ * Streaming started message - acknowledgment of audioStart
298
+ */
299
+ interface ServerStreamingStartedMessage extends BaseServerMessage {
300
+ type: 'streamingStarted';
301
+ }
302
+ /**
303
+ * Speech start message - VAD detected speech begin
304
+ */
305
+ interface ServerSpeechStartMessage extends BaseServerMessage {
306
+ type: 'speechStart';
307
+ }
308
+ /**
309
+ * Speech end message - VAD detected speech end
310
+ */
311
+ interface ServerSpeechEndMessage extends BaseServerMessage {
312
+ type: 'speechEnd';
313
+ }
314
314
  /**
315
315
  * Transcript message from server
316
316
  */
@@ -344,7 +344,6 @@ interface ServerErrorMessage extends BaseServerMessage {
344
344
  type: 'error';
345
345
  code: string;
346
346
  message: string;
347
- details?: unknown;
348
347
  }
349
348
  /**
350
349
  * Pong message from server
@@ -355,7 +354,7 @@ interface ServerPongMessage extends BaseServerMessage {
355
354
  /**
356
355
  * Union type of all server messages
357
356
  */
358
- type ServerMessage = ServerConnectedMessage | ServerSessionStartedMessage | ServerSessionEndedMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
357
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerStreamingStartedMessage | ServerSpeechStartMessage | ServerSpeechEndMessage | ServerTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerErrorMessage | ServerPongMessage;
359
358
 
360
359
  /**
361
360
  * Connection state
@@ -370,6 +369,9 @@ type LiveSpeechEventMap = {
370
369
  disconnected: DisconnectedEvent;
371
370
  sessionStarted: SessionStartedEvent;
372
371
  sessionEnded: SessionEndedEvent;
372
+ streamingStarted: StreamingStartedEvent;
373
+ speechStart: SpeechStartEvent;
374
+ speechEnd: SpeechEndEvent;
373
375
  transcript: TranscriptEvent;
374
376
  response: ResponseEvent;
375
377
  audio: AudioEvent;
@@ -385,7 +387,7 @@ declare class LiveSpeechClient {
385
387
  private readonly audioEncoder;
386
388
  private readonly logger;
387
389
  private sessionId;
388
- private sessionConfig;
390
+ private isStreaming;
389
391
  private readonly eventListeners;
390
392
  private transcriptHandler;
391
393
  private responseHandler;
@@ -412,6 +414,10 @@ declare class LiveSpeechClient {
412
414
  * Check if session is active
413
415
  */
414
416
  get hasActiveSession(): boolean;
417
+ /**
418
+ * Check if audio streaming is active
419
+ */
420
+ get isAudioStreaming(): boolean;
415
421
  /**
416
422
  * Connect to the server
417
423
  */
@@ -423,18 +429,23 @@ declare class LiveSpeechClient {
423
429
  /**
424
430
  * Start a new session
425
431
  */
426
- startSession(config: SessionConfig): Promise<string>;
432
+ startSession(config?: SessionConfig): Promise<string>;
427
433
  /**
428
434
  * End the current session
429
435
  */
430
436
  endSession(): Promise<void>;
431
437
  /**
432
- * Send audio data
438
+ * Start audio streaming session
433
439
  */
434
- sendAudio(data: Uint8Array, options?: {
435
- format?: AudioFormat;
436
- isFinal?: boolean;
437
- }): void;
440
+ audioStart(): void;
441
+ /**
442
+ * Send audio chunk (PCM16 base64 encoded)
443
+ */
444
+ sendAudioChunk(data: Uint8Array): void;
445
+ /**
446
+ * End audio streaming session
447
+ */
448
+ audioEnd(): void;
438
449
  /**
439
450
  * Add event listener
440
451
  */
@@ -467,6 +478,10 @@ declare class LiveSpeechClient {
467
478
  private handleMessage;
468
479
  }
469
480
 
481
+ /**
482
+ * Audio format type
483
+ */
484
+ type AudioFormat = 'pcm16' | 'opus' | 'wav';
470
485
  /**
471
486
  * Audio encoder options
472
487
  */
@@ -552,4 +567,4 @@ declare class AudioEncoder {
552
567
  wrapWav(data: Uint8Array): Uint8Array;
553
568
  }
554
569
 
555
- export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioFormat, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
570
+ export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type SpeechEndEvent, type SpeechStartEvent, type StreamingStartedEvent, type TranscriptEvent, type TranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
package/dist/index.js CHANGED
@@ -332,22 +332,13 @@ var WebSocketConnection = class {
332
332
  /**
333
333
  * Handle incoming message
334
334
  */
335
- handleMessage(data, onFirstConnect) {
335
+ handleMessage(data) {
336
336
  const message = parseServerMessage(data);
337
337
  if (!message) {
338
338
  this.logger.warn("Invalid message received:", data);
339
339
  return;
340
340
  }
341
341
  this.logger.debug("Received message:", message.type);
342
- if (message.type === "connected") {
343
- this.connectionId = message.connectionId;
344
- this.state = "connected";
345
- this.retryController.reset();
346
- this.startPingInterval();
347
- this.events.onOpen?.(message.connectionId);
348
- onFirstConnect?.();
349
- return;
350
- }
351
342
  if (message.type === "pong") {
352
343
  this.logger.debug("Pong received");
353
344
  return;
@@ -613,20 +604,13 @@ var CONFIG_DEFAULTS = {
613
604
  reconnectDelay: 1e3,
614
605
  debug: false
615
606
  };
616
- var SESSION_DEFAULTS = {
617
- voiceId: "en-US-Standard-A",
618
- languageCode: "en-US",
619
- inputFormat: "pcm16",
620
- outputFormat: "pcm16",
621
- sampleRate: 16e3
622
- };
623
607
  var LiveSpeechClient = class {
624
608
  config;
625
609
  connection;
626
610
  audioEncoder;
627
611
  logger;
628
612
  sessionId = null;
629
- sessionConfig = null;
613
+ isStreaming = false;
630
614
  // Event listeners using a simple map
631
615
  eventListeners = /* @__PURE__ */ new Map();
632
616
  // Simplified handlers
@@ -692,6 +676,12 @@ var LiveSpeechClient = class {
692
676
  get hasActiveSession() {
693
677
  return this.sessionId !== null;
694
678
  }
679
+ /**
680
+ * Check if audio streaming is active
681
+ */
682
+ get isAudioStreaming() {
683
+ return this.isStreaming;
684
+ }
695
685
  /**
696
686
  * Connect to the server
697
687
  */
@@ -705,7 +695,7 @@ var LiveSpeechClient = class {
705
695
  disconnect() {
706
696
  this.logger.info("Disconnecting...");
707
697
  this.sessionId = null;
708
- this.sessionConfig = null;
698
+ this.isStreaming = false;
709
699
  this.connection.disconnect();
710
700
  }
711
701
  /**
@@ -718,16 +708,6 @@ var LiveSpeechClient = class {
718
708
  if (this.sessionId) {
719
709
  throw new Error("Session already active. Call endSession() first.");
720
710
  }
721
- const resolvedConfig = {
722
- prePrompt: config.prePrompt,
723
- voiceId: config.voiceId ?? SESSION_DEFAULTS.voiceId,
724
- languageCode: config.languageCode ?? SESSION_DEFAULTS.languageCode,
725
- inputFormat: config.inputFormat ?? SESSION_DEFAULTS.inputFormat,
726
- outputFormat: config.outputFormat ?? SESSION_DEFAULTS.outputFormat,
727
- sampleRate: config.sampleRate ?? SESSION_DEFAULTS.sampleRate,
728
- metadata: config.metadata ?? {}
729
- };
730
- this.sessionConfig = resolvedConfig;
731
711
  this.logger.info("Starting session...");
732
712
  return new Promise((resolve, reject) => {
733
713
  const onSessionStarted = (event) => {
@@ -744,16 +724,13 @@ var LiveSpeechClient = class {
744
724
  };
745
725
  this.on("sessionStarted", onSessionStarted);
746
726
  this.on("error", onError);
747
- this.connection.send({
748
- action: "startSession",
749
- prePrompt: resolvedConfig.prePrompt,
750
- voiceId: resolvedConfig.voiceId,
751
- languageCode: resolvedConfig.languageCode,
752
- inputFormat: resolvedConfig.inputFormat,
753
- outputFormat: resolvedConfig.outputFormat,
754
- sampleRate: resolvedConfig.sampleRate,
755
- metadata: resolvedConfig.metadata
756
- });
727
+ const startMessage = {
728
+ action: "startSession"
729
+ };
730
+ if (config?.prePrompt) {
731
+ startMessage.prePrompt = config.prePrompt;
732
+ }
733
+ this.connection.send(startMessage);
757
734
  });
758
735
  }
759
736
  /**
@@ -765,6 +742,9 @@ var LiveSpeechClient = class {
765
742
  return;
766
743
  }
767
744
  this.logger.info("Ending session...");
745
+ if (this.isStreaming) {
746
+ this.audioEnd();
747
+ }
768
748
  return new Promise((resolve) => {
769
749
  const onSessionEnded = () => {
770
750
  this.off("sessionEnded", onSessionEnded);
@@ -775,28 +755,49 @@ var LiveSpeechClient = class {
775
755
  });
776
756
  }
777
757
  /**
778
- * Send audio data
758
+ * Start audio streaming session
779
759
  */
780
- sendAudio(data, options) {
760
+ audioStart() {
781
761
  if (!this.isConnected) {
782
762
  throw new Error("Not connected");
783
763
  }
784
764
  if (!this.sessionId) {
785
765
  throw new Error("No active session. Call startSession() first.");
786
766
  }
767
+ if (this.isStreaming) {
768
+ throw new Error("Already streaming. Call audioEnd() first.");
769
+ }
770
+ this.logger.info("Starting audio stream...");
771
+ this.connection.send({ action: "audioStart" });
772
+ this.isStreaming = true;
773
+ }
774
+ /**
775
+ * Send audio chunk (PCM16 base64 encoded)
776
+ */
777
+ sendAudioChunk(data) {
778
+ if (!this.isConnected) {
779
+ throw new Error("Not connected");
780
+ }
781
+ if (!this.isStreaming) {
782
+ throw new Error("Not streaming. Call audioStart() first.");
783
+ }
787
784
  const base64Data = this.audioEncoder.encode(data);
788
- const format = options?.format ?? this.sessionConfig?.inputFormat ?? SESSION_DEFAULTS.inputFormat;
789
- const sampleRate = this.sessionConfig?.sampleRate ?? SESSION_DEFAULTS.sampleRate;
790
- const audioMessage = {
791
- action: "audio",
792
- data: base64Data,
793
- format,
794
- sampleRate
795
- };
796
- if (options?.isFinal !== void 0) {
797
- audioMessage.isFinal = options.isFinal;
785
+ this.connection.send({
786
+ action: "audioChunk",
787
+ data: base64Data
788
+ });
789
+ }
790
+ /**
791
+ * End audio streaming session
792
+ */
793
+ audioEnd() {
794
+ if (!this.isStreaming) {
795
+ this.logger.warn("Not streaming");
796
+ return;
798
797
  }
799
- this.connection.send(audioMessage);
798
+ this.logger.info("Ending audio stream...");
799
+ this.connection.send({ action: "audioEnd" });
800
+ this.isStreaming = false;
800
801
  }
801
802
  // ==================== Event System ====================
802
803
  /**
@@ -864,7 +865,7 @@ var LiveSpeechClient = class {
864
865
  }
865
866
  handleDisconnected(code, _reason) {
866
867
  this.sessionId = null;
867
- this.sessionConfig = null;
868
+ this.isStreaming = false;
868
869
  const event = {
869
870
  type: "disconnected",
870
871
  reason: code === 1e3 ? "normal" : "error",
@@ -906,13 +907,31 @@ var LiveSpeechClient = class {
906
907
  break;
907
908
  case "sessionEnded":
908
909
  this.sessionId = null;
909
- this.sessionConfig = null;
910
+ this.isStreaming = false;
910
911
  this.emit("sessionEnded", {
911
912
  type: "sessionEnded",
912
913
  sessionId: message.sessionId,
913
914
  timestamp: message.timestamp
914
915
  });
915
916
  break;
917
+ case "streamingStarted":
918
+ this.emit("streamingStarted", {
919
+ type: "streamingStarted",
920
+ timestamp: message.timestamp
921
+ });
922
+ break;
923
+ case "speechStart":
924
+ this.emit("speechStart", {
925
+ type: "speechStart",
926
+ timestamp: message.timestamp
927
+ });
928
+ break;
929
+ case "speechEnd":
930
+ this.emit("speechEnd", {
931
+ type: "speechEnd",
932
+ timestamp: message.timestamp
933
+ });
934
+ break;
916
935
  case "transcript": {
917
936
  const transcriptEvent = {
918
937
  type: "transcript",
@@ -952,7 +971,7 @@ var LiveSpeechClient = class {
952
971
  break;
953
972
  }
954
973
  case "error":
955
- this.handleError(message.code, message.message, message.details);
974
+ this.handleError(message.code, message.message);
956
975
  break;
957
976
  default:
958
977
  this.logger.warn("Unknown message type:", message.type);
package/dist/index.mjs CHANGED
@@ -293,22 +293,13 @@ var WebSocketConnection = class {
293
293
  /**
294
294
  * Handle incoming message
295
295
  */
296
- handleMessage(data, onFirstConnect) {
296
+ handleMessage(data) {
297
297
  const message = parseServerMessage(data);
298
298
  if (!message) {
299
299
  this.logger.warn("Invalid message received:", data);
300
300
  return;
301
301
  }
302
302
  this.logger.debug("Received message:", message.type);
303
- if (message.type === "connected") {
304
- this.connectionId = message.connectionId;
305
- this.state = "connected";
306
- this.retryController.reset();
307
- this.startPingInterval();
308
- this.events.onOpen?.(message.connectionId);
309
- onFirstConnect?.();
310
- return;
311
- }
312
303
  if (message.type === "pong") {
313
304
  this.logger.debug("Pong received");
314
305
  return;
@@ -574,20 +565,13 @@ var CONFIG_DEFAULTS = {
574
565
  reconnectDelay: 1e3,
575
566
  debug: false
576
567
  };
577
- var SESSION_DEFAULTS = {
578
- voiceId: "en-US-Standard-A",
579
- languageCode: "en-US",
580
- inputFormat: "pcm16",
581
- outputFormat: "pcm16",
582
- sampleRate: 16e3
583
- };
584
568
  var LiveSpeechClient = class {
585
569
  config;
586
570
  connection;
587
571
  audioEncoder;
588
572
  logger;
589
573
  sessionId = null;
590
- sessionConfig = null;
574
+ isStreaming = false;
591
575
  // Event listeners using a simple map
592
576
  eventListeners = /* @__PURE__ */ new Map();
593
577
  // Simplified handlers
@@ -653,6 +637,12 @@ var LiveSpeechClient = class {
653
637
  get hasActiveSession() {
654
638
  return this.sessionId !== null;
655
639
  }
640
+ /**
641
+ * Check if audio streaming is active
642
+ */
643
+ get isAudioStreaming() {
644
+ return this.isStreaming;
645
+ }
656
646
  /**
657
647
  * Connect to the server
658
648
  */
@@ -666,7 +656,7 @@ var LiveSpeechClient = class {
666
656
  disconnect() {
667
657
  this.logger.info("Disconnecting...");
668
658
  this.sessionId = null;
669
- this.sessionConfig = null;
659
+ this.isStreaming = false;
670
660
  this.connection.disconnect();
671
661
  }
672
662
  /**
@@ -679,16 +669,6 @@ var LiveSpeechClient = class {
679
669
  if (this.sessionId) {
680
670
  throw new Error("Session already active. Call endSession() first.");
681
671
  }
682
- const resolvedConfig = {
683
- prePrompt: config.prePrompt,
684
- voiceId: config.voiceId ?? SESSION_DEFAULTS.voiceId,
685
- languageCode: config.languageCode ?? SESSION_DEFAULTS.languageCode,
686
- inputFormat: config.inputFormat ?? SESSION_DEFAULTS.inputFormat,
687
- outputFormat: config.outputFormat ?? SESSION_DEFAULTS.outputFormat,
688
- sampleRate: config.sampleRate ?? SESSION_DEFAULTS.sampleRate,
689
- metadata: config.metadata ?? {}
690
- };
691
- this.sessionConfig = resolvedConfig;
692
672
  this.logger.info("Starting session...");
693
673
  return new Promise((resolve, reject) => {
694
674
  const onSessionStarted = (event) => {
@@ -705,16 +685,13 @@ var LiveSpeechClient = class {
705
685
  };
706
686
  this.on("sessionStarted", onSessionStarted);
707
687
  this.on("error", onError);
708
- this.connection.send({
709
- action: "startSession",
710
- prePrompt: resolvedConfig.prePrompt,
711
- voiceId: resolvedConfig.voiceId,
712
- languageCode: resolvedConfig.languageCode,
713
- inputFormat: resolvedConfig.inputFormat,
714
- outputFormat: resolvedConfig.outputFormat,
715
- sampleRate: resolvedConfig.sampleRate,
716
- metadata: resolvedConfig.metadata
717
- });
688
+ const startMessage = {
689
+ action: "startSession"
690
+ };
691
+ if (config?.prePrompt) {
692
+ startMessage.prePrompt = config.prePrompt;
693
+ }
694
+ this.connection.send(startMessage);
718
695
  });
719
696
  }
720
697
  /**
@@ -726,6 +703,9 @@ var LiveSpeechClient = class {
726
703
  return;
727
704
  }
728
705
  this.logger.info("Ending session...");
706
+ if (this.isStreaming) {
707
+ this.audioEnd();
708
+ }
729
709
  return new Promise((resolve) => {
730
710
  const onSessionEnded = () => {
731
711
  this.off("sessionEnded", onSessionEnded);
@@ -736,28 +716,49 @@ var LiveSpeechClient = class {
736
716
  });
737
717
  }
738
718
  /**
739
- * Send audio data
719
+ * Start audio streaming session
740
720
  */
741
- sendAudio(data, options) {
721
+ audioStart() {
742
722
  if (!this.isConnected) {
743
723
  throw new Error("Not connected");
744
724
  }
745
725
  if (!this.sessionId) {
746
726
  throw new Error("No active session. Call startSession() first.");
747
727
  }
728
+ if (this.isStreaming) {
729
+ throw new Error("Already streaming. Call audioEnd() first.");
730
+ }
731
+ this.logger.info("Starting audio stream...");
732
+ this.connection.send({ action: "audioStart" });
733
+ this.isStreaming = true;
734
+ }
735
+ /**
736
+ * Send audio chunk (PCM16 base64 encoded)
737
+ */
738
+ sendAudioChunk(data) {
739
+ if (!this.isConnected) {
740
+ throw new Error("Not connected");
741
+ }
742
+ if (!this.isStreaming) {
743
+ throw new Error("Not streaming. Call audioStart() first.");
744
+ }
748
745
  const base64Data = this.audioEncoder.encode(data);
749
- const format = options?.format ?? this.sessionConfig?.inputFormat ?? SESSION_DEFAULTS.inputFormat;
750
- const sampleRate = this.sessionConfig?.sampleRate ?? SESSION_DEFAULTS.sampleRate;
751
- const audioMessage = {
752
- action: "audio",
753
- data: base64Data,
754
- format,
755
- sampleRate
756
- };
757
- if (options?.isFinal !== void 0) {
758
- audioMessage.isFinal = options.isFinal;
746
+ this.connection.send({
747
+ action: "audioChunk",
748
+ data: base64Data
749
+ });
750
+ }
751
+ /**
752
+ * End audio streaming session
753
+ */
754
+ audioEnd() {
755
+ if (!this.isStreaming) {
756
+ this.logger.warn("Not streaming");
757
+ return;
759
758
  }
760
- this.connection.send(audioMessage);
759
+ this.logger.info("Ending audio stream...");
760
+ this.connection.send({ action: "audioEnd" });
761
+ this.isStreaming = false;
761
762
  }
762
763
  // ==================== Event System ====================
763
764
  /**
@@ -825,7 +826,7 @@ var LiveSpeechClient = class {
825
826
  }
826
827
  handleDisconnected(code, _reason) {
827
828
  this.sessionId = null;
828
- this.sessionConfig = null;
829
+ this.isStreaming = false;
829
830
  const event = {
830
831
  type: "disconnected",
831
832
  reason: code === 1e3 ? "normal" : "error",
@@ -867,13 +868,31 @@ var LiveSpeechClient = class {
867
868
  break;
868
869
  case "sessionEnded":
869
870
  this.sessionId = null;
870
- this.sessionConfig = null;
871
+ this.isStreaming = false;
871
872
  this.emit("sessionEnded", {
872
873
  type: "sessionEnded",
873
874
  sessionId: message.sessionId,
874
875
  timestamp: message.timestamp
875
876
  });
876
877
  break;
878
+ case "streamingStarted":
879
+ this.emit("streamingStarted", {
880
+ type: "streamingStarted",
881
+ timestamp: message.timestamp
882
+ });
883
+ break;
884
+ case "speechStart":
885
+ this.emit("speechStart", {
886
+ type: "speechStart",
887
+ timestamp: message.timestamp
888
+ });
889
+ break;
890
+ case "speechEnd":
891
+ this.emit("speechEnd", {
892
+ type: "speechEnd",
893
+ timestamp: message.timestamp
894
+ });
895
+ break;
877
896
  case "transcript": {
878
897
  const transcriptEvent = {
879
898
  type: "transcript",
@@ -913,7 +932,7 @@ var LiveSpeechClient = class {
913
932
  break;
914
933
  }
915
934
  case "error":
916
- this.handleError(message.code, message.message, message.details);
935
+ this.handleError(message.code, message.message);
917
936
  break;
918
937
  default:
919
938
  this.logger.warn("Unknown message type:", message.type);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@drawdream/livespeech",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Real-time speech-to-speech AI conversation SDK",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",