@drawdream/livespeech 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,7 +10,7 @@ A TypeScript/JavaScript SDK for real-time speech-to-speech AI conversations.
10
10
  - đŸŽ™ī¸ **Real-time Voice Conversations** - Natural, low-latency voice interactions
11
11
  - 🌐 **Multi-language Support** - Korean, English, Japanese, Chinese, and more
12
12
  - 🔊 **Streaming Audio** - Send and receive audio in real-time
13
- - 📝 **Live Transcription** - Get transcriptions of both user and AI speech
13
+ - âšī¸ **Barge-in Support** - Interrupt AI mid-speech by talking or programmatically
14
14
  - 🔄 **Auto-reconnection** - Automatic recovery from network issues
15
15
  - 🌐 **Browser & Node.js** - Works in both environments
16
16
 
@@ -18,13 +18,9 @@ A TypeScript/JavaScript SDK for real-time speech-to-speech AI conversations.
18
18
 
19
19
  ```bash
20
20
  npm install @drawdream/livespeech
21
- # or
22
- yarn add @drawdream/livespeech
23
- # or
24
- pnpm add @drawdream/livespeech
25
21
  ```
26
22
 
27
- ## Quick Start
23
+ ## Quick Start (5 minutes)
28
24
 
29
25
  ```typescript
30
26
  import { LiveSpeechClient } from '@drawdream/livespeech';
@@ -34,31 +30,28 @@ const client = new LiveSpeechClient({
34
30
  apiKey: 'your-api-key',
35
31
  });
36
32
 
37
- // Set up event handlers
38
- client.setUserTranscriptHandler((text) => {
39
- console.log('You:', text);
33
+ // Handle only 4 essential events!
34
+ client.setAudioHandler((audioData) => {
35
+ audioPlayer.queue(audioData); // PCM16 @ 24kHz
40
36
  });
41
37
 
42
- client.setResponseHandler((text, isFinal) => {
43
- console.log('AI:', text);
38
+ client.on('interrupted', () => {
39
+ audioPlayer.clear(); // CRITICAL: Clear buffer on interrupt!
44
40
  });
45
41
 
46
- client.setAudioHandler((audioData) => {
47
- playAudio(audioData); // PCM16 @ 24kHz
42
+ client.on('turnComplete', () => {
43
+ console.log('AI finished');
48
44
  });
49
45
 
50
46
  client.setErrorHandler((error) => {
51
47
  console.error('Error:', error.message);
52
48
  });
53
49
 
54
- // Connect and start conversation
50
+ // Connect and start
55
51
  await client.connect();
56
- await client.startSession({
57
- prePrompt: 'You are a helpful assistant.',
58
- language: 'ko-KR',
59
- });
52
+ await client.startSession({ prePrompt: 'You are a helpful assistant.' });
60
53
 
61
- // Stream audio
54
+ // Send audio
62
55
  client.audioStart();
63
56
  client.sendAudioChunk(pcmData); // PCM16 @ 16kHz
64
57
  client.audioEnd();
@@ -68,380 +61,224 @@ await client.endSession();
68
61
  client.disconnect();
69
62
  ```
70
63
 
71
- ## Audio Flow
64
+ ---
72
65
 
73
- ```
74
- connect() → startSession() → audioStart() → sendAudioChunk()* → audioEnd() → endSession()
75
- ↓
76
- sendSystemMessage() (optional, during live session)
77
- sendToolResponse() (when toolCall received)
78
- ```
66
+ # Core API
79
67
 
80
- | Step | Description |
81
- |------|-------------|
82
- | `connect()` | Establish WebSocket connection |
83
- | `startSession(config)` | Start conversation with optional system prompt |
84
- | `audioStart()` | Begin audio streaming |
85
- | `sendAudioChunk(data)` | Send PCM16 audio (call multiple times) |
86
- | `sendSystemMessage(msg)` | Inject context or trigger AI response (optional) |
87
- | `sendToolResponse(id, result)` | Send function result back to AI (after toolCall) |
88
- | `updateUserId(userId)` | Migrate guest session to user account |
89
- | `audioEnd()` | End streaming, triggers AI response |
90
- | `endSession()` | End conversation |
68
+ Everything you need for basic voice conversations.
69
+
70
+ ## Methods
71
+
72
+ | Method | Description |
73
+ |--------|-------------|
74
+ | `connect()` | Establish connection |
91
75
  | `disconnect()` | Close connection |
76
+ | `startSession(config)` | Start conversation with system prompt |
77
+ | `endSession()` | End conversation |
78
+ | `sendAudioChunk(data)` | Send PCM16 audio (16kHz) |
79
+
80
+ ## Events
81
+
82
+ | Event | Description | Action Required |
83
+ |-------|-------------|-----------------|
84
+ | `audio` | AI's audio output | Play audio (PCM16 @ 24kHz) |
85
+ | `turnComplete` | AI finished speaking | Ready for next input |
86
+ | `interrupted` | User barged in | **Clear audio buffer!** |
87
+ | `error` | Error occurred | Handle/log error |
88
+
89
+ ### âš ī¸ Critical: Handle `interrupted`
90
+
91
+ When the user speaks while AI is responding, **you must clear your audio buffer**:
92
+
93
+ ```typescript
94
+ client.on('interrupted', () => {
95
+ audioPlayer.clear(); // Stop buffered audio immediately
96
+ audioPlayer.stop();
97
+ });
98
+ ```
99
+
100
+ Without this, 2-3 seconds of buffered audio continues playing after the user interrupts.
101
+
102
+ ## Audio Format
103
+
104
+ | Direction | Format | Sample Rate |
105
+ |-----------|--------|-------------|
106
+ | Input (mic) | PCM16 | 16,000 Hz |
107
+ | Output (AI) | PCM16 | 24,000 Hz |
92
108
 
93
109
  ## Configuration
94
110
 
95
111
  ```typescript
96
112
  const client = new LiveSpeechClient({
97
- region: 'ap-northeast-2', // Required: Seoul region
98
- apiKey: 'your-api-key', // Required: Your API key
99
- userId: 'user-123', // Optional: Enable conversation memory
100
- autoReconnect: true, // Auto-reconnect on disconnect
101
- maxReconnectAttempts: 5, // Maximum reconnection attempts
102
- debug: false, // Enable debug logging
113
+ region: 'ap-northeast-2', // Required
114
+ apiKey: 'your-api-key', // Required
103
115
  });
104
116
 
105
117
  await client.startSession({
106
118
  prePrompt: 'You are a helpful assistant.',
107
- language: 'ko-KR', // Language: ko-KR, en-US, ja-JP, etc.
108
- pipelineMode: 'live', // 'live' (default) or 'composed'
109
- aiSpeaksFirst: false, // AI speaks first (live mode only)
110
- allowHarmCategory: false, // Disable safety filtering (use with caution)
111
- tools: [{ name: 'func', description: 'desc', parameters: {...} }], // Function calling
119
+ language: 'ko-KR', // Optional: ko-KR, en-US, ja-JP, etc.
112
120
  });
113
121
  ```
114
122
 
115
- ## Session Options
123
+ ---
116
124
 
117
- | Option | Type | Default | Description |
118
- |--------|------|---------|-------------|
119
- | `prePrompt` | `string` | - | System prompt for the AI assistant |
120
- | `language` | `string` | `'en-US'` | Language code (e.g., `ko-KR`, `ja-JP`) |
121
- | `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
122
- | `aiSpeaksFirst` | `boolean` | `false` | AI initiates conversation (live mode only) |
123
- | `allowHarmCategory` | `boolean` | `false` | Disable content safety filtering |
124
- | `tools` | `Tool[]` | `undefined` | Function definitions for AI to call |
125
+ # Advanced API
125
126
 
126
- ### Pipeline Modes
127
+ Optional features for power users.
127
128
 
128
- | Mode | Latency | Description |
129
- |------|---------|-------------|
130
- | `live` | Lower (~300ms) | Direct audio-to-audio via Live API |
131
- | `composed` | Higher (~1-2s) | Separate STT → LLM → TTS pipeline |
129
+ ## Additional Methods
132
130
 
133
- ### AI Speaks First
131
+ | Method | Description |
132
+ |--------|-------------|
133
+ | `audioStart()` / `audioEnd()` | Manual audio stream control |
134
+ | `interrupt()` | Explicitly stop AI response (for Stop button) |
135
+ | `sendSystemMessage(msg)` | Inject context during conversation |
136
+ | `sendToolResponse(id, result)` | Reply to function calls |
137
+ | `updateUserId(userId)` | Migrate guest to authenticated user |
134
138
 
135
- When `aiSpeaksFirst: true`, the AI will immediately speak a greeting based on your `prePrompt`:
139
+ ## Additional Events
136
140
 
137
- ```typescript
138
- await client.startSession({
139
- prePrompt: 'You are a customer service agent. Greet the customer warmly and ask how you can help.',
140
- aiSpeaksFirst: true,
141
- });
141
+ | Event | Description |
142
+ |-------|-------------|
143
+ | `connected` / `disconnected` | Connection lifecycle |
144
+ | `sessionStarted` / `sessionEnded` | Session lifecycle |
145
+ | `ready` | Session ready for audio |
146
+ | `userTranscript` | User's speech transcribed |
147
+ | `response` | AI's response text |
148
+ | `toolCall` | AI wants to call a function |
149
+ | `userIdUpdated` | Guest-to-user migration complete |
150
+
151
+ ---
152
+
153
+ ## Explicit Interrupt (Stop Button)
154
+
155
+ For UI "Stop" buttons or programmatic control:
142
156
 
143
- client.audioStart(); // AI greeting plays immediately
157
+ ```typescript
158
+ // User clicks Stop button
159
+ client.interrupt();
144
160
  ```
145
161
 
146
- > âš ī¸ **Note**: Only works with `pipelineMode: 'live'`
162
+ Note: Voice barge-in works automatically via Gemini's VAD. This method is for explicit control.
163
+
164
+ ---
147
165
 
148
- ### Content Safety
166
+ ## System Messages
149
167
 
150
- By default, LLM applies content safety filtering. Set `allowHarmCategory: true` to disable:
168
+ Inject text context during live sessions (game events, app state, etc.):
151
169
 
152
170
  ```typescript
153
- await client.startSession({
154
- allowHarmCategory: true, // âš ī¸ Disables all safety filters
155
- });
171
+ // AI responds immediately
172
+ client.sendSystemMessage("User completed level 5. Congratulate them!");
173
+
174
+ // Context only, no response
175
+ client.sendSystemMessage({ text: "User is browsing", triggerResponse: false });
156
176
  ```
157
177
 
158
- > âš ī¸ **Warning**: Only use in controlled environments where content moderation is handled by other means.
178
+ > Requires active live session (`audioStart()` called). Max 500 characters.
179
+
180
+ ---
159
181
 
160
182
  ## Function Calling (Tool Use)
161
183
 
162
- Define functions that the AI can call during conversation. When the AI decides to call a function, you receive a `toolCall` event and must respond with `sendToolResponse()`.
184
+ Let AI call functions in your app:
163
185
 
164
- ### Define Tools
186
+ ### 1. Define Tools
165
187
 
166
188
  ```typescript
167
- const tools = [
168
- {
169
- name: 'open_login',
170
- description: 'Opens Google Login popup when user wants to sign in',
171
- parameters: { type: 'OBJECT', properties: {}, required: [] }
172
- },
173
- {
174
- name: 'get_price',
175
- description: 'Gets product price by ID',
176
- parameters: {
177
- type: 'OBJECT',
178
- properties: {
179
- productId: { type: 'string', description: 'Product ID' }
180
- },
181
- required: ['productId']
182
- }
189
+ const tools = [{
190
+ name: 'get_price',
191
+ description: 'Gets product price by ID',
192
+ parameters: {
193
+ type: 'OBJECT',
194
+ properties: { productId: { type: 'string' } },
195
+ required: ['productId']
183
196
  }
184
- ];
197
+ }];
185
198
 
186
199
  await client.startSession({
187
- prePrompt: 'You are a helpful assistant. Use tools when appropriate.',
200
+ prePrompt: 'You are helpful.',
188
201
  tools,
189
202
  });
190
203
  ```
191
204
 
192
- ### Handle Tool Calls
205
+ ### 2. Handle toolCall Events
193
206
 
194
207
  ```typescript
195
208
  client.on('toolCall', (event) => {
196
- console.log('AI wants to call:', event.name);
197
- console.log('With arguments:', event.args);
198
-
199
- if (event.name === 'open_login') {
200
- showLoginModal();
201
- client.sendToolResponse(event.id, { success: true });
202
- }
203
-
204
209
  if (event.name === 'get_price') {
205
- const price = getProductPrice(event.args.productId);
206
- client.sendToolResponse(event.id, { price, currency: 'USD' });
210
+ const price = lookupPrice(event.args.productId);
211
+ client.sendToolResponse(event.id, { price });
207
212
  }
208
213
  });
209
214
  ```
210
215
 
211
- ### Tool Interface
212
-
213
- ```typescript
214
- interface Tool {
215
- name: string; // Function name
216
- description: string; // When AI should use this
217
- parameters?: {
218
- type: 'OBJECT';
219
- properties: Record<string, unknown>;
220
- required?: string[];
221
- };
222
- }
223
- ```
224
-
225
- > âš ī¸ **Note**: Function calling only works with `pipelineMode: 'live'`
226
-
227
- ## System Messages
228
-
229
- During an active live session, you can inject text messages to the AI using `sendSystemMessage()`. This is useful for:
230
- - Game events ("User completed level 5, congratulate them!")
231
- - App state changes ("User opened the cart with 3 items")
232
- - Timer/engagement triggers ("User has been quiet, engage them")
233
- - External data updates ("Weather changed to rainy")
234
-
235
- ### Usage
236
-
237
- ```typescript
238
- // Simple usage - AI responds immediately
239
- client.sendSystemMessage("User just completed level 5. Congratulate them!");
240
-
241
- // With options - context only, no immediate response
242
- client.sendSystemMessage({
243
- text: "User is browsing the cart",
244
- triggerResponse: false
245
- });
246
- ```
247
-
248
- ### Parameters
249
-
250
- | Parameter | Type | Required | Default | Description |
251
- |-----------|------|----------|---------|-------------|
252
- | `text` | `string` | Yes | - | Message text (max 500 chars) |
253
- | `triggerResponse` | `boolean` | No | `true` | AI responds immediately if `true` |
254
-
255
- > âš ī¸ **Note**: Requires an active live session (`audioStart()` must have been called). Only works with `pipelineMode: 'live'`.
216
+ ---
256
217
 
257
218
  ## Conversation Memory
258
219
 
259
- When you provide a `userId`, the SDK enables persistent conversation memory:
260
-
261
- - **Entity Memory**: AI remembers facts shared in previous sessions (names, preferences, relationships)
262
- - **Session Summaries**: Recent conversation summaries are available to the AI
263
- - **Cross-Session**: Memory persists across sessions for the same `userId`
220
+ Enable persistent memory across sessions:
264
221
 
265
222
  ```typescript
266
- // With memory (authenticated user)
267
223
  const client = new LiveSpeechClient({
268
224
  region: 'ap-northeast-2',
269
225
  apiKey: 'your-api-key',
270
- userId: 'user-123', // Enables conversation memory
271
- });
272
-
273
- // Without memory (guest)
274
- const client = new LiveSpeechClient({
275
- region: 'ap-northeast-2',
276
- apiKey: 'your-api-key',
277
- // No userId = guest mode, no persistent memory
226
+ userId: 'user-123', // Enables memory
278
227
  });
279
228
  ```
280
229
 
281
- | Mode | Memory Persistence | Use Case |
282
- |------|-------------------|----------|
283
- | With `userId` | Permanent | Authenticated users |
284
- | Without `userId` | Session only | Guests, anonymous users |
230
+ | Mode | Memory |
231
+ |------|--------|
232
+ | With `userId` | Permanent (entities, summaries) |
233
+ | Without `userId` | Session only (guest) |
285
234
 
286
235
  ### Guest-to-User Migration
287
236
 
288
- When a guest user logs in during a session, you can migrate their conversation history to their user account:
289
-
290
237
  ```typescript
291
- // User logs in after chatting as guest
292
- client.on('userIdUpdated', (event) => {
293
- console.log(`Migrated ${event.migratedMessages} messages to user ${event.userId}`);
294
- });
295
-
296
- // After authentication
238
+ // User logs in during session
297
239
  await client.updateUserId('authenticated-user-123');
298
- ```
299
-
300
- This enables:
301
- - Entity extraction on guest conversation history
302
- - Conversation continuity across sessions
303
- - Personalization based on past interactions
304
-
305
- ## Events
306
-
307
- | Event | Description | Key Properties |
308
- |-------|-------------|----------------|
309
- | `connected` | Connection established | `connectionId` |
310
- | `disconnected` | Connection closed | `reason`, `code` |
311
- | `sessionStarted` | Session created | `sessionId` |
312
- | `ready` | Ready for audio input | `timestamp` |
313
- | `userTranscript` | Your speech transcribed | `text` |
314
- | `response` | AI's response text | `text`, `isFinal` |
315
- | `audio` | AI's audio output | `data`, `sampleRate` |
316
- | `turnComplete` | AI finished speaking | `timestamp` |
317
- | `toolCall` | AI wants to call a function | `id`, `name`, `args` |
318
- | `userIdUpdated` | Guest migrated to user account | `userId`, `migratedMessages` |
319
- | `error` | Error occurred | `code`, `message` |
320
-
321
- ### Simple Handlers
322
-
323
- ```typescript
324
- // Your speech transcription
325
- client.setUserTranscriptHandler((text) => {
326
- console.log('You said:', text);
327
- });
328
-
329
- // AI's text response
330
- client.setResponseHandler((text, isFinal) => {
331
- console.log('AI:', text, isFinal ? '(done)' : '...');
332
- });
333
-
334
- // AI's audio output
335
- client.setAudioHandler((data: Uint8Array) => {
336
- // data: PCM16 audio
337
- // Sample rate: 24000 Hz
338
- playAudio(data);
339
- });
340
-
341
- // Error handling
342
- client.setErrorHandler((error) => {
343
- console.error(`Error [${error.code}]: ${error.message}`);
344
- });
345
-
346
- // Tool calls (function calling)
347
- client.on('toolCall', (event) => {
348
- // Execute function and send result
349
- const result = executeFunction(event.name, event.args);
350
- client.sendToolResponse(event.id, result);
351
- });
352
240
 
353
- // Guest-to-user migration
241
+ // Listen for confirmation
354
242
  client.on('userIdUpdated', (event) => {
355
- console.log(`Logged in as ${event.userId}, migrated ${event.migratedMessages} messages`);
243
+ console.log(`Migrated ${event.migratedMessages} messages`);
356
244
  });
357
245
  ```
358
246
 
359
- ### Full Event API
247
+ ---
360
248
 
361
- ```typescript
362
- client.on('connected', (event) => {
363
- console.log('Connected:', event.connectionId);
364
- });
249
+ ## AI Speaks First
365
250
 
366
- client.on('ready', () => {
367
- console.log('Ready for audio');
368
- });
369
-
370
- client.on('userTranscript', (event) => {
371
- console.log('You:', event.text);
372
- });
373
-
374
- client.on('response', (event) => {
375
- console.log('AI:', event.text, event.isFinal);
376
- });
377
-
378
- client.on('audio', (event) => {
379
- // event.data: Uint8Array (PCM16)
380
- // event.sampleRate: 24000
381
- playAudio(event.data);
382
- });
383
-
384
- client.on('turnComplete', () => {
385
- console.log('AI finished speaking');
386
- });
387
-
388
- client.on('error', (event) => {
389
- console.error('Error:', event.code, event.message);
390
- });
251
+ AI initiates the conversation:
391
252
 
392
- client.on('toolCall', (event) => {
393
- // event.id: string - use with sendToolResponse
394
- // event.name: string - function name
395
- // event.args: object - function arguments
396
- const result = handleToolCall(event.name, event.args);
397
- client.sendToolResponse(event.id, result);
253
+ ```typescript
254
+ await client.startSession({
255
+ prePrompt: 'Greet the customer warmly.',
256
+ aiSpeaksFirst: true,
398
257
  });
399
258
 
400
- client.on('userIdUpdated', (event) => {
401
- // event.userId: string - the new user ID
402
- // event.migratedMessages: number - count of migrated messages
403
- console.log(`Migrated ${event.migratedMessages} messages to ${event.userId}`);
404
- });
259
+ client.audioStart(); // AI speaks immediately
405
260
  ```
406
261
 
407
- ## Audio Format
408
-
409
- ### Input (Your Microphone)
262
+ ---
410
263
 
411
- | Property | Value |
412
- |----------|-------|
413
- | Format | PCM16 (16-bit signed, little-endian) |
414
- | Sample Rate | 16,000 Hz |
415
- | Channels | 1 (Mono) |
416
- | Chunk Size | ~3200 bytes (100ms) |
264
+ ## Session Options
417
265
 
418
- ### Output (AI Response)
266
+ | Option | Default | Description |
267
+ |--------|---------|-------------|
268
+ | `prePrompt` | - | System prompt |
269
+ | `language` | `'en-US'` | Language code |
270
+ | `pipelineMode` | `'live'` | `'live'` (~300ms) or `'composed'` (~1-2s) |
271
+ | `aiSpeaksFirst` | `false` | AI initiates (live mode only) |
272
+ | `allowHarmCategory` | `false` | Disable safety filters |
273
+ | `tools` | `[]` | Function definitions |
419
274
 
420
- | Property | Value |
421
- |----------|-------|
422
- | Format | PCM16 (16-bit signed, little-endian) |
423
- | Sample Rate | 24,000 Hz |
424
- | Channels | 1 (Mono) |
275
+ ---
425
276
 
426
277
  ## Browser Example
427
278
 
428
279
  ```typescript
429
280
  import { LiveSpeechClient, float32ToInt16, int16ToUint8 } from '@drawdream/livespeech';
430
281
 
431
- const client = new LiveSpeechClient({
432
- region: 'ap-northeast-2',
433
- apiKey: 'your-api-key',
434
- });
435
-
436
- // Handlers
437
- client.setUserTranscriptHandler((text) => console.log('You:', text));
438
- client.setResponseHandler((text) => console.log('AI:', text));
439
- client.setAudioHandler((data) => playAudioChunk(data));
440
-
441
- // Connect
442
- await client.connect();
443
- await client.startSession({ prePrompt: 'You are a helpful assistant.' });
444
-
445
282
  // Capture microphone
446
283
  const stream = await navigator.mediaDevices.getUserMedia({
447
284
  audio: { sampleRate: 16000, channelCount: 1 }
@@ -460,60 +297,30 @@ processor.onaudioprocess = (e) => {
460
297
 
461
298
  source.connect(processor);
462
299
  processor.connect(audioContext.destination);
463
-
464
- // Start streaming
465
- client.audioStart();
466
-
467
- // Stop later
468
- client.audioEnd();
469
- stream.getTracks().forEach(track => track.stop());
470
300
  ```
471
301
 
302
+ ---
303
+
472
304
  ## Audio Utilities
473
305
 
474
306
  ```typescript
475
- import {
476
- float32ToInt16, // Web Audio Float32 → PCM16
477
- int16ToFloat32, // PCM16 → Float32
478
- int16ToUint8, // Int16Array → Uint8Array
479
- uint8ToInt16, // Uint8Array → Int16Array
480
- wrapPcmInWav, // Create WAV file
481
- AudioEncoder, // Base64 encoding/decoding
482
- } from '@drawdream/livespeech';
483
-
484
- // Convert Web Audio to PCM16 for sending
485
- const float32 = audioBuffer.getChannelData(0);
486
- const int16 = float32ToInt16(float32);
487
- const pcmBytes = int16ToUint8(int16);
488
- client.sendAudioChunk(pcmBytes);
489
-
490
- // Convert received PCM16 to Web Audio
491
- const receivedInt16 = uint8ToInt16(audioEvent.data);
492
- const float32Data = int16ToFloat32(receivedInt16);
307
+ import { float32ToInt16, int16ToUint8, wrapPcmInWav } from '@drawdream/livespeech';
308
+
309
+ const int16 = float32ToInt16(float32Data);
310
+ const bytes = int16ToUint8(int16);
311
+ const wav = wrapPcmInWav(bytes, 16000, 1, 16);
493
312
  ```
494
313
 
314
+ ---
315
+
495
316
  ## Error Handling
496
317
 
497
318
  ```typescript
498
319
  client.on('error', (event) => {
499
320
  switch (event.code) {
500
- case 'authentication_failed':
501
- console.error('Invalid API key');
502
- break;
503
- case 'connection_timeout':
504
- console.error('Connection timed out');
505
- break;
506
- case 'rate_limit':
507
- console.error('Rate limit exceeded');
508
- break;
509
- default:
510
- console.error(`Error: ${event.message}`);
511
- }
512
- });
513
-
514
- client.on('disconnected', (event) => {
515
- if (event.reason === 'error') {
516
- console.log('Will auto-reconnect...');
321
+ case 'authentication_failed': console.error('Invalid API key'); break;
322
+ case 'connection_timeout': console.error('Timed out'); break;
323
+ default: console.error(`Error: ${event.message}`);
517
324
  }
518
325
  });
519
326
 
@@ -522,44 +329,13 @@ client.on('reconnecting', (event) => {
522
329
  });
523
330
  ```
524
331
 
525
- ## Client Properties
526
-
527
- | Property | Type | Description |
528
- |----------|------|-------------|
529
- | `isConnected` | `boolean` | Connection status |
530
- | `hasActiveSession` | `boolean` | Session status |
531
- | `isAudioStreaming` | `boolean` | Streaming status |
532
- | `connectionId` | `string \| null` | Current connection ID |
533
- | `currentSessionId` | `string \| null` | Current session ID |
332
+ ---
534
333
 
535
334
  ## Regions
536
335
 
537
- | Region | Code | Location |
538
- |--------|------|----------|
539
- | Asia Pacific (Seoul) | `ap-northeast-2` | Korea |
540
-
541
- ## TypeScript Types
542
-
543
- ```typescript
544
- import type {
545
- LiveSpeechConfig,
546
- SessionConfig,
547
- LiveSpeechEvent,
548
- ConnectedEvent,
549
- DisconnectedEvent,
550
- SessionStartedEvent,
551
- ReadyEvent,
552
- UserTranscriptEvent,
553
- ResponseEvent,
554
- AudioEvent,
555
- TurnCompleteEvent,
556
- ToolCallEvent,
557
- UserIdUpdatedEvent,
558
- ErrorEvent,
559
- ErrorCode,
560
- Tool,
561
- } from '@drawdream/livespeech';
562
- ```
336
+ | Region | Code |
337
+ |--------|------|
338
+ | Seoul (Korea) | `ap-northeast-2` |
563
339
 
564
340
  ## License
565
341
 
package/dist/index.d.mts CHANGED
@@ -201,7 +201,7 @@ interface ResolvedConfig {
201
201
  /**
202
202
  * Event types emitted by the LiveSpeech client
203
203
  */
204
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'error';
204
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
205
205
  /**
206
206
  * Event payload for 'connected' event
207
207
  */
@@ -357,10 +357,30 @@ interface UserIdUpdatedEvent {
357
357
  migratedMessages: number;
358
358
  timestamp: string;
359
359
  }
360
+ /**
361
+ * Event payload for 'interrupted' event (barge-in)
362
+ * Indicates the AI response was interrupted because the user started speaking.
363
+ *
364
+ * **Critical**: When you receive this event, immediately clear your audio playback
365
+ * buffer to stop the AI audio from continuing to play. This enables natural
366
+ * barge-in behavior like a real phone conversation.
367
+ *
368
+ * @example
369
+ * client.on('interrupted', (event) => {
370
+ * // Stop playing AI audio immediately
371
+ * audioPlayer.clearBuffer();
372
+ * audioPlayer.stop();
373
+ * console.log('AI interrupted - ready for user input');
374
+ * });
375
+ */
376
+ interface InterruptedEvent {
377
+ type: 'interrupted';
378
+ timestamp: string;
379
+ }
360
380
  /**
361
381
  * Union type of all event payloads
362
382
  */
363
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | ErrorEvent;
383
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
364
384
  /**
365
385
  * Simplified event handlers for common use cases
366
386
  */
@@ -372,11 +392,11 @@ type ErrorHandler = (error: ErrorEvent) => void;
372
392
  /**
373
393
  * WebSocket message types sent from client to server
374
394
  */
375
- type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'systemMessage' | 'toolResponse' | 'updateUserId' | 'ping';
395
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'systemMessage' | 'toolResponse' | 'updateUserId' | 'interrupt' | 'ping';
376
396
  /**
377
397
  * WebSocket message types received from server
378
398
  */
379
- type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'error' | 'pong';
399
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
380
400
  /**
381
401
  * Base interface for client messages
382
402
  */
@@ -466,10 +486,16 @@ interface UpdateUserIdMessage extends BaseClientMessage {
466
486
  /** The authenticated user's unique identifier */
467
487
  userId: string;
468
488
  }
489
+ /**
490
+ * Interrupt message - explicitly stop AI response (for Stop button)
491
+ */
492
+ interface InterruptMessage extends BaseClientMessage {
493
+ action: 'interrupt';
494
+ }
469
495
  /**
470
496
  * Union type of all client messages
471
497
  */
472
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | SystemMessageMessage | ToolResponseMessage | UpdateUserIdMessage | PingMessage;
498
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | SystemMessageMessage | ToolResponseMessage | UpdateUserIdMessage | InterruptMessage | PingMessage;
473
499
  /**
474
500
  * Base interface for server messages
475
501
  */
@@ -567,10 +593,18 @@ interface ServerUserIdUpdatedMessage extends BaseServerMessage {
567
593
  /** Number of messages migrated from guest to user partition */
568
594
  migratedMessages: number;
569
595
  }
596
+ /**
597
+ * Interrupted message from server (barge-in)
598
+ * Indicates the AI response was interrupted because the user started speaking.
599
+ * Clients should immediately clear their audio playback buffer when receiving this.
600
+ */
601
+ interface ServerInterruptedMessage extends BaseServerMessage {
602
+ type: 'interrupted';
603
+ }
570
604
  /**
571
605
  * Union type of all server messages
572
606
  */
573
- type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerErrorMessage | ServerPongMessage;
607
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
574
608
 
575
609
  /**
576
610
  * Connection state
@@ -593,6 +627,7 @@ type LiveSpeechEventMap = {
593
627
  turnComplete: TurnCompleteEvent;
594
628
  toolCall: ToolCallEvent;
595
629
  userIdUpdated: UserIdUpdatedEvent;
630
+ interrupted: InterruptedEvent;
596
631
  error: ErrorEvent;
597
632
  };
598
633
  /**
@@ -710,6 +745,26 @@ declare class LiveSpeechClient {
710
745
  * });
711
746
  */
712
747
  sendToolResponse(id: string, response?: unknown): void;
748
+ /**
749
+ * Explicitly interrupt the current AI response
750
+ *
751
+ * Use this method for:
752
+ * - UI "Stop" button functionality
753
+ * - Programmatic control to stop AI mid-response
754
+ *
755
+ * Note: In most cases, simply speaking will trigger automatic
756
+ * interruption via Gemini's voice activity detection (VAD).
757
+ * This method is for explicit programmatic control.
758
+ *
759
+ * @example
760
+ * // User clicks "Stop" button
761
+ * client.interrupt();
762
+ *
763
+ * @example
764
+ * // Stop AI after a certain time
765
+ * setTimeout(() => client.interrupt(), 10000);
766
+ */
767
+ interrupt(): void;
713
768
  /**
714
769
  * Update the user ID for the current connection (guest-to-user migration)
715
770
  *
package/dist/index.d.ts CHANGED
@@ -201,7 +201,7 @@ interface ResolvedConfig {
201
201
  /**
202
202
  * Event types emitted by the LiveSpeech client
203
203
  */
204
- type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'error';
204
+ type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
205
205
  /**
206
206
  * Event payload for 'connected' event
207
207
  */
@@ -357,10 +357,30 @@ interface UserIdUpdatedEvent {
357
357
  migratedMessages: number;
358
358
  timestamp: string;
359
359
  }
360
+ /**
361
+ * Event payload for 'interrupted' event (barge-in)
362
+ * Indicates the AI response was interrupted because the user started speaking.
363
+ *
364
+ * **Critical**: When you receive this event, immediately clear your audio playback
365
+ * buffer to stop the AI audio from continuing to play. This enables natural
366
+ * barge-in behavior like a real phone conversation.
367
+ *
368
+ * @example
369
+ * client.on('interrupted', (event) => {
370
+ * // Stop playing AI audio immediately
371
+ * audioPlayer.clearBuffer();
372
+ * audioPlayer.stop();
373
+ * console.log('AI interrupted - ready for user input');
374
+ * });
375
+ */
376
+ interface InterruptedEvent {
377
+ type: 'interrupted';
378
+ timestamp: string;
379
+ }
360
380
  /**
361
381
  * Union type of all event payloads
362
382
  */
363
- type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | ErrorEvent;
383
+ type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
364
384
  /**
365
385
  * Simplified event handlers for common use cases
366
386
  */
@@ -372,11 +392,11 @@ type ErrorHandler = (error: ErrorEvent) => void;
372
392
  /**
373
393
  * WebSocket message types sent from client to server
374
394
  */
375
- type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'systemMessage' | 'toolResponse' | 'updateUserId' | 'ping';
395
+ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioChunk' | 'audioEnd' | 'systemMessage' | 'toolResponse' | 'updateUserId' | 'interrupt' | 'ping';
376
396
  /**
377
397
  * WebSocket message types received from server
378
398
  */
379
- type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'error' | 'pong';
399
+ type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
380
400
  /**
381
401
  * Base interface for client messages
382
402
  */
@@ -466,10 +486,16 @@ interface UpdateUserIdMessage extends BaseClientMessage {
466
486
  /** The authenticated user's unique identifier */
467
487
  userId: string;
468
488
  }
489
+ /**
490
+ * Interrupt message - explicitly stop AI response (for Stop button)
491
+ */
492
+ interface InterruptMessage extends BaseClientMessage {
493
+ action: 'interrupt';
494
+ }
469
495
  /**
470
496
  * Union type of all client messages
471
497
  */
472
- type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | SystemMessageMessage | ToolResponseMessage | UpdateUserIdMessage | PingMessage;
498
+ type ClientMessage = StartSessionMessage | EndSessionMessage | AudioStartMessage | AudioChunkMessage | AudioEndMessage | SystemMessageMessage | ToolResponseMessage | UpdateUserIdMessage | InterruptMessage | PingMessage;
473
499
  /**
474
500
  * Base interface for server messages
475
501
  */
@@ -567,10 +593,18 @@ interface ServerUserIdUpdatedMessage extends BaseServerMessage {
567
593
  /** Number of messages migrated from guest to user partition */
568
594
  migratedMessages: number;
569
595
  }
596
+ /**
597
+ * Interrupted message from server (barge-in)
598
+ * Indicates the AI response was interrupted because the user started speaking.
599
+ * Clients should immediately clear their audio playback buffer when receiving this.
600
+ */
601
+ interface ServerInterruptedMessage extends BaseServerMessage {
602
+ type: 'interrupted';
603
+ }
570
604
  /**
571
605
  * Union type of all server messages
572
606
  */
573
- type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerErrorMessage | ServerPongMessage;
607
+ type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
574
608
 
575
609
  /**
576
610
  * Connection state
@@ -593,6 +627,7 @@ type LiveSpeechEventMap = {
593
627
  turnComplete: TurnCompleteEvent;
594
628
  toolCall: ToolCallEvent;
595
629
  userIdUpdated: UserIdUpdatedEvent;
630
+ interrupted: InterruptedEvent;
596
631
  error: ErrorEvent;
597
632
  };
598
633
  /**
@@ -710,6 +745,26 @@ declare class LiveSpeechClient {
710
745
  * });
711
746
  */
712
747
  sendToolResponse(id: string, response?: unknown): void;
748
+ /**
749
+ * Explicitly interrupt the current AI response
750
+ *
751
+ * Use this method for:
752
+ * - UI "Stop" button functionality
753
+ * - Programmatic control to stop AI mid-response
754
+ *
755
+ * Note: In most cases, simply speaking will trigger automatic
756
+ * interruption via Gemini's voice activity detection (VAD).
757
+ * This method is for explicit programmatic control.
758
+ *
759
+ * @example
760
+ * // User clicks "Stop" button
761
+ * client.interrupt();
762
+ *
763
+ * @example
764
+ * // Stop AI after a certain time
765
+ * setTimeout(() => client.interrupt(), 10000);
766
+ */
767
+ interrupt(): void;
713
768
  /**
714
769
  * Update the user ID for the current connection (guest-to-user migration)
715
770
  *
package/dist/index.js CHANGED
@@ -877,6 +877,35 @@ var LiveSpeechClient = class {
877
877
  payload: { id, response }
878
878
  });
879
879
  }
880
+ /**
881
+ * Explicitly interrupt the current AI response
882
+ *
883
+ * Use this method for:
884
+ * - UI "Stop" button functionality
885
+ * - Programmatic control to stop AI mid-response
886
+ *
887
+ * Note: In most cases, simply speaking will trigger automatic
888
+ * interruption via Gemini's voice activity detection (VAD).
889
+ * This method is for explicit programmatic control.
890
+ *
891
+ * @example
892
+ * // User clicks "Stop" button
893
+ * client.interrupt();
894
+ *
895
+ * @example
896
+ * // Stop AI after a certain time
897
+ * setTimeout(() => client.interrupt(), 10000);
898
+ */
899
+ interrupt() {
900
+ if (!this.isConnected) {
901
+ throw new Error("Not connected");
902
+ }
903
+ if (!this.isStreaming) {
904
+ throw new Error("No active Live session. Call audioStart() first.");
905
+ }
906
+ this.logger.info("Sending explicit interrupt");
907
+ this.connection.send({ action: "interrupt" });
908
+ }
880
909
  /**
881
910
  * Update the user ID for the current connection (guest-to-user migration)
882
911
  *
@@ -1119,6 +1148,15 @@ var LiveSpeechClient = class {
1119
1148
  this.emit("userIdUpdated", userIdUpdatedEvent);
1120
1149
  break;
1121
1150
  }
1151
+ case "interrupted": {
1152
+ const interruptedEvent = {
1153
+ type: "interrupted",
1154
+ timestamp: message.timestamp
1155
+ };
1156
+ this.logger.info("AI response interrupted (barge-in)");
1157
+ this.emit("interrupted", interruptedEvent);
1158
+ break;
1159
+ }
1122
1160
  case "error":
1123
1161
  this.handleError(message.code, message.message);
1124
1162
  break;
package/dist/index.mjs CHANGED
@@ -838,6 +838,35 @@ var LiveSpeechClient = class {
838
838
  payload: { id, response }
839
839
  });
840
840
  }
841
+ /**
842
+ * Explicitly interrupt the current AI response
843
+ *
844
+ * Use this method for:
845
+ * - UI "Stop" button functionality
846
+ * - Programmatic control to stop AI mid-response
847
+ *
848
+ * Note: In most cases, simply speaking will trigger automatic
849
+ * interruption via Gemini's voice activity detection (VAD).
850
+ * This method is for explicit programmatic control.
851
+ *
852
+ * @example
853
+ * // User clicks "Stop" button
854
+ * client.interrupt();
855
+ *
856
+ * @example
857
+ * // Stop AI after a certain time
858
+ * setTimeout(() => client.interrupt(), 10000);
859
+ */
860
+ interrupt() {
861
+ if (!this.isConnected) {
862
+ throw new Error("Not connected");
863
+ }
864
+ if (!this.isStreaming) {
865
+ throw new Error("No active Live session. Call audioStart() first.");
866
+ }
867
+ this.logger.info("Sending explicit interrupt");
868
+ this.connection.send({ action: "interrupt" });
869
+ }
841
870
  /**
842
871
  * Update the user ID for the current connection (guest-to-user migration)
843
872
  *
@@ -1080,6 +1109,15 @@ var LiveSpeechClient = class {
1080
1109
  this.emit("userIdUpdated", userIdUpdatedEvent);
1081
1110
  break;
1082
1111
  }
1112
+ case "interrupted": {
1113
+ const interruptedEvent = {
1114
+ type: "interrupted",
1115
+ timestamp: message.timestamp
1116
+ };
1117
+ this.logger.info("AI response interrupted (barge-in)");
1118
+ this.emit("interrupted", interruptedEvent);
1119
+ break;
1120
+ }
1083
1121
  case "error":
1084
1122
  this.handleError(message.code, message.message);
1085
1123
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@drawdream/livespeech",
3
- "version": "0.1.10",
3
+ "version": "0.1.12",
4
4
  "description": "Real-time speech-to-speech AI conversation SDK",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",