@drawdream/livespeech 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,6 +5,15 @@
5
5
 
6
6
  A TypeScript/JavaScript SDK for real-time speech-to-speech AI conversations.
7
7
 
8
+ ## Features
9
+
10
+ - 🎙️ **Real-time Voice Conversations** - Natural, low-latency voice interactions
11
+ - 🌐 **Multi-language Support** - Korean, English, Japanese, Chinese, and more
12
+ - 🔊 **Streaming Audio** - Send and receive audio in real-time
13
+ - 📝 **Live Transcription** - Get transcriptions of both user and AI speech
14
+ - 🔄 **Auto-reconnection** - Automatic recovery from network issues
15
+ - 🌐 **Browser & Node.js** - Works in both environments
16
+
8
17
  ## Installation
9
18
 
10
19
  ```bash
@@ -18,171 +27,306 @@ pnpm add @drawdream/livespeech
18
27
  ## Quick Start
19
28
 
20
29
  ```typescript
21
- import { LiveSpeechClient, Region } from '@drawdream/livespeech';
30
+ import { LiveSpeechClient } from '@drawdream/livespeech';
22
31
 
23
32
  const client = new LiveSpeechClient({
24
- region: 'ap-northeast-2', // or Region.AP_NORTHEAST_2
33
+ region: 'ap-northeast-2',
25
34
  apiKey: 'your-api-key',
26
35
  });
27
36
 
28
- // Handle events
37
+ // Set up event handlers
29
38
  client.setUserTranscriptHandler((text) => {
30
- console.log(`You said: ${text}`);
39
+ console.log('You:', text);
31
40
  });
32
41
 
33
- client.setTranscriptHandler((text, isFinal) => {
34
- console.log(`AI Transcript: ${text}`);
42
+ client.setResponseHandler((text, isFinal) => {
43
+ console.log('AI:', text);
35
44
  });
36
45
 
37
46
  client.setAudioHandler((audioData) => {
38
- // Play audio through speakers
47
+ playAudio(audioData); // PCM16 @ 24kHz
48
+ });
49
+
50
+ client.setErrorHandler((error) => {
51
+ console.error('Error:', error.message);
39
52
  });
40
53
 
41
- // Connect and start session
54
+ // Connect and start conversation
42
55
  await client.connect();
43
56
  await client.startSession({
44
57
  prePrompt: 'You are a helpful assistant.',
45
- // pipelineMode: 'live' is the default
58
+ language: 'ko-KR',
46
59
  });
47
60
 
48
- // Start streaming and send audio
61
+ // Stream audio
49
62
  client.audioStart();
50
- client.sendAudioChunk(audioBuffer);
63
+ client.sendAudioChunk(pcmData); // PCM16 @ 16kHz
64
+ client.audioEnd();
65
+
66
+ // Cleanup
67
+ await client.endSession();
68
+ client.disconnect();
51
69
  ```
52
70
 
53
- ## Pipeline Modes
71
+ ## Audio Flow
54
72
 
55
- The SDK supports two pipeline modes for audio processing:
73
+ ```
74
+ connect() → startSession() → audioStart() → sendAudioChunk()* → audioEnd() → endSession()
75
+ ```
56
76
 
57
- ### Live Mode (Default)
77
+ | Step | Description |
78
+ |------|-------------|
79
+ | `connect()` | Establish WebSocket connection |
80
+ | `startSession(config)` | Start conversation with optional system prompt |
81
+ | `audioStart()` | Begin audio streaming |
82
+ | `sendAudioChunk(data)` | Send PCM16 audio (call multiple times) |
83
+ | `audioEnd()` | End streaming, triggers AI response |
84
+ | `endSession()` | End conversation |
85
+ | `disconnect()` | Close connection |
58
86
 
59
- Uses Gemini 2.5 Flash Live API for end-to-end audio conversation. This provides:
60
- - **Lower latency** - Direct audio-to-audio processing
61
- - **Natural conversation** - Built-in voice activity detection and turn-taking
62
- - **Real-time transcription** - Both user and AI speech are transcribed
87
+ ## Configuration
63
88
 
64
89
  ```typescript
90
+ const client = new LiveSpeechClient({
91
+ region: 'ap-northeast-2', // Required: Seoul region
92
+ apiKey: 'your-api-key', // Required: Your API key
93
+ autoReconnect: true, // Auto-reconnect on disconnect
94
+ maxReconnectAttempts: 5, // Maximum reconnection attempts
95
+ debug: false, // Enable debug logging
96
+ });
97
+
65
98
  await client.startSession({
66
99
  prePrompt: 'You are a helpful assistant.',
67
- pipelineMode: 'live', // Default, can be omitted
100
+ language: 'ko-KR', // Language: ko-KR, en-US, ja-JP, etc.
68
101
  });
69
102
  ```
70
103
 
71
- ### Composed Mode
104
+ ## Events
72
105
 
73
- Uses separate STT + LLM + TTS services for more customization:
74
- - **More control** - Separate services for each step
75
- - **Custom voices** - Use different TTS voices
76
- - **Text responses** - Access to intermediate text responses
106
+ | Event | Description | Key Properties |
107
+ |-------|-------------|----------------|
108
+ | `connected` | Connection established | `connectionId` |
109
+ | `disconnected` | Connection closed | `reason`, `code` |
110
+ | `sessionStarted` | Session created | `sessionId` |
111
+ | `ready` | Ready for audio input | `timestamp` |
112
+ | `userTranscript` | Your speech transcribed | `text` |
113
+ | `response` | AI's response text | `text`, `isFinal` |
114
+ | `audio` | AI's audio output | `data`, `sampleRate` |
115
+ | `turnComplete` | AI finished speaking | `timestamp` |
116
+ | `error` | Error occurred | `code`, `message` |
117
+
118
+ ### Simple Handlers
77
119
 
78
120
  ```typescript
79
- await client.startSession({
80
- prePrompt: 'You are a helpful assistant.',
81
- pipelineMode: 'composed',
121
+ // Your speech transcription
122
+ client.setUserTranscriptHandler((text) => {
123
+ console.log('You said:', text);
124
+ });
125
+
126
+ // AI's text response
127
+ client.setResponseHandler((text, isFinal) => {
128
+ console.log('AI:', text, isFinal ? '(done)' : '...');
129
+ });
130
+
131
+ // AI's audio output
132
+ client.setAudioHandler((data: Uint8Array) => {
133
+ // data: PCM16 audio
134
+ // Sample rate: 24000 Hz
135
+ playAudio(data);
136
+ });
137
+
138
+ // Error handling
139
+ client.setErrorHandler((error) => {
140
+ console.error(`Error [${error.code}]: ${error.message}`);
82
141
  });
83
142
  ```
84
143
 
85
- ## API Reference
144
+ ### Full Event API
145
+
146
+ ```typescript
147
+ client.on('connected', (event) => {
148
+ console.log('Connected:', event.connectionId);
149
+ });
150
+
151
+ client.on('ready', () => {
152
+ console.log('Ready for audio');
153
+ });
154
+
155
+ client.on('userTranscript', (event) => {
156
+ console.log('You:', event.text);
157
+ });
158
+
159
+ client.on('response', (event) => {
160
+ console.log('AI:', event.text, event.isFinal);
161
+ });
86
162
 
87
- ### Regions
163
+ client.on('audio', (event) => {
164
+ // event.data: Uint8Array (PCM16)
165
+ // event.sampleRate: 24000
166
+ playAudio(event.data);
167
+ });
88
168
 
89
- The SDK provides built-in region support, so you don't need to remember endpoint URLs:
169
+ client.on('turnComplete', () => {
170
+ console.log('AI finished speaking');
171
+ });
90
172
 
91
- | Region | Identifier | Location |
92
- |--------|------------|----------|
93
- | `ap-northeast-2` | `Region.AP_NORTHEAST_2` | Asia Pacific (Seoul) |
94
- | `us-west-2` | `Region.US_WEST_2` | US West (Oregon) - Coming soon |
173
+ client.on('error', (event) => {
174
+ console.error('Error:', event.code, event.message);
175
+ });
176
+ ```
95
177
 
96
- ### LiveSpeechClient
178
+ ## Audio Format
97
179
 
98
- #### Constructor Options
180
+ ### Input (Your Microphone)
99
181
 
100
- | Option | Type | Default | Description |
101
- |--------|------|---------|-------------|
102
- | `region` | `string` | **required** | Region identifier |
103
- | `apiKey` | `string` | **required** | API key for authentication |
104
- | `connectionTimeout` | `number` | `30000` | Connection timeout in ms |
105
- | `autoReconnect` | `boolean` | `true` | Auto-reconnect on disconnect |
106
- | `maxReconnectAttempts` | `number` | `5` | Max reconnection attempts |
107
- | `reconnectDelay` | `number` | `1000` | Base reconnection delay in ms |
108
- | `debug` | `boolean` | `false` | Enable debug logging |
182
+ | Property | Value |
183
+ |----------|-------|
184
+ | Format | PCM16 (16-bit signed, little-endian) |
185
+ | Sample Rate | 16,000 Hz |
186
+ | Channels | 1 (Mono) |
187
+ | Chunk Size | ~3200 bytes (100ms) |
109
188
 
110
- #### Methods
189
+ ### Output (AI Response)
111
190
 
112
- | Method | Description |
113
- |--------|-------------|
114
- | `connect()` | Connect to the server |
115
- | `disconnect()` | Disconnect from the server |
116
- | `startSession(config)` | Start a conversation session |
117
- | `endSession()` | End the current session |
118
- | `sendAudio(data, options?)` | Send audio data to be transcribed |
191
+ | Property | Value |
192
+ |----------|-------|
193
+ | Format | PCM16 (16-bit signed, little-endian) |
194
+ | Sample Rate | 24,000 Hz |
195
+ | Channels | 1 (Mono) |
119
196
 
120
- #### Event Handlers
197
+ ## Browser Example
121
198
 
122
199
  ```typescript
123
- // Simple handlers
124
- client.setUserTranscriptHandler((text) => {}); // User's speech transcription
125
- client.setTranscriptHandler((text, isFinal) => {}); // AI's speech transcription (live mode)
126
- client.setResponseHandler((text, isFinal) => {}); // AI text response (composed mode)
127
- client.setAudioHandler((audioData) => {});
128
- client.setErrorHandler((error) => {});
129
-
130
- // Full event API
131
- client.on('connected', (event) => {});
132
- client.on('disconnected', (event) => {});
133
- client.on('sessionStarted', (event) => {});
134
- client.on('sessionEnded', (event) => {});
135
- client.on('userTranscript', (event) => {}); // User's speech transcription
136
- client.on('transcript', (event) => {}); // AI's speech transcription
137
- client.on('response', (event) => {}); // AI text response
138
- client.on('audio', (event) => {});
139
- client.on('error', (event) => {});
140
- client.on('reconnecting', (event) => {});
141
- client.on('ready', (event) => {}); // Gemini Live ready (live mode)
142
- client.on('turnComplete', (event) => {}); // AI finished speaking (live mode)
143
- ```
200
+ import { LiveSpeechClient, float32ToInt16, int16ToUint8 } from '@drawdream/livespeech';
144
201
 
145
- ### SessionConfig
202
+ const client = new LiveSpeechClient({
203
+ region: 'ap-northeast-2',
204
+ apiKey: 'your-api-key',
205
+ });
146
206
 
147
- | Option | Type | Default | Description |
148
- |--------|------|---------|-------------|
149
- | `prePrompt` | `string` | - | System prompt for the AI |
150
- | `language` | `string` | `'en-US'` | Language code for speech (e.g., "ko-KR") |
151
- | `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
207
+ // Handlers
208
+ client.setUserTranscriptHandler((text) => console.log('You:', text));
209
+ client.setResponseHandler((text) => console.log('AI:', text));
210
+ client.setAudioHandler((data) => playAudioChunk(data));
152
211
 
153
- ## Audio Utilities
212
+ // Connect
213
+ await client.connect();
214
+ await client.startSession({ prePrompt: 'You are a helpful assistant.' });
154
215
 
155
- The SDK includes audio encoding/decoding utilities:
216
+ // Capture microphone
217
+ const stream = await navigator.mediaDevices.getUserMedia({
218
+ audio: { sampleRate: 16000, channelCount: 1 }
219
+ });
220
+
221
+ const audioContext = new AudioContext({ sampleRate: 16000 });
222
+ const source = audioContext.createMediaStreamSource(stream);
223
+ const processor = audioContext.createScriptProcessor(4096, 1, 1);
224
+
225
+ processor.onaudioprocess = (e) => {
226
+ const float32 = e.inputBuffer.getChannelData(0);
227
+ const int16 = float32ToInt16(float32);
228
+ const pcm = int16ToUint8(int16);
229
+ client.sendAudioChunk(pcm);
230
+ };
231
+
232
+ source.connect(processor);
233
+ processor.connect(audioContext.destination);
234
+
235
+ // Start streaming
236
+ client.audioStart();
237
+
238
+ // Stop later
239
+ client.audioEnd();
240
+ stream.getTracks().forEach(track => track.stop());
241
+ ```
242
+
243
+ ## Audio Utilities
156
244
 
157
245
  ```typescript
158
246
  import {
159
- encodeAudioToBase64,
160
- decodeBase64ToAudio,
161
- float32ToInt16,
162
- int16ToFloat32,
163
- wrapPcmInWav,
247
+ float32ToInt16, // Web Audio Float32 → PCM16
248
+ int16ToFloat32, // PCM16 → Float32
249
+ int16ToUint8, // Int16Array → Uint8Array
250
+ uint8ToInt16, // Uint8Array → Int16Array
251
+ wrapPcmInWav, // Create WAV file
252
+ AudioEncoder, // Base64 encoding/decoding
164
253
  } from '@drawdream/livespeech';
165
254
 
166
- // Convert Float32 audio samples to PCM16
167
- const pcmData = float32ToInt16(float32Samples);
255
+ // Convert Web Audio to PCM16 for sending
256
+ const float32 = audioBuffer.getChannelData(0);
257
+ const int16 = float32ToInt16(float32);
258
+ const pcmBytes = int16ToUint8(int16);
259
+ client.sendAudioChunk(pcmBytes);
168
260
 
169
- // Create WAV file from PCM data
170
- const wavFile = wrapPcmInWav(pcmData, { sampleRate: 16000 });
261
+ // Convert received PCM16 to Web Audio
262
+ const receivedInt16 = uint8ToInt16(audioEvent.data);
263
+ const float32Data = int16ToFloat32(receivedInt16);
171
264
  ```
172
265
 
173
- ## Browser Usage
266
+ ## Error Handling
174
267
 
175
- The SDK works in both Node.js and browser environments:
268
+ ```typescript
269
+ client.on('error', (event) => {
270
+ switch (event.code) {
271
+ case 'authentication_failed':
272
+ console.error('Invalid API key');
273
+ break;
274
+ case 'connection_timeout':
275
+ console.error('Connection timed out');
276
+ break;
277
+ case 'rate_limit':
278
+ console.error('Rate limit exceeded');
279
+ break;
280
+ default:
281
+ console.error(`Error: ${event.message}`);
282
+ }
283
+ });
176
284
 
177
- ```html
178
- <script type="module">
179
- import { LiveSpeechClient } from '@drawdream/livespeech';
285
+ client.on('disconnected', (event) => {
286
+ if (event.reason === 'error') {
287
+ console.log('Will auto-reconnect...');
288
+ }
289
+ });
180
290
 
181
- // Use the Web Audio API to capture microphone
182
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
183
- const audioContext = new AudioContext({ sampleRate: 16000 });
184
- // ... process audio and send to client
185
- </script>
291
+ client.on('reconnecting', (event) => {
292
+ console.log(`Reconnecting ${event.attempt}/${event.maxAttempts}`);
293
+ });
294
+ ```
295
+
296
+ ## Client Properties
297
+
298
+ | Property | Type | Description |
299
+ |----------|------|-------------|
300
+ | `isConnected` | `boolean` | Connection status |
301
+ | `hasActiveSession` | `boolean` | Session status |
302
+ | `isAudioStreaming` | `boolean` | Streaming status |
303
+ | `connectionId` | `string \| null` | Current connection ID |
304
+ | `currentSessionId` | `string \| null` | Current session ID |
305
+
306
+ ## Regions
307
+
308
+ | Region | Code | Location |
309
+ |--------|------|----------|
310
+ | Asia Pacific (Seoul) | `ap-northeast-2` | Korea |
311
+
312
+ ## TypeScript Types
313
+
314
+ ```typescript
315
+ import type {
316
+ LiveSpeechConfig,
317
+ SessionConfig,
318
+ LiveSpeechEvent,
319
+ ConnectedEvent,
320
+ DisconnectedEvent,
321
+ SessionStartedEvent,
322
+ ReadyEvent,
323
+ UserTranscriptEvent,
324
+ ResponseEvent,
325
+ AudioEvent,
326
+ TurnCompleteEvent,
327
+ ErrorEvent,
328
+ ErrorCode,
329
+ } from '@drawdream/livespeech';
186
330
  ```
187
331
 
188
332
  ## License
package/dist/index.d.mts CHANGED
@@ -23,8 +23,8 @@ declare function isValidRegion(value: string): value is Region;
23
23
 
24
24
  /**
25
25
  * Pipeline mode for audio processing
26
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
- * - 'composed': Uses separate STT + LLM + TTS services
26
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
27
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
28
28
  */
29
29
  type PipelineMode = 'live' | 'composed';
30
30
  /**
@@ -88,11 +88,18 @@ interface SessionConfig {
88
88
  language?: string;
89
89
  /**
90
90
  * Pipeline mode for audio processing
91
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
91
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
92
92
  * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
93
  * @default "live"
94
94
  */
95
95
  pipelineMode?: PipelineMode;
96
+ /**
97
+ * Enable AI to speak first before user input (live mode only)
98
+ * When enabled, the AI will initiate the conversation based on the prePrompt.
99
+ * Make sure your prePrompt includes instructions for how the AI should greet the user.
100
+ * @default false
101
+ */
102
+ aiSpeaksFirst?: boolean;
96
103
  }
97
104
  /**
98
105
  * Internal resolved configuration with defaults applied
@@ -353,7 +360,7 @@ interface ServerTurnCompleteMessage extends BaseServerMessage {
353
360
  }
354
361
  /**
355
362
  * Ready message from server
356
- * Indicates the Gemini Live session is ready for audio input
363
+ * Indicates the session is ready for audio input
357
364
  */
358
365
  interface ServerReadyMessage extends BaseServerMessage {
359
366
  type: 'ready';
package/dist/index.d.ts CHANGED
@@ -23,8 +23,8 @@ declare function isValidRegion(value: string): value is Region;
23
23
 
24
24
  /**
25
25
  * Pipeline mode for audio processing
26
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
27
- * - 'composed': Uses separate STT + LLM + TTS services
26
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
27
+ * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
28
28
  */
29
29
  type PipelineMode = 'live' | 'composed';
30
30
  /**
@@ -88,11 +88,18 @@ interface SessionConfig {
88
88
  language?: string;
89
89
  /**
90
90
  * Pipeline mode for audio processing
91
- * - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
91
+ * - 'live': Direct audio-to-audio conversation (default, lower latency)
92
92
  * - 'composed': Uses separate STT + LLM + TTS services (more customizable)
93
93
  * @default "live"
94
94
  */
95
95
  pipelineMode?: PipelineMode;
96
+ /**
97
+ * Enable AI to speak first before user input (live mode only)
98
+ * When enabled, the AI will initiate the conversation based on the prePrompt.
99
+ * Make sure your prePrompt includes instructions for how the AI should greet the user.
100
+ * @default false
101
+ */
102
+ aiSpeaksFirst?: boolean;
96
103
  }
97
104
  /**
98
105
  * Internal resolved configuration with defaults applied
@@ -353,7 +360,7 @@ interface ServerTurnCompleteMessage extends BaseServerMessage {
353
360
  }
354
361
  /**
355
362
  * Ready message from server
356
- * Indicates the Gemini Live session is ready for audio input
363
+ * Indicates the session is ready for audio input
357
364
  */
358
365
  interface ServerReadyMessage extends BaseServerMessage {
359
366
  type: 'ready';
package/dist/index.js CHANGED
@@ -734,6 +734,9 @@ var LiveSpeechClient = class {
734
734
  startMessage.language = config.language;
735
735
  }
736
736
  startMessage.pipelineMode = config?.pipelineMode ?? "live";
737
+ if (config?.aiSpeaksFirst) {
738
+ startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
739
+ }
737
740
  this.connection.send(startMessage);
738
741
  });
739
742
  }
package/dist/index.mjs CHANGED
@@ -695,6 +695,9 @@ var LiveSpeechClient = class {
695
695
  startMessage.language = config.language;
696
696
  }
697
697
  startMessage.pipelineMode = config?.pipelineMode ?? "live";
698
+ if (config?.aiSpeaksFirst) {
699
+ startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
700
+ }
698
701
  this.connection.send(startMessage);
699
702
  });
700
703
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@drawdream/livespeech",
3
- "version": "0.1.2",
3
+ "version": "0.1.3",
4
4
  "description": "Real-time speech-to-speech AI conversation SDK",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",