@drawdream/livespeech 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +262 -84
- package/dist/index.d.mts +75 -62
- package/dist/index.d.ts +75 -62
- package/dist/index.js +38 -37
- package/dist/index.mjs +38 -37
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -5,6 +5,15 @@
|
|
|
5
5
|
|
|
6
6
|
A TypeScript/JavaScript SDK for real-time speech-to-speech AI conversations.
|
|
7
7
|
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- 🎙️ **Real-time Voice Conversations** - Natural, low-latency voice interactions
|
|
11
|
+
- 🌐 **Multi-language Support** - Korean, English, Japanese, Chinese, and more
|
|
12
|
+
- 🔊 **Streaming Audio** - Send and receive audio in real-time
|
|
13
|
+
- 📝 **Live Transcription** - Get transcriptions of both user and AI speech
|
|
14
|
+
- 🔄 **Auto-reconnection** - Automatic recovery from network issues
|
|
15
|
+
- 🌐 **Browser & Node.js** - Works in both environments
|
|
16
|
+
|
|
8
17
|
## Installation
|
|
9
18
|
|
|
10
19
|
```bash
|
|
@@ -18,137 +27,306 @@ pnpm add @drawdream/livespeech
|
|
|
18
27
|
## Quick Start
|
|
19
28
|
|
|
20
29
|
```typescript
|
|
21
|
-
import { LiveSpeechClient
|
|
30
|
+
import { LiveSpeechClient } from '@drawdream/livespeech';
|
|
22
31
|
|
|
23
32
|
const client = new LiveSpeechClient({
|
|
24
|
-
region: 'ap-northeast-2',
|
|
33
|
+
region: 'ap-northeast-2',
|
|
25
34
|
apiKey: 'your-api-key',
|
|
26
35
|
});
|
|
27
36
|
|
|
28
|
-
//
|
|
29
|
-
client.
|
|
30
|
-
console.log(
|
|
37
|
+
// Set up event handlers
|
|
38
|
+
client.setUserTranscriptHandler((text) => {
|
|
39
|
+
console.log('You:', text);
|
|
31
40
|
});
|
|
32
41
|
|
|
33
42
|
client.setResponseHandler((text, isFinal) => {
|
|
34
|
-
console.log(
|
|
43
|
+
console.log('AI:', text);
|
|
35
44
|
});
|
|
36
45
|
|
|
37
46
|
client.setAudioHandler((audioData) => {
|
|
38
|
-
//
|
|
47
|
+
playAudio(audioData); // PCM16 @ 24kHz
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
client.setErrorHandler((error) => {
|
|
51
|
+
console.error('Error:', error.message);
|
|
39
52
|
});
|
|
40
53
|
|
|
41
|
-
// Connect and start
|
|
54
|
+
// Connect and start conversation
|
|
42
55
|
await client.connect();
|
|
43
56
|
await client.startSession({
|
|
44
57
|
prePrompt: 'You are a helpful assistant.',
|
|
58
|
+
language: 'ko-KR',
|
|
45
59
|
});
|
|
46
60
|
|
|
47
|
-
//
|
|
48
|
-
client.
|
|
61
|
+
// Stream audio
|
|
62
|
+
client.audioStart();
|
|
63
|
+
client.sendAudioChunk(pcmData); // PCM16 @ 16kHz
|
|
64
|
+
client.audioEnd();
|
|
65
|
+
|
|
66
|
+
// Cleanup
|
|
67
|
+
await client.endSession();
|
|
68
|
+
client.disconnect();
|
|
49
69
|
```
|
|
50
70
|
|
|
51
|
-
##
|
|
71
|
+
## Audio Flow
|
|
52
72
|
|
|
53
|
-
|
|
73
|
+
```
|
|
74
|
+
connect() → startSession() → audioStart() → sendAudioChunk()* → audioEnd() → endSession()
|
|
75
|
+
```
|
|
54
76
|
|
|
55
|
-
|
|
77
|
+
| Step | Description |
|
|
78
|
+
|------|-------------|
|
|
79
|
+
| `connect()` | Establish WebSocket connection |
|
|
80
|
+
| `startSession(config)` | Start conversation with optional system prompt |
|
|
81
|
+
| `audioStart()` | Begin audio streaming |
|
|
82
|
+
| `sendAudioChunk(data)` | Send PCM16 audio (call multiple times) |
|
|
83
|
+
| `audioEnd()` | End streaming, triggers AI response |
|
|
84
|
+
| `endSession()` | End conversation |
|
|
85
|
+
| `disconnect()` | Close connection |
|
|
56
86
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
87
|
+
## Configuration
|
|
88
|
+
|
|
89
|
+
```typescript
|
|
90
|
+
const client = new LiveSpeechClient({
|
|
91
|
+
region: 'ap-northeast-2', // Required: Seoul region
|
|
92
|
+
apiKey: 'your-api-key', // Required: Your API key
|
|
93
|
+
autoReconnect: true, // Auto-reconnect on disconnect
|
|
94
|
+
maxReconnectAttempts: 5, // Maximum reconnection attempts
|
|
95
|
+
debug: false, // Enable debug logging
|
|
96
|
+
});
|
|
61
97
|
|
|
62
|
-
|
|
98
|
+
await client.startSession({
|
|
99
|
+
prePrompt: 'You are a helpful assistant.',
|
|
100
|
+
language: 'ko-KR', // Language: ko-KR, en-US, ja-JP, etc.
|
|
101
|
+
});
|
|
102
|
+
```
|
|
63
103
|
|
|
64
|
-
|
|
104
|
+
## Events
|
|
65
105
|
|
|
66
|
-
|
|
|
67
|
-
|
|
68
|
-
| `
|
|
69
|
-
| `
|
|
70
|
-
| `
|
|
71
|
-
| `
|
|
72
|
-
| `
|
|
73
|
-
| `
|
|
74
|
-
| `
|
|
106
|
+
| Event | Description | Key Properties |
|
|
107
|
+
|-------|-------------|----------------|
|
|
108
|
+
| `connected` | Connection established | `connectionId` |
|
|
109
|
+
| `disconnected` | Connection closed | `reason`, `code` |
|
|
110
|
+
| `sessionStarted` | Session created | `sessionId` |
|
|
111
|
+
| `ready` | Ready for audio input | `timestamp` |
|
|
112
|
+
| `userTranscript` | Your speech transcribed | `text` |
|
|
113
|
+
| `response` | AI's response text | `text`, `isFinal` |
|
|
114
|
+
| `audio` | AI's audio output | `data`, `sampleRate` |
|
|
115
|
+
| `turnComplete` | AI finished speaking | `timestamp` |
|
|
116
|
+
| `error` | Error occurred | `code`, `message` |
|
|
75
117
|
|
|
76
|
-
|
|
118
|
+
### Simple Handlers
|
|
77
119
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
| `endSession()` | End the current session |
|
|
84
|
-
| `sendAudio(data, options?)` | Send audio data to be transcribed |
|
|
120
|
+
```typescript
|
|
121
|
+
// Your speech transcription
|
|
122
|
+
client.setUserTranscriptHandler((text) => {
|
|
123
|
+
console.log('You said:', text);
|
|
124
|
+
});
|
|
85
125
|
|
|
86
|
-
|
|
126
|
+
// AI's text response
|
|
127
|
+
client.setResponseHandler((text, isFinal) => {
|
|
128
|
+
console.log('AI:', text, isFinal ? '(done)' : '...');
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// AI's audio output
|
|
132
|
+
client.setAudioHandler((data: Uint8Array) => {
|
|
133
|
+
// data: PCM16 audio
|
|
134
|
+
// Sample rate: 24000 Hz
|
|
135
|
+
playAudio(data);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
// Error handling
|
|
139
|
+
client.setErrorHandler((error) => {
|
|
140
|
+
console.error(`Error [${error.code}]: ${error.message}`);
|
|
141
|
+
});
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Full Event API
|
|
87
145
|
|
|
88
146
|
```typescript
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
client.
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
client.on('
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
client.on('response', (event) => {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
147
|
+
client.on('connected', (event) => {
|
|
148
|
+
console.log('Connected:', event.connectionId);
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
client.on('ready', () => {
|
|
152
|
+
console.log('Ready for audio');
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
client.on('userTranscript', (event) => {
|
|
156
|
+
console.log('You:', event.text);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
client.on('response', (event) => {
|
|
160
|
+
console.log('AI:', event.text, event.isFinal);
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
client.on('audio', (event) => {
|
|
164
|
+
// event.data: Uint8Array (PCM16)
|
|
165
|
+
// event.sampleRate: 24000
|
|
166
|
+
playAudio(event.data);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
client.on('turnComplete', () => {
|
|
170
|
+
console.log('AI finished speaking');
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
client.on('error', (event) => {
|
|
174
|
+
console.error('Error:', event.code, event.message);
|
|
175
|
+
});
|
|
105
176
|
```
|
|
106
177
|
|
|
107
|
-
|
|
178
|
+
## Audio Format
|
|
108
179
|
|
|
109
|
-
|
|
110
|
-
|--------|------|---------|-------------|
|
|
111
|
-
| `prePrompt` | `string` | **required** | System prompt for the AI |
|
|
112
|
-
| `voiceId` | `string` | `'en-US-Standard-A'` | TTS voice ID |
|
|
113
|
-
| `languageCode` | `string` | `'en-US'` | Language for STT |
|
|
114
|
-
| `inputFormat` | `AudioFormat` | `'pcm16'` | Input audio format |
|
|
115
|
-
| `outputFormat` | `AudioFormat` | `'pcm16'` | Output audio format |
|
|
116
|
-
| `sampleRate` | `number` | `16000` | Sample rate in Hz |
|
|
117
|
-
| `metadata` | `Record<string,string>` | `{}` | Custom metadata |
|
|
180
|
+
### Input (Your Microphone)
|
|
118
181
|
|
|
119
|
-
|
|
182
|
+
| Property | Value |
|
|
183
|
+
|----------|-------|
|
|
184
|
+
| Format | PCM16 (16-bit signed, little-endian) |
|
|
185
|
+
| Sample Rate | 16,000 Hz |
|
|
186
|
+
| Channels | 1 (Mono) |
|
|
187
|
+
| Chunk Size | ~3200 bytes (100ms) |
|
|
188
|
+
|
|
189
|
+
### Output (AI Response)
|
|
190
|
+
|
|
191
|
+
| Property | Value |
|
|
192
|
+
|----------|-------|
|
|
193
|
+
| Format | PCM16 (16-bit signed, little-endian) |
|
|
194
|
+
| Sample Rate | 24,000 Hz |
|
|
195
|
+
| Channels | 1 (Mono) |
|
|
196
|
+
|
|
197
|
+
## Browser Example
|
|
198
|
+
|
|
199
|
+
```typescript
|
|
200
|
+
import { LiveSpeechClient, float32ToInt16, int16ToUint8 } from '@drawdream/livespeech';
|
|
201
|
+
|
|
202
|
+
const client = new LiveSpeechClient({
|
|
203
|
+
region: 'ap-northeast-2',
|
|
204
|
+
apiKey: 'your-api-key',
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// Handlers
|
|
208
|
+
client.setUserTranscriptHandler((text) => console.log('You:', text));
|
|
209
|
+
client.setResponseHandler((text) => console.log('AI:', text));
|
|
210
|
+
client.setAudioHandler((data) => playAudioChunk(data));
|
|
211
|
+
|
|
212
|
+
// Connect
|
|
213
|
+
await client.connect();
|
|
214
|
+
await client.startSession({ prePrompt: 'You are a helpful assistant.' });
|
|
215
|
+
|
|
216
|
+
// Capture microphone
|
|
217
|
+
const stream = await navigator.mediaDevices.getUserMedia({
|
|
218
|
+
audio: { sampleRate: 16000, channelCount: 1 }
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
const audioContext = new AudioContext({ sampleRate: 16000 });
|
|
222
|
+
const source = audioContext.createMediaStreamSource(stream);
|
|
223
|
+
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
224
|
+
|
|
225
|
+
processor.onaudioprocess = (e) => {
|
|
226
|
+
const float32 = e.inputBuffer.getChannelData(0);
|
|
227
|
+
const int16 = float32ToInt16(float32);
|
|
228
|
+
const pcm = int16ToUint8(int16);
|
|
229
|
+
client.sendAudioChunk(pcm);
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
source.connect(processor);
|
|
233
|
+
processor.connect(audioContext.destination);
|
|
120
234
|
|
|
121
|
-
|
|
235
|
+
// Start streaming
|
|
236
|
+
client.audioStart();
|
|
237
|
+
|
|
238
|
+
// Stop later
|
|
239
|
+
client.audioEnd();
|
|
240
|
+
stream.getTracks().forEach(track => track.stop());
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Audio Utilities
|
|
122
244
|
|
|
123
245
|
```typescript
|
|
124
246
|
import {
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
wrapPcmInWav,
|
|
247
|
+
float32ToInt16, // Web Audio Float32 → PCM16
|
|
248
|
+
int16ToFloat32, // PCM16 → Float32
|
|
249
|
+
int16ToUint8, // Int16Array → Uint8Array
|
|
250
|
+
uint8ToInt16, // Uint8Array → Int16Array
|
|
251
|
+
wrapPcmInWav, // Create WAV file
|
|
252
|
+
AudioEncoder, // Base64 encoding/decoding
|
|
130
253
|
} from '@drawdream/livespeech';
|
|
131
254
|
|
|
132
|
-
// Convert
|
|
133
|
-
const
|
|
255
|
+
// Convert Web Audio to PCM16 for sending
|
|
256
|
+
const float32 = audioBuffer.getChannelData(0);
|
|
257
|
+
const int16 = float32ToInt16(float32);
|
|
258
|
+
const pcmBytes = int16ToUint8(int16);
|
|
259
|
+
client.sendAudioChunk(pcmBytes);
|
|
134
260
|
|
|
135
|
-
//
|
|
136
|
-
const
|
|
261
|
+
// Convert received PCM16 to Web Audio
|
|
262
|
+
const receivedInt16 = uint8ToInt16(audioEvent.data);
|
|
263
|
+
const float32Data = int16ToFloat32(receivedInt16);
|
|
137
264
|
```
|
|
138
265
|
|
|
139
|
-
##
|
|
266
|
+
## Error Handling
|
|
140
267
|
|
|
141
|
-
|
|
268
|
+
```typescript
|
|
269
|
+
client.on('error', (event) => {
|
|
270
|
+
switch (event.code) {
|
|
271
|
+
case 'authentication_failed':
|
|
272
|
+
console.error('Invalid API key');
|
|
273
|
+
break;
|
|
274
|
+
case 'connection_timeout':
|
|
275
|
+
console.error('Connection timed out');
|
|
276
|
+
break;
|
|
277
|
+
case 'rate_limit':
|
|
278
|
+
console.error('Rate limit exceeded');
|
|
279
|
+
break;
|
|
280
|
+
default:
|
|
281
|
+
console.error(`Error: ${event.message}`);
|
|
282
|
+
}
|
|
283
|
+
});
|
|
142
284
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
285
|
+
client.on('disconnected', (event) => {
|
|
286
|
+
if (event.reason === 'error') {
|
|
287
|
+
console.log('Will auto-reconnect...');
|
|
288
|
+
}
|
|
289
|
+
});
|
|
146
290
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
291
|
+
client.on('reconnecting', (event) => {
|
|
292
|
+
console.log(`Reconnecting ${event.attempt}/${event.maxAttempts}`);
|
|
293
|
+
});
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Client Properties
|
|
297
|
+
|
|
298
|
+
| Property | Type | Description |
|
|
299
|
+
|----------|------|-------------|
|
|
300
|
+
| `isConnected` | `boolean` | Connection status |
|
|
301
|
+
| `hasActiveSession` | `boolean` | Session status |
|
|
302
|
+
| `isAudioStreaming` | `boolean` | Streaming status |
|
|
303
|
+
| `connectionId` | `string \| null` | Current connection ID |
|
|
304
|
+
| `currentSessionId` | `string \| null` | Current session ID |
|
|
305
|
+
|
|
306
|
+
## Regions
|
|
307
|
+
|
|
308
|
+
| Region | Code | Location |
|
|
309
|
+
|--------|------|----------|
|
|
310
|
+
| Asia Pacific (Seoul) | `ap-northeast-2` | Korea |
|
|
311
|
+
|
|
312
|
+
## TypeScript Types
|
|
313
|
+
|
|
314
|
+
```typescript
|
|
315
|
+
import type {
|
|
316
|
+
LiveSpeechConfig,
|
|
317
|
+
SessionConfig,
|
|
318
|
+
LiveSpeechEvent,
|
|
319
|
+
ConnectedEvent,
|
|
320
|
+
DisconnectedEvent,
|
|
321
|
+
SessionStartedEvent,
|
|
322
|
+
ReadyEvent,
|
|
323
|
+
UserTranscriptEvent,
|
|
324
|
+
ResponseEvent,
|
|
325
|
+
AudioEvent,
|
|
326
|
+
TurnCompleteEvent,
|
|
327
|
+
ErrorEvent,
|
|
328
|
+
ErrorCode,
|
|
329
|
+
} from '@drawdream/livespeech';
|
|
152
330
|
```
|
|
153
331
|
|
|
154
332
|
## License
|
package/dist/index.d.mts
CHANGED
|
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
|
|
|
21
21
|
*/
|
|
22
22
|
declare function isValidRegion(value: string): value is Region;
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Pipeline mode for audio processing
|
|
26
|
+
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
27
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
28
|
+
*/
|
|
29
|
+
type PipelineMode = 'live' | 'composed';
|
|
24
30
|
/**
|
|
25
31
|
* Configuration options for the LiveSpeech client
|
|
26
32
|
*
|
|
@@ -75,6 +81,25 @@ interface SessionConfig {
|
|
|
75
81
|
* System prompt for the AI assistant
|
|
76
82
|
*/
|
|
77
83
|
prePrompt?: string;
|
|
84
|
+
/**
|
|
85
|
+
* Language code for speech recognition (e.g., "en-US", "ko-KR")
|
|
86
|
+
* @default "en-US"
|
|
87
|
+
*/
|
|
88
|
+
language?: string;
|
|
89
|
+
/**
|
|
90
|
+
* Pipeline mode for audio processing
|
|
91
|
+
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
92
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
93
|
+
* @default "live"
|
|
94
|
+
*/
|
|
95
|
+
pipelineMode?: PipelineMode;
|
|
96
|
+
/**
|
|
97
|
+
* Enable AI to speak first before user input (live mode only)
|
|
98
|
+
* When enabled, the AI will initiate the conversation based on the prePrompt.
|
|
99
|
+
* Make sure your prePrompt includes instructions for how the AI should greet the user.
|
|
100
|
+
* @default false
|
|
101
|
+
*/
|
|
102
|
+
aiSpeaksFirst?: boolean;
|
|
78
103
|
}
|
|
79
104
|
/**
|
|
80
105
|
* Internal resolved configuration with defaults applied
|
|
@@ -92,7 +117,7 @@ interface ResolvedConfig {
|
|
|
92
117
|
/**
|
|
93
118
|
* Event types emitted by the LiveSpeech client
|
|
94
119
|
*/
|
|
95
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | '
|
|
120
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
|
|
96
121
|
/**
|
|
97
122
|
* Event payload for 'connected' event
|
|
98
123
|
*/
|
|
@@ -131,34 +156,19 @@ interface SessionEndedEvent {
|
|
|
131
156
|
timestamp: string;
|
|
132
157
|
}
|
|
133
158
|
/**
|
|
134
|
-
* Event payload for '
|
|
135
|
-
*/
|
|
136
|
-
interface StreamingStartedEvent {
|
|
137
|
-
type: 'streamingStarted';
|
|
138
|
-
timestamp: string;
|
|
139
|
-
}
|
|
140
|
-
/**
|
|
141
|
-
* Event payload for 'speechStart' event - VAD detected speech begin
|
|
142
|
-
*/
|
|
143
|
-
interface SpeechStartEvent {
|
|
144
|
-
type: 'speechStart';
|
|
145
|
-
timestamp: string;
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Event payload for 'speechEnd' event - VAD detected speech end
|
|
159
|
+
* Event payload for 'ready' event
|
|
149
160
|
*/
|
|
150
|
-
interface
|
|
151
|
-
type: '
|
|
161
|
+
interface ReadyEvent {
|
|
162
|
+
type: 'ready';
|
|
152
163
|
timestamp: string;
|
|
153
164
|
}
|
|
154
165
|
/**
|
|
155
|
-
* Event payload for '
|
|
166
|
+
* Event payload for 'userTranscript' event
|
|
167
|
+
* User's speech transcription
|
|
156
168
|
*/
|
|
157
|
-
interface
|
|
158
|
-
type: '
|
|
169
|
+
interface UserTranscriptEvent {
|
|
170
|
+
type: 'userTranscript';
|
|
159
171
|
text: string;
|
|
160
|
-
isFinal: boolean;
|
|
161
|
-
confidence?: number;
|
|
162
172
|
timestamp: string;
|
|
163
173
|
}
|
|
164
174
|
/**
|
|
@@ -204,14 +214,22 @@ interface ReconnectingEvent {
|
|
|
204
214
|
delay: number;
|
|
205
215
|
timestamp: string;
|
|
206
216
|
}
|
|
217
|
+
/**
|
|
218
|
+
* Event payload for 'turnComplete' event (both modes)
|
|
219
|
+
* Indicates the AI has finished its response turn
|
|
220
|
+
*/
|
|
221
|
+
interface TurnCompleteEvent {
|
|
222
|
+
type: 'turnComplete';
|
|
223
|
+
timestamp: string;
|
|
224
|
+
}
|
|
207
225
|
/**
|
|
208
226
|
* Union type of all event payloads
|
|
209
227
|
*/
|
|
210
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent |
|
|
228
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
|
|
211
229
|
/**
|
|
212
230
|
* Simplified event handlers for common use cases
|
|
213
231
|
*/
|
|
214
|
-
type
|
|
232
|
+
type UserTranscriptHandler = (text: string) => void;
|
|
215
233
|
type ResponseHandler = (text: string, isFinal: boolean) => void;
|
|
216
234
|
type AudioHandler = (data: Uint8Array) => void;
|
|
217
235
|
type ErrorHandler = (error: ErrorEvent) => void;
|
|
@@ -223,7 +241,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
223
241
|
/**
|
|
224
242
|
* WebSocket message types received from server
|
|
225
243
|
*/
|
|
226
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | '
|
|
244
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
|
|
227
245
|
/**
|
|
228
246
|
* Base interface for client messages
|
|
229
247
|
*/
|
|
@@ -236,6 +254,8 @@ interface BaseClientMessage {
|
|
|
236
254
|
interface StartSessionMessage extends BaseClientMessage {
|
|
237
255
|
action: 'startSession';
|
|
238
256
|
prePrompt?: string;
|
|
257
|
+
language?: string;
|
|
258
|
+
pipelineMode?: 'live' | 'composed';
|
|
239
259
|
}
|
|
240
260
|
/**
|
|
241
261
|
* End session message
|
|
@@ -294,31 +314,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
294
314
|
sessionId: string;
|
|
295
315
|
}
|
|
296
316
|
/**
|
|
297
|
-
*
|
|
317
|
+
* User transcript message from server (user's speech transcription)
|
|
298
318
|
*/
|
|
299
|
-
interface
|
|
300
|
-
type: '
|
|
301
|
-
}
|
|
302
|
-
/**
|
|
303
|
-
* Speech start message - VAD detected speech begin
|
|
304
|
-
*/
|
|
305
|
-
interface ServerSpeechStartMessage extends BaseServerMessage {
|
|
306
|
-
type: 'speechStart';
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Speech end message - VAD detected speech end
|
|
310
|
-
*/
|
|
311
|
-
interface ServerSpeechEndMessage extends BaseServerMessage {
|
|
312
|
-
type: 'speechEnd';
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Transcript message from server
|
|
316
|
-
*/
|
|
317
|
-
interface ServerTranscriptMessage extends BaseServerMessage {
|
|
318
|
-
type: 'transcript';
|
|
319
|
+
interface ServerUserTranscriptMessage extends BaseServerMessage {
|
|
320
|
+
type: 'userTranscript';
|
|
319
321
|
text: string;
|
|
320
|
-
isFinal: boolean;
|
|
321
|
-
confidence?: number;
|
|
322
322
|
}
|
|
323
323
|
/**
|
|
324
324
|
* Response message from server
|
|
@@ -351,10 +351,24 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
351
351
|
interface ServerPongMessage extends BaseServerMessage {
|
|
352
352
|
type: 'pong';
|
|
353
353
|
}
|
|
354
|
+
/**
|
|
355
|
+
* Turn complete message from server
|
|
356
|
+
* Indicates the AI has finished its response turn
|
|
357
|
+
*/
|
|
358
|
+
interface ServerTurnCompleteMessage extends BaseServerMessage {
|
|
359
|
+
type: 'turnComplete';
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Ready message from server
|
|
363
|
+
* Indicates the session is ready for audio input
|
|
364
|
+
*/
|
|
365
|
+
interface ServerReadyMessage extends BaseServerMessage {
|
|
366
|
+
type: 'ready';
|
|
367
|
+
}
|
|
354
368
|
/**
|
|
355
369
|
* Union type of all server messages
|
|
356
370
|
*/
|
|
357
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage |
|
|
371
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
|
|
358
372
|
|
|
359
373
|
/**
|
|
360
374
|
* Connection state
|
|
@@ -367,16 +381,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
|
|
|
367
381
|
type LiveSpeechEventMap = {
|
|
368
382
|
connected: ConnectedEvent;
|
|
369
383
|
disconnected: DisconnectedEvent;
|
|
384
|
+
reconnecting: ReconnectingEvent;
|
|
370
385
|
sessionStarted: SessionStartedEvent;
|
|
371
386
|
sessionEnded: SessionEndedEvent;
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
speechEnd: SpeechEndEvent;
|
|
375
|
-
transcript: TranscriptEvent;
|
|
387
|
+
ready: ReadyEvent;
|
|
388
|
+
userTranscript: UserTranscriptEvent;
|
|
376
389
|
response: ResponseEvent;
|
|
377
390
|
audio: AudioEvent;
|
|
391
|
+
turnComplete: TurnCompleteEvent;
|
|
378
392
|
error: ErrorEvent;
|
|
379
|
-
reconnecting: ReconnectingEvent;
|
|
380
393
|
};
|
|
381
394
|
/**
|
|
382
395
|
* LiveSpeech client for real-time speech-to-speech AI conversations
|
|
@@ -389,7 +402,7 @@ declare class LiveSpeechClient {
|
|
|
389
402
|
private sessionId;
|
|
390
403
|
private isStreaming;
|
|
391
404
|
private readonly eventListeners;
|
|
392
|
-
private
|
|
405
|
+
private userTranscriptHandler;
|
|
393
406
|
private responseHandler;
|
|
394
407
|
private audioHandler;
|
|
395
408
|
private errorHandler;
|
|
@@ -455,17 +468,17 @@ declare class LiveSpeechClient {
|
|
|
455
468
|
*/
|
|
456
469
|
off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
|
|
457
470
|
/**
|
|
458
|
-
* Set
|
|
459
|
-
*/
|
|
460
|
-
setTranscriptHandler(handler: TranscriptHandler): void;
|
|
461
|
-
/**
|
|
462
|
-
* Set response handler (simplified)
|
|
471
|
+
* Set response handler
|
|
463
472
|
*/
|
|
464
473
|
setResponseHandler(handler: ResponseHandler): void;
|
|
465
474
|
/**
|
|
466
475
|
* Set audio handler (simplified)
|
|
467
476
|
*/
|
|
468
477
|
setAudioHandler(handler: AudioHandler): void;
|
|
478
|
+
/**
|
|
479
|
+
* Set user transcript handler
|
|
480
|
+
*/
|
|
481
|
+
setUserTranscriptHandler(handler: UserTranscriptHandler): void;
|
|
469
482
|
/**
|
|
470
483
|
* Set error handler (simplified)
|
|
471
484
|
*/
|
|
@@ -567,4 +580,4 @@ declare class AudioEncoder {
|
|
|
567
580
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
568
581
|
}
|
|
569
582
|
|
|
570
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type
|
|
583
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
|
|
|
21
21
|
*/
|
|
22
22
|
declare function isValidRegion(value: string): value is Region;
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Pipeline mode for audio processing
|
|
26
|
+
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
27
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
28
|
+
*/
|
|
29
|
+
type PipelineMode = 'live' | 'composed';
|
|
24
30
|
/**
|
|
25
31
|
* Configuration options for the LiveSpeech client
|
|
26
32
|
*
|
|
@@ -75,6 +81,25 @@ interface SessionConfig {
|
|
|
75
81
|
* System prompt for the AI assistant
|
|
76
82
|
*/
|
|
77
83
|
prePrompt?: string;
|
|
84
|
+
/**
|
|
85
|
+
* Language code for speech recognition (e.g., "en-US", "ko-KR")
|
|
86
|
+
* @default "en-US"
|
|
87
|
+
*/
|
|
88
|
+
language?: string;
|
|
89
|
+
/**
|
|
90
|
+
* Pipeline mode for audio processing
|
|
91
|
+
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
92
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
93
|
+
* @default "live"
|
|
94
|
+
*/
|
|
95
|
+
pipelineMode?: PipelineMode;
|
|
96
|
+
/**
|
|
97
|
+
* Enable AI to speak first before user input (live mode only)
|
|
98
|
+
* When enabled, the AI will initiate the conversation based on the prePrompt.
|
|
99
|
+
* Make sure your prePrompt includes instructions for how the AI should greet the user.
|
|
100
|
+
* @default false
|
|
101
|
+
*/
|
|
102
|
+
aiSpeaksFirst?: boolean;
|
|
78
103
|
}
|
|
79
104
|
/**
|
|
80
105
|
* Internal resolved configuration with defaults applied
|
|
@@ -92,7 +117,7 @@ interface ResolvedConfig {
|
|
|
92
117
|
/**
|
|
93
118
|
* Event types emitted by the LiveSpeech client
|
|
94
119
|
*/
|
|
95
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | '
|
|
120
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
|
|
96
121
|
/**
|
|
97
122
|
* Event payload for 'connected' event
|
|
98
123
|
*/
|
|
@@ -131,34 +156,19 @@ interface SessionEndedEvent {
|
|
|
131
156
|
timestamp: string;
|
|
132
157
|
}
|
|
133
158
|
/**
|
|
134
|
-
* Event payload for '
|
|
135
|
-
*/
|
|
136
|
-
interface StreamingStartedEvent {
|
|
137
|
-
type: 'streamingStarted';
|
|
138
|
-
timestamp: string;
|
|
139
|
-
}
|
|
140
|
-
/**
|
|
141
|
-
* Event payload for 'speechStart' event - VAD detected speech begin
|
|
142
|
-
*/
|
|
143
|
-
interface SpeechStartEvent {
|
|
144
|
-
type: 'speechStart';
|
|
145
|
-
timestamp: string;
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Event payload for 'speechEnd' event - VAD detected speech end
|
|
159
|
+
* Event payload for 'ready' event
|
|
149
160
|
*/
|
|
150
|
-
interface
|
|
151
|
-
type: '
|
|
161
|
+
interface ReadyEvent {
|
|
162
|
+
type: 'ready';
|
|
152
163
|
timestamp: string;
|
|
153
164
|
}
|
|
154
165
|
/**
|
|
155
|
-
* Event payload for '
|
|
166
|
+
* Event payload for 'userTranscript' event
|
|
167
|
+
* User's speech transcription
|
|
156
168
|
*/
|
|
157
|
-
interface
|
|
158
|
-
type: '
|
|
169
|
+
interface UserTranscriptEvent {
|
|
170
|
+
type: 'userTranscript';
|
|
159
171
|
text: string;
|
|
160
|
-
isFinal: boolean;
|
|
161
|
-
confidence?: number;
|
|
162
172
|
timestamp: string;
|
|
163
173
|
}
|
|
164
174
|
/**
|
|
@@ -204,14 +214,22 @@ interface ReconnectingEvent {
|
|
|
204
214
|
delay: number;
|
|
205
215
|
timestamp: string;
|
|
206
216
|
}
|
|
217
|
+
/**
|
|
218
|
+
* Event payload for 'turnComplete' event (both modes)
|
|
219
|
+
* Indicates the AI has finished its response turn
|
|
220
|
+
*/
|
|
221
|
+
interface TurnCompleteEvent {
|
|
222
|
+
type: 'turnComplete';
|
|
223
|
+
timestamp: string;
|
|
224
|
+
}
|
|
207
225
|
/**
|
|
208
226
|
* Union type of all event payloads
|
|
209
227
|
*/
|
|
210
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent |
|
|
228
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
|
|
211
229
|
/**
|
|
212
230
|
* Simplified event handlers for common use cases
|
|
213
231
|
*/
|
|
214
|
-
type
|
|
232
|
+
type UserTranscriptHandler = (text: string) => void;
|
|
215
233
|
type ResponseHandler = (text: string, isFinal: boolean) => void;
|
|
216
234
|
type AudioHandler = (data: Uint8Array) => void;
|
|
217
235
|
type ErrorHandler = (error: ErrorEvent) => void;
|
|
@@ -223,7 +241,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
223
241
|
/**
|
|
224
242
|
* WebSocket message types received from server
|
|
225
243
|
*/
|
|
226
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | '
|
|
244
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
|
|
227
245
|
/**
|
|
228
246
|
* Base interface for client messages
|
|
229
247
|
*/
|
|
@@ -236,6 +254,8 @@ interface BaseClientMessage {
|
|
|
236
254
|
interface StartSessionMessage extends BaseClientMessage {
|
|
237
255
|
action: 'startSession';
|
|
238
256
|
prePrompt?: string;
|
|
257
|
+
language?: string;
|
|
258
|
+
pipelineMode?: 'live' | 'composed';
|
|
239
259
|
}
|
|
240
260
|
/**
|
|
241
261
|
* End session message
|
|
@@ -294,31 +314,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
294
314
|
sessionId: string;
|
|
295
315
|
}
|
|
296
316
|
/**
|
|
297
|
-
*
|
|
317
|
+
* User transcript message from server (user's speech transcription)
|
|
298
318
|
*/
|
|
299
|
-
interface
|
|
300
|
-
type: '
|
|
301
|
-
}
|
|
302
|
-
/**
|
|
303
|
-
* Speech start message - VAD detected speech begin
|
|
304
|
-
*/
|
|
305
|
-
interface ServerSpeechStartMessage extends BaseServerMessage {
|
|
306
|
-
type: 'speechStart';
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Speech end message - VAD detected speech end
|
|
310
|
-
*/
|
|
311
|
-
interface ServerSpeechEndMessage extends BaseServerMessage {
|
|
312
|
-
type: 'speechEnd';
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Transcript message from server
|
|
316
|
-
*/
|
|
317
|
-
interface ServerTranscriptMessage extends BaseServerMessage {
|
|
318
|
-
type: 'transcript';
|
|
319
|
+
interface ServerUserTranscriptMessage extends BaseServerMessage {
|
|
320
|
+
type: 'userTranscript';
|
|
319
321
|
text: string;
|
|
320
|
-
isFinal: boolean;
|
|
321
|
-
confidence?: number;
|
|
322
322
|
}
|
|
323
323
|
/**
|
|
324
324
|
* Response message from server
|
|
@@ -351,10 +351,24 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
351
351
|
interface ServerPongMessage extends BaseServerMessage {
|
|
352
352
|
type: 'pong';
|
|
353
353
|
}
|
|
354
|
+
/**
|
|
355
|
+
* Turn complete message from server
|
|
356
|
+
* Indicates the AI has finished its response turn
|
|
357
|
+
*/
|
|
358
|
+
interface ServerTurnCompleteMessage extends BaseServerMessage {
|
|
359
|
+
type: 'turnComplete';
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Ready message from server
|
|
363
|
+
* Indicates the session is ready for audio input
|
|
364
|
+
*/
|
|
365
|
+
interface ServerReadyMessage extends BaseServerMessage {
|
|
366
|
+
type: 'ready';
|
|
367
|
+
}
|
|
354
368
|
/**
|
|
355
369
|
* Union type of all server messages
|
|
356
370
|
*/
|
|
357
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage |
|
|
371
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
|
|
358
372
|
|
|
359
373
|
/**
|
|
360
374
|
* Connection state
|
|
@@ -367,16 +381,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
|
|
|
367
381
|
type LiveSpeechEventMap = {
|
|
368
382
|
connected: ConnectedEvent;
|
|
369
383
|
disconnected: DisconnectedEvent;
|
|
384
|
+
reconnecting: ReconnectingEvent;
|
|
370
385
|
sessionStarted: SessionStartedEvent;
|
|
371
386
|
sessionEnded: SessionEndedEvent;
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
speechEnd: SpeechEndEvent;
|
|
375
|
-
transcript: TranscriptEvent;
|
|
387
|
+
ready: ReadyEvent;
|
|
388
|
+
userTranscript: UserTranscriptEvent;
|
|
376
389
|
response: ResponseEvent;
|
|
377
390
|
audio: AudioEvent;
|
|
391
|
+
turnComplete: TurnCompleteEvent;
|
|
378
392
|
error: ErrorEvent;
|
|
379
|
-
reconnecting: ReconnectingEvent;
|
|
380
393
|
};
|
|
381
394
|
/**
|
|
382
395
|
* LiveSpeech client for real-time speech-to-speech AI conversations
|
|
@@ -389,7 +402,7 @@ declare class LiveSpeechClient {
|
|
|
389
402
|
private sessionId;
|
|
390
403
|
private isStreaming;
|
|
391
404
|
private readonly eventListeners;
|
|
392
|
-
private
|
|
405
|
+
private userTranscriptHandler;
|
|
393
406
|
private responseHandler;
|
|
394
407
|
private audioHandler;
|
|
395
408
|
private errorHandler;
|
|
@@ -455,17 +468,17 @@ declare class LiveSpeechClient {
|
|
|
455
468
|
*/
|
|
456
469
|
off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
|
|
457
470
|
/**
|
|
458
|
-
* Set
|
|
459
|
-
*/
|
|
460
|
-
setTranscriptHandler(handler: TranscriptHandler): void;
|
|
461
|
-
/**
|
|
462
|
-
* Set response handler (simplified)
|
|
471
|
+
* Set response handler
|
|
463
472
|
*/
|
|
464
473
|
setResponseHandler(handler: ResponseHandler): void;
|
|
465
474
|
/**
|
|
466
475
|
* Set audio handler (simplified)
|
|
467
476
|
*/
|
|
468
477
|
setAudioHandler(handler: AudioHandler): void;
|
|
478
|
+
/**
|
|
479
|
+
* Set user transcript handler
|
|
480
|
+
*/
|
|
481
|
+
setUserTranscriptHandler(handler: UserTranscriptHandler): void;
|
|
469
482
|
/**
|
|
470
483
|
* Set error handler (simplified)
|
|
471
484
|
*/
|
|
@@ -567,4 +580,4 @@ declare class AudioEncoder {
|
|
|
567
580
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
568
581
|
}
|
|
569
582
|
|
|
570
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type
|
|
583
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.js
CHANGED
|
@@ -46,7 +46,7 @@ var Region = {
|
|
|
46
46
|
};
|
|
47
47
|
var REGION_ENDPOINTS = {
|
|
48
48
|
"ap-northeast-2": "wss://talk.drawdream.co.kr",
|
|
49
|
-
"us-west-2": "wss://talk
|
|
49
|
+
"us-west-2": "wss://talk.drawdream.ca"
|
|
50
50
|
// Coming soon
|
|
51
51
|
};
|
|
52
52
|
function getEndpointForRegion(region) {
|
|
@@ -614,7 +614,7 @@ var LiveSpeechClient = class {
|
|
|
614
614
|
// Event listeners using a simple map
|
|
615
615
|
eventListeners = /* @__PURE__ */ new Map();
|
|
616
616
|
// Simplified handlers
|
|
617
|
-
|
|
617
|
+
userTranscriptHandler = null;
|
|
618
618
|
responseHandler = null;
|
|
619
619
|
audioHandler = null;
|
|
620
620
|
errorHandler = null;
|
|
@@ -730,6 +730,13 @@ var LiveSpeechClient = class {
|
|
|
730
730
|
if (config?.prePrompt) {
|
|
731
731
|
startMessage.prePrompt = config.prePrompt;
|
|
732
732
|
}
|
|
733
|
+
if (config?.language) {
|
|
734
|
+
startMessage.language = config.language;
|
|
735
|
+
}
|
|
736
|
+
startMessage.pipelineMode = config?.pipelineMode ?? "live";
|
|
737
|
+
if (config?.aiSpeaksFirst) {
|
|
738
|
+
startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
|
|
739
|
+
}
|
|
733
740
|
this.connection.send(startMessage);
|
|
734
741
|
});
|
|
735
742
|
}
|
|
@@ -819,13 +826,7 @@ var LiveSpeechClient = class {
|
|
|
819
826
|
}
|
|
820
827
|
}
|
|
821
828
|
/**
|
|
822
|
-
* Set
|
|
823
|
-
*/
|
|
824
|
-
setTranscriptHandler(handler) {
|
|
825
|
-
this.transcriptHandler = handler;
|
|
826
|
-
}
|
|
827
|
-
/**
|
|
828
|
-
* Set response handler (simplified)
|
|
829
|
+
* Set response handler
|
|
829
830
|
*/
|
|
830
831
|
setResponseHandler(handler) {
|
|
831
832
|
this.responseHandler = handler;
|
|
@@ -836,6 +837,12 @@ var LiveSpeechClient = class {
|
|
|
836
837
|
setAudioHandler(handler) {
|
|
837
838
|
this.audioHandler = handler;
|
|
838
839
|
}
|
|
840
|
+
/**
|
|
841
|
+
* Set user transcript handler
|
|
842
|
+
*/
|
|
843
|
+
setUserTranscriptHandler(handler) {
|
|
844
|
+
this.userTranscriptHandler = handler;
|
|
845
|
+
}
|
|
839
846
|
/**
|
|
840
847
|
* Set error handler (simplified)
|
|
841
848
|
*/
|
|
@@ -914,36 +921,12 @@ var LiveSpeechClient = class {
|
|
|
914
921
|
timestamp: message.timestamp
|
|
915
922
|
});
|
|
916
923
|
break;
|
|
917
|
-
case "
|
|
918
|
-
|
|
919
|
-
type: "
|
|
920
|
-
timestamp: message.timestamp
|
|
921
|
-
});
|
|
922
|
-
break;
|
|
923
|
-
case "speechStart":
|
|
924
|
-
this.emit("speechStart", {
|
|
925
|
-
type: "speechStart",
|
|
926
|
-
timestamp: message.timestamp
|
|
927
|
-
});
|
|
928
|
-
break;
|
|
929
|
-
case "speechEnd":
|
|
930
|
-
this.emit("speechEnd", {
|
|
931
|
-
type: "speechEnd",
|
|
932
|
-
timestamp: message.timestamp
|
|
933
|
-
});
|
|
934
|
-
break;
|
|
935
|
-
case "transcript": {
|
|
936
|
-
const transcriptEvent = {
|
|
937
|
-
type: "transcript",
|
|
938
|
-
text: message.text,
|
|
939
|
-
isFinal: message.isFinal,
|
|
924
|
+
case "ready": {
|
|
925
|
+
const readyEvent = {
|
|
926
|
+
type: "ready",
|
|
940
927
|
timestamp: message.timestamp
|
|
941
928
|
};
|
|
942
|
-
|
|
943
|
-
transcriptEvent.confidence = message.confidence;
|
|
944
|
-
}
|
|
945
|
-
this.emit("transcript", transcriptEvent);
|
|
946
|
-
this.transcriptHandler?.(message.text, message.isFinal);
|
|
929
|
+
this.emit("ready", readyEvent);
|
|
947
930
|
break;
|
|
948
931
|
}
|
|
949
932
|
case "response": {
|
|
@@ -970,6 +953,24 @@ var LiveSpeechClient = class {
|
|
|
970
953
|
this.audioHandler?.(audioData);
|
|
971
954
|
break;
|
|
972
955
|
}
|
|
956
|
+
case "userTranscript": {
|
|
957
|
+
const userTranscriptEvent = {
|
|
958
|
+
type: "userTranscript",
|
|
959
|
+
text: message.text,
|
|
960
|
+
timestamp: message.timestamp
|
|
961
|
+
};
|
|
962
|
+
this.emit("userTranscript", userTranscriptEvent);
|
|
963
|
+
this.userTranscriptHandler?.(message.text);
|
|
964
|
+
break;
|
|
965
|
+
}
|
|
966
|
+
case "turnComplete": {
|
|
967
|
+
const turnCompleteEvent = {
|
|
968
|
+
type: "turnComplete",
|
|
969
|
+
timestamp: message.timestamp
|
|
970
|
+
};
|
|
971
|
+
this.emit("turnComplete", turnCompleteEvent);
|
|
972
|
+
break;
|
|
973
|
+
}
|
|
973
974
|
case "error":
|
|
974
975
|
this.handleError(message.code, message.message);
|
|
975
976
|
break;
|
package/dist/index.mjs
CHANGED
|
@@ -7,7 +7,7 @@ var Region = {
|
|
|
7
7
|
};
|
|
8
8
|
var REGION_ENDPOINTS = {
|
|
9
9
|
"ap-northeast-2": "wss://talk.drawdream.co.kr",
|
|
10
|
-
"us-west-2": "wss://talk
|
|
10
|
+
"us-west-2": "wss://talk.drawdream.ca"
|
|
11
11
|
// Coming soon
|
|
12
12
|
};
|
|
13
13
|
function getEndpointForRegion(region) {
|
|
@@ -575,7 +575,7 @@ var LiveSpeechClient = class {
|
|
|
575
575
|
// Event listeners using a simple map
|
|
576
576
|
eventListeners = /* @__PURE__ */ new Map();
|
|
577
577
|
// Simplified handlers
|
|
578
|
-
|
|
578
|
+
userTranscriptHandler = null;
|
|
579
579
|
responseHandler = null;
|
|
580
580
|
audioHandler = null;
|
|
581
581
|
errorHandler = null;
|
|
@@ -691,6 +691,13 @@ var LiveSpeechClient = class {
|
|
|
691
691
|
if (config?.prePrompt) {
|
|
692
692
|
startMessage.prePrompt = config.prePrompt;
|
|
693
693
|
}
|
|
694
|
+
if (config?.language) {
|
|
695
|
+
startMessage.language = config.language;
|
|
696
|
+
}
|
|
697
|
+
startMessage.pipelineMode = config?.pipelineMode ?? "live";
|
|
698
|
+
if (config?.aiSpeaksFirst) {
|
|
699
|
+
startMessage.aiSpeaksFirst = config.aiSpeaksFirst;
|
|
700
|
+
}
|
|
694
701
|
this.connection.send(startMessage);
|
|
695
702
|
});
|
|
696
703
|
}
|
|
@@ -780,13 +787,7 @@ var LiveSpeechClient = class {
|
|
|
780
787
|
}
|
|
781
788
|
}
|
|
782
789
|
/**
|
|
783
|
-
* Set
|
|
784
|
-
*/
|
|
785
|
-
setTranscriptHandler(handler) {
|
|
786
|
-
this.transcriptHandler = handler;
|
|
787
|
-
}
|
|
788
|
-
/**
|
|
789
|
-
* Set response handler (simplified)
|
|
790
|
+
* Set response handler
|
|
790
791
|
*/
|
|
791
792
|
setResponseHandler(handler) {
|
|
792
793
|
this.responseHandler = handler;
|
|
@@ -797,6 +798,12 @@ var LiveSpeechClient = class {
|
|
|
797
798
|
setAudioHandler(handler) {
|
|
798
799
|
this.audioHandler = handler;
|
|
799
800
|
}
|
|
801
|
+
/**
|
|
802
|
+
* Set user transcript handler
|
|
803
|
+
*/
|
|
804
|
+
setUserTranscriptHandler(handler) {
|
|
805
|
+
this.userTranscriptHandler = handler;
|
|
806
|
+
}
|
|
800
807
|
/**
|
|
801
808
|
* Set error handler (simplified)
|
|
802
809
|
*/
|
|
@@ -875,36 +882,12 @@ var LiveSpeechClient = class {
|
|
|
875
882
|
timestamp: message.timestamp
|
|
876
883
|
});
|
|
877
884
|
break;
|
|
878
|
-
case "
|
|
879
|
-
|
|
880
|
-
type: "
|
|
881
|
-
timestamp: message.timestamp
|
|
882
|
-
});
|
|
883
|
-
break;
|
|
884
|
-
case "speechStart":
|
|
885
|
-
this.emit("speechStart", {
|
|
886
|
-
type: "speechStart",
|
|
887
|
-
timestamp: message.timestamp
|
|
888
|
-
});
|
|
889
|
-
break;
|
|
890
|
-
case "speechEnd":
|
|
891
|
-
this.emit("speechEnd", {
|
|
892
|
-
type: "speechEnd",
|
|
893
|
-
timestamp: message.timestamp
|
|
894
|
-
});
|
|
895
|
-
break;
|
|
896
|
-
case "transcript": {
|
|
897
|
-
const transcriptEvent = {
|
|
898
|
-
type: "transcript",
|
|
899
|
-
text: message.text,
|
|
900
|
-
isFinal: message.isFinal,
|
|
885
|
+
case "ready": {
|
|
886
|
+
const readyEvent = {
|
|
887
|
+
type: "ready",
|
|
901
888
|
timestamp: message.timestamp
|
|
902
889
|
};
|
|
903
|
-
|
|
904
|
-
transcriptEvent.confidence = message.confidence;
|
|
905
|
-
}
|
|
906
|
-
this.emit("transcript", transcriptEvent);
|
|
907
|
-
this.transcriptHandler?.(message.text, message.isFinal);
|
|
890
|
+
this.emit("ready", readyEvent);
|
|
908
891
|
break;
|
|
909
892
|
}
|
|
910
893
|
case "response": {
|
|
@@ -931,6 +914,24 @@ var LiveSpeechClient = class {
|
|
|
931
914
|
this.audioHandler?.(audioData);
|
|
932
915
|
break;
|
|
933
916
|
}
|
|
917
|
+
case "userTranscript": {
|
|
918
|
+
const userTranscriptEvent = {
|
|
919
|
+
type: "userTranscript",
|
|
920
|
+
text: message.text,
|
|
921
|
+
timestamp: message.timestamp
|
|
922
|
+
};
|
|
923
|
+
this.emit("userTranscript", userTranscriptEvent);
|
|
924
|
+
this.userTranscriptHandler?.(message.text);
|
|
925
|
+
break;
|
|
926
|
+
}
|
|
927
|
+
case "turnComplete": {
|
|
928
|
+
const turnCompleteEvent = {
|
|
929
|
+
type: "turnComplete",
|
|
930
|
+
timestamp: message.timestamp
|
|
931
|
+
};
|
|
932
|
+
this.emit("turnComplete", turnCompleteEvent);
|
|
933
|
+
break;
|
|
934
|
+
}
|
|
934
935
|
case "error":
|
|
935
936
|
this.handleError(message.code, message.message);
|
|
936
937
|
break;
|