@drawdream/livespeech 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -17
- package/dist/index.d.mts +68 -62
- package/dist/index.d.ts +68 -62
- package/dist/index.js +35 -37
- package/dist/index.mjs +35 -37
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -26,12 +26,12 @@ const client = new LiveSpeechClient({
|
|
|
26
26
|
});
|
|
27
27
|
|
|
28
28
|
// Handle events
|
|
29
|
-
client.
|
|
30
|
-
console.log(`
|
|
29
|
+
client.setUserTranscriptHandler((text) => {
|
|
30
|
+
console.log(`You said: ${text}`);
|
|
31
31
|
});
|
|
32
32
|
|
|
33
|
-
client.
|
|
34
|
-
console.log(`AI
|
|
33
|
+
client.setTranscriptHandler((text, isFinal) => {
|
|
34
|
+
console.log(`AI Transcript: ${text}`);
|
|
35
35
|
});
|
|
36
36
|
|
|
37
37
|
client.setAudioHandler((audioData) => {
|
|
@@ -42,10 +42,44 @@ client.setAudioHandler((audioData) => {
|
|
|
42
42
|
await client.connect();
|
|
43
43
|
await client.startSession({
|
|
44
44
|
prePrompt: 'You are a helpful assistant.',
|
|
45
|
+
// pipelineMode: 'live' is the default
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// Start streaming and send audio
|
|
49
|
+
client.audioStart();
|
|
50
|
+
client.sendAudioChunk(audioBuffer);
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Pipeline Modes
|
|
54
|
+
|
|
55
|
+
The SDK supports two pipeline modes for audio processing:
|
|
56
|
+
|
|
57
|
+
### Live Mode (Default)
|
|
58
|
+
|
|
59
|
+
Uses Gemini 2.5 Flash Live API for end-to-end audio conversation. This provides:
|
|
60
|
+
- **Lower latency** - Direct audio-to-audio processing
|
|
61
|
+
- **Natural conversation** - Built-in voice activity detection and turn-taking
|
|
62
|
+
- **Real-time transcription** - Both user and AI speech are transcribed
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
await client.startSession({
|
|
66
|
+
prePrompt: 'You are a helpful assistant.',
|
|
67
|
+
pipelineMode: 'live', // Default, can be omitted
|
|
45
68
|
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Composed Mode
|
|
46
72
|
|
|
47
|
-
|
|
48
|
-
|
|
73
|
+
Uses separate STT + LLM + TTS services for more customization:
|
|
74
|
+
- **More control** - Separate services for each step
|
|
75
|
+
- **Custom voices** - Use different TTS voices
|
|
76
|
+
- **Text responses** - Access to intermediate text responses
|
|
77
|
+
|
|
78
|
+
```typescript
|
|
79
|
+
await client.startSession({
|
|
80
|
+
prePrompt: 'You are a helpful assistant.',
|
|
81
|
+
pipelineMode: 'composed',
|
|
82
|
+
});
|
|
49
83
|
```
|
|
50
84
|
|
|
51
85
|
## API Reference
|
|
@@ -87,8 +121,9 @@ The SDK provides built-in region support, so you don't need to remember endpoint
|
|
|
87
121
|
|
|
88
122
|
```typescript
|
|
89
123
|
// Simple handlers
|
|
90
|
-
client.
|
|
91
|
-
client.
|
|
124
|
+
client.setUserTranscriptHandler((text) => {}); // User's speech transcription
|
|
125
|
+
client.setTranscriptHandler((text, isFinal) => {}); // AI's speech transcription (live mode)
|
|
126
|
+
client.setResponseHandler((text, isFinal) => {}); // AI text response (composed mode)
|
|
92
127
|
client.setAudioHandler((audioData) => {});
|
|
93
128
|
client.setErrorHandler((error) => {});
|
|
94
129
|
|
|
@@ -97,24 +132,23 @@ client.on('connected', (event) => {});
|
|
|
97
132
|
client.on('disconnected', (event) => {});
|
|
98
133
|
client.on('sessionStarted', (event) => {});
|
|
99
134
|
client.on('sessionEnded', (event) => {});
|
|
100
|
-
client.on('
|
|
101
|
-
client.on('
|
|
135
|
+
client.on('userTranscript', (event) => {}); // User's speech transcription
|
|
136
|
+
client.on('transcript', (event) => {}); // AI's speech transcription
|
|
137
|
+
client.on('response', (event) => {}); // AI text response
|
|
102
138
|
client.on('audio', (event) => {});
|
|
103
139
|
client.on('error', (event) => {});
|
|
104
140
|
client.on('reconnecting', (event) => {});
|
|
141
|
+
client.on('ready', (event) => {}); // Gemini Live ready (live mode)
|
|
142
|
+
client.on('turnComplete', (event) => {}); // AI finished speaking (live mode)
|
|
105
143
|
```
|
|
106
144
|
|
|
107
145
|
### SessionConfig
|
|
108
146
|
|
|
109
147
|
| Option | Type | Default | Description |
|
|
110
148
|
|--------|------|---------|-------------|
|
|
111
|
-
| `prePrompt` | `string` |
|
|
112
|
-
| `
|
|
113
|
-
| `
|
|
114
|
-
| `inputFormat` | `AudioFormat` | `'pcm16'` | Input audio format |
|
|
115
|
-
| `outputFormat` | `AudioFormat` | `'pcm16'` | Output audio format |
|
|
116
|
-
| `sampleRate` | `number` | `16000` | Sample rate in Hz |
|
|
117
|
-
| `metadata` | `Record<string,string>` | `{}` | Custom metadata |
|
|
149
|
+
| `prePrompt` | `string` | - | System prompt for the AI |
|
|
150
|
+
| `language` | `string` | `'en-US'` | Language code for speech (e.g., "ko-KR") |
|
|
151
|
+
| `pipelineMode` | `'live' \| 'composed'` | `'live'` | Audio processing mode |
|
|
118
152
|
|
|
119
153
|
## Audio Utilities
|
|
120
154
|
|
package/dist/index.d.mts
CHANGED
|
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
|
|
|
21
21
|
*/
|
|
22
22
|
declare function isValidRegion(value: string): value is Region;
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Pipeline mode for audio processing
|
|
26
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
|
|
27
|
+
* - 'composed': Uses separate STT + LLM + TTS services
|
|
28
|
+
*/
|
|
29
|
+
type PipelineMode = 'live' | 'composed';
|
|
24
30
|
/**
|
|
25
31
|
* Configuration options for the LiveSpeech client
|
|
26
32
|
*
|
|
@@ -75,6 +81,18 @@ interface SessionConfig {
|
|
|
75
81
|
* System prompt for the AI assistant
|
|
76
82
|
*/
|
|
77
83
|
prePrompt?: string;
|
|
84
|
+
/**
|
|
85
|
+
* Language code for speech recognition (e.g., "en-US", "ko-KR")
|
|
86
|
+
* @default "en-US"
|
|
87
|
+
*/
|
|
88
|
+
language?: string;
|
|
89
|
+
/**
|
|
90
|
+
* Pipeline mode for audio processing
|
|
91
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
|
|
92
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
93
|
+
* @default "live"
|
|
94
|
+
*/
|
|
95
|
+
pipelineMode?: PipelineMode;
|
|
78
96
|
}
|
|
79
97
|
/**
|
|
80
98
|
* Internal resolved configuration with defaults applied
|
|
@@ -92,7 +110,7 @@ interface ResolvedConfig {
|
|
|
92
110
|
/**
|
|
93
111
|
* Event types emitted by the LiveSpeech client
|
|
94
112
|
*/
|
|
95
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | '
|
|
113
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
|
|
96
114
|
/**
|
|
97
115
|
* Event payload for 'connected' event
|
|
98
116
|
*/
|
|
@@ -131,34 +149,19 @@ interface SessionEndedEvent {
|
|
|
131
149
|
timestamp: string;
|
|
132
150
|
}
|
|
133
151
|
/**
|
|
134
|
-
* Event payload for '
|
|
152
|
+
* Event payload for 'ready' event
|
|
135
153
|
*/
|
|
136
|
-
interface
|
|
137
|
-
type: '
|
|
154
|
+
interface ReadyEvent {
|
|
155
|
+
type: 'ready';
|
|
138
156
|
timestamp: string;
|
|
139
157
|
}
|
|
140
158
|
/**
|
|
141
|
-
* Event payload for '
|
|
159
|
+
* Event payload for 'userTranscript' event
|
|
160
|
+
* User's speech transcription
|
|
142
161
|
*/
|
|
143
|
-
interface
|
|
144
|
-
type: '
|
|
145
|
-
timestamp: string;
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Event payload for 'speechEnd' event - VAD detected speech end
|
|
149
|
-
*/
|
|
150
|
-
interface SpeechEndEvent {
|
|
151
|
-
type: 'speechEnd';
|
|
152
|
-
timestamp: string;
|
|
153
|
-
}
|
|
154
|
-
/**
|
|
155
|
-
* Event payload for 'transcript' event
|
|
156
|
-
*/
|
|
157
|
-
interface TranscriptEvent {
|
|
158
|
-
type: 'transcript';
|
|
162
|
+
interface UserTranscriptEvent {
|
|
163
|
+
type: 'userTranscript';
|
|
159
164
|
text: string;
|
|
160
|
-
isFinal: boolean;
|
|
161
|
-
confidence?: number;
|
|
162
165
|
timestamp: string;
|
|
163
166
|
}
|
|
164
167
|
/**
|
|
@@ -204,14 +207,22 @@ interface ReconnectingEvent {
|
|
|
204
207
|
delay: number;
|
|
205
208
|
timestamp: string;
|
|
206
209
|
}
|
|
210
|
+
/**
|
|
211
|
+
* Event payload for 'turnComplete' event (both modes)
|
|
212
|
+
* Indicates the AI has finished its response turn
|
|
213
|
+
*/
|
|
214
|
+
interface TurnCompleteEvent {
|
|
215
|
+
type: 'turnComplete';
|
|
216
|
+
timestamp: string;
|
|
217
|
+
}
|
|
207
218
|
/**
|
|
208
219
|
* Union type of all event payloads
|
|
209
220
|
*/
|
|
210
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent |
|
|
221
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
|
|
211
222
|
/**
|
|
212
223
|
* Simplified event handlers for common use cases
|
|
213
224
|
*/
|
|
214
|
-
type
|
|
225
|
+
type UserTranscriptHandler = (text: string) => void;
|
|
215
226
|
type ResponseHandler = (text: string, isFinal: boolean) => void;
|
|
216
227
|
type AudioHandler = (data: Uint8Array) => void;
|
|
217
228
|
type ErrorHandler = (error: ErrorEvent) => void;
|
|
@@ -223,7 +234,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
223
234
|
/**
|
|
224
235
|
* WebSocket message types received from server
|
|
225
236
|
*/
|
|
226
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | '
|
|
237
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
|
|
227
238
|
/**
|
|
228
239
|
* Base interface for client messages
|
|
229
240
|
*/
|
|
@@ -236,6 +247,8 @@ interface BaseClientMessage {
|
|
|
236
247
|
interface StartSessionMessage extends BaseClientMessage {
|
|
237
248
|
action: 'startSession';
|
|
238
249
|
prePrompt?: string;
|
|
250
|
+
language?: string;
|
|
251
|
+
pipelineMode?: 'live' | 'composed';
|
|
239
252
|
}
|
|
240
253
|
/**
|
|
241
254
|
* End session message
|
|
@@ -294,31 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
294
307
|
sessionId: string;
|
|
295
308
|
}
|
|
296
309
|
/**
|
|
297
|
-
*
|
|
310
|
+
* User transcript message from server (user's speech transcription)
|
|
298
311
|
*/
|
|
299
|
-
interface
|
|
300
|
-
type: '
|
|
301
|
-
}
|
|
302
|
-
/**
|
|
303
|
-
* Speech start message - VAD detected speech begin
|
|
304
|
-
*/
|
|
305
|
-
interface ServerSpeechStartMessage extends BaseServerMessage {
|
|
306
|
-
type: 'speechStart';
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Speech end message - VAD detected speech end
|
|
310
|
-
*/
|
|
311
|
-
interface ServerSpeechEndMessage extends BaseServerMessage {
|
|
312
|
-
type: 'speechEnd';
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Transcript message from server
|
|
316
|
-
*/
|
|
317
|
-
interface ServerTranscriptMessage extends BaseServerMessage {
|
|
318
|
-
type: 'transcript';
|
|
312
|
+
interface ServerUserTranscriptMessage extends BaseServerMessage {
|
|
313
|
+
type: 'userTranscript';
|
|
319
314
|
text: string;
|
|
320
|
-
isFinal: boolean;
|
|
321
|
-
confidence?: number;
|
|
322
315
|
}
|
|
323
316
|
/**
|
|
324
317
|
* Response message from server
|
|
@@ -351,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
351
344
|
interface ServerPongMessage extends BaseServerMessage {
|
|
352
345
|
type: 'pong';
|
|
353
346
|
}
|
|
347
|
+
/**
|
|
348
|
+
* Turn complete message from server
|
|
349
|
+
* Indicates the AI has finished its response turn
|
|
350
|
+
*/
|
|
351
|
+
interface ServerTurnCompleteMessage extends BaseServerMessage {
|
|
352
|
+
type: 'turnComplete';
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Ready message from server
|
|
356
|
+
* Indicates the Gemini Live session is ready for audio input
|
|
357
|
+
*/
|
|
358
|
+
interface ServerReadyMessage extends BaseServerMessage {
|
|
359
|
+
type: 'ready';
|
|
360
|
+
}
|
|
354
361
|
/**
|
|
355
362
|
* Union type of all server messages
|
|
356
363
|
*/
|
|
357
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage |
|
|
364
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
|
|
358
365
|
|
|
359
366
|
/**
|
|
360
367
|
* Connection state
|
|
@@ -367,16 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
|
|
|
367
374
|
type LiveSpeechEventMap = {
|
|
368
375
|
connected: ConnectedEvent;
|
|
369
376
|
disconnected: DisconnectedEvent;
|
|
377
|
+
reconnecting: ReconnectingEvent;
|
|
370
378
|
sessionStarted: SessionStartedEvent;
|
|
371
379
|
sessionEnded: SessionEndedEvent;
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
speechEnd: SpeechEndEvent;
|
|
375
|
-
transcript: TranscriptEvent;
|
|
380
|
+
ready: ReadyEvent;
|
|
381
|
+
userTranscript: UserTranscriptEvent;
|
|
376
382
|
response: ResponseEvent;
|
|
377
383
|
audio: AudioEvent;
|
|
384
|
+
turnComplete: TurnCompleteEvent;
|
|
378
385
|
error: ErrorEvent;
|
|
379
|
-
reconnecting: ReconnectingEvent;
|
|
380
386
|
};
|
|
381
387
|
/**
|
|
382
388
|
* LiveSpeech client for real-time speech-to-speech AI conversations
|
|
@@ -389,7 +395,7 @@ declare class LiveSpeechClient {
|
|
|
389
395
|
private sessionId;
|
|
390
396
|
private isStreaming;
|
|
391
397
|
private readonly eventListeners;
|
|
392
|
-
private
|
|
398
|
+
private userTranscriptHandler;
|
|
393
399
|
private responseHandler;
|
|
394
400
|
private audioHandler;
|
|
395
401
|
private errorHandler;
|
|
@@ -455,17 +461,17 @@ declare class LiveSpeechClient {
|
|
|
455
461
|
*/
|
|
456
462
|
off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
|
|
457
463
|
/**
|
|
458
|
-
* Set
|
|
459
|
-
*/
|
|
460
|
-
setTranscriptHandler(handler: TranscriptHandler): void;
|
|
461
|
-
/**
|
|
462
|
-
* Set response handler (simplified)
|
|
464
|
+
* Set response handler
|
|
463
465
|
*/
|
|
464
466
|
setResponseHandler(handler: ResponseHandler): void;
|
|
465
467
|
/**
|
|
466
468
|
* Set audio handler (simplified)
|
|
467
469
|
*/
|
|
468
470
|
setAudioHandler(handler: AudioHandler): void;
|
|
471
|
+
/**
|
|
472
|
+
* Set user transcript handler
|
|
473
|
+
*/
|
|
474
|
+
setUserTranscriptHandler(handler: UserTranscriptHandler): void;
|
|
469
475
|
/**
|
|
470
476
|
* Set error handler (simplified)
|
|
471
477
|
*/
|
|
@@ -567,4 +573,4 @@ declare class AudioEncoder {
|
|
|
567
573
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
568
574
|
}
|
|
569
575
|
|
|
570
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type
|
|
576
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,12 @@ declare function getEndpointForRegion(region: Region): string;
|
|
|
21
21
|
*/
|
|
22
22
|
declare function isValidRegion(value: string): value is Region;
|
|
23
23
|
|
|
24
|
+
/**
|
|
25
|
+
* Pipeline mode for audio processing
|
|
26
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default)
|
|
27
|
+
* - 'composed': Uses separate STT + LLM + TTS services
|
|
28
|
+
*/
|
|
29
|
+
type PipelineMode = 'live' | 'composed';
|
|
24
30
|
/**
|
|
25
31
|
* Configuration options for the LiveSpeech client
|
|
26
32
|
*
|
|
@@ -75,6 +81,18 @@ interface SessionConfig {
|
|
|
75
81
|
* System prompt for the AI assistant
|
|
76
82
|
*/
|
|
77
83
|
prePrompt?: string;
|
|
84
|
+
/**
|
|
85
|
+
* Language code for speech recognition (e.g., "en-US", "ko-KR")
|
|
86
|
+
* @default "en-US"
|
|
87
|
+
*/
|
|
88
|
+
language?: string;
|
|
89
|
+
/**
|
|
90
|
+
* Pipeline mode for audio processing
|
|
91
|
+
* - 'live': Uses Gemini Live API for end-to-end audio conversation (default, lower latency)
|
|
92
|
+
* - 'composed': Uses separate STT + LLM + TTS services (more customizable)
|
|
93
|
+
* @default "live"
|
|
94
|
+
*/
|
|
95
|
+
pipelineMode?: PipelineMode;
|
|
78
96
|
}
|
|
79
97
|
/**
|
|
80
98
|
* Internal resolved configuration with defaults applied
|
|
@@ -92,7 +110,7 @@ interface ResolvedConfig {
|
|
|
92
110
|
/**
|
|
93
111
|
* Event types emitted by the LiveSpeech client
|
|
94
112
|
*/
|
|
95
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | '
|
|
113
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error';
|
|
96
114
|
/**
|
|
97
115
|
* Event payload for 'connected' event
|
|
98
116
|
*/
|
|
@@ -131,34 +149,19 @@ interface SessionEndedEvent {
|
|
|
131
149
|
timestamp: string;
|
|
132
150
|
}
|
|
133
151
|
/**
|
|
134
|
-
* Event payload for '
|
|
152
|
+
* Event payload for 'ready' event
|
|
135
153
|
*/
|
|
136
|
-
interface
|
|
137
|
-
type: '
|
|
154
|
+
interface ReadyEvent {
|
|
155
|
+
type: 'ready';
|
|
138
156
|
timestamp: string;
|
|
139
157
|
}
|
|
140
158
|
/**
|
|
141
|
-
* Event payload for '
|
|
159
|
+
* Event payload for 'userTranscript' event
|
|
160
|
+
* User's speech transcription
|
|
142
161
|
*/
|
|
143
|
-
interface
|
|
144
|
-
type: '
|
|
145
|
-
timestamp: string;
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Event payload for 'speechEnd' event - VAD detected speech end
|
|
149
|
-
*/
|
|
150
|
-
interface SpeechEndEvent {
|
|
151
|
-
type: 'speechEnd';
|
|
152
|
-
timestamp: string;
|
|
153
|
-
}
|
|
154
|
-
/**
|
|
155
|
-
* Event payload for 'transcript' event
|
|
156
|
-
*/
|
|
157
|
-
interface TranscriptEvent {
|
|
158
|
-
type: 'transcript';
|
|
162
|
+
interface UserTranscriptEvent {
|
|
163
|
+
type: 'userTranscript';
|
|
159
164
|
text: string;
|
|
160
|
-
isFinal: boolean;
|
|
161
|
-
confidence?: number;
|
|
162
165
|
timestamp: string;
|
|
163
166
|
}
|
|
164
167
|
/**
|
|
@@ -204,14 +207,22 @@ interface ReconnectingEvent {
|
|
|
204
207
|
delay: number;
|
|
205
208
|
timestamp: string;
|
|
206
209
|
}
|
|
210
|
+
/**
|
|
211
|
+
* Event payload for 'turnComplete' event (both modes)
|
|
212
|
+
* Indicates the AI has finished its response turn
|
|
213
|
+
*/
|
|
214
|
+
interface TurnCompleteEvent {
|
|
215
|
+
type: 'turnComplete';
|
|
216
|
+
timestamp: string;
|
|
217
|
+
}
|
|
207
218
|
/**
|
|
208
219
|
* Union type of all event payloads
|
|
209
220
|
*/
|
|
210
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent |
|
|
221
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ErrorEvent;
|
|
211
222
|
/**
|
|
212
223
|
* Simplified event handlers for common use cases
|
|
213
224
|
*/
|
|
214
|
-
type
|
|
225
|
+
type UserTranscriptHandler = (text: string) => void;
|
|
215
226
|
type ResponseHandler = (text: string, isFinal: boolean) => void;
|
|
216
227
|
type AudioHandler = (data: Uint8Array) => void;
|
|
217
228
|
type ErrorHandler = (error: ErrorEvent) => void;
|
|
@@ -223,7 +234,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
223
234
|
/**
|
|
224
235
|
* WebSocket message types received from server
|
|
225
236
|
*/
|
|
226
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | '
|
|
237
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'error' | 'pong';
|
|
227
238
|
/**
|
|
228
239
|
* Base interface for client messages
|
|
229
240
|
*/
|
|
@@ -236,6 +247,8 @@ interface BaseClientMessage {
|
|
|
236
247
|
interface StartSessionMessage extends BaseClientMessage {
|
|
237
248
|
action: 'startSession';
|
|
238
249
|
prePrompt?: string;
|
|
250
|
+
language?: string;
|
|
251
|
+
pipelineMode?: 'live' | 'composed';
|
|
239
252
|
}
|
|
240
253
|
/**
|
|
241
254
|
* End session message
|
|
@@ -294,31 +307,11 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
294
307
|
sessionId: string;
|
|
295
308
|
}
|
|
296
309
|
/**
|
|
297
|
-
*
|
|
310
|
+
* User transcript message from server (user's speech transcription)
|
|
298
311
|
*/
|
|
299
|
-
interface
|
|
300
|
-
type: '
|
|
301
|
-
}
|
|
302
|
-
/**
|
|
303
|
-
* Speech start message - VAD detected speech begin
|
|
304
|
-
*/
|
|
305
|
-
interface ServerSpeechStartMessage extends BaseServerMessage {
|
|
306
|
-
type: 'speechStart';
|
|
307
|
-
}
|
|
308
|
-
/**
|
|
309
|
-
* Speech end message - VAD detected speech end
|
|
310
|
-
*/
|
|
311
|
-
interface ServerSpeechEndMessage extends BaseServerMessage {
|
|
312
|
-
type: 'speechEnd';
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Transcript message from server
|
|
316
|
-
*/
|
|
317
|
-
interface ServerTranscriptMessage extends BaseServerMessage {
|
|
318
|
-
type: 'transcript';
|
|
312
|
+
interface ServerUserTranscriptMessage extends BaseServerMessage {
|
|
313
|
+
type: 'userTranscript';
|
|
319
314
|
text: string;
|
|
320
|
-
isFinal: boolean;
|
|
321
|
-
confidence?: number;
|
|
322
315
|
}
|
|
323
316
|
/**
|
|
324
317
|
* Response message from server
|
|
@@ -351,10 +344,24 @@ interface ServerErrorMessage extends BaseServerMessage {
|
|
|
351
344
|
interface ServerPongMessage extends BaseServerMessage {
|
|
352
345
|
type: 'pong';
|
|
353
346
|
}
|
|
347
|
+
/**
|
|
348
|
+
* Turn complete message from server
|
|
349
|
+
* Indicates the AI has finished its response turn
|
|
350
|
+
*/
|
|
351
|
+
interface ServerTurnCompleteMessage extends BaseServerMessage {
|
|
352
|
+
type: 'turnComplete';
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Ready message from server
|
|
356
|
+
* Indicates the Gemini Live session is ready for audio input
|
|
357
|
+
*/
|
|
358
|
+
interface ServerReadyMessage extends BaseServerMessage {
|
|
359
|
+
type: 'ready';
|
|
360
|
+
}
|
|
354
361
|
/**
|
|
355
362
|
* Union type of all server messages
|
|
356
363
|
*/
|
|
357
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage |
|
|
364
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerErrorMessage | ServerPongMessage;
|
|
358
365
|
|
|
359
366
|
/**
|
|
360
367
|
* Connection state
|
|
@@ -367,16 +374,15 @@ type ConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecti
|
|
|
367
374
|
type LiveSpeechEventMap = {
|
|
368
375
|
connected: ConnectedEvent;
|
|
369
376
|
disconnected: DisconnectedEvent;
|
|
377
|
+
reconnecting: ReconnectingEvent;
|
|
370
378
|
sessionStarted: SessionStartedEvent;
|
|
371
379
|
sessionEnded: SessionEndedEvent;
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
speechEnd: SpeechEndEvent;
|
|
375
|
-
transcript: TranscriptEvent;
|
|
380
|
+
ready: ReadyEvent;
|
|
381
|
+
userTranscript: UserTranscriptEvent;
|
|
376
382
|
response: ResponseEvent;
|
|
377
383
|
audio: AudioEvent;
|
|
384
|
+
turnComplete: TurnCompleteEvent;
|
|
378
385
|
error: ErrorEvent;
|
|
379
|
-
reconnecting: ReconnectingEvent;
|
|
380
386
|
};
|
|
381
387
|
/**
|
|
382
388
|
* LiveSpeech client for real-time speech-to-speech AI conversations
|
|
@@ -389,7 +395,7 @@ declare class LiveSpeechClient {
|
|
|
389
395
|
private sessionId;
|
|
390
396
|
private isStreaming;
|
|
391
397
|
private readonly eventListeners;
|
|
392
|
-
private
|
|
398
|
+
private userTranscriptHandler;
|
|
393
399
|
private responseHandler;
|
|
394
400
|
private audioHandler;
|
|
395
401
|
private errorHandler;
|
|
@@ -455,17 +461,17 @@ declare class LiveSpeechClient {
|
|
|
455
461
|
*/
|
|
456
462
|
off<K extends keyof LiveSpeechEventMap>(event: K, listener: (event: LiveSpeechEventMap[K]) => void): void;
|
|
457
463
|
/**
|
|
458
|
-
* Set
|
|
459
|
-
*/
|
|
460
|
-
setTranscriptHandler(handler: TranscriptHandler): void;
|
|
461
|
-
/**
|
|
462
|
-
* Set response handler (simplified)
|
|
464
|
+
* Set response handler
|
|
463
465
|
*/
|
|
464
466
|
setResponseHandler(handler: ResponseHandler): void;
|
|
465
467
|
/**
|
|
466
468
|
* Set audio handler (simplified)
|
|
467
469
|
*/
|
|
468
470
|
setAudioHandler(handler: AudioHandler): void;
|
|
471
|
+
/**
|
|
472
|
+
* Set user transcript handler
|
|
473
|
+
*/
|
|
474
|
+
setUserTranscriptHandler(handler: UserTranscriptHandler): void;
|
|
469
475
|
/**
|
|
470
476
|
* Set error handler (simplified)
|
|
471
477
|
*/
|
|
@@ -567,4 +573,4 @@ declare class AudioEncoder {
|
|
|
567
573
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
568
574
|
}
|
|
569
575
|
|
|
570
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type
|
|
576
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.js
CHANGED
|
@@ -46,7 +46,7 @@ var Region = {
|
|
|
46
46
|
};
|
|
47
47
|
var REGION_ENDPOINTS = {
|
|
48
48
|
"ap-northeast-2": "wss://talk.drawdream.co.kr",
|
|
49
|
-
"us-west-2": "wss://talk
|
|
49
|
+
"us-west-2": "wss://talk.drawdream.ca"
|
|
50
50
|
// Coming soon
|
|
51
51
|
};
|
|
52
52
|
function getEndpointForRegion(region) {
|
|
@@ -614,7 +614,7 @@ var LiveSpeechClient = class {
|
|
|
614
614
|
// Event listeners using a simple map
|
|
615
615
|
eventListeners = /* @__PURE__ */ new Map();
|
|
616
616
|
// Simplified handlers
|
|
617
|
-
|
|
617
|
+
userTranscriptHandler = null;
|
|
618
618
|
responseHandler = null;
|
|
619
619
|
audioHandler = null;
|
|
620
620
|
errorHandler = null;
|
|
@@ -730,6 +730,10 @@ var LiveSpeechClient = class {
|
|
|
730
730
|
if (config?.prePrompt) {
|
|
731
731
|
startMessage.prePrompt = config.prePrompt;
|
|
732
732
|
}
|
|
733
|
+
if (config?.language) {
|
|
734
|
+
startMessage.language = config.language;
|
|
735
|
+
}
|
|
736
|
+
startMessage.pipelineMode = config?.pipelineMode ?? "live";
|
|
733
737
|
this.connection.send(startMessage);
|
|
734
738
|
});
|
|
735
739
|
}
|
|
@@ -819,13 +823,7 @@ var LiveSpeechClient = class {
|
|
|
819
823
|
}
|
|
820
824
|
}
|
|
821
825
|
/**
|
|
822
|
-
* Set
|
|
823
|
-
*/
|
|
824
|
-
setTranscriptHandler(handler) {
|
|
825
|
-
this.transcriptHandler = handler;
|
|
826
|
-
}
|
|
827
|
-
/**
|
|
828
|
-
* Set response handler (simplified)
|
|
826
|
+
* Set response handler
|
|
829
827
|
*/
|
|
830
828
|
setResponseHandler(handler) {
|
|
831
829
|
this.responseHandler = handler;
|
|
@@ -836,6 +834,12 @@ var LiveSpeechClient = class {
|
|
|
836
834
|
setAudioHandler(handler) {
|
|
837
835
|
this.audioHandler = handler;
|
|
838
836
|
}
|
|
837
|
+
/**
|
|
838
|
+
* Set user transcript handler
|
|
839
|
+
*/
|
|
840
|
+
setUserTranscriptHandler(handler) {
|
|
841
|
+
this.userTranscriptHandler = handler;
|
|
842
|
+
}
|
|
839
843
|
/**
|
|
840
844
|
* Set error handler (simplified)
|
|
841
845
|
*/
|
|
@@ -914,36 +918,12 @@ var LiveSpeechClient = class {
|
|
|
914
918
|
timestamp: message.timestamp
|
|
915
919
|
});
|
|
916
920
|
break;
|
|
917
|
-
case "
|
|
918
|
-
|
|
919
|
-
type: "
|
|
920
|
-
timestamp: message.timestamp
|
|
921
|
-
});
|
|
922
|
-
break;
|
|
923
|
-
case "speechStart":
|
|
924
|
-
this.emit("speechStart", {
|
|
925
|
-
type: "speechStart",
|
|
926
|
-
timestamp: message.timestamp
|
|
927
|
-
});
|
|
928
|
-
break;
|
|
929
|
-
case "speechEnd":
|
|
930
|
-
this.emit("speechEnd", {
|
|
931
|
-
type: "speechEnd",
|
|
932
|
-
timestamp: message.timestamp
|
|
933
|
-
});
|
|
934
|
-
break;
|
|
935
|
-
case "transcript": {
|
|
936
|
-
const transcriptEvent = {
|
|
937
|
-
type: "transcript",
|
|
938
|
-
text: message.text,
|
|
939
|
-
isFinal: message.isFinal,
|
|
921
|
+
case "ready": {
|
|
922
|
+
const readyEvent = {
|
|
923
|
+
type: "ready",
|
|
940
924
|
timestamp: message.timestamp
|
|
941
925
|
};
|
|
942
|
-
|
|
943
|
-
transcriptEvent.confidence = message.confidence;
|
|
944
|
-
}
|
|
945
|
-
this.emit("transcript", transcriptEvent);
|
|
946
|
-
this.transcriptHandler?.(message.text, message.isFinal);
|
|
926
|
+
this.emit("ready", readyEvent);
|
|
947
927
|
break;
|
|
948
928
|
}
|
|
949
929
|
case "response": {
|
|
@@ -970,6 +950,24 @@ var LiveSpeechClient = class {
|
|
|
970
950
|
this.audioHandler?.(audioData);
|
|
971
951
|
break;
|
|
972
952
|
}
|
|
953
|
+
case "userTranscript": {
|
|
954
|
+
const userTranscriptEvent = {
|
|
955
|
+
type: "userTranscript",
|
|
956
|
+
text: message.text,
|
|
957
|
+
timestamp: message.timestamp
|
|
958
|
+
};
|
|
959
|
+
this.emit("userTranscript", userTranscriptEvent);
|
|
960
|
+
this.userTranscriptHandler?.(message.text);
|
|
961
|
+
break;
|
|
962
|
+
}
|
|
963
|
+
case "turnComplete": {
|
|
964
|
+
const turnCompleteEvent = {
|
|
965
|
+
type: "turnComplete",
|
|
966
|
+
timestamp: message.timestamp
|
|
967
|
+
};
|
|
968
|
+
this.emit("turnComplete", turnCompleteEvent);
|
|
969
|
+
break;
|
|
970
|
+
}
|
|
973
971
|
case "error":
|
|
974
972
|
this.handleError(message.code, message.message);
|
|
975
973
|
break;
|
package/dist/index.mjs
CHANGED
|
@@ -7,7 +7,7 @@ var Region = {
|
|
|
7
7
|
};
|
|
8
8
|
var REGION_ENDPOINTS = {
|
|
9
9
|
"ap-northeast-2": "wss://talk.drawdream.co.kr",
|
|
10
|
-
"us-west-2": "wss://talk
|
|
10
|
+
"us-west-2": "wss://talk.drawdream.ca"
|
|
11
11
|
// Coming soon
|
|
12
12
|
};
|
|
13
13
|
function getEndpointForRegion(region) {
|
|
@@ -575,7 +575,7 @@ var LiveSpeechClient = class {
|
|
|
575
575
|
// Event listeners using a simple map
|
|
576
576
|
eventListeners = /* @__PURE__ */ new Map();
|
|
577
577
|
// Simplified handlers
|
|
578
|
-
|
|
578
|
+
userTranscriptHandler = null;
|
|
579
579
|
responseHandler = null;
|
|
580
580
|
audioHandler = null;
|
|
581
581
|
errorHandler = null;
|
|
@@ -691,6 +691,10 @@ var LiveSpeechClient = class {
|
|
|
691
691
|
if (config?.prePrompt) {
|
|
692
692
|
startMessage.prePrompt = config.prePrompt;
|
|
693
693
|
}
|
|
694
|
+
if (config?.language) {
|
|
695
|
+
startMessage.language = config.language;
|
|
696
|
+
}
|
|
697
|
+
startMessage.pipelineMode = config?.pipelineMode ?? "live";
|
|
694
698
|
this.connection.send(startMessage);
|
|
695
699
|
});
|
|
696
700
|
}
|
|
@@ -780,13 +784,7 @@ var LiveSpeechClient = class {
|
|
|
780
784
|
}
|
|
781
785
|
}
|
|
782
786
|
/**
|
|
783
|
-
* Set
|
|
784
|
-
*/
|
|
785
|
-
setTranscriptHandler(handler) {
|
|
786
|
-
this.transcriptHandler = handler;
|
|
787
|
-
}
|
|
788
|
-
/**
|
|
789
|
-
* Set response handler (simplified)
|
|
787
|
+
* Set response handler
|
|
790
788
|
*/
|
|
791
789
|
setResponseHandler(handler) {
|
|
792
790
|
this.responseHandler = handler;
|
|
@@ -797,6 +795,12 @@ var LiveSpeechClient = class {
|
|
|
797
795
|
setAudioHandler(handler) {
|
|
798
796
|
this.audioHandler = handler;
|
|
799
797
|
}
|
|
798
|
+
/**
|
|
799
|
+
* Set user transcript handler
|
|
800
|
+
*/
|
|
801
|
+
setUserTranscriptHandler(handler) {
|
|
802
|
+
this.userTranscriptHandler = handler;
|
|
803
|
+
}
|
|
800
804
|
/**
|
|
801
805
|
* Set error handler (simplified)
|
|
802
806
|
*/
|
|
@@ -875,36 +879,12 @@ var LiveSpeechClient = class {
|
|
|
875
879
|
timestamp: message.timestamp
|
|
876
880
|
});
|
|
877
881
|
break;
|
|
878
|
-
case "
|
|
879
|
-
|
|
880
|
-
type: "
|
|
881
|
-
timestamp: message.timestamp
|
|
882
|
-
});
|
|
883
|
-
break;
|
|
884
|
-
case "speechStart":
|
|
885
|
-
this.emit("speechStart", {
|
|
886
|
-
type: "speechStart",
|
|
887
|
-
timestamp: message.timestamp
|
|
888
|
-
});
|
|
889
|
-
break;
|
|
890
|
-
case "speechEnd":
|
|
891
|
-
this.emit("speechEnd", {
|
|
892
|
-
type: "speechEnd",
|
|
893
|
-
timestamp: message.timestamp
|
|
894
|
-
});
|
|
895
|
-
break;
|
|
896
|
-
case "transcript": {
|
|
897
|
-
const transcriptEvent = {
|
|
898
|
-
type: "transcript",
|
|
899
|
-
text: message.text,
|
|
900
|
-
isFinal: message.isFinal,
|
|
882
|
+
case "ready": {
|
|
883
|
+
const readyEvent = {
|
|
884
|
+
type: "ready",
|
|
901
885
|
timestamp: message.timestamp
|
|
902
886
|
};
|
|
903
|
-
|
|
904
|
-
transcriptEvent.confidence = message.confidence;
|
|
905
|
-
}
|
|
906
|
-
this.emit("transcript", transcriptEvent);
|
|
907
|
-
this.transcriptHandler?.(message.text, message.isFinal);
|
|
887
|
+
this.emit("ready", readyEvent);
|
|
908
888
|
break;
|
|
909
889
|
}
|
|
910
890
|
case "response": {
|
|
@@ -931,6 +911,24 @@ var LiveSpeechClient = class {
|
|
|
931
911
|
this.audioHandler?.(audioData);
|
|
932
912
|
break;
|
|
933
913
|
}
|
|
914
|
+
case "userTranscript": {
|
|
915
|
+
const userTranscriptEvent = {
|
|
916
|
+
type: "userTranscript",
|
|
917
|
+
text: message.text,
|
|
918
|
+
timestamp: message.timestamp
|
|
919
|
+
};
|
|
920
|
+
this.emit("userTranscript", userTranscriptEvent);
|
|
921
|
+
this.userTranscriptHandler?.(message.text);
|
|
922
|
+
break;
|
|
923
|
+
}
|
|
924
|
+
case "turnComplete": {
|
|
925
|
+
const turnCompleteEvent = {
|
|
926
|
+
type: "turnComplete",
|
|
927
|
+
timestamp: message.timestamp
|
|
928
|
+
};
|
|
929
|
+
this.emit("turnComplete", turnCompleteEvent);
|
|
930
|
+
break;
|
|
931
|
+
}
|
|
934
932
|
case "error":
|
|
935
933
|
this.handleError(message.code, message.message);
|
|
936
934
|
break;
|