@drawdream/livespeech 0.1.12 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -4
- package/dist/index.d.mts +82 -7
- package/dist/index.d.ts +82 -7
- package/dist/index.js +29 -1
- package/dist/index.mjs +29 -1
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -32,7 +32,7 @@ const client = new LiveSpeechClient({
|
|
|
32
32
|
|
|
33
33
|
// Handle only 4 essential events!
|
|
34
34
|
client.setAudioHandler((audioData) => {
|
|
35
|
-
audioPlayer.queue(audioData); // PCM16
|
|
35
|
+
audioPlayer.queue(audioData); // PCM16 — use event.sampleRate (24kHz Live, 16kHz Composed)
|
|
36
36
|
});
|
|
37
37
|
|
|
38
38
|
client.on('interrupted', () => {
|
|
@@ -81,7 +81,7 @@ Everything you need for basic voice conversations.
|
|
|
81
81
|
|
|
82
82
|
| Event | Description | Action Required |
|
|
83
83
|
|-------|-------------|-----------------|
|
|
84
|
-
| `audio` | AI's audio output | Play audio (PCM16
|
|
84
|
+
| `audio` | AI's audio output | Play audio (PCM16 — check `sampleRate`) |
|
|
85
85
|
| `turnComplete` | AI finished speaking | Ready for next input |
|
|
86
86
|
| `interrupted` | User barged in | **Clear audio buffer!** |
|
|
87
87
|
| `error` | Error occurred | Handle/log error |
|
|
@@ -104,7 +104,10 @@ Without this, 2-3 seconds of buffered audio continues playing after the user int
|
|
|
104
104
|
| Direction | Format | Sample Rate |
|
|
105
105
|
|-----------|--------|-------------|
|
|
106
106
|
| Input (mic) | PCM16 | 16,000 Hz |
|
|
107
|
-
| Output (AI) | PCM16 | 24,000 Hz |
|
|
107
|
+
| Output (AI) — Live mode | PCM16 | 24,000 Hz |
|
|
108
|
+
| Output (AI) — Composed mode | PCM16 | 16,000 Hz |
|
|
109
|
+
|
|
110
|
+
> **Important:** The `audio` event includes a `sampleRate` field. Always use it to configure your audio decoder rather than hardcoding a rate.
|
|
108
111
|
|
|
109
112
|
## Configuration
|
|
110
113
|
|
|
@@ -122,6 +125,37 @@ await client.startSession({
|
|
|
122
125
|
|
|
123
126
|
---
|
|
124
127
|
|
|
128
|
+
# Composed Mode
|
|
129
|
+
|
|
130
|
+
Use composed mode for higher accuracy with slightly more latency. It runs a separate STT → LLM → TTS pipeline instead of direct audio-to-audio.
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
await client.startSession({
|
|
134
|
+
prePrompt: 'You are a helpful assistant.',
|
|
135
|
+
pipelineMode: 'composed',
|
|
136
|
+
language: 'ko-KR',
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
client.audioStart();
|
|
140
|
+
// Send/receive audio the same way as live mode
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Live vs Composed
|
|
144
|
+
|
|
145
|
+
| | Live | Composed |
|
|
146
|
+
|---|---|---|
|
|
147
|
+
| **Latency** | ~300ms | ~1-2s |
|
|
148
|
+
| **Pipeline** | Direct audio-to-audio (Gemini Live) | STT → LLM → TTS |
|
|
149
|
+
| **Accuracy** | Good | Higher |
|
|
150
|
+
| **`aiSpeaksFirst`** | ✅ Supported | ❌ Not supported |
|
|
151
|
+
| **`tools` (function calling)** | ✅ Supported | ❌ Not supported |
|
|
152
|
+
| **Output sample rate** | 24,000 Hz | 16,000 Hz |
|
|
153
|
+
| **Barge-in** | Automatic (Gemini VAD) | Automatic |
|
|
154
|
+
|
|
155
|
+
> **Note:** All other SDK methods and events work identically in both modes. The only code change is adding `pipelineMode: 'composed'` to your session config.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
125
159
|
# Advanced API
|
|
126
160
|
|
|
127
161
|
Optional features for power users.
|
|
@@ -146,7 +180,10 @@ Optional features for power users.
|
|
|
146
180
|
| `userTranscript` | User's speech transcribed |
|
|
147
181
|
| `response` | AI's response text |
|
|
148
182
|
| `toolCall` | AI wants to call a function |
|
|
183
|
+
| `reconnecting` | Auto-reconnection attempt |
|
|
149
184
|
| `userIdUpdated` | Guest-to-user migration complete |
|
|
185
|
+
| `sessionWarning` | Session nearing duration limit |
|
|
186
|
+
| `sessionGoodbye` | Session about to end |
|
|
150
187
|
|
|
151
188
|
---
|
|
152
189
|
|
|
@@ -267,10 +304,17 @@ client.audioStart(); // AI speaks immediately
|
|
|
267
304
|
|--------|---------|-------------|
|
|
268
305
|
| `prePrompt` | - | System prompt |
|
|
269
306
|
| `language` | `'en-US'` | Language code |
|
|
307
|
+
| `outputLanguage` | - | TTS voice language override (composed mode only) |
|
|
270
308
|
| `pipelineMode` | `'live'` | `'live'` (~300ms) or `'composed'` (~1-2s) |
|
|
271
309
|
| `aiSpeaksFirst` | `false` | AI initiates (live mode only) |
|
|
272
310
|
| `allowHarmCategory` | `false` | Disable safety filters |
|
|
273
311
|
| `tools` | `[]` | Function definitions |
|
|
312
|
+
| `sessionDuration` | - | Enables session duration limits when provided |
|
|
313
|
+
|
|
314
|
+
**Notes**
|
|
315
|
+
- Duration checks are **disabled by default**. They activate only when `sessionDuration` is provided.
|
|
316
|
+
- If only `sessionDuration.maxSeconds` is provided, `enableWarning`/`enableGoodbye` default to `false` in the SDK.
|
|
317
|
+
- Server limits take precedence in production.
|
|
274
318
|
|
|
275
319
|
---
|
|
276
320
|
|
|
@@ -308,7 +352,7 @@ import { float32ToInt16, int16ToUint8, wrapPcmInWav } from '@drawdream/livespeec
|
|
|
308
352
|
|
|
309
353
|
const int16 = float32ToInt16(float32Data);
|
|
310
354
|
const bytes = int16ToUint8(int16);
|
|
311
|
-
const wav = wrapPcmInWav(bytes, 16000, 1, 16);
|
|
355
|
+
const wav = wrapPcmInWav(bytes, { sampleRate: 16000, channels: 1, bitDepth: 16 });
|
|
312
356
|
```
|
|
313
357
|
|
|
314
358
|
---
|
package/dist/index.d.mts
CHANGED
|
@@ -39,7 +39,6 @@ type PipelineMode = 'live' | 'composed';
|
|
|
39
39
|
interface LiveSpeechConfig {
|
|
40
40
|
/**
|
|
41
41
|
* Region for the LiveSpeech service
|
|
42
|
-
* @example 'ap-northeast-2'
|
|
43
42
|
*/
|
|
44
43
|
region: Region;
|
|
45
44
|
/**
|
|
@@ -134,6 +133,23 @@ interface Tool {
|
|
|
134
133
|
*/
|
|
135
134
|
parameters?: FunctionParameters;
|
|
136
135
|
}
|
|
136
|
+
/**
|
|
137
|
+
* Session configuration options
|
|
138
|
+
*/
|
|
139
|
+
interface SessionDurationConfig {
|
|
140
|
+
/**
|
|
141
|
+
* Max session duration in seconds (required)
|
|
142
|
+
*/
|
|
143
|
+
maxSeconds: number;
|
|
144
|
+
/**
|
|
145
|
+
* Enable session warning events/messages (default: false)
|
|
146
|
+
*/
|
|
147
|
+
enableWarning?: boolean;
|
|
148
|
+
/**
|
|
149
|
+
* Enable session goodbye events/messages (default: false)
|
|
150
|
+
*/
|
|
151
|
+
enableGoodbye?: boolean;
|
|
152
|
+
}
|
|
137
153
|
/**
|
|
138
154
|
* Session configuration options
|
|
139
155
|
*/
|
|
@@ -143,10 +159,29 @@ interface SessionConfig {
|
|
|
143
159
|
*/
|
|
144
160
|
prePrompt?: string;
|
|
145
161
|
/**
|
|
146
|
-
* Language code
|
|
162
|
+
* Language code (e.g., "en-US", "ko-KR").
|
|
163
|
+
*
|
|
164
|
+
* - **Composed mode:** Used for STT speech recognition language AND as fallback
|
|
165
|
+
* for TTS voice selection (if `outputLanguage` is not set).
|
|
166
|
+
* - **Live mode:** Used for TTS voice selection only. Gemini auto-detects
|
|
167
|
+
* the input language from the audio stream.
|
|
168
|
+
*
|
|
147
169
|
* @default "en-US"
|
|
148
170
|
*/
|
|
149
171
|
language?: string;
|
|
172
|
+
/**
|
|
173
|
+
* Output language for TTS voice selection (Composed mode only).
|
|
174
|
+
*
|
|
175
|
+
* Use this when the AI output language differs from the input language
|
|
176
|
+
* (e.g., input is "ko-KR" but AI responds in English via prePrompt translation).
|
|
177
|
+
* If not set, defaults to `language`.
|
|
178
|
+
*
|
|
179
|
+
* **Note:** This field is only used in Composed mode. In Live mode, voice
|
|
180
|
+
* selection is driven by `language` since Gemini handles the full pipeline natively.
|
|
181
|
+
*
|
|
182
|
+
* @example "en-US"
|
|
183
|
+
*/
|
|
184
|
+
outputLanguage?: string;
|
|
150
185
|
/**
|
|
151
186
|
* Pipeline mode for audio processing
|
|
152
187
|
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
@@ -183,6 +218,10 @@ interface SessionConfig {
|
|
|
183
218
|
* }]
|
|
184
219
|
*/
|
|
185
220
|
tools?: Tool[];
|
|
221
|
+
/**
|
|
222
|
+
* Session duration configuration (enables duration limits when set)
|
|
223
|
+
*/
|
|
224
|
+
sessionDuration?: SessionDurationConfig;
|
|
186
225
|
}
|
|
187
226
|
/**
|
|
188
227
|
* Internal resolved configuration with defaults applied
|
|
@@ -201,7 +240,7 @@ interface ResolvedConfig {
|
|
|
201
240
|
/**
|
|
202
241
|
* Event types emitted by the LiveSpeech client
|
|
203
242
|
*/
|
|
204
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
|
|
243
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'sessionWarning' | 'sessionGoodbye' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
|
|
205
244
|
/**
|
|
206
245
|
* Event payload for 'connected' event
|
|
207
246
|
*/
|
|
@@ -239,6 +278,22 @@ interface SessionEndedEvent {
|
|
|
239
278
|
sessionId: string;
|
|
240
279
|
timestamp: string;
|
|
241
280
|
}
|
|
281
|
+
/**
|
|
282
|
+
* Event payload for 'sessionWarning' event
|
|
283
|
+
*/
|
|
284
|
+
interface SessionWarningEvent {
|
|
285
|
+
type: 'sessionWarning';
|
|
286
|
+
remainingSeconds: number;
|
|
287
|
+
timestamp: string;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Event payload for 'sessionGoodbye' event
|
|
291
|
+
*/
|
|
292
|
+
interface SessionGoodbyeEvent {
|
|
293
|
+
type: 'sessionGoodbye';
|
|
294
|
+
remainingSeconds: number;
|
|
295
|
+
timestamp: string;
|
|
296
|
+
}
|
|
242
297
|
/**
|
|
243
298
|
* Event payload for 'ready' event
|
|
244
299
|
*/
|
|
@@ -380,7 +435,7 @@ interface InterruptedEvent {
|
|
|
380
435
|
/**
|
|
381
436
|
* Union type of all event payloads
|
|
382
437
|
*/
|
|
383
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
|
|
438
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | SessionWarningEvent | SessionGoodbyeEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
|
|
384
439
|
/**
|
|
385
440
|
* Simplified event handlers for common use cases
|
|
386
441
|
*/
|
|
@@ -396,7 +451,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
396
451
|
/**
|
|
397
452
|
* WebSocket message types received from server
|
|
398
453
|
*/
|
|
399
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
|
|
454
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'session_warning' | 'session_goodbye' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
|
|
400
455
|
/**
|
|
401
456
|
* Base interface for client messages
|
|
402
457
|
*/
|
|
@@ -410,10 +465,14 @@ interface StartSessionMessage extends BaseClientMessage {
|
|
|
410
465
|
action: 'startSession';
|
|
411
466
|
prePrompt?: string;
|
|
412
467
|
language?: string;
|
|
468
|
+
outputLanguage?: string;
|
|
413
469
|
pipelineMode?: 'live' | 'composed';
|
|
414
470
|
aiSpeaksFirst?: boolean;
|
|
415
471
|
allowHarmCategory?: boolean;
|
|
416
472
|
tools?: Tool[];
|
|
473
|
+
sessionMaxDurationSeconds?: number;
|
|
474
|
+
enableSessionWarning?: boolean;
|
|
475
|
+
enableSessionGoodbye?: boolean;
|
|
417
476
|
}
|
|
418
477
|
/**
|
|
419
478
|
* End session message
|
|
@@ -517,6 +576,20 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
517
576
|
type: 'sessionEnded';
|
|
518
577
|
sessionId: string;
|
|
519
578
|
}
|
|
579
|
+
/**
|
|
580
|
+
* Session warning message from server
|
|
581
|
+
*/
|
|
582
|
+
interface ServerSessionWarningMessage extends BaseServerMessage {
|
|
583
|
+
type: 'session_warning';
|
|
584
|
+
remainingSeconds: number;
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Session goodbye message from server
|
|
588
|
+
*/
|
|
589
|
+
interface ServerSessionGoodbyeMessage extends BaseServerMessage {
|
|
590
|
+
type: 'session_goodbye';
|
|
591
|
+
remainingSeconds: number;
|
|
592
|
+
}
|
|
520
593
|
/**
|
|
521
594
|
* User transcript message from server (user's speech transcription)
|
|
522
595
|
*/
|
|
@@ -604,7 +677,7 @@ interface ServerInterruptedMessage extends BaseServerMessage {
|
|
|
604
677
|
/**
|
|
605
678
|
* Union type of all server messages
|
|
606
679
|
*/
|
|
607
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
|
|
680
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerSessionWarningMessage | ServerSessionGoodbyeMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
|
|
608
681
|
|
|
609
682
|
/**
|
|
610
683
|
* Connection state
|
|
@@ -620,6 +693,8 @@ type LiveSpeechEventMap = {
|
|
|
620
693
|
reconnecting: ReconnectingEvent;
|
|
621
694
|
sessionStarted: SessionStartedEvent;
|
|
622
695
|
sessionEnded: SessionEndedEvent;
|
|
696
|
+
sessionWarning: SessionWarningEvent;
|
|
697
|
+
sessionGoodbye: SessionGoodbyeEvent;
|
|
623
698
|
ready: ReadyEvent;
|
|
624
699
|
userTranscript: UserTranscriptEvent;
|
|
625
700
|
response: ResponseEvent;
|
|
@@ -912,4 +987,4 @@ declare class AudioEncoder {
|
|
|
912
987
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
913
988
|
}
|
|
914
989
|
|
|
915
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, type FunctionParameters, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type Tool, type ToolCallEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
|
990
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, type FunctionParameters, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionDurationConfig, type SessionEndedEvent, type SessionStartedEvent, type Tool, type ToolCallEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.d.ts
CHANGED
|
@@ -39,7 +39,6 @@ type PipelineMode = 'live' | 'composed';
|
|
|
39
39
|
interface LiveSpeechConfig {
|
|
40
40
|
/**
|
|
41
41
|
* Region for the LiveSpeech service
|
|
42
|
-
* @example 'ap-northeast-2'
|
|
43
42
|
*/
|
|
44
43
|
region: Region;
|
|
45
44
|
/**
|
|
@@ -134,6 +133,23 @@ interface Tool {
|
|
|
134
133
|
*/
|
|
135
134
|
parameters?: FunctionParameters;
|
|
136
135
|
}
|
|
136
|
+
/**
|
|
137
|
+
* Session configuration options
|
|
138
|
+
*/
|
|
139
|
+
interface SessionDurationConfig {
|
|
140
|
+
/**
|
|
141
|
+
* Max session duration in seconds (required)
|
|
142
|
+
*/
|
|
143
|
+
maxSeconds: number;
|
|
144
|
+
/**
|
|
145
|
+
* Enable session warning events/messages (default: false)
|
|
146
|
+
*/
|
|
147
|
+
enableWarning?: boolean;
|
|
148
|
+
/**
|
|
149
|
+
* Enable session goodbye events/messages (default: false)
|
|
150
|
+
*/
|
|
151
|
+
enableGoodbye?: boolean;
|
|
152
|
+
}
|
|
137
153
|
/**
|
|
138
154
|
* Session configuration options
|
|
139
155
|
*/
|
|
@@ -143,10 +159,29 @@ interface SessionConfig {
|
|
|
143
159
|
*/
|
|
144
160
|
prePrompt?: string;
|
|
145
161
|
/**
|
|
146
|
-
* Language code
|
|
162
|
+
* Language code (e.g., "en-US", "ko-KR").
|
|
163
|
+
*
|
|
164
|
+
* - **Composed mode:** Used for STT speech recognition language AND as fallback
|
|
165
|
+
* for TTS voice selection (if `outputLanguage` is not set).
|
|
166
|
+
* - **Live mode:** Used for TTS voice selection only. Gemini auto-detects
|
|
167
|
+
* the input language from the audio stream.
|
|
168
|
+
*
|
|
147
169
|
* @default "en-US"
|
|
148
170
|
*/
|
|
149
171
|
language?: string;
|
|
172
|
+
/**
|
|
173
|
+
* Output language for TTS voice selection (Composed mode only).
|
|
174
|
+
*
|
|
175
|
+
* Use this when the AI output language differs from the input language
|
|
176
|
+
* (e.g., input is "ko-KR" but AI responds in English via prePrompt translation).
|
|
177
|
+
* If not set, defaults to `language`.
|
|
178
|
+
*
|
|
179
|
+
* **Note:** This field is only used in Composed mode. In Live mode, voice
|
|
180
|
+
* selection is driven by `language` since Gemini handles the full pipeline natively.
|
|
181
|
+
*
|
|
182
|
+
* @example "en-US"
|
|
183
|
+
*/
|
|
184
|
+
outputLanguage?: string;
|
|
150
185
|
/**
|
|
151
186
|
* Pipeline mode for audio processing
|
|
152
187
|
* - 'live': Direct audio-to-audio conversation (default, lower latency)
|
|
@@ -183,6 +218,10 @@ interface SessionConfig {
|
|
|
183
218
|
* }]
|
|
184
219
|
*/
|
|
185
220
|
tools?: Tool[];
|
|
221
|
+
/**
|
|
222
|
+
* Session duration configuration (enables duration limits when set)
|
|
223
|
+
*/
|
|
224
|
+
sessionDuration?: SessionDurationConfig;
|
|
186
225
|
}
|
|
187
226
|
/**
|
|
188
227
|
* Internal resolved configuration with defaults applied
|
|
@@ -201,7 +240,7 @@ interface ResolvedConfig {
|
|
|
201
240
|
/**
|
|
202
241
|
* Event types emitted by the LiveSpeech client
|
|
203
242
|
*/
|
|
204
|
-
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
|
|
243
|
+
type LiveSpeechEventType = 'connected' | 'disconnected' | 'reconnecting' | 'sessionStarted' | 'sessionEnded' | 'sessionWarning' | 'sessionGoodbye' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error';
|
|
205
244
|
/**
|
|
206
245
|
* Event payload for 'connected' event
|
|
207
246
|
*/
|
|
@@ -239,6 +278,22 @@ interface SessionEndedEvent {
|
|
|
239
278
|
sessionId: string;
|
|
240
279
|
timestamp: string;
|
|
241
280
|
}
|
|
281
|
+
/**
|
|
282
|
+
* Event payload for 'sessionWarning' event
|
|
283
|
+
*/
|
|
284
|
+
interface SessionWarningEvent {
|
|
285
|
+
type: 'sessionWarning';
|
|
286
|
+
remainingSeconds: number;
|
|
287
|
+
timestamp: string;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Event payload for 'sessionGoodbye' event
|
|
291
|
+
*/
|
|
292
|
+
interface SessionGoodbyeEvent {
|
|
293
|
+
type: 'sessionGoodbye';
|
|
294
|
+
remainingSeconds: number;
|
|
295
|
+
timestamp: string;
|
|
296
|
+
}
|
|
242
297
|
/**
|
|
243
298
|
* Event payload for 'ready' event
|
|
244
299
|
*/
|
|
@@ -380,7 +435,7 @@ interface InterruptedEvent {
|
|
|
380
435
|
/**
|
|
381
436
|
* Union type of all event payloads
|
|
382
437
|
*/
|
|
383
|
-
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
|
|
438
|
+
type LiveSpeechEvent = ConnectedEvent | DisconnectedEvent | ReconnectingEvent | SessionStartedEvent | SessionEndedEvent | SessionWarningEvent | SessionGoodbyeEvent | ReadyEvent | UserTranscriptEvent | ResponseEvent | AudioEvent | TurnCompleteEvent | ToolCallEvent | UserIdUpdatedEvent | InterruptedEvent | ErrorEvent;
|
|
384
439
|
/**
|
|
385
440
|
* Simplified event handlers for common use cases
|
|
386
441
|
*/
|
|
@@ -396,7 +451,7 @@ type ClientMessageType = 'startSession' | 'endSession' | 'audioStart' | 'audioCh
|
|
|
396
451
|
/**
|
|
397
452
|
* WebSocket message types received from server
|
|
398
453
|
*/
|
|
399
|
-
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
|
|
454
|
+
type ServerMessageType = 'sessionStarted' | 'sessionEnded' | 'session_warning' | 'session_goodbye' | 'ready' | 'userTranscript' | 'response' | 'audio' | 'turnComplete' | 'toolCall' | 'userIdUpdated' | 'interrupted' | 'error' | 'pong';
|
|
400
455
|
/**
|
|
401
456
|
* Base interface for client messages
|
|
402
457
|
*/
|
|
@@ -410,10 +465,14 @@ interface StartSessionMessage extends BaseClientMessage {
|
|
|
410
465
|
action: 'startSession';
|
|
411
466
|
prePrompt?: string;
|
|
412
467
|
language?: string;
|
|
468
|
+
outputLanguage?: string;
|
|
413
469
|
pipelineMode?: 'live' | 'composed';
|
|
414
470
|
aiSpeaksFirst?: boolean;
|
|
415
471
|
allowHarmCategory?: boolean;
|
|
416
472
|
tools?: Tool[];
|
|
473
|
+
sessionMaxDurationSeconds?: number;
|
|
474
|
+
enableSessionWarning?: boolean;
|
|
475
|
+
enableSessionGoodbye?: boolean;
|
|
417
476
|
}
|
|
418
477
|
/**
|
|
419
478
|
* End session message
|
|
@@ -517,6 +576,20 @@ interface ServerSessionEndedMessage extends BaseServerMessage {
|
|
|
517
576
|
type: 'sessionEnded';
|
|
518
577
|
sessionId: string;
|
|
519
578
|
}
|
|
579
|
+
/**
|
|
580
|
+
* Session warning message from server
|
|
581
|
+
*/
|
|
582
|
+
interface ServerSessionWarningMessage extends BaseServerMessage {
|
|
583
|
+
type: 'session_warning';
|
|
584
|
+
remainingSeconds: number;
|
|
585
|
+
}
|
|
586
|
+
/**
|
|
587
|
+
* Session goodbye message from server
|
|
588
|
+
*/
|
|
589
|
+
interface ServerSessionGoodbyeMessage extends BaseServerMessage {
|
|
590
|
+
type: 'session_goodbye';
|
|
591
|
+
remainingSeconds: number;
|
|
592
|
+
}
|
|
520
593
|
/**
|
|
521
594
|
* User transcript message from server (user's speech transcription)
|
|
522
595
|
*/
|
|
@@ -604,7 +677,7 @@ interface ServerInterruptedMessage extends BaseServerMessage {
|
|
|
604
677
|
/**
|
|
605
678
|
* Union type of all server messages
|
|
606
679
|
*/
|
|
607
|
-
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
|
|
680
|
+
type ServerMessage = ServerSessionStartedMessage | ServerSessionEndedMessage | ServerSessionWarningMessage | ServerSessionGoodbyeMessage | ServerReadyMessage | ServerUserTranscriptMessage | ServerResponseMessage | ServerAudioMessage | ServerTurnCompleteMessage | ServerToolCallMessage | ServerUserIdUpdatedMessage | ServerInterruptedMessage | ServerErrorMessage | ServerPongMessage;
|
|
608
681
|
|
|
609
682
|
/**
|
|
610
683
|
* Connection state
|
|
@@ -620,6 +693,8 @@ type LiveSpeechEventMap = {
|
|
|
620
693
|
reconnecting: ReconnectingEvent;
|
|
621
694
|
sessionStarted: SessionStartedEvent;
|
|
622
695
|
sessionEnded: SessionEndedEvent;
|
|
696
|
+
sessionWarning: SessionWarningEvent;
|
|
697
|
+
sessionGoodbye: SessionGoodbyeEvent;
|
|
623
698
|
ready: ReadyEvent;
|
|
624
699
|
userTranscript: UserTranscriptEvent;
|
|
625
700
|
response: ResponseEvent;
|
|
@@ -912,4 +987,4 @@ declare class AudioEncoder {
|
|
|
912
987
|
wrapWav(data: Uint8Array): Uint8Array;
|
|
913
988
|
}
|
|
914
989
|
|
|
915
|
-
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, type FunctionParameters, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionEndedEvent, type SessionStartedEvent, type Tool, type ToolCallEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
|
990
|
+
export { AudioEncoder, type AudioEncoderOptions, type AudioEvent, type AudioHandler, type ClientMessage, type ClientMessageType, type ConnectedEvent, type ConnectionState, type DisconnectReason, type DisconnectedEvent, type ErrorCode, type ErrorEvent, type ErrorHandler, type FunctionParameters, LiveSpeechClient, type LiveSpeechConfig, type LiveSpeechEvent, type LiveSpeechEventMap, type LiveSpeechEventType, type PipelineMode, type ReadyEvent, type ReconnectingEvent, Region, Region as RegionType, type ResolvedConfig, type ResponseEvent, type ResponseHandler, type ServerMessage, type ServerMessageType, type SessionConfig, type SessionDurationConfig, type SessionEndedEvent, type SessionStartedEvent, type Tool, type ToolCallEvent, type TurnCompleteEvent, type UserTranscriptEvent, type UserTranscriptHandler, createWavHeader, decodeBase64ToAudio, encodeAudioToBase64, extractPcmFromWav, float32ToInt16, getEndpointForRegion, int16ToFloat32, int16ToUint8, isValidRegion, uint8ToInt16, wrapPcmInWav };
|
package/dist/index.js
CHANGED
|
@@ -728,14 +728,22 @@ var LiveSpeechClient = class {
|
|
|
728
728
|
};
|
|
729
729
|
this.on("sessionStarted", onSessionStarted);
|
|
730
730
|
this.on("error", onError);
|
|
731
|
+
const sessionDuration = config?.sessionDuration;
|
|
732
|
+
const hasSessionDuration = typeof sessionDuration?.maxSeconds === "number";
|
|
731
733
|
this.connection.send({
|
|
732
734
|
action: "startSession",
|
|
733
735
|
...config?.prePrompt && { prePrompt: config.prePrompt },
|
|
734
736
|
...config?.language && { language: config.language },
|
|
737
|
+
...config?.outputLanguage && { outputLanguage: config.outputLanguage },
|
|
735
738
|
pipelineMode: config?.pipelineMode ?? "live",
|
|
736
739
|
...config?.aiSpeaksFirst && { aiSpeaksFirst: config.aiSpeaksFirst },
|
|
737
740
|
allowHarmCategory: config?.allowHarmCategory ?? false,
|
|
738
|
-
...config?.tools && config.tools.length > 0 && { tools: config.tools }
|
|
741
|
+
...config?.tools && config.tools.length > 0 && { tools: config.tools },
|
|
742
|
+
...hasSessionDuration && {
|
|
743
|
+
sessionMaxDurationSeconds: sessionDuration.maxSeconds,
|
|
744
|
+
enableSessionWarning: sessionDuration.enableWarning ?? false,
|
|
745
|
+
enableSessionGoodbye: sessionDuration.enableGoodbye ?? false
|
|
746
|
+
}
|
|
739
747
|
});
|
|
740
748
|
});
|
|
741
749
|
}
|
|
@@ -1017,6 +1025,8 @@ var LiveSpeechClient = class {
|
|
|
1017
1025
|
}
|
|
1018
1026
|
}
|
|
1019
1027
|
handleConnected(connectionId) {
|
|
1028
|
+
this.sessionId = null;
|
|
1029
|
+
this.isStreaming = false;
|
|
1020
1030
|
const event = {
|
|
1021
1031
|
type: "connected",
|
|
1022
1032
|
connectionId,
|
|
@@ -1075,6 +1085,24 @@ var LiveSpeechClient = class {
|
|
|
1075
1085
|
timestamp: message.timestamp
|
|
1076
1086
|
});
|
|
1077
1087
|
break;
|
|
1088
|
+
case "session_warning": {
|
|
1089
|
+
const warningEvent = {
|
|
1090
|
+
type: "sessionWarning",
|
|
1091
|
+
remainingSeconds: message.remainingSeconds ?? 0,
|
|
1092
|
+
timestamp: message.timestamp
|
|
1093
|
+
};
|
|
1094
|
+
this.emit("sessionWarning", warningEvent);
|
|
1095
|
+
break;
|
|
1096
|
+
}
|
|
1097
|
+
case "session_goodbye": {
|
|
1098
|
+
const goodbyeEvent = {
|
|
1099
|
+
type: "sessionGoodbye",
|
|
1100
|
+
remainingSeconds: message.remainingSeconds ?? 0,
|
|
1101
|
+
timestamp: message.timestamp
|
|
1102
|
+
};
|
|
1103
|
+
this.emit("sessionGoodbye", goodbyeEvent);
|
|
1104
|
+
break;
|
|
1105
|
+
}
|
|
1078
1106
|
case "ready": {
|
|
1079
1107
|
const readyEvent = {
|
|
1080
1108
|
type: "ready",
|
package/dist/index.mjs
CHANGED
|
@@ -689,14 +689,22 @@ var LiveSpeechClient = class {
|
|
|
689
689
|
};
|
|
690
690
|
this.on("sessionStarted", onSessionStarted);
|
|
691
691
|
this.on("error", onError);
|
|
692
|
+
const sessionDuration = config?.sessionDuration;
|
|
693
|
+
const hasSessionDuration = typeof sessionDuration?.maxSeconds === "number";
|
|
692
694
|
this.connection.send({
|
|
693
695
|
action: "startSession",
|
|
694
696
|
...config?.prePrompt && { prePrompt: config.prePrompt },
|
|
695
697
|
...config?.language && { language: config.language },
|
|
698
|
+
...config?.outputLanguage && { outputLanguage: config.outputLanguage },
|
|
696
699
|
pipelineMode: config?.pipelineMode ?? "live",
|
|
697
700
|
...config?.aiSpeaksFirst && { aiSpeaksFirst: config.aiSpeaksFirst },
|
|
698
701
|
allowHarmCategory: config?.allowHarmCategory ?? false,
|
|
699
|
-
...config?.tools && config.tools.length > 0 && { tools: config.tools }
|
|
702
|
+
...config?.tools && config.tools.length > 0 && { tools: config.tools },
|
|
703
|
+
...hasSessionDuration && {
|
|
704
|
+
sessionMaxDurationSeconds: sessionDuration.maxSeconds,
|
|
705
|
+
enableSessionWarning: sessionDuration.enableWarning ?? false,
|
|
706
|
+
enableSessionGoodbye: sessionDuration.enableGoodbye ?? false
|
|
707
|
+
}
|
|
700
708
|
});
|
|
701
709
|
});
|
|
702
710
|
}
|
|
@@ -978,6 +986,8 @@ var LiveSpeechClient = class {
|
|
|
978
986
|
}
|
|
979
987
|
}
|
|
980
988
|
handleConnected(connectionId) {
|
|
989
|
+
this.sessionId = null;
|
|
990
|
+
this.isStreaming = false;
|
|
981
991
|
const event = {
|
|
982
992
|
type: "connected",
|
|
983
993
|
connectionId,
|
|
@@ -1036,6 +1046,24 @@ var LiveSpeechClient = class {
|
|
|
1036
1046
|
timestamp: message.timestamp
|
|
1037
1047
|
});
|
|
1038
1048
|
break;
|
|
1049
|
+
case "session_warning": {
|
|
1050
|
+
const warningEvent = {
|
|
1051
|
+
type: "sessionWarning",
|
|
1052
|
+
remainingSeconds: message.remainingSeconds ?? 0,
|
|
1053
|
+
timestamp: message.timestamp
|
|
1054
|
+
};
|
|
1055
|
+
this.emit("sessionWarning", warningEvent);
|
|
1056
|
+
break;
|
|
1057
|
+
}
|
|
1058
|
+
case "session_goodbye": {
|
|
1059
|
+
const goodbyeEvent = {
|
|
1060
|
+
type: "sessionGoodbye",
|
|
1061
|
+
remainingSeconds: message.remainingSeconds ?? 0,
|
|
1062
|
+
timestamp: message.timestamp
|
|
1063
|
+
};
|
|
1064
|
+
this.emit("sessionGoodbye", goodbyeEvent);
|
|
1065
|
+
break;
|
|
1066
|
+
}
|
|
1039
1067
|
case "ready": {
|
|
1040
1068
|
const readyEvent = {
|
|
1041
1069
|
type: "ready",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@drawdream/livespeech",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.15",
|
|
4
4
|
"description": "Real-time speech-to-speech AI conversation SDK",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.mjs",
|
|
@@ -56,12 +56,12 @@
|
|
|
56
56
|
"devDependencies": {
|
|
57
57
|
"@types/node": "^20.10.0",
|
|
58
58
|
"@types/ws": "^8.5.10",
|
|
59
|
-
"@typescript-eslint/eslint-plugin": "^
|
|
60
|
-
"@typescript-eslint/parser": "^
|
|
61
|
-
"eslint": "^8.
|
|
59
|
+
"@typescript-eslint/eslint-plugin": "^7.18.0",
|
|
60
|
+
"@typescript-eslint/parser": "^7.18.0",
|
|
61
|
+
"eslint": "^8.56.0",
|
|
62
62
|
"tsup": "^8.0.1",
|
|
63
63
|
"typescript": "^5.3.0",
|
|
64
|
-
"vitest": "^
|
|
64
|
+
"vitest": "^4.0.0"
|
|
65
65
|
},
|
|
66
66
|
"peerDependencies": {
|
|
67
67
|
"typescript": ">=5.0.0"
|