@mastra/voice-inworld 0.3.0-alpha.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +69 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +11 -8
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-voice-overview.md +25 -25
- package/dist/docs/references/docs-voice-speech-to-speech.md +4 -4
- package/dist/docs/references/reference-voice-inworld-realtime.md +5 -5
- package/dist/docs/references/reference-voice-inworld.md +1 -1
- package/dist/index.cjs +2 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/package.json +10 -9
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,74 @@
|
|
|
1
1
|
# @mastra/voice-inworld
|
|
2
2
|
|
|
3
|
+
## 0.3.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- `@mastra/voice-inworld` now ships `InworldRealtimeVoice` for full-duplex realtime voice — mic in, speakers out, server-side LLM routing, semantic VAD turn-taking, tool calling, barge-in, and live transcripts of both sides — alongside the existing streaming TTS and batch STT. No separate package needed; import both from the same entry point. ([#16865](https://github.com/mastra-ai/mastra/pull/16865))
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
// Batch TTS / STT (unchanged)
|
|
11
|
+
import { InworldVoice } from '@mastra/voice-inworld';
|
|
12
|
+
|
|
13
|
+
// New: realtime full-duplex voice, from the same package
|
|
14
|
+
import { InworldRealtimeVoice } from '@mastra/voice-inworld';
|
|
15
|
+
|
|
16
|
+
const voice = new InworldRealtimeVoice({
|
|
17
|
+
apiKey: process.env.INWORLD_API_KEY,
|
|
18
|
+
// Defaults: model 'inworld/models/gemma-4-26b-a4b-it', speaker 'Sarah',
|
|
19
|
+
// STT 'inworld/inworld-stt-1', semantic-VAD turn detection.
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
await voice.connect();
|
|
23
|
+
voice.on('speaker', stream => playAudio(stream)); // PCM16 @ 24kHz
|
|
24
|
+
voice.on('writing', ({ text, role }) => console.log(role, text));
|
|
25
|
+
voice.on('interrupted', ({ response_id }) => stopAudio(response_id));
|
|
26
|
+
await voice.send(getMicrophoneStream());
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Typed `providerData` for Inworld realtime extensions**
|
|
30
|
+
|
|
31
|
+
`InworldRealtimeVoice` now accepts a typed `providerData` object for Inworld-specific extensions — STT tuning, TTS segmentation and steering, automatic memory, back-channel, and responsiveness — sent under `session.providerData`. The provider also surfaces inbound extension data: a `voiceProfile` on user `writing` events, a `memory` event for the rolling summary/facts state, and `backchannel` / `backchannel.done` / `backchannel.skipped` events for back-channel audio.
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
const voice = new InworldRealtimeVoice({
|
|
35
|
+
providerData: {
|
|
36
|
+
stt: { voice_profile: true, language_hints: ['en-US'] },
|
|
37
|
+
tts: { delivery_mode: 'CREATIVE', segmenter_strategy: 'balanced' },
|
|
38
|
+
memory: { enabled: true, turn_interval: 4 },
|
|
39
|
+
backchannel: { enabled: true, max_per_turn: 1 },
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
voice.on('memory', state => console.log(state.summary, state.facts));
|
|
44
|
+
voice.on('backchannel', stream => playAudio(stream));
|
|
45
|
+
voice.on('writing', ({ role, voiceProfile }) => console.log(role, voiceProfile?.emotion));
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Realtime fixes and additions**
|
|
49
|
+
- Fixed the per-call `speak(text, { speaker })` voice override. It is now sent as the flat `response.voice` field, so the per-call speaker is no longer silently ignored by the server.
|
|
50
|
+
- Added manual turn-taking methods `commitInput()`, `clearInput()`, and `clearOutput()` for push-to-talk and manual turn control (use `clearOutput()` only to hard-stop all playback — it also stops in-flight back-channels).
|
|
51
|
+
- Added smart-turn and playback-state events: `turn-suggestion`, `turn-suggestion-revoked`, `input-committed`, `input-cleared`, `input-timeout`, and `output-audio-started` / `output-audio-stopped` / `output-audio-cleared`.
|
|
52
|
+
- Added richer typed session config: input noise reduction, telephony (8 kHz) and float32 audio formats, a server-VAD `idle_timeout_ms`, plus `tracing`, `include`, and `prompt`.
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
// Push-to-talk with no auto-VAD
|
|
56
|
+
const voice = new InworldRealtimeVoice({
|
|
57
|
+
session: { audio: { input: { turn_detection: null } } },
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
await voice.send(getMicrophoneStream());
|
|
61
|
+
voice.commitInput(); // end the user turn manually
|
|
62
|
+
|
|
63
|
+
voice.on('output-audio-stopped', () => console.log('playback finished'));
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Patch Changes
|
|
67
|
+
|
|
68
|
+
- Moved shared voice primitives and route metadata into the new `@internal/voice` package so voice providers no longer depend on `@mastra/core` and server voice routes share the same route definitions. ([#16725](https://github.com/mastra-ai/mastra/pull/16725))
|
|
69
|
+
|
|
70
|
+
`@mastra/core/voice` continues to re-export the voice APIs for backwards compatibility.
|
|
71
|
+
|
|
3
72
|
## 0.3.0-alpha.1
|
|
4
73
|
|
|
5
74
|
### Minor Changes
|
|
@@ -1549,16 +1549,16 @@ declare interface EventSourceMessage {
|
|
|
1549
1549
|
* implementation in that browsers will default this to `message`, whereas this parser will
|
|
1550
1550
|
* leave this as `undefined` if not explicitly declared.
|
|
1551
1551
|
*/
|
|
1552
|
-
event?: string | undefined
|
|
1552
|
+
event?: string | undefined;
|
|
1553
1553
|
/**
|
|
1554
1554
|
* ID of the message, if any was provided by the server. Can be used by clients to keep the
|
|
1555
1555
|
* last received message ID in sync when reconnecting.
|
|
1556
1556
|
*/
|
|
1557
|
-
id?: string | undefined
|
|
1557
|
+
id?: string | undefined;
|
|
1558
1558
|
/**
|
|
1559
1559
|
* The data received for this message
|
|
1560
1560
|
*/
|
|
1561
|
-
data: string
|
|
1561
|
+
data: string;
|
|
1562
1562
|
}
|
|
1563
1563
|
|
|
1564
1564
|
/**
|
|
@@ -1582,8 +1582,11 @@ declare interface EventSourceMessage {
|
|
|
1582
1582
|
*
|
|
1583
1583
|
* @public
|
|
1584
1584
|
*/
|
|
1585
|
-
declare class EventSourceParserStream extends TransformStream<
|
|
1586
|
-
|
|
1585
|
+
declare class EventSourceParserStream extends TransformStream<
|
|
1586
|
+
string,
|
|
1587
|
+
EventSourceMessage
|
|
1588
|
+
> {
|
|
1589
|
+
constructor({ onError, onRetry, onComment }?: StreamOptions);
|
|
1587
1590
|
}
|
|
1588
1591
|
|
|
1589
1592
|
/**
|
|
@@ -6830,19 +6833,19 @@ declare interface StreamOptions {
|
|
|
6830
6833
|
*
|
|
6831
6834
|
* @defaultValue `undefined`
|
|
6832
6835
|
*/
|
|
6833
|
-
onError?: (
|
|
6836
|
+
onError?: ("terminate" | ((error: Error) => void)) | undefined;
|
|
6834
6837
|
/**
|
|
6835
6838
|
* Callback for when a reconnection interval is sent from the server.
|
|
6836
6839
|
*
|
|
6837
6840
|
* @param retry - The number of milliseconds to wait before reconnecting.
|
|
6838
6841
|
*/
|
|
6839
|
-
onRetry?: ((retry: number) => void) | undefined
|
|
6842
|
+
onRetry?: ((retry: number) => void) | undefined;
|
|
6840
6843
|
/**
|
|
6841
6844
|
* Callback for when a comment is encountered in the stream.
|
|
6842
6845
|
*
|
|
6843
6846
|
* @param comment - The comment encountered in the stream.
|
|
6844
6847
|
*/
|
|
6845
|
-
onComment?: ((comment: string) => void) | undefined
|
|
6848
|
+
onComment?: ((comment: string) => void) | undefined;
|
|
6846
6849
|
}
|
|
6847
6850
|
|
|
6848
6851
|
/**
|
package/dist/docs/SKILL.md
CHANGED
|
@@ -3,7 +3,7 @@ name: mastra-voice-inworld
|
|
|
3
3
|
description: Documentation for @mastra/voice-inworld. Use when working with @mastra/voice-inworld APIs, configuration, or implementation.
|
|
4
4
|
metadata:
|
|
5
5
|
package: "@mastra/voice-inworld"
|
|
6
|
-
version: "0.3.
|
|
6
|
+
version: "0.3.2"
|
|
7
7
|
---
|
|
8
8
|
|
|
9
9
|
## When to use
|
|
@@ -16,7 +16,7 @@ const voiceAgent = new Agent({
|
|
|
16
16
|
id: 'voice-agent',
|
|
17
17
|
name: 'Voice Agent',
|
|
18
18
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
19
|
-
model: 'openai/gpt-5.
|
|
19
|
+
model: 'openai/gpt-5.5',
|
|
20
20
|
voice: new OpenAIVoice(),
|
|
21
21
|
})
|
|
22
22
|
```
|
|
@@ -40,7 +40,7 @@ const voiceAgent = new Agent({
|
|
|
40
40
|
id: 'voice-agent',
|
|
41
41
|
name: 'Voice Agent',
|
|
42
42
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
43
|
-
model: 'openai/gpt-5.
|
|
43
|
+
model: 'openai/gpt-5.5',
|
|
44
44
|
voice: new OpenAIVoice(),
|
|
45
45
|
})
|
|
46
46
|
|
|
@@ -68,7 +68,7 @@ const voiceAgent = new Agent({
|
|
|
68
68
|
id: 'voice-agent',
|
|
69
69
|
name: 'Voice Agent',
|
|
70
70
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
71
|
-
model: 'openai/gpt-5.
|
|
71
|
+
model: 'openai/gpt-5.5',
|
|
72
72
|
voice: new AzureVoice(),
|
|
73
73
|
})
|
|
74
74
|
|
|
@@ -95,7 +95,7 @@ const voiceAgent = new Agent({
|
|
|
95
95
|
id: 'voice-agent',
|
|
96
96
|
name: 'Voice Agent',
|
|
97
97
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
98
|
-
model: 'openai/gpt-5.
|
|
98
|
+
model: 'openai/gpt-5.5',
|
|
99
99
|
voice: new ElevenLabsVoice(),
|
|
100
100
|
})
|
|
101
101
|
|
|
@@ -122,7 +122,7 @@ const voiceAgent = new Agent({
|
|
|
122
122
|
id: 'voice-agent',
|
|
123
123
|
name: 'Voice Agent',
|
|
124
124
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
125
|
-
model: 'openai/gpt-5.
|
|
125
|
+
model: 'openai/gpt-5.5',
|
|
126
126
|
voice: new PlayAIVoice(),
|
|
127
127
|
})
|
|
128
128
|
|
|
@@ -149,7 +149,7 @@ const voiceAgent = new Agent({
|
|
|
149
149
|
id: 'voice-agent',
|
|
150
150
|
name: 'Voice Agent',
|
|
151
151
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
152
|
-
model: 'openai/gpt-5.
|
|
152
|
+
model: 'openai/gpt-5.5',
|
|
153
153
|
voice: new GoogleVoice(),
|
|
154
154
|
})
|
|
155
155
|
|
|
@@ -176,7 +176,7 @@ const voiceAgent = new Agent({
|
|
|
176
176
|
id: 'voice-agent',
|
|
177
177
|
name: 'Voice Agent',
|
|
178
178
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
179
|
-
model: 'openai/gpt-5.
|
|
179
|
+
model: 'openai/gpt-5.5',
|
|
180
180
|
voice: new CloudflareVoice(),
|
|
181
181
|
})
|
|
182
182
|
|
|
@@ -203,7 +203,7 @@ const voiceAgent = new Agent({
|
|
|
203
203
|
id: 'voice-agent',
|
|
204
204
|
name: 'Voice Agent',
|
|
205
205
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
206
|
-
model: 'openai/gpt-5.
|
|
206
|
+
model: 'openai/gpt-5.5',
|
|
207
207
|
voice: new DeepgramVoice(),
|
|
208
208
|
})
|
|
209
209
|
|
|
@@ -230,7 +230,7 @@ const voiceAgent = new Agent({
|
|
|
230
230
|
id: 'voice-agent',
|
|
231
231
|
name: 'Voice Agent',
|
|
232
232
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
233
|
-
model: 'openai/gpt-5.
|
|
233
|
+
model: 'openai/gpt-5.5',
|
|
234
234
|
voice: new InworldVoice(),
|
|
235
235
|
})
|
|
236
236
|
|
|
@@ -257,7 +257,7 @@ const voiceAgent = new Agent({
|
|
|
257
257
|
id: 'voice-agent',
|
|
258
258
|
name: 'Voice Agent',
|
|
259
259
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
260
|
-
model: 'openai/gpt-5.
|
|
260
|
+
model: 'openai/gpt-5.5',
|
|
261
261
|
voice: new SpeechifyVoice(),
|
|
262
262
|
})
|
|
263
263
|
|
|
@@ -284,7 +284,7 @@ const voiceAgent = new Agent({
|
|
|
284
284
|
id: 'voice-agent',
|
|
285
285
|
name: 'Voice Agent',
|
|
286
286
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
287
|
-
model: 'openai/gpt-5.
|
|
287
|
+
model: 'openai/gpt-5.5',
|
|
288
288
|
voice: new SarvamVoice(),
|
|
289
289
|
})
|
|
290
290
|
|
|
@@ -311,7 +311,7 @@ const voiceAgent = new Agent({
|
|
|
311
311
|
id: 'voice-agent',
|
|
312
312
|
name: 'Voice Agent',
|
|
313
313
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
314
|
-
model: 'openai/gpt-5.
|
|
314
|
+
model: 'openai/gpt-5.5',
|
|
315
315
|
voice: new MurfVoice(),
|
|
316
316
|
})
|
|
317
317
|
|
|
@@ -346,7 +346,7 @@ const voiceAgent = new Agent({
|
|
|
346
346
|
id: 'voice-agent',
|
|
347
347
|
name: 'Voice Agent',
|
|
348
348
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
349
|
-
model: 'openai/gpt-5.
|
|
349
|
+
model: 'openai/gpt-5.5',
|
|
350
350
|
voice: new OpenAIVoice(),
|
|
351
351
|
})
|
|
352
352
|
|
|
@@ -375,7 +375,7 @@ const voiceAgent = new Agent({
|
|
|
375
375
|
id: 'voice-agent',
|
|
376
376
|
name: 'Voice Agent',
|
|
377
377
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
378
|
-
model: 'openai/gpt-5.
|
|
378
|
+
model: 'openai/gpt-5.5',
|
|
379
379
|
voice: new AzureVoice(),
|
|
380
380
|
})
|
|
381
381
|
|
|
@@ -403,7 +403,7 @@ const voiceAgent = new Agent({
|
|
|
403
403
|
id: 'voice-agent',
|
|
404
404
|
name: 'Voice Agent',
|
|
405
405
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
406
|
-
model: 'openai/gpt-5.
|
|
406
|
+
model: 'openai/gpt-5.5',
|
|
407
407
|
voice: new ElevenLabsVoice(),
|
|
408
408
|
})
|
|
409
409
|
|
|
@@ -431,7 +431,7 @@ const voiceAgent = new Agent({
|
|
|
431
431
|
id: 'voice-agent',
|
|
432
432
|
name: 'Voice Agent',
|
|
433
433
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
434
|
-
model: 'openai/gpt-5.
|
|
434
|
+
model: 'openai/gpt-5.5',
|
|
435
435
|
voice: new GoogleVoice(),
|
|
436
436
|
})
|
|
437
437
|
|
|
@@ -459,7 +459,7 @@ const voiceAgent = new Agent({
|
|
|
459
459
|
id: 'voice-agent',
|
|
460
460
|
name: 'Voice Agent',
|
|
461
461
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
462
|
-
model: 'openai/gpt-5.
|
|
462
|
+
model: 'openai/gpt-5.5',
|
|
463
463
|
voice: new CloudflareVoice(),
|
|
464
464
|
})
|
|
465
465
|
|
|
@@ -487,7 +487,7 @@ const voiceAgent = new Agent({
|
|
|
487
487
|
id: 'voice-agent',
|
|
488
488
|
name: 'Voice Agent',
|
|
489
489
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
490
|
-
model: 'openai/gpt-5.
|
|
490
|
+
model: 'openai/gpt-5.5',
|
|
491
491
|
voice: new DeepgramVoice(),
|
|
492
492
|
})
|
|
493
493
|
|
|
@@ -515,7 +515,7 @@ const voiceAgent = new Agent({
|
|
|
515
515
|
id: 'voice-agent',
|
|
516
516
|
name: 'Voice Agent',
|
|
517
517
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
518
|
-
model: 'openai/gpt-5.
|
|
518
|
+
model: 'openai/gpt-5.5',
|
|
519
519
|
voice: new InworldVoice(),
|
|
520
520
|
})
|
|
521
521
|
|
|
@@ -543,7 +543,7 @@ const voiceAgent = new Agent({
|
|
|
543
543
|
id: 'voice-agent',
|
|
544
544
|
name: 'Voice Agent',
|
|
545
545
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
546
|
-
model: 'openai/gpt-5.
|
|
546
|
+
model: 'openai/gpt-5.5',
|
|
547
547
|
voice: new SarvamVoice(),
|
|
548
548
|
})
|
|
549
549
|
|
|
@@ -575,7 +575,7 @@ const voiceAgent = new Agent({
|
|
|
575
575
|
id: 'voice-agent',
|
|
576
576
|
name: 'Voice Agent',
|
|
577
577
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
578
|
-
model: 'openai/gpt-5.
|
|
578
|
+
model: 'openai/gpt-5.5',
|
|
579
579
|
voice: new OpenAIRealtimeVoice(),
|
|
580
580
|
})
|
|
581
581
|
|
|
@@ -605,7 +605,7 @@ const voiceAgent = new Agent({
|
|
|
605
605
|
id: 'voice-agent',
|
|
606
606
|
name: 'Voice Agent',
|
|
607
607
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
608
|
-
model: 'openai/gpt-5.
|
|
608
|
+
model: 'openai/gpt-5.5',
|
|
609
609
|
voice: new GeminiLiveVoice({
|
|
610
610
|
// Live API mode
|
|
611
611
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
@@ -654,7 +654,7 @@ const voiceAgent = new Agent({
|
|
|
654
654
|
id: 'voice-agent',
|
|
655
655
|
name: 'Voice Agent',
|
|
656
656
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
657
|
-
model: 'openai/gpt-5.
|
|
657
|
+
model: 'openai/gpt-5.5',
|
|
658
658
|
voice: new NovaSonicVoice({
|
|
659
659
|
region: 'us-east-1',
|
|
660
660
|
speaker: 'matthew',
|
|
@@ -697,7 +697,7 @@ const voiceAgent = new Agent({
|
|
|
697
697
|
id: 'voice-agent',
|
|
698
698
|
name: 'Voice Agent',
|
|
699
699
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
700
|
-
model: 'openai/gpt-5.
|
|
700
|
+
model: 'openai/gpt-5.5',
|
|
701
701
|
voice: new InworldRealtimeVoice({
|
|
702
702
|
apiKey: process.env.INWORLD_API_KEY,
|
|
703
703
|
model: 'inworld/models/gemma-4-26b-a4b-it',
|
|
@@ -1132,7 +1132,7 @@ const voiceAgent = new Agent({
|
|
|
1132
1132
|
id: 'aisdk-voice-agent',
|
|
1133
1133
|
name: 'AI SDK Voice Agent',
|
|
1134
1134
|
instructions: 'You are a helpful assistant with voice capabilities.',
|
|
1135
|
-
model: 'openai/gpt-5.
|
|
1135
|
+
model: 'openai/gpt-5.5',
|
|
1136
1136
|
voice,
|
|
1137
1137
|
})
|
|
1138
1138
|
```
|
|
@@ -32,7 +32,7 @@ const agent = new Agent({
|
|
|
32
32
|
id: 'agent',
|
|
33
33
|
name: 'OpenAI Realtime Agent',
|
|
34
34
|
instructions: `You are a helpful assistant with real-time voice capabilities.`,
|
|
35
|
-
model: 'openai/gpt-5.
|
|
35
|
+
model: 'openai/gpt-5.5',
|
|
36
36
|
voice: new OpenAIRealtimeVoice(),
|
|
37
37
|
})
|
|
38
38
|
|
|
@@ -66,7 +66,7 @@ const agent = new Agent({
|
|
|
66
66
|
name: 'Gemini Live Agent',
|
|
67
67
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
68
68
|
// Model used for text generation; voice provider handles realtime audio
|
|
69
|
-
model: 'openai/gpt-5.
|
|
69
|
+
model: 'openai/gpt-5.5',
|
|
70
70
|
voice: new GeminiLiveVoice({
|
|
71
71
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
72
72
|
model: 'gemini-2.0-flash-exp',
|
|
@@ -113,7 +113,7 @@ const agent = new Agent({
|
|
|
113
113
|
name: 'Nova Sonic Agent',
|
|
114
114
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
115
115
|
// Model used for text generation; voice provider handles realtime audio
|
|
116
|
-
model: 'openai/gpt-5.
|
|
116
|
+
model: 'openai/gpt-5.5',
|
|
117
117
|
voice: new NovaSonicVoice({
|
|
118
118
|
region: 'us-east-1',
|
|
119
119
|
speaker: 'matthew',
|
|
@@ -157,7 +157,7 @@ const agent = new Agent({
|
|
|
157
157
|
name: 'Inworld Realtime Agent',
|
|
158
158
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
159
159
|
// Model used for text generation; voice provider handles realtime audio
|
|
160
|
-
model: 'openai/gpt-5.
|
|
160
|
+
model: 'openai/gpt-5.5',
|
|
161
161
|
voice: new InworldRealtimeVoice({
|
|
162
162
|
apiKey: process.env.INWORLD_API_KEY,
|
|
163
163
|
model: 'inworld/models/gemma-4-26b-a4b-it',
|
|
@@ -54,7 +54,7 @@ await voice.send(microphoneStream)
|
|
|
54
54
|
voice.close()
|
|
55
55
|
```
|
|
56
56
|
|
|
57
|
-
> Inworld API keys ship pre-Basic-encoded. Paste them verbatim into `INWORLD_API_KEY`; the package
|
|
57
|
+
> Inworld API keys ship pre-Basic-encoded. Paste them verbatim into `INWORLD_API_KEY`; the package doesn't re-encode them.
|
|
58
58
|
|
|
59
59
|
## Constructor parameters
|
|
60
60
|
|
|
@@ -116,7 +116,7 @@ Use the typed `session` field for documented Inworld realtime options. Fields co
|
|
|
116
116
|
|
|
117
117
|
### `providerData` (Inworld extensions)
|
|
118
118
|
|
|
119
|
-
`providerData` is a typed object for Inworld-specific realtime extensions. It
|
|
119
|
+
`providerData` is a typed object for Inworld-specific realtime extensions. It's sent under `session.providerData` on every `session.update`, and composes with any `session.providerData` you set via the `session` field — the constructor `providerData` wins on key collisions.
|
|
120
120
|
|
|
121
121
|
It has five branches plus two session-level fields:
|
|
122
122
|
|
|
@@ -342,12 +342,12 @@ Any voice ID from [Inworld's voice catalog](https://docs.inworld.ai/quickstart-t
|
|
|
342
342
|
|
|
343
343
|
## Notes
|
|
344
344
|
|
|
345
|
-
- API keys can be provided via constructor options or the `INWORLD_API_KEY` environment variable. Keys are pre-Basic-encoded;
|
|
345
|
+
- API keys can be provided via constructor options or the `INWORLD_API_KEY` environment variable. Keys are pre-Basic-encoded; don't re-encode them.
|
|
346
346
|
- The WebSocket URL appends `?key=<sessionId>&protocol=realtime`. The model is configured via the initial `session.update`, not the URL.
|
|
347
|
-
- Per-call `speak(input, { speaker })` scopes the voice override to a single response (via the flat `response.voice` field) and
|
|
347
|
+
- Per-call `speak(input, { speaker })` scopes the voice override to a single response (via the flat `response.voice` field) and doesn't mutate the session.
|
|
348
348
|
- Audio output defaults to PCM16 at 24 kHz. Telephony `audio/pcmu` and `audio/pcma` at 8 kHz, and `audio/float32`, are also supported via `session.audio.output.format`.
|
|
349
349
|
- Use `connect()` before any send, speak, or listen call. Events sent before the WebSocket is open are queued and flushed once the server acknowledges `session.updated`.
|
|
350
350
|
- The voice instance must be closed with `close()` or `disconnect()` to release the WebSocket.
|
|
351
|
-
- `audio.input.turn_detection` defaults to semantic VAD when `session`
|
|
351
|
+
- `audio.input.turn_detection` defaults to semantic VAD when `session` doesn't supply it. Override with your own object, or pass `null` to disable turn detection entirely.
|
|
352
352
|
- `audio.input.transcription` defaults to `{ model: 'inworld/inworld-stt-1' }`, so user-side `writing` events fire out of the box. Override with your own object, or pass `null` to disable user-side transcription.
|
|
353
353
|
- `on()` and `off()` are typed against `InworldVoiceEventMap` — known event names yield a typed callback payload, unknown names fall back to `unknown`.
|
|
@@ -130,6 +130,6 @@ const speakers = await voice.getSpeakers()
|
|
|
130
130
|
## Notes
|
|
131
131
|
|
|
132
132
|
- The TTS endpoint uses progressive NDJSON streaming, so audio playback can begin before the full response is received.
|
|
133
|
-
- An API key can be provided via the `speechModel` or `listeningModel` config, or the `INWORLD_API_KEY` environment variable. TTS and STT keys are resolved independently: passing distinct `speechModel.apiKey` and `listeningModel.apiKey` values lets each service use its own credential. If only one is provided, it
|
|
133
|
+
- An API key can be provided via the `speechModel` or `listeningModel` config, or the `INWORLD_API_KEY` environment variable. TTS and STT keys are resolved independently: passing distinct `speechModel.apiKey` and `listeningModel.apiKey` values lets each service use its own credential. If only one is provided, it's reused for both services as a fallback before the env var.
|
|
134
134
|
- `inworld-tts-2` is the default flagship model. Use `deliveryMode` (`STABLE` | `BALANCED` | `CREATIVE`) to steer delivery style on this model. The `temperature` option is ignored on `inworld-tts-2`.
|
|
135
135
|
- The `inworld-tts-1.5-mini` model offers lower latency at the cost of reduced voice quality compared to `inworld-tts-1.5-max`.
|
package/dist/index.cjs
CHANGED
|
@@ -7,7 +7,7 @@ var zodToJsonSchema = require('zod-to-json-schema');
|
|
|
7
7
|
|
|
8
8
|
// src/index.ts
|
|
9
9
|
|
|
10
|
-
// ../../packages/_internal-core/dist/chunk-
|
|
10
|
+
// ../../packages/_internal-core/dist/chunk-3M4SEWMI.js
|
|
11
11
|
var RegisteredLogger = {
|
|
12
12
|
LLM: "LLM"};
|
|
13
13
|
var LogLevel = {
|
|
@@ -104,7 +104,7 @@ var ConsoleLogger = class _ConsoleLogger extends MastraLogger {
|
|
|
104
104
|
}
|
|
105
105
|
warn(message, ...args) {
|
|
106
106
|
if ((this.level === LogLevel.WARN || this.level === LogLevel.INFO || this.level === LogLevel.DEBUG) && this.shouldLog(LogLevel.WARN, message, args)) {
|
|
107
|
-
console.
|
|
107
|
+
console.warn(`${this.prefix()}${message}`, ...args);
|
|
108
108
|
}
|
|
109
109
|
}
|
|
110
110
|
error(message, ...args) {
|