@mastra/voice-inworld 0.3.0-alpha.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,74 @@
|
|
|
1
1
|
# @mastra/voice-inworld
|
|
2
2
|
|
|
3
|
+
## 0.3.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- `@mastra/voice-inworld` now ships `InworldRealtimeVoice` for full-duplex realtime voice — mic in, speakers out, server-side LLM routing, semantic VAD turn-taking, tool calling, barge-in, and live transcripts of both sides — alongside the existing streaming TTS and batch STT. No separate package needed; import both from the same entry point. ([#16865](https://github.com/mastra-ai/mastra/pull/16865))
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
// Batch TTS / STT (unchanged)
|
|
11
|
+
import { InworldVoice } from '@mastra/voice-inworld';
|
|
12
|
+
|
|
13
|
+
// New: realtime full-duplex voice, from the same package
|
|
14
|
+
import { InworldRealtimeVoice } from '@mastra/voice-inworld';
|
|
15
|
+
|
|
16
|
+
const voice = new InworldRealtimeVoice({
|
|
17
|
+
apiKey: process.env.INWORLD_API_KEY,
|
|
18
|
+
// Defaults: model 'inworld/models/gemma-4-26b-a4b-it', speaker 'Sarah',
|
|
19
|
+
// STT 'inworld/inworld-stt-1', semantic-VAD turn detection.
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
await voice.connect();
|
|
23
|
+
voice.on('speaker', stream => playAudio(stream)); // PCM16 @ 24kHz
|
|
24
|
+
voice.on('writing', ({ text, role }) => console.log(role, text));
|
|
25
|
+
voice.on('interrupted', ({ response_id }) => stopAudio(response_id));
|
|
26
|
+
await voice.send(getMicrophoneStream());
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Typed `providerData` for Inworld realtime extensions**
|
|
30
|
+
|
|
31
|
+
`InworldRealtimeVoice` now accepts a typed `providerData` object for Inworld-specific extensions — STT tuning, TTS segmentation and steering, automatic memory, back-channel, and responsiveness — sent under `session.providerData`. The provider also surfaces inbound extension data: a `voiceProfile` on user `writing` events, a `memory` event for the rolling summary/facts state, and `backchannel` / `backchannel.done` / `backchannel.skipped` events for back-channel audio.
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
const voice = new InworldRealtimeVoice({
|
|
35
|
+
providerData: {
|
|
36
|
+
stt: { voice_profile: true, language_hints: ['en-US'] },
|
|
37
|
+
tts: { delivery_mode: 'CREATIVE', segmenter_strategy: 'balanced' },
|
|
38
|
+
memory: { enabled: true, turn_interval: 4 },
|
|
39
|
+
backchannel: { enabled: true, max_per_turn: 1 },
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
voice.on('memory', state => console.log(state.summary, state.facts));
|
|
44
|
+
voice.on('backchannel', stream => playAudio(stream));
|
|
45
|
+
voice.on('writing', ({ role, voiceProfile }) => console.log(role, voiceProfile?.emotion));
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**Realtime fixes and additions**
|
|
49
|
+
- Fixed the per-call `speak(text, { speaker })` voice override. It is now sent as the flat `response.voice` field, so the per-call speaker is no longer silently ignored by the server.
|
|
50
|
+
- Added manual turn-taking methods `commitInput()`, `clearInput()`, and `clearOutput()` for push-to-talk and manual turn control (use `clearOutput()` only to hard-stop all playback — it also stops in-flight back-channels).
|
|
51
|
+
- Added smart-turn and playback-state events: `turn-suggestion`, `turn-suggestion-revoked`, `input-committed`, `input-cleared`, `input-timeout`, and `output-audio-started` / `output-audio-stopped` / `output-audio-cleared`.
|
|
52
|
+
- Added richer typed session config: input noise reduction, telephony (8 kHz) and float32 audio formats, a server-VAD `idle_timeout_ms`, plus `tracing`, `include`, and `prompt`.
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
// Push-to-talk with no auto-VAD
|
|
56
|
+
const voice = new InworldRealtimeVoice({
|
|
57
|
+
session: { audio: { input: { turn_detection: null } } },
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
await voice.send(getMicrophoneStream());
|
|
61
|
+
voice.commitInput(); // end the user turn manually
|
|
62
|
+
|
|
63
|
+
voice.on('output-audio-stopped', () => console.log('playback finished'));
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Patch Changes
|
|
67
|
+
|
|
68
|
+
- Moved shared voice primitives and route metadata into the new `@internal/voice` package so voice providers no longer depend on `@mastra/core` and server voice routes share the same route definitions. ([#16725](https://github.com/mastra-ai/mastra/pull/16725))
|
|
69
|
+
|
|
70
|
+
`@mastra/core/voice` continues to re-export the voice APIs for backwards compatibility.
|
|
71
|
+
|
|
3
72
|
## 0.3.0-alpha.1
|
|
4
73
|
|
|
5
74
|
### Minor Changes
|
package/dist/docs/SKILL.md
CHANGED
|
@@ -3,7 +3,7 @@ name: mastra-voice-inworld
|
|
|
3
3
|
description: Documentation for @mastra/voice-inworld. Use when working with @mastra/voice-inworld APIs, configuration, or implementation.
|
|
4
4
|
metadata:
|
|
5
5
|
package: "@mastra/voice-inworld"
|
|
6
|
-
version: "0.3.0
|
|
6
|
+
version: "0.3.0"
|
|
7
7
|
---
|
|
8
8
|
|
|
9
9
|
## When to use
|
|
@@ -16,7 +16,7 @@ const voiceAgent = new Agent({
|
|
|
16
16
|
id: 'voice-agent',
|
|
17
17
|
name: 'Voice Agent',
|
|
18
18
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
19
|
-
model: 'openai/gpt-5.
|
|
19
|
+
model: 'openai/gpt-5.5',
|
|
20
20
|
voice: new OpenAIVoice(),
|
|
21
21
|
})
|
|
22
22
|
```
|
|
@@ -40,7 +40,7 @@ const voiceAgent = new Agent({
|
|
|
40
40
|
id: 'voice-agent',
|
|
41
41
|
name: 'Voice Agent',
|
|
42
42
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
43
|
-
model: 'openai/gpt-5.
|
|
43
|
+
model: 'openai/gpt-5.5',
|
|
44
44
|
voice: new OpenAIVoice(),
|
|
45
45
|
})
|
|
46
46
|
|
|
@@ -68,7 +68,7 @@ const voiceAgent = new Agent({
|
|
|
68
68
|
id: 'voice-agent',
|
|
69
69
|
name: 'Voice Agent',
|
|
70
70
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
71
|
-
model: 'openai/gpt-5.
|
|
71
|
+
model: 'openai/gpt-5.5',
|
|
72
72
|
voice: new AzureVoice(),
|
|
73
73
|
})
|
|
74
74
|
|
|
@@ -95,7 +95,7 @@ const voiceAgent = new Agent({
|
|
|
95
95
|
id: 'voice-agent',
|
|
96
96
|
name: 'Voice Agent',
|
|
97
97
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
98
|
-
model: 'openai/gpt-5.
|
|
98
|
+
model: 'openai/gpt-5.5',
|
|
99
99
|
voice: new ElevenLabsVoice(),
|
|
100
100
|
})
|
|
101
101
|
|
|
@@ -122,7 +122,7 @@ const voiceAgent = new Agent({
|
|
|
122
122
|
id: 'voice-agent',
|
|
123
123
|
name: 'Voice Agent',
|
|
124
124
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
125
|
-
model: 'openai/gpt-5.
|
|
125
|
+
model: 'openai/gpt-5.5',
|
|
126
126
|
voice: new PlayAIVoice(),
|
|
127
127
|
})
|
|
128
128
|
|
|
@@ -149,7 +149,7 @@ const voiceAgent = new Agent({
|
|
|
149
149
|
id: 'voice-agent',
|
|
150
150
|
name: 'Voice Agent',
|
|
151
151
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
152
|
-
model: 'openai/gpt-5.
|
|
152
|
+
model: 'openai/gpt-5.5',
|
|
153
153
|
voice: new GoogleVoice(),
|
|
154
154
|
})
|
|
155
155
|
|
|
@@ -176,7 +176,7 @@ const voiceAgent = new Agent({
|
|
|
176
176
|
id: 'voice-agent',
|
|
177
177
|
name: 'Voice Agent',
|
|
178
178
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
179
|
-
model: 'openai/gpt-5.
|
|
179
|
+
model: 'openai/gpt-5.5',
|
|
180
180
|
voice: new CloudflareVoice(),
|
|
181
181
|
})
|
|
182
182
|
|
|
@@ -203,7 +203,7 @@ const voiceAgent = new Agent({
|
|
|
203
203
|
id: 'voice-agent',
|
|
204
204
|
name: 'Voice Agent',
|
|
205
205
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
206
|
-
model: 'openai/gpt-5.
|
|
206
|
+
model: 'openai/gpt-5.5',
|
|
207
207
|
voice: new DeepgramVoice(),
|
|
208
208
|
})
|
|
209
209
|
|
|
@@ -230,7 +230,7 @@ const voiceAgent = new Agent({
|
|
|
230
230
|
id: 'voice-agent',
|
|
231
231
|
name: 'Voice Agent',
|
|
232
232
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
233
|
-
model: 'openai/gpt-5.
|
|
233
|
+
model: 'openai/gpt-5.5',
|
|
234
234
|
voice: new InworldVoice(),
|
|
235
235
|
})
|
|
236
236
|
|
|
@@ -257,7 +257,7 @@ const voiceAgent = new Agent({
|
|
|
257
257
|
id: 'voice-agent',
|
|
258
258
|
name: 'Voice Agent',
|
|
259
259
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
260
|
-
model: 'openai/gpt-5.
|
|
260
|
+
model: 'openai/gpt-5.5',
|
|
261
261
|
voice: new SpeechifyVoice(),
|
|
262
262
|
})
|
|
263
263
|
|
|
@@ -284,7 +284,7 @@ const voiceAgent = new Agent({
|
|
|
284
284
|
id: 'voice-agent',
|
|
285
285
|
name: 'Voice Agent',
|
|
286
286
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
287
|
-
model: 'openai/gpt-5.
|
|
287
|
+
model: 'openai/gpt-5.5',
|
|
288
288
|
voice: new SarvamVoice(),
|
|
289
289
|
})
|
|
290
290
|
|
|
@@ -311,7 +311,7 @@ const voiceAgent = new Agent({
|
|
|
311
311
|
id: 'voice-agent',
|
|
312
312
|
name: 'Voice Agent',
|
|
313
313
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
314
|
-
model: 'openai/gpt-5.
|
|
314
|
+
model: 'openai/gpt-5.5',
|
|
315
315
|
voice: new MurfVoice(),
|
|
316
316
|
})
|
|
317
317
|
|
|
@@ -346,7 +346,7 @@ const voiceAgent = new Agent({
|
|
|
346
346
|
id: 'voice-agent',
|
|
347
347
|
name: 'Voice Agent',
|
|
348
348
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
349
|
-
model: 'openai/gpt-5.
|
|
349
|
+
model: 'openai/gpt-5.5',
|
|
350
350
|
voice: new OpenAIVoice(),
|
|
351
351
|
})
|
|
352
352
|
|
|
@@ -375,7 +375,7 @@ const voiceAgent = new Agent({
|
|
|
375
375
|
id: 'voice-agent',
|
|
376
376
|
name: 'Voice Agent',
|
|
377
377
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
378
|
-
model: 'openai/gpt-5.
|
|
378
|
+
model: 'openai/gpt-5.5',
|
|
379
379
|
voice: new AzureVoice(),
|
|
380
380
|
})
|
|
381
381
|
|
|
@@ -403,7 +403,7 @@ const voiceAgent = new Agent({
|
|
|
403
403
|
id: 'voice-agent',
|
|
404
404
|
name: 'Voice Agent',
|
|
405
405
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
406
|
-
model: 'openai/gpt-5.
|
|
406
|
+
model: 'openai/gpt-5.5',
|
|
407
407
|
voice: new ElevenLabsVoice(),
|
|
408
408
|
})
|
|
409
409
|
|
|
@@ -431,7 +431,7 @@ const voiceAgent = new Agent({
|
|
|
431
431
|
id: 'voice-agent',
|
|
432
432
|
name: 'Voice Agent',
|
|
433
433
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
434
|
-
model: 'openai/gpt-5.
|
|
434
|
+
model: 'openai/gpt-5.5',
|
|
435
435
|
voice: new GoogleVoice(),
|
|
436
436
|
})
|
|
437
437
|
|
|
@@ -459,7 +459,7 @@ const voiceAgent = new Agent({
|
|
|
459
459
|
id: 'voice-agent',
|
|
460
460
|
name: 'Voice Agent',
|
|
461
461
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
462
|
-
model: 'openai/gpt-5.
|
|
462
|
+
model: 'openai/gpt-5.5',
|
|
463
463
|
voice: new CloudflareVoice(),
|
|
464
464
|
})
|
|
465
465
|
|
|
@@ -487,7 +487,7 @@ const voiceAgent = new Agent({
|
|
|
487
487
|
id: 'voice-agent',
|
|
488
488
|
name: 'Voice Agent',
|
|
489
489
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
490
|
-
model: 'openai/gpt-5.
|
|
490
|
+
model: 'openai/gpt-5.5',
|
|
491
491
|
voice: new DeepgramVoice(),
|
|
492
492
|
})
|
|
493
493
|
|
|
@@ -515,7 +515,7 @@ const voiceAgent = new Agent({
|
|
|
515
515
|
id: 'voice-agent',
|
|
516
516
|
name: 'Voice Agent',
|
|
517
517
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
518
|
-
model: 'openai/gpt-5.
|
|
518
|
+
model: 'openai/gpt-5.5',
|
|
519
519
|
voice: new InworldVoice(),
|
|
520
520
|
})
|
|
521
521
|
|
|
@@ -543,7 +543,7 @@ const voiceAgent = new Agent({
|
|
|
543
543
|
id: 'voice-agent',
|
|
544
544
|
name: 'Voice Agent',
|
|
545
545
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
546
|
-
model: 'openai/gpt-5.
|
|
546
|
+
model: 'openai/gpt-5.5',
|
|
547
547
|
voice: new SarvamVoice(),
|
|
548
548
|
})
|
|
549
549
|
|
|
@@ -575,7 +575,7 @@ const voiceAgent = new Agent({
|
|
|
575
575
|
id: 'voice-agent',
|
|
576
576
|
name: 'Voice Agent',
|
|
577
577
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
578
|
-
model: 'openai/gpt-5.
|
|
578
|
+
model: 'openai/gpt-5.5',
|
|
579
579
|
voice: new OpenAIRealtimeVoice(),
|
|
580
580
|
})
|
|
581
581
|
|
|
@@ -605,7 +605,7 @@ const voiceAgent = new Agent({
|
|
|
605
605
|
id: 'voice-agent',
|
|
606
606
|
name: 'Voice Agent',
|
|
607
607
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
608
|
-
model: 'openai/gpt-5.
|
|
608
|
+
model: 'openai/gpt-5.5',
|
|
609
609
|
voice: new GeminiLiveVoice({
|
|
610
610
|
// Live API mode
|
|
611
611
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
@@ -654,7 +654,7 @@ const voiceAgent = new Agent({
|
|
|
654
654
|
id: 'voice-agent',
|
|
655
655
|
name: 'Voice Agent',
|
|
656
656
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
657
|
-
model: 'openai/gpt-5.
|
|
657
|
+
model: 'openai/gpt-5.5',
|
|
658
658
|
voice: new NovaSonicVoice({
|
|
659
659
|
region: 'us-east-1',
|
|
660
660
|
speaker: 'matthew',
|
|
@@ -697,7 +697,7 @@ const voiceAgent = new Agent({
|
|
|
697
697
|
id: 'voice-agent',
|
|
698
698
|
name: 'Voice Agent',
|
|
699
699
|
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
700
|
-
model: 'openai/gpt-5.
|
|
700
|
+
model: 'openai/gpt-5.5',
|
|
701
701
|
voice: new InworldRealtimeVoice({
|
|
702
702
|
apiKey: process.env.INWORLD_API_KEY,
|
|
703
703
|
model: 'inworld/models/gemma-4-26b-a4b-it',
|
|
@@ -1132,7 +1132,7 @@ const voiceAgent = new Agent({
|
|
|
1132
1132
|
id: 'aisdk-voice-agent',
|
|
1133
1133
|
name: 'AI SDK Voice Agent',
|
|
1134
1134
|
instructions: 'You are a helpful assistant with voice capabilities.',
|
|
1135
|
-
model: 'openai/gpt-5.
|
|
1135
|
+
model: 'openai/gpt-5.5',
|
|
1136
1136
|
voice,
|
|
1137
1137
|
})
|
|
1138
1138
|
```
|
|
@@ -32,7 +32,7 @@ const agent = new Agent({
|
|
|
32
32
|
id: 'agent',
|
|
33
33
|
name: 'OpenAI Realtime Agent',
|
|
34
34
|
instructions: `You are a helpful assistant with real-time voice capabilities.`,
|
|
35
|
-
model: 'openai/gpt-5.
|
|
35
|
+
model: 'openai/gpt-5.5',
|
|
36
36
|
voice: new OpenAIRealtimeVoice(),
|
|
37
37
|
})
|
|
38
38
|
|
|
@@ -66,7 +66,7 @@ const agent = new Agent({
|
|
|
66
66
|
name: 'Gemini Live Agent',
|
|
67
67
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
68
68
|
// Model used for text generation; voice provider handles realtime audio
|
|
69
|
-
model: 'openai/gpt-5.
|
|
69
|
+
model: 'openai/gpt-5.5',
|
|
70
70
|
voice: new GeminiLiveVoice({
|
|
71
71
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
72
72
|
model: 'gemini-2.0-flash-exp',
|
|
@@ -113,7 +113,7 @@ const agent = new Agent({
|
|
|
113
113
|
name: 'Nova Sonic Agent',
|
|
114
114
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
115
115
|
// Model used for text generation; voice provider handles realtime audio
|
|
116
|
-
model: 'openai/gpt-5.
|
|
116
|
+
model: 'openai/gpt-5.5',
|
|
117
117
|
voice: new NovaSonicVoice({
|
|
118
118
|
region: 'us-east-1',
|
|
119
119
|
speaker: 'matthew',
|
|
@@ -157,7 +157,7 @@ const agent = new Agent({
|
|
|
157
157
|
name: 'Inworld Realtime Agent',
|
|
158
158
|
instructions: 'You are a helpful assistant with real-time voice capabilities.',
|
|
159
159
|
// Model used for text generation; voice provider handles realtime audio
|
|
160
|
-
model: 'openai/gpt-5.
|
|
160
|
+
model: 'openai/gpt-5.5',
|
|
161
161
|
voice: new InworldRealtimeVoice({
|
|
162
162
|
apiKey: process.env.INWORLD_API_KEY,
|
|
163
163
|
model: 'inworld/models/gemma-4-26b-a4b-it',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/voice-inworld",
|
|
3
|
-
"version": "0.3.0
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Mastra Inworld AI voice integration — streaming TTS, batch STT, and realtime full-duplex voice",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -37,9 +37,9 @@
|
|
|
37
37
|
"typescript": "^6.0.3",
|
|
38
38
|
"vitest": "4.1.5",
|
|
39
39
|
"zod": "^4.4.3",
|
|
40
|
-
"@internal/lint": "0.0.
|
|
41
|
-
"@internal/types-builder": "0.0.
|
|
42
|
-
"@internal/voice": "0.0.
|
|
40
|
+
"@internal/lint": "0.0.100",
|
|
41
|
+
"@internal/types-builder": "0.0.75",
|
|
42
|
+
"@internal/voice": "0.0.1"
|
|
43
43
|
},
|
|
44
44
|
"peerDependencies": {
|
|
45
45
|
"zod": "^3.25.0 || ^4.0.0"
|