@mastra/voice-inworld 0.0.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/LICENSE.md +30 -0
- package/dist/docs/SKILL.md +26 -0
- package/dist/docs/assets/SOURCE_MAP.json +6 -0
- package/dist/docs/references/docs-voice-overview.md +1112 -0
- package/dist/docs/references/reference-voice-inworld.md +133 -0
- package/package.json +15 -16
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Inworld
|
|
2
|
+
|
|
3
|
+
The Inworld voice implementation in Mastra provides streaming text-to-speech (TTS) and batch speech-to-text (STT) capabilities using Inworld AI's API. It supports multiple TTS and STT models, configurable audio encodings, and progressive audio streaming.
|
|
4
|
+
|
|
5
|
+
## Usage example
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import { InworldVoice } from '@mastra/voice-inworld'
|
|
9
|
+
|
|
10
|
+
// Initialize with default configuration (uses INWORLD_API_KEY environment variable)
|
|
11
|
+
const voice = new InworldVoice()
|
|
12
|
+
|
|
13
|
+
// Initialize with custom configuration
|
|
14
|
+
const voice = new InworldVoice({
|
|
15
|
+
speechModel: {
|
|
16
|
+
name: 'inworld-tts-2',
|
|
17
|
+
apiKey: 'your-api-key',
|
|
18
|
+
},
|
|
19
|
+
listeningModel: {
|
|
20
|
+
name: 'groq/whisper-large-v3',
|
|
21
|
+
apiKey: 'your-api-key',
|
|
22
|
+
},
|
|
23
|
+
speaker: 'Dennis',
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
// Text-to-Speech (streaming)
|
|
27
|
+
const audioStream = await voice.speak('Hello, world!')
|
|
28
|
+
|
|
29
|
+
// Speech-to-Text
|
|
30
|
+
const transcript = await voice.listen(audioStream)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Constructor parameters
|
|
34
|
+
|
|
35
|
+
**speechModel** (`InworldVoiceConfig`): Configuration for text-to-speech functionality. (Default: `{ name: 'inworld-tts-2' }`)
|
|
36
|
+
|
|
37
|
+
**speechModel.name** (`'inworld-tts-2' | 'inworld-tts-1.5-max' | 'inworld-tts-1.5-mini'`): The Inworld TTS model to use.
|
|
38
|
+
|
|
39
|
+
**speechModel.apiKey** (`string`): Inworld API key. Falls back to INWORLD\_API\_KEY environment variable.
|
|
40
|
+
|
|
41
|
+
**listeningModel** (`InworldListeningConfig`): Configuration for speech-to-text functionality. (Default: `{ name: 'groq/whisper-large-v3' }`)
|
|
42
|
+
|
|
43
|
+
**listeningModel.name** (`'groq/whisper-large-v3'`): The Inworld STT model to use.
|
|
44
|
+
|
|
45
|
+
**listeningModel.apiKey** (`string`): Inworld API key. Falls back to INWORLD\_API\_KEY environment variable.
|
|
46
|
+
|
|
47
|
+
**speaker** (`string`): Default voice ID to use for text-to-speech. (Default: `'Dennis'`)
|
|
48
|
+
|
|
49
|
+
**audioEncoding** (`'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'ALAW' | 'MULAW' | 'FLAC' | 'PCM' | 'WAV'`): Default audio encoding for TTS output. (Default: `'MP3'`)
|
|
50
|
+
|
|
51
|
+
**sampleRateHertz** (`number`): Default sample rate for TTS output. (Default: `48000`)
|
|
52
|
+
|
|
53
|
+
**language** (`string`): Default BCP-47 language code for STT. (Default: `'en-US'`)
|
|
54
|
+
|
|
55
|
+
## Methods
|
|
56
|
+
|
|
57
|
+
### `speak(input, options?)`
|
|
58
|
+
|
|
59
|
+
Converts text to speech using Inworld's streaming TTS endpoint. Returns a readable stream that emits audio chunks progressively as they arrive.
|
|
60
|
+
|
|
61
|
+
```typescript
|
|
62
|
+
const audioStream = await voice.speak('Hello, world!', {
|
|
63
|
+
speaker: 'Olivia',
|
|
64
|
+
audioEncoding: 'WAV',
|
|
65
|
+
sampleRateHertz: 24000,
|
|
66
|
+
speakingRate: 1.2,
|
|
67
|
+
temperature: 0.8,
|
|
68
|
+
})
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**input** (`string | NodeJS.ReadableStream`): Text to convert to speech. If a stream is provided, it will be converted to text first.
|
|
72
|
+
|
|
73
|
+
**options** (`InworldSpeakOptions`): Additional options for speech synthesis.
|
|
74
|
+
|
|
75
|
+
**options.speaker** (`string`): Override the default speaker for this request.
|
|
76
|
+
|
|
77
|
+
**options.audioEncoding** (`AudioEncoding`): Override the default audio encoding.
|
|
78
|
+
|
|
79
|
+
**options.sampleRateHertz** (`number`): Override the default sample rate.
|
|
80
|
+
|
|
81
|
+
**options.speakingRate** (`number`): Adjust the speaking rate.
|
|
82
|
+
|
|
83
|
+
**options.temperature** (`number`): Controls voice variability. Honored on \`inworld-tts-1.5-\*\` models; ignored by \`inworld-tts-2\`.
|
|
84
|
+
|
|
85
|
+
**options.deliveryMode** (`'STABLE' | 'BALANCED' | 'CREATIVE'`): Steering control for delivery style. Only honored by \`inworld-tts-2\`.
|
|
86
|
+
|
|
87
|
+
**options.language** (`string`): BCP-47 language code for this request. Auto-detected when omitted.
|
|
88
|
+
|
|
89
|
+
**Returns:** `Promise<NodeJS.ReadableStream>`
|
|
90
|
+
|
|
91
|
+
### `listen(input, options?)`
|
|
92
|
+
|
|
93
|
+
Converts speech to text using Inworld's batch STT endpoint.
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
const transcript = await voice.listen(audioStream, {
|
|
97
|
+
audioEncoding: 'MP3',
|
|
98
|
+
sampleRateHertz: 44100,
|
|
99
|
+
language: 'ja-JP',
|
|
100
|
+
})
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**input** (`NodeJS.ReadableStream`): Audio stream to transcribe.
|
|
104
|
+
|
|
105
|
+
**options** (`InworldListenOptions`): Additional options for transcription.
|
|
106
|
+
|
|
107
|
+
**options.audioEncoding** (`'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'FLAC' | 'AUTO_DETECT'`): Audio encoding of the input stream.
|
|
108
|
+
|
|
109
|
+
**options.sampleRateHertz** (`number`): Sample rate of the input audio.
|
|
110
|
+
|
|
111
|
+
**options.language** (`string`): BCP-47 language code for transcription.
|
|
112
|
+
|
|
113
|
+
**options.numberOfChannels** (`number`): Number of audio channels in the input.
|
|
114
|
+
|
|
115
|
+
**Returns:** `Promise<string>`
|
|
116
|
+
|
|
117
|
+
### `getSpeakers()`
|
|
118
|
+
|
|
119
|
+
Returns a list of available voices from the Inworld API.
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
const speakers = await voice.getSpeakers()
|
|
123
|
+
// [{ voiceId: 'Dennis', name: 'Dennis', language: 'en', description: '...', tags: ['friendly'], source: 'SYSTEM' }, ...]
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**Returns:** `Promise<Array<{ voiceId: string; name: string; language: string; description: string; tags: string[]; source: string }>>`
|
|
127
|
+
|
|
128
|
+
## Notes
|
|
129
|
+
|
|
130
|
+
- The TTS endpoint uses progressive NDJSON streaming, so audio playback can begin before the full response is received.
|
|
131
|
+
- An API key can be provided via the `speechModel` or `listeningModel` config, or the `INWORLD_API_KEY` environment variable. TTS and STT keys are resolved independently: passing distinct `speechModel.apiKey` and `listeningModel.apiKey` values lets each service use its own credential. If only one is provided, it is reused for both services as a fallback before the env var.
|
|
132
|
+
- `inworld-tts-2` is the default flagship model. Use `deliveryMode` (`STABLE` | `BALANCED` | `CREATIVE`) to steer delivery style on this model. The `temperature` option is ignored on `inworld-tts-2`.
|
|
133
|
+
- The `inworld-tts-1.5-mini` model offers lower latency at the cost of reduced voice quality compared to `inworld-tts-1.5-max`.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/voice-inworld",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Mastra Inworld AI voice integration — streaming TTS and batch STT",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -22,26 +22,19 @@
|
|
|
22
22
|
},
|
|
23
23
|
"./package.json": "./package.json"
|
|
24
24
|
},
|
|
25
|
-
"scripts": {
|
|
26
|
-
"build": "tsup --silent --config tsup.config.ts",
|
|
27
|
-
"prepack": "pnpx tsx ../../scripts/generate-package-docs.ts",
|
|
28
|
-
"build:watch": "tsup --watch --silent --config tsup.config.ts",
|
|
29
|
-
"test": "vitest run",
|
|
30
|
-
"lint": "eslint ."
|
|
31
|
-
},
|
|
32
25
|
"license": "Apache-2.0",
|
|
33
26
|
"dependencies": {},
|
|
34
27
|
"devDependencies": {
|
|
35
|
-
"@internal/lint": "workspace:*",
|
|
36
|
-
"@internal/types-builder": "workspace:*",
|
|
37
|
-
"@mastra/core": "workspace:*",
|
|
38
28
|
"@types/node": "22.19.15",
|
|
39
|
-
"@vitest/coverage-v8": "
|
|
40
|
-
"@vitest/ui": "
|
|
29
|
+
"@vitest/coverage-v8": "4.1.5",
|
|
30
|
+
"@vitest/ui": "4.1.5",
|
|
41
31
|
"eslint": "^10.2.1",
|
|
42
32
|
"tsup": "^8.5.1",
|
|
43
|
-
"typescript": "
|
|
44
|
-
"vitest": "
|
|
33
|
+
"typescript": "^6.0.3",
|
|
34
|
+
"vitest": "4.1.5",
|
|
35
|
+
"@mastra/core": "1.33.0",
|
|
36
|
+
"@internal/lint": "0.0.93",
|
|
37
|
+
"@internal/types-builder": "0.0.68"
|
|
45
38
|
},
|
|
46
39
|
"peerDependencies": {
|
|
47
40
|
"@mastra/core": ">=1.0.0-0 <2.0.0-0",
|
|
@@ -58,5 +51,11 @@
|
|
|
58
51
|
},
|
|
59
52
|
"engines": {
|
|
60
53
|
"node": ">=22.13.0"
|
|
54
|
+
},
|
|
55
|
+
"scripts": {
|
|
56
|
+
"build": "tsup --silent --config tsup.config.ts",
|
|
57
|
+
"build:watch": "tsup --watch --silent --config tsup.config.ts",
|
|
58
|
+
"test": "vitest run",
|
|
59
|
+
"lint": "eslint ."
|
|
61
60
|
}
|
|
62
|
-
}
|
|
61
|
+
}
|