vision-agents-plugins-inworld 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agents_plugins_inworld-0.2.3/.gitignore +90 -0
- vision_agents_plugins_inworld-0.2.3/PKG-INFO +82 -0
- vision_agents_plugins_inworld-0.2.3/README.md +67 -0
- vision_agents_plugins_inworld-0.2.3/pyproject.toml +42 -0
- vision_agents_plugins_inworld-0.2.3/vision_agents/plugins/inworld/__init__.py +3 -0
- vision_agents_plugins_inworld-0.2.3/vision_agents/plugins/inworld/tts.py +172 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.cursor/*
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
downloads/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib64/
|
|
16
|
+
parts/
|
|
17
|
+
sdist/
|
|
18
|
+
var/
|
|
19
|
+
wheels/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
pip-wheel-metadata/
|
|
22
|
+
MANIFEST
|
|
23
|
+
*.egg-info/
|
|
24
|
+
*.egg
|
|
25
|
+
|
|
26
|
+
# Installer logs
|
|
27
|
+
pip-log.txt
|
|
28
|
+
pip-delete-this-directory.txt
|
|
29
|
+
|
|
30
|
+
# Unit test / coverage reports
|
|
31
|
+
htmlcov/
|
|
32
|
+
.tox/
|
|
33
|
+
.nox/
|
|
34
|
+
.coverage
|
|
35
|
+
.coverage.*
|
|
36
|
+
.cache
|
|
37
|
+
coverage.xml
|
|
38
|
+
nosetests.xml
|
|
39
|
+
*.cover
|
|
40
|
+
*.py,cover
|
|
41
|
+
.hypothesis/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
|
|
44
|
+
# Type checker / lint caches
|
|
45
|
+
.mypy_cache/
|
|
46
|
+
.dmypy.json
|
|
47
|
+
dmypy.json
|
|
48
|
+
.pytype/
|
|
49
|
+
.pyre/
|
|
50
|
+
.ruff_cache/
|
|
51
|
+
|
|
52
|
+
# Environments
|
|
53
|
+
.venv
|
|
54
|
+
env/
|
|
55
|
+
venv/
|
|
56
|
+
ENV/
|
|
57
|
+
env.bak/
|
|
58
|
+
venv.bak/
|
|
59
|
+
.env
|
|
60
|
+
.env.local
|
|
61
|
+
.env.*.local
|
|
62
|
+
.env.bak
|
|
63
|
+
pyvenv.cfg
|
|
64
|
+
.python-version
|
|
65
|
+
|
|
66
|
+
# Editors / IDEs
|
|
67
|
+
.vscode/
|
|
68
|
+
.idea/
|
|
69
|
+
|
|
70
|
+
# Jupyter Notebook
|
|
71
|
+
.ipynb_checkpoints/
|
|
72
|
+
|
|
73
|
+
# OS / Misc
|
|
74
|
+
.DS_Store
|
|
75
|
+
*.log
|
|
76
|
+
|
|
77
|
+
# Tooling & repo-specific
|
|
78
|
+
pyrightconfig.json
|
|
79
|
+
shell.nix
|
|
80
|
+
bin/*
|
|
81
|
+
lib/*
|
|
82
|
+
stream-py/
|
|
83
|
+
|
|
84
|
+
# Artifacts / assets
|
|
85
|
+
*.pt
|
|
86
|
+
*.kef
|
|
87
|
+
*.onnx
|
|
88
|
+
profile.html
|
|
89
|
+
|
|
90
|
+
/opencode.json
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-inworld
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary: Inworld AI TTS integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,TTS,agents,inworld,text-to-speech,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: av>=10.0.0
|
|
12
|
+
Requires-Dist: httpx>=0.27.0
|
|
13
|
+
Requires-Dist: vision-agents
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Inworld AI Text-to-Speech Plugin
|
|
17
|
+
|
|
18
|
+
A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv add vision-agents[inworld]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from vision_agents.plugins import inworld
|
|
30
|
+
|
|
31
|
+
# Initialize with API key from environment variable
|
|
32
|
+
tts = inworld.TTS()
|
|
33
|
+
|
|
34
|
+
# Or specify API key and other options directly
|
|
35
|
+
tts = inworld.TTS(
|
|
36
|
+
api_key="your_inworld_api_key",
|
|
37
|
+
voice_id="Dennis",
|
|
38
|
+
model_id="inworld-tts-1",
|
|
39
|
+
temperature=1.1
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Use with an Agent
|
|
43
|
+
from vision_agents.core import Agent
|
|
44
|
+
from vision_agents.plugins import getstream, gemini, smart_turn
|
|
45
|
+
|
|
46
|
+
agent = Agent(
|
|
47
|
+
edge=getstream.Edge(),
|
|
48
|
+
tts=inworld.TTS(),
|
|
49
|
+
llm=gemini.LLM("gemini-2.0-flash"),
|
|
50
|
+
turn_detection=smart_turn.TurnDetection(),
|
|
51
|
+
)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Configuration Options
|
|
55
|
+
|
|
56
|
+
- `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
|
|
57
|
+
- `voice_id`: The voice ID to use for synthesis (default: "Dennis")
|
|
58
|
+
- `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
|
|
59
|
+
- `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
|
|
60
|
+
|
|
61
|
+
## Requirements
|
|
62
|
+
|
|
63
|
+
- Python 3.10+
|
|
64
|
+
- httpx>=0.27.0
|
|
65
|
+
"av>=10.0.0",
|
|
66
|
+
|
|
67
|
+
## Getting Started
|
|
68
|
+
|
|
69
|
+
1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
|
|
70
|
+
2. Set the `INWORLD_API_KEY` environment variable:
|
|
71
|
+
```bash
|
|
72
|
+
export INWORLD_API_KEY="your_api_key_here"
|
|
73
|
+
```
|
|
74
|
+
3. Use the plugin in your Vision Agents application
|
|
75
|
+
|
|
76
|
+
## API Reference
|
|
77
|
+
|
|
78
|
+
The plugin implements the standard Vision Agents TTS interface:
|
|
79
|
+
|
|
80
|
+
- `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
|
|
81
|
+
- `stop_audio()`: Stop audio playback (no-op for this plugin)
|
|
82
|
+
- `send(text: str)`: Send text to be converted to speech (inherited from base class)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Inworld AI Text-to-Speech Plugin
|
|
2
|
+
|
|
3
|
+
A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv add vision-agents[inworld]
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from vision_agents.plugins import inworld
|
|
15
|
+
|
|
16
|
+
# Initialize with API key from environment variable
|
|
17
|
+
tts = inworld.TTS()
|
|
18
|
+
|
|
19
|
+
# Or specify API key and other options directly
|
|
20
|
+
tts = inworld.TTS(
|
|
21
|
+
api_key="your_inworld_api_key",
|
|
22
|
+
voice_id="Dennis",
|
|
23
|
+
model_id="inworld-tts-1",
|
|
24
|
+
temperature=1.1
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Use with an Agent
|
|
28
|
+
from vision_agents.core import Agent
|
|
29
|
+
from vision_agents.plugins import getstream, gemini, smart_turn
|
|
30
|
+
|
|
31
|
+
agent = Agent(
|
|
32
|
+
edge=getstream.Edge(),
|
|
33
|
+
tts=inworld.TTS(),
|
|
34
|
+
llm=gemini.LLM("gemini-2.0-flash"),
|
|
35
|
+
turn_detection=smart_turn.TurnDetection(),
|
|
36
|
+
)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Configuration Options
|
|
40
|
+
|
|
41
|
+
- `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
|
|
42
|
+
- `voice_id`: The voice ID to use for synthesis (default: "Dennis")
|
|
43
|
+
- `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
|
|
44
|
+
- `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
|
|
45
|
+
|
|
46
|
+
## Requirements
|
|
47
|
+
|
|
48
|
+
- Python 3.10+
|
|
49
|
+
- httpx>=0.27.0
|
|
50
|
+
"av>=10.0.0",
|
|
51
|
+
|
|
52
|
+
## Getting Started
|
|
53
|
+
|
|
54
|
+
1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
|
|
55
|
+
2. Set the `INWORLD_API_KEY` environment variable:
|
|
56
|
+
```bash
|
|
57
|
+
export INWORLD_API_KEY="your_api_key_here"
|
|
58
|
+
```
|
|
59
|
+
3. Use the plugin in your Vision Agents application
|
|
60
|
+
|
|
61
|
+
## API Reference
|
|
62
|
+
|
|
63
|
+
The plugin implements the standard Vision Agents TTS interface:
|
|
64
|
+
|
|
65
|
+
- `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
|
|
66
|
+
- `stop_audio()`: Stop audio playback (no-op for this plugin)
|
|
67
|
+
- `send(text: str)`: Send text to be converted to speech (inherited from base class)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-inworld"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Inworld AI TTS integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["inworld", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"httpx>=0.27.0",
|
|
16
|
+
"av>=10.0.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Documentation = "https://visionagents.ai/"
|
|
21
|
+
Website = "https://visionagents.ai/"
|
|
22
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
23
|
+
|
|
24
|
+
[tool.hatch.version]
|
|
25
|
+
source = "vcs"
|
|
26
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = [".", "vision_agents"]
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.sdist]
|
|
32
|
+
include = ["/vision_agents"]
|
|
33
|
+
|
|
34
|
+
[tool.uv.sources]
|
|
35
|
+
vision-agents = { workspace = true }
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8.4.1",
|
|
40
|
+
"pytest-asyncio>=1.0.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from typing import AsyncIterator, Literal, Optional
|
|
7
|
+
|
|
8
|
+
import av
|
|
9
|
+
import httpx
|
|
10
|
+
from getstream.video.rtc.track_util import PcmData
|
|
11
|
+
|
|
12
|
+
from vision_agents.core import tts
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
INWORLD_API_BASE = "https://api.inworld.ai"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TTS(tts.TTS):
|
|
20
|
+
"""
|
|
21
|
+
Inworld AI Text-to-Speech implementation.
|
|
22
|
+
Inworld AI provides high-quality text-to-speech synthesis with streaming support.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
api_key: Optional[str] = None,
|
|
28
|
+
voice_id: str = "Dennis",
|
|
29
|
+
model_id: Literal["inworld-tts-1", "inworld-tts-1-max"] = "inworld-tts-1",
|
|
30
|
+
temperature: float = 1.1,
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the Inworld AI TTS service.
|
|
34
|
+
Args:
|
|
35
|
+
api_key: Inworld AI API key. If not provided, the INWORLD_API_KEY
|
|
36
|
+
environment variable will be used.
|
|
37
|
+
voice_id: The voice ID to use for synthesis (default: "Dennis").
|
|
38
|
+
model_id: The model ID to use for synthesis. Options: "inworld-tts-1",
|
|
39
|
+
"inworld-tts-1-max" (default: "inworld-tts-1").
|
|
40
|
+
temperature: Determines the degree of randomness when sampling audio tokens.
|
|
41
|
+
Accepts values between 0 and 2. Default: 1.1.
|
|
42
|
+
"""
|
|
43
|
+
super().__init__(provider_name="inworld")
|
|
44
|
+
|
|
45
|
+
api_key = api_key or os.getenv("INWORLD_API_KEY")
|
|
46
|
+
if not api_key:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"INWORLD_API_KEY environment variable must be set or api_key must be provided"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
self.api_key = api_key
|
|
52
|
+
self.voice_id = voice_id
|
|
53
|
+
self.model_id = model_id
|
|
54
|
+
self.temperature = temperature
|
|
55
|
+
self.base_url = INWORLD_API_BASE
|
|
56
|
+
self.client = httpx.AsyncClient(timeout=60.0)
|
|
57
|
+
|
|
58
|
+
async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[PcmData]:
|
|
59
|
+
"""
|
|
60
|
+
Convert text to speech using Inworld AI API.
|
|
61
|
+
Args:
|
|
62
|
+
text: The text to convert to speech (max 2,000 characters).
|
|
63
|
+
Returns:
|
|
64
|
+
An async iterator of audio chunks as PcmData objects.
|
|
65
|
+
"""
|
|
66
|
+
url = f"{self.base_url}/tts/v1/voice:stream"
|
|
67
|
+
|
|
68
|
+
credentials = f"Basic {self.api_key}"
|
|
69
|
+
headers = {
|
|
70
|
+
"Authorization": credentials,
|
|
71
|
+
"Content-Type": "application/json",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
payload = {
|
|
75
|
+
"text": text,
|
|
76
|
+
"voiceId": self.voice_id,
|
|
77
|
+
"modelId": self.model_id,
|
|
78
|
+
"audioConfig": {
|
|
79
|
+
"temperature": self.temperature,
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async def _stream_audio() -> AsyncIterator[PcmData]:
|
|
84
|
+
try:
|
|
85
|
+
async with self.client.stream(
|
|
86
|
+
"POST", url, headers=headers, json=payload
|
|
87
|
+
) as response:
|
|
88
|
+
async for pcm in self._process_response(response):
|
|
89
|
+
yield pcm
|
|
90
|
+
except httpx.HTTPStatusError as e:
|
|
91
|
+
logger.error(
|
|
92
|
+
"Inworld AI API HTTP error: %s - %s",
|
|
93
|
+
e.response.status_code,
|
|
94
|
+
e.response.text,
|
|
95
|
+
)
|
|
96
|
+
raise
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error("Error streaming audio from Inworld AI: %s", e)
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
# Return the async generator
|
|
102
|
+
return _stream_audio()
|
|
103
|
+
|
|
104
|
+
async def _process_response(
|
|
105
|
+
self, response: httpx.Response
|
|
106
|
+
) -> AsyncIterator[PcmData]:
|
|
107
|
+
# Check status before processing streaming response
|
|
108
|
+
if response.status_code >= 400:
|
|
109
|
+
error_text = await response.aread()
|
|
110
|
+
error_msg = error_text.decode() if error_text else "Unknown error"
|
|
111
|
+
logger.error(
|
|
112
|
+
"Inworld AI API HTTP error: %s - %s",
|
|
113
|
+
response.status_code,
|
|
114
|
+
error_msg,
|
|
115
|
+
)
|
|
116
|
+
raise httpx.HTTPStatusError(
|
|
117
|
+
f"HTTP {response.status_code}: {error_msg}",
|
|
118
|
+
request=response.request,
|
|
119
|
+
response=response,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
async for line in response.aiter_lines():
|
|
123
|
+
if not line.strip():
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
data = json.loads(line)
|
|
128
|
+
if "error" in data:
|
|
129
|
+
error_msg = data["error"].get("message", "Unknown error")
|
|
130
|
+
logger.error("Inworld AI API error: %s", error_msg)
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
if "result" in data and "audioContent" in data["result"]:
|
|
134
|
+
wav_bytes = base64.b64decode(data["result"]["audioContent"])
|
|
135
|
+
|
|
136
|
+
container = av.open(io.BytesIO(wav_bytes))
|
|
137
|
+
assert isinstance(container, av.container.InputContainer)
|
|
138
|
+
with container:
|
|
139
|
+
audio_stream = container.streams.audio[0]
|
|
140
|
+
pcm: Optional[PcmData] = None
|
|
141
|
+
for frame in container.decode(audio_stream):
|
|
142
|
+
frame_pcm = PcmData.from_av_frame(frame)
|
|
143
|
+
if pcm is None:
|
|
144
|
+
pcm = frame_pcm
|
|
145
|
+
else:
|
|
146
|
+
pcm.append(frame_pcm)
|
|
147
|
+
|
|
148
|
+
if pcm:
|
|
149
|
+
pcm = pcm.resample(
|
|
150
|
+
target_sample_rate=pcm.sample_rate,
|
|
151
|
+
target_channels=1,
|
|
152
|
+
).to_int16()
|
|
153
|
+
yield pcm
|
|
154
|
+
except json.JSONDecodeError as e:
|
|
155
|
+
logger.warning("Failed to parse JSON line: %s", e)
|
|
156
|
+
continue
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning("Error processing audio chunk: %s", e)
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
async def stop_audio(self) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Clears the queue and stops playing audio.
|
|
164
|
+
This method can be used manually or under the hood in response to turn events.
|
|
165
|
+
Returns:
|
|
166
|
+
None
|
|
167
|
+
"""
|
|
168
|
+
logger.info("🎤 Inworld AI TTS stop requested (no-op)")
|
|
169
|
+
|
|
170
|
+
async def close(self) -> None:
|
|
171
|
+
if self.client:
|
|
172
|
+
await self.client.aclose()
|