vision-agents-plugins-inworld 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
.gitignore ADDED
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .cursor/*
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ downloads/
12
+ develop-eggs/
13
+ eggs/
14
+ .eggs/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ share/python-wheels/
21
+ pip-wheel-metadata/
22
+ MANIFEST
23
+ *.egg-info/
24
+ *.egg
25
+
26
+ # Installer logs
27
+ pip-log.txt
28
+ pip-delete-this-directory.txt
29
+
30
+ # Unit test / coverage reports
31
+ htmlcov/
32
+ .tox/
33
+ .nox/
34
+ .coverage
35
+ .coverage.*
36
+ .cache
37
+ coverage.xml
38
+ nosetests.xml
39
+ *.cover
40
+ *.py,cover
41
+ .hypothesis/
42
+ .pytest_cache/
43
+
44
+ # Type checker / lint caches
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+ .pytype/
49
+ .pyre/
50
+ .ruff_cache/
51
+
52
+ # Environments
53
+ .venv
54
+ env/
55
+ venv/
56
+ ENV/
57
+ env.bak/
58
+ venv.bak/
59
+ .env
60
+ .env.local
61
+ .env.*.local
62
+ .env.bak
63
+ pyvenv.cfg
64
+ .python-version
65
+
66
+ # Editors / IDEs
67
+ .vscode/
68
+ .idea/
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints/
72
+
73
+ # OS / Misc
74
+ .DS_Store
75
+ *.log
76
+
77
+ # Tooling & repo-specific
78
+ pyrightconfig.json
79
+ shell.nix
80
+ bin/*
81
+ lib/*
82
+ stream-py/
83
+
84
+ # Artifacts / assets
85
+ *.pt
86
+ *.kef
87
+ *.onnx
88
+ profile.html
89
+
90
+ /opencode.json
PKG-INFO ADDED
@@ -0,0 +1,82 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-inworld
3
+ Version: 0.2.9
4
+ Summary: Inworld AI TTS integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,TTS,agents,inworld,text-to-speech,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: av>=10.0.0
12
+ Requires-Dist: httpx>=0.27.0
13
+ Requires-Dist: vision-agents
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Inworld AI Text-to-Speech Plugin
17
+
18
+ A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ uv add vision-agents[inworld]
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ from vision_agents.plugins import inworld
30
+
31
+ # Initialize with API key from environment variable
32
+ tts = inworld.TTS()
33
+
34
+ # Or specify API key and other options directly
35
+ tts = inworld.TTS(
36
+ api_key="your_inworld_api_key",
37
+ voice_id="Dennis",
38
+ model_id="inworld-tts-1",
39
+ temperature=1.1
40
+ )
41
+
42
+ # Use with an Agent
43
+ from vision_agents.core import Agent
44
+ from vision_agents.plugins import getstream, gemini, smart_turn
45
+
46
+ agent = Agent(
47
+ edge=getstream.Edge(),
48
+ tts=inworld.TTS(),
49
+ llm=gemini.LLM("gemini-2.0-flash"),
50
+ turn_detection=smart_turn.TurnDetection(),
51
+ )
52
+ ```
53
+
54
+ ## Configuration Options
55
+
56
+ - `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
57
+ - `voice_id`: The voice ID to use for synthesis (default: "Dennis")
58
+ - `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
59
+ - `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
60
+
61
+ ## Requirements
62
+
63
+ - Python 3.10+
64
+ - httpx>=0.27.0
65
+ "av>=10.0.0",
66
+
67
+ ## Getting Started
68
+
69
+ 1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
70
+ 2. Set the `INWORLD_API_KEY` environment variable:
71
+ ```bash
72
+ export INWORLD_API_KEY="your_api_key_here"
73
+ ```
74
+ 3. Use the plugin in your Vision Agents application
75
+
76
+ ## API Reference
77
+
78
+ The plugin implements the standard Vision Agents TTS interface:
79
+
80
+ - `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
81
+ - `stop_audio()`: Stop audio playback (no-op for this plugin)
82
+ - `send(text: str)`: Send text to be converted to speech (inherited from base class)
README.md ADDED
@@ -0,0 +1,67 @@
1
+ # Inworld AI Text-to-Speech Plugin
2
+
3
+ A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ uv add vision-agents[inworld]
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from vision_agents.plugins import inworld
15
+
16
+ # Initialize with API key from environment variable
17
+ tts = inworld.TTS()
18
+
19
+ # Or specify API key and other options directly
20
+ tts = inworld.TTS(
21
+ api_key="your_inworld_api_key",
22
+ voice_id="Dennis",
23
+ model_id="inworld-tts-1",
24
+ temperature=1.1
25
+ )
26
+
27
+ # Use with an Agent
28
+ from vision_agents.core import Agent
29
+ from vision_agents.plugins import getstream, gemini, smart_turn
30
+
31
+ agent = Agent(
32
+ edge=getstream.Edge(),
33
+ tts=inworld.TTS(),
34
+ llm=gemini.LLM("gemini-2.0-flash"),
35
+ turn_detection=smart_turn.TurnDetection(),
36
+ )
37
+ ```
38
+
39
+ ## Configuration Options
40
+
41
+ - `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
42
+ - `voice_id`: The voice ID to use for synthesis (default: "Dennis")
43
+ - `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
44
+ - `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
45
+
46
+ ## Requirements
47
+
48
+ - Python 3.10+
49
+ - httpx>=0.27.0
50
+ "av>=10.0.0",
51
+
52
+ ## Getting Started
53
+
54
+ 1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
55
+ 2. Set the `INWORLD_API_KEY` environment variable:
56
+ ```bash
57
+ export INWORLD_API_KEY="your_api_key_here"
58
+ ```
59
+ 3. Use the plugin in your Vision Agents application
60
+
61
+ ## API Reference
62
+
63
+ The plugin implements the standard Vision Agents TTS interface:
64
+
65
+ - `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
66
+ - `stop_audio()`: Stop audio playback (no-op for this plugin)
67
+ - `send(text: str)`: Send text to be converted to speech (inherited from base class)
pyproject.toml ADDED
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vision-agents-plugins-inworld"
7
+ dynamic = ["version"]
8
+ description = "Inworld AI TTS integration for Vision Agents"
9
+ readme = "README.md"
10
+ keywords = ["inworld", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
11
+ requires-python = ">=3.10"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "vision-agents",
15
+ "httpx>=0.27.0",
16
+ "av>=10.0.0",
17
+ ]
18
+
19
+ [project.urls]
20
+ Documentation = "https://visionagents.ai/"
21
+ Website = "https://visionagents.ai/"
22
+ Source = "https://github.com/GetStream/Vision-Agents"
23
+
24
+ [tool.hatch.version]
25
+ source = "vcs"
26
+ raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = [".", "vision_agents"]
30
+
31
+ [tool.hatch.build.targets.sdist]
32
+ include = ["/vision_agents"]
33
+
34
+ [tool.uv.sources]
35
+ vision-agents = { workspace = true }
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "pytest>=8.4.1",
40
+ "pytest-asyncio>=1.0.0",
41
+ ]
42
+
@@ -0,0 +1,3 @@
1
+ from vision_agents.plugins.inworld.tts import TTS
2
+
3
+ __all__ = ["TTS"]
@@ -0,0 +1,172 @@
1
+ import base64
2
+ import io
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import AsyncIterator, Literal, Optional
7
+
8
+ import av
9
+ import httpx
10
+ from getstream.video.rtc.track_util import PcmData
11
+
12
+ from vision_agents.core import tts
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ INWORLD_API_BASE = "https://api.inworld.ai"
17
+
18
+
19
+ class TTS(tts.TTS):
20
+ """
21
+ Inworld AI Text-to-Speech implementation.
22
+ Inworld AI provides high-quality text-to-speech synthesis with streaming support.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ api_key: Optional[str] = None,
28
+ voice_id: str = "Dennis",
29
+ model_id: Literal["inworld-tts-1", "inworld-tts-1-max"] = "inworld-tts-1",
30
+ temperature: float = 1.1,
31
+ ):
32
+ """
33
+ Initialize the Inworld AI TTS service.
34
+ Args:
35
+ api_key: Inworld AI API key. If not provided, the INWORLD_API_KEY
36
+ environment variable will be used.
37
+ voice_id: The voice ID to use for synthesis (default: "Dennis").
38
+ model_id: The model ID to use for synthesis. Options: "inworld-tts-1",
39
+ "inworld-tts-1-max" (default: "inworld-tts-1").
40
+ temperature: Determines the degree of randomness when sampling audio tokens.
41
+ Accepts values between 0 and 2. Default: 1.1.
42
+ """
43
+ super().__init__(provider_name="inworld")
44
+
45
+ api_key = api_key or os.getenv("INWORLD_API_KEY")
46
+ if not api_key:
47
+ raise ValueError(
48
+ "INWORLD_API_KEY environment variable must be set or api_key must be provided"
49
+ )
50
+
51
+ self.api_key = api_key
52
+ self.voice_id = voice_id
53
+ self.model_id = model_id
54
+ self.temperature = temperature
55
+ self.base_url = INWORLD_API_BASE
56
+ self.client = httpx.AsyncClient(timeout=60.0)
57
+
58
+ async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[PcmData]:
59
+ """
60
+ Convert text to speech using Inworld AI API.
61
+ Args:
62
+ text: The text to convert to speech (max 2,000 characters).
63
+ Returns:
64
+ An async iterator of audio chunks as PcmData objects.
65
+ """
66
+ url = f"{self.base_url}/tts/v1/voice:stream"
67
+
68
+ credentials = f"Basic {self.api_key}"
69
+ headers = {
70
+ "Authorization": credentials,
71
+ "Content-Type": "application/json",
72
+ }
73
+
74
+ payload = {
75
+ "text": text,
76
+ "voiceId": self.voice_id,
77
+ "modelId": self.model_id,
78
+ "audioConfig": {
79
+ "temperature": self.temperature,
80
+ },
81
+ }
82
+
83
+ async def _stream_audio() -> AsyncIterator[PcmData]:
84
+ try:
85
+ async with self.client.stream(
86
+ "POST", url, headers=headers, json=payload
87
+ ) as response:
88
+ async for pcm in self._process_response(response):
89
+ yield pcm
90
+ except httpx.HTTPStatusError as e:
91
+ logger.error(
92
+ "Inworld AI API HTTP error: %s - %s",
93
+ e.response.status_code,
94
+ e.response.text,
95
+ )
96
+ raise
97
+ except Exception as e:
98
+ logger.error("Error streaming audio from Inworld AI: %s", e)
99
+ raise
100
+
101
+ # Return the async generator
102
+ return _stream_audio()
103
+
104
+ async def _process_response(
105
+ self, response: httpx.Response
106
+ ) -> AsyncIterator[PcmData]:
107
+ # Check status before processing streaming response
108
+ if response.status_code >= 400:
109
+ error_text = await response.aread()
110
+ error_msg = error_text.decode() if error_text else "Unknown error"
111
+ logger.error(
112
+ "Inworld AI API HTTP error: %s - %s",
113
+ response.status_code,
114
+ error_msg,
115
+ )
116
+ raise httpx.HTTPStatusError(
117
+ f"HTTP {response.status_code}: {error_msg}",
118
+ request=response.request,
119
+ response=response,
120
+ )
121
+
122
+ async for line in response.aiter_lines():
123
+ if not line.strip():
124
+ continue
125
+
126
+ try:
127
+ data = json.loads(line)
128
+ if "error" in data:
129
+ error_msg = data["error"].get("message", "Unknown error")
130
+ logger.error("Inworld AI API error: %s", error_msg)
131
+ continue
132
+
133
+ if "result" in data and "audioContent" in data["result"]:
134
+ wav_bytes = base64.b64decode(data["result"]["audioContent"])
135
+
136
+ container = av.open(io.BytesIO(wav_bytes))
137
+ assert isinstance(container, av.container.InputContainer)
138
+ with container:
139
+ audio_stream = container.streams.audio[0]
140
+ pcm: Optional[PcmData] = None
141
+ for frame in container.decode(audio_stream):
142
+ frame_pcm = PcmData.from_av_frame(frame)
143
+ if pcm is None:
144
+ pcm = frame_pcm
145
+ else:
146
+ pcm.append(frame_pcm)
147
+
148
+ if pcm:
149
+ pcm = pcm.resample(
150
+ target_sample_rate=pcm.sample_rate,
151
+ target_channels=1,
152
+ ).to_int16()
153
+ yield pcm
154
+ except json.JSONDecodeError as e:
155
+ logger.warning("Failed to parse JSON line: %s", e)
156
+ continue
157
+ except Exception as e:
158
+ logger.warning("Error processing audio chunk: %s", e)
159
+ continue
160
+
161
+ async def stop_audio(self) -> None:
162
+ """
163
+ Clears the queue and stops playing audio.
164
+ This method can be used manually or under the hood in response to turn events.
165
+ Returns:
166
+ None
167
+ """
168
+ logger.info("🎤 Inworld AI TTS stop requested (no-op)")
169
+
170
+ async def close(self) -> None:
171
+ if self.client:
172
+ await self.client.aclose()
@@ -0,0 +1,82 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-agents-plugins-inworld
3
+ Version: 0.2.9
4
+ Summary: Inworld AI TTS integration for Vision Agents
5
+ Project-URL: Documentation, https://visionagents.ai/
6
+ Project-URL: Website, https://visionagents.ai/
7
+ Project-URL: Source, https://github.com/GetStream/Vision-Agents
8
+ License-Expression: MIT
9
+ Keywords: AI,TTS,agents,inworld,text-to-speech,voice agents
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: av>=10.0.0
12
+ Requires-Dist: httpx>=0.27.0
13
+ Requires-Dist: vision-agents
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Inworld AI Text-to-Speech Plugin
17
+
18
+ A high-quality Text-to-Speech (TTS) plugin for Vision Agents that uses the Inworld AI API with streaming support.
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ uv add vision-agents[inworld]
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ from vision_agents.plugins import inworld
30
+
31
+ # Initialize with API key from environment variable
32
+ tts = inworld.TTS()
33
+
34
+ # Or specify API key and other options directly
35
+ tts = inworld.TTS(
36
+ api_key="your_inworld_api_key",
37
+ voice_id="Dennis",
38
+ model_id="inworld-tts-1",
39
+ temperature=1.1
40
+ )
41
+
42
+ # Use with an Agent
43
+ from vision_agents.core import Agent
44
+ from vision_agents.plugins import getstream, gemini, smart_turn
45
+
46
+ agent = Agent(
47
+ edge=getstream.Edge(),
48
+ tts=inworld.TTS(),
49
+ llm=gemini.LLM("gemini-2.0-flash"),
50
+ turn_detection=smart_turn.TurnDetection(),
51
+ )
52
+ ```
53
+
54
+ ## Configuration Options
55
+
56
+ - `api_key`: Inworld AI API key (default: reads from `INWORLD_API_KEY` environment variable)
57
+ - `voice_id`: The voice ID to use for synthesis (default: "Dennis")
58
+ - `model_id`: The model ID to use for synthesis. Options: "inworld-tts-1", "inworld-tts-1-max" (default: "inworld-tts-1")
59
+ - `temperature`: Determines the degree of randomness when sampling audio tokens. Accepts values between 0 and 2 (default: 1.1)
60
+
61
+ ## Requirements
62
+
63
+ - Python 3.10+
64
+ - httpx>=0.27.0
65
+ "av>=10.0.0",
66
+
67
+ ## Getting Started
68
+
69
+ 1. Get your Inworld AI API key from the [Inworld Portal](https://studio.inworld.ai/)
70
+ 2. Set the `INWORLD_API_KEY` environment variable:
71
+ ```bash
72
+ export INWORLD_API_KEY="your_api_key_here"
73
+ ```
74
+ 3. Use the plugin in your Vision Agents application
75
+
76
+ ## API Reference
77
+
78
+ The plugin implements the standard Vision Agents TTS interface:
79
+
80
+ - `stream_audio(text: str)`: Convert text to speech and return an async iterator of `PcmData` chunks
81
+ - `stop_audio()`: Stop audio playback (no-op for this plugin)
82
+ - `send(text: str)`: Send text to be converted to speech (inherited from base class)
@@ -0,0 +1,11 @@
1
+ ./.gitignore,sha256=zrSq4X-Qh8j7QY0ukXt-RXj6StdhdbJdR3e8HoHbTTg,961
2
+ ./PKG-INFO,sha256=dsKkY3gMzgtPkbFDLzzuTIgWz0MJEikBriIl0Sz3P1M,2437
3
+ ./README.md,sha256=hSRtsarmdJny5KvVpSG1QTqWLJDh9cyWt_F_reUAhN4,1913
4
+ ./pyproject.toml,sha256=Q87omcO1BwfFv6hsPSL0LnaVZlzOusvZVO8QUU-Znwo,991
5
+ ./vision_agents/plugins/inworld/__init__.py,sha256=Ag-b3YN_aspZlNzEegXT8jTsVwq1M1_Ad0Y-v2puHL0,69
6
+ ./vision_agents/plugins/inworld/tts.py,sha256=QdWU4K0CS_V7J9mBVlcmo1oGzUJxZ2dBUhFkyLZsQ2E,6162
7
+ vision_agents/plugins/inworld/__init__.py,sha256=Ag-b3YN_aspZlNzEegXT8jTsVwq1M1_Ad0Y-v2puHL0,69
8
+ vision_agents/plugins/inworld/tts.py,sha256=QdWU4K0CS_V7J9mBVlcmo1oGzUJxZ2dBUhFkyLZsQ2E,6162
9
+ vision_agents_plugins_inworld-0.2.9.dist-info/METADATA,sha256=dsKkY3gMzgtPkbFDLzzuTIgWz0MJEikBriIl0Sz3P1M,2437
10
+ vision_agents_plugins_inworld-0.2.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
11
+ vision_agents_plugins_inworld-0.2.9.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any