vision-agents-plugins-kokoro 0.0.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-kokoro might be problematic. Click here for more details.
- vision_agents_plugins_kokoro-0.0.17/.gitignore +32 -0
- vision_agents_plugins_kokoro-0.0.17/PKG-INFO +54 -0
- vision_agents_plugins_kokoro-0.0.17/README.md +38 -0
- vision_agents_plugins_kokoro-0.0.17/pyproject.toml +42 -0
- vision_agents_plugins_kokoro-0.0.17/vision_agents/plugins/kokoro/__init__.py +6 -0
- vision_agents_plugins_kokoro-0.0.17/vision_agents/plugins/kokoro/tts.py +92 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
*/__pycache__
|
|
2
|
+
*/chat/__pycache__
|
|
3
|
+
*/video/__pycache__
|
|
4
|
+
*/chat/sync/__pycache__
|
|
5
|
+
*/chat/async_/__pycache__
|
|
6
|
+
*/sync/__pycache__
|
|
7
|
+
*/async_/__pycache__
|
|
8
|
+
*/video/sync/__pycache__
|
|
9
|
+
*/model/__pycache__/
|
|
10
|
+
*/cli/__pycache__
|
|
11
|
+
*/cli/__pycache__
|
|
12
|
+
.env
|
|
13
|
+
.venv
|
|
14
|
+
.vscode/settings.json
|
|
15
|
+
*.pyc
|
|
16
|
+
dist/*
|
|
17
|
+
dist/*
|
|
18
|
+
*.log
|
|
19
|
+
.python-version
|
|
20
|
+
pyvenv.cfg
|
|
21
|
+
.idea*
|
|
22
|
+
bin/*
|
|
23
|
+
lib/*
|
|
24
|
+
shell.nix
|
|
25
|
+
pyrightconfig.json
|
|
26
|
+
.DS_Store
|
|
27
|
+
|
|
28
|
+
*.egg-info/
|
|
29
|
+
*.egg
|
|
30
|
+
*.pt
|
|
31
|
+
*.kef
|
|
32
|
+
.env.bak
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-kokoro
|
|
3
|
+
Version: 0.0.17
|
|
4
|
+
Summary: Kokoro TTS integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,TTS,agents,kokoro,text-to-speech,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: kokoro>=0.9.4
|
|
12
|
+
Requires-Dist: misaki[en]>=0.9.4
|
|
13
|
+
Requires-Dist: numpy<2.3,>=2.2.6
|
|
14
|
+
Requires-Dist: vision-agents
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# GetStream Kokoro Plugin
|
|
18
|
+
|
|
19
|
+
This package integrates the open-weight [Kokoro-82M TTS model](https://github.com/hexgrad/kokoro) with the GetStream audio/video SDK.
|
|
20
|
+
|
|
21
|
+
It provides a drop-in `KokoroTTS` class that implements the common `getstream_common.tts.TTS` interface, allowing you to stream PCM audio generated by Kokoro directly into a WebRTC `AudioStreamTrack`.
|
|
22
|
+
|
|
23
|
+
```py
|
|
24
|
+
from getstream.plugins.kokoro import KokoroTTS
|
|
25
|
+
from getstream.video.rtc.audio_track import AudioStreamTrack
|
|
26
|
+
|
|
27
|
+
track = AudioStreamTrack(framerate=24_000)
|
|
28
|
+
|
|
29
|
+
tts = KokoroTTS(lang_code="a", voice="af_heart")
|
|
30
|
+
tts.set_output_track(track)
|
|
31
|
+
|
|
32
|
+
await tts.send("Hello from Kokoro!")
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install getstream-plugins-kokoro
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
This will pull in the required `kokoro`, `numpy` and `getstream[webrtc"]` dependencies. You also need `espeak-ng` **at runtime** for pronunciation fallback. On macOS you can install it with Homebrew:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
brew install espeak-ng
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Configuration options
|
|
48
|
+
|
|
49
|
+
| Parameter | Default | Description |
|
|
50
|
+
|-----------|---------|-------------|
|
|
51
|
+
| `lang_code` | `"a"` | Language group passed to `KPipeline` (`"a"` = American English, etc.) |
|
|
52
|
+
| `voice` | `"af_heart"` | Kokoro voice preset. See the [model card](https://huggingface.co/NeuML/kokoro-int8-onnx#speaker-reference) for available options. |
|
|
53
|
+
| `speed` | `1.0` | Playback speed multiplier. |
|
|
54
|
+
| `sample_rate` | `24000` | Output sample-rate (fixed by Kokoro). **The attached `AudioStreamTrack` must use the same value.** |
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# GetStream Kokoro Plugin
|
|
2
|
+
|
|
3
|
+
This package integrates the open-weight [Kokoro-82M TTS model](https://github.com/hexgrad/kokoro) with the GetStream audio/video SDK.
|
|
4
|
+
|
|
5
|
+
It provides a drop-in `KokoroTTS` class that implements the common `getstream_common.tts.TTS` interface, allowing you to stream PCM audio generated by Kokoro directly into a WebRTC `AudioStreamTrack`.
|
|
6
|
+
|
|
7
|
+
```py
|
|
8
|
+
from getstream.plugins.kokoro import KokoroTTS
|
|
9
|
+
from getstream.video.rtc.audio_track import AudioStreamTrack
|
|
10
|
+
|
|
11
|
+
track = AudioStreamTrack(framerate=24_000)
|
|
12
|
+
|
|
13
|
+
tts = KokoroTTS(lang_code="a", voice="af_heart")
|
|
14
|
+
tts.set_output_track(track)
|
|
15
|
+
|
|
16
|
+
await tts.send("Hello from Kokoro!")
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install getstream-plugins-kokoro
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
This will pull in the required `kokoro`, `numpy` and `getstream[webrtc"]` dependencies. You also need `espeak-ng` **at runtime** for pronunciation fallback. On macOS you can install it with Homebrew:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
brew install espeak-ng
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Configuration options
|
|
32
|
+
|
|
33
|
+
| Parameter | Default | Description |
|
|
34
|
+
|-----------|---------|-------------|
|
|
35
|
+
| `lang_code` | `"a"` | Language group passed to `KPipeline` (`"a"` = American English, etc.) |
|
|
36
|
+
| `voice` | `"af_heart"` | Kokoro voice preset. See the [model card](https://huggingface.co/NeuML/kokoro-int8-onnx#speaker-reference) for available options. |
|
|
37
|
+
| `speed` | `1.0` | Playback speed multiplier. |
|
|
38
|
+
| `sample_rate` | `24000` | Output sample-rate (fixed by Kokoro). **The attached `AudioStreamTrack` must use the same value.** |
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vision-agents-plugins-kokoro"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Kokoro TTS integration for Vision Agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["kokoro", "TTS", "text-to-speech", "AI", "voice agents", "agents"]
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"vision-agents",
|
|
15
|
+
"kokoro>=0.9.4",
|
|
16
|
+
"misaki[en]>=0.9.4",
|
|
17
|
+
"numpy>=2.2.6,<2.3",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Documentation = "https://visionagents.ai/"
|
|
22
|
+
Website = "https://visionagents.ai/"
|
|
23
|
+
Source = "https://github.com/GetStream/Vision-Agents"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.version]
|
|
26
|
+
source = "vcs"
|
|
27
|
+
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["."]
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.sdist]
|
|
33
|
+
include = ["/vision_agents"]
|
|
34
|
+
|
|
35
|
+
[tool.uv.sources]
|
|
36
|
+
vision-agents = { workspace = true }
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=8.4.1",
|
|
41
|
+
"pytest-asyncio>=1.0.0",
|
|
42
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from typing import AsyncIterator, List, Optional
|
|
8
|
+
|
|
9
|
+
from vision_agents.core import tts
|
|
10
|
+
from getstream.video.rtc.audio_track import AudioStreamTrack
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from kokoro import KPipeline # type: ignore
|
|
14
|
+
except ModuleNotFoundError: # pragma: no cover – mocked during CI
|
|
15
|
+
KPipeline = None # type: ignore # noqa: N816
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TTS(tts.TTS):
|
|
19
|
+
"""Text-to-Speech plugin backed by the Kokoro-82M model."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
lang_code: str = "a", # American English
|
|
24
|
+
voice: str = "af_heart",
|
|
25
|
+
speed: float = 1.0,
|
|
26
|
+
sample_rate: int = 24_000,
|
|
27
|
+
device: Optional[str] = None,
|
|
28
|
+
client: Optional[KPipeline] = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
super().__init__()
|
|
31
|
+
|
|
32
|
+
if KPipeline is None:
|
|
33
|
+
raise ImportError(
|
|
34
|
+
"The 'kokoro' package is not installed. ``pip install kokoro`` first."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
self._pipeline = (
|
|
38
|
+
KPipeline(lang_code=lang_code)
|
|
39
|
+
if device is None
|
|
40
|
+
else KPipeline(lang_code=lang_code, device=device)
|
|
41
|
+
)
|
|
42
|
+
self.voice = voice
|
|
43
|
+
self.speed = speed
|
|
44
|
+
self.sample_rate = sample_rate
|
|
45
|
+
self.client = client if client is not None else self._pipeline
|
|
46
|
+
|
|
47
|
+
def get_required_framerate(self) -> int:
|
|
48
|
+
"""Get the required framerate for Kokoro TTS."""
|
|
49
|
+
return self.sample_rate
|
|
50
|
+
|
|
51
|
+
def get_required_stereo(self) -> bool:
|
|
52
|
+
"""Get whether Kokoro TTS requires stereo audio."""
|
|
53
|
+
return False # Kokoro returns mono audio
|
|
54
|
+
|
|
55
|
+
def set_output_track(self, track: AudioStreamTrack) -> None: # noqa: D401
|
|
56
|
+
if track.framerate != self.sample_rate:
|
|
57
|
+
raise TypeError(
|
|
58
|
+
f"Invalid framerate {track.framerate}, Kokoro requires {self.sample_rate} Hz"
|
|
59
|
+
)
|
|
60
|
+
super().set_output_track(track)
|
|
61
|
+
|
|
62
|
+
async def stream_audio(self, text: str, *_, **__) -> AsyncIterator[bytes]: # noqa: D401
|
|
63
|
+
loop = asyncio.get_event_loop()
|
|
64
|
+
chunks: List[bytes] = await loop.run_in_executor(
|
|
65
|
+
None, lambda: list(self._generate_chunks(text))
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
async def _aiter():
|
|
69
|
+
for chunk in chunks:
|
|
70
|
+
yield chunk
|
|
71
|
+
|
|
72
|
+
return _aiter()
|
|
73
|
+
|
|
74
|
+
async def stop_audio(self) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Clears the queue and stops playing audio.
|
|
77
|
+
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
await self.track.flush()
|
|
81
|
+
return
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logging.error(f"Error flushing audio track: {e}")
|
|
84
|
+
|
|
85
|
+
def _generate_chunks(self, text: str):
|
|
86
|
+
for _gs, _ps, audio in self._pipeline(
|
|
87
|
+
text, voice=self.voice, speed=self.speed, split_pattern=r"\n+"
|
|
88
|
+
):
|
|
89
|
+
if not isinstance(audio, np.ndarray):
|
|
90
|
+
audio = np.asarray(audio)
|
|
91
|
+
pcm16 = (np.clip(audio, -1.0, 1.0) * 32767.0).astype("<i2")
|
|
92
|
+
yield pcm16.tobytes()
|