openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""CosyVoice TTS provider adapter (voice clone + multilingual, subprocess mode)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from openspeech.core.base import TTSProvider
|
|
11
|
+
|
|
12
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
13
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
14
|
+
from openspeech.core.settings import BaseSettings
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class CosyVoiceTTSSettings(BaseSettings):
|
|
18
|
+
model_dir: str = ""
|
|
19
|
+
device: str = "auto"
|
|
20
|
+
fp16: bool = False
|
|
21
|
+
spk_id: str | None = None
|
|
22
|
+
|
|
23
|
+
class CosyVoiceTTS(TTSProvider):
|
|
24
|
+
name = "cosyvoice"
|
|
25
|
+
provider_type = ProviderType.TTS
|
|
26
|
+
execution_mode = ExecMode.SUBPROCESS
|
|
27
|
+
settings_cls = CosyVoiceTTSSettings
|
|
28
|
+
capabilities = {Capability.VOICE_CLONE, Capability.MULTILINGUAL}
|
|
29
|
+
field_options = {
|
|
30
|
+
"device": ["auto", "cpu", "cuda", "mps"],
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def __init__(self, settings: CosyVoiceTTSSettings | None = None) -> None:
|
|
34
|
+
self.settings = settings or CosyVoiceTTSSettings()
|
|
35
|
+
self._client: Any = None
|
|
36
|
+
self._model: Any = None
|
|
37
|
+
|
|
38
|
+
async def start(self) -> None:
|
|
39
|
+
try:
|
|
40
|
+
from cosyvoice.cli.cosyvoice import CosyVoice as CosyVoiceModel
|
|
41
|
+
except ImportError:
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"Install cosyvoice: pip install openspeech[cosyvoice]"
|
|
44
|
+
)
|
|
45
|
+
self._model = CosyVoiceModel(self.settings.model_dir)
|
|
46
|
+
self._client = self._model
|
|
47
|
+
logger.info("{} provider started", self.name)
|
|
48
|
+
|
|
49
|
+
async def stop(self) -> None:
|
|
50
|
+
self._client = None
|
|
51
|
+
self._model = None
|
|
52
|
+
logger.info("{} provider stopped", self.name)
|
|
53
|
+
|
|
54
|
+
async def health_check(self) -> bool:
|
|
55
|
+
return self._client is not None
|
|
56
|
+
|
|
57
|
+
async def synthesize(
|
|
58
|
+
self, text: str, opts: TTSOptions | None = None
|
|
59
|
+
) -> AudioData:
|
|
60
|
+
if self._client is None:
|
|
61
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
62
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
63
|
+
_t0 = time.perf_counter()
|
|
64
|
+
import io
|
|
65
|
+
|
|
66
|
+
import torchaudio
|
|
67
|
+
|
|
68
|
+
spk_id = self.settings.spk_id or "中文女"
|
|
69
|
+
output = list(self._model.inference_sft(text, spk_id))
|
|
70
|
+
if not output:
|
|
71
|
+
raise RuntimeError("CosyVoice produced no output")
|
|
72
|
+
speech = output[0]["tts_speech"]
|
|
73
|
+
buf = io.BytesIO()
|
|
74
|
+
torchaudio.save(buf, speech, 22050, format="wav")
|
|
75
|
+
result = AudioData(
|
|
76
|
+
data=buf.getvalue(),
|
|
77
|
+
sample_rate=22050,
|
|
78
|
+
channels=1,
|
|
79
|
+
format=AudioFormat.WAV,
|
|
80
|
+
)
|
|
81
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
async def synthesize_stream(
|
|
85
|
+
self, text: str, opts: TTSOptions | None = None
|
|
86
|
+
) -> AsyncIterator[AudioChunk]:
|
|
87
|
+
raise NotImplementedError(
|
|
88
|
+
"CosyVoiceTTS.synthesize_stream() is not yet implemented"
|
|
89
|
+
)
|
|
90
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Deepgram TTS provider adapter (Aura API, httpx-based, no SDK needed)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import AsyncIterator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
from openspeech.logging_config import logger
|
|
11
|
+
|
|
12
|
+
from openspeech.core.base import TTSProvider
|
|
13
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
14
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
15
|
+
from openspeech.core.settings import BaseSettings
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class DeepgramTTSSettings(BaseSettings):
|
|
20
|
+
api_key: str = ""
|
|
21
|
+
model: str = "aura-asteria-en"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeepgramTTS(TTSProvider):
|
|
25
|
+
name = "deepgram-tts"
|
|
26
|
+
provider_type = ProviderType.TTS
|
|
27
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
28
|
+
settings_cls = DeepgramTTSSettings
|
|
29
|
+
capabilities = {Capability.STREAMING, Capability.MULTILINGUAL}
|
|
30
|
+
field_options = {
|
|
31
|
+
"model": [
|
|
32
|
+
# Aura-2 English
|
|
33
|
+
"aura-2-asteria-en", "aura-2-athena-en", "aura-2-luna-en",
|
|
34
|
+
"aura-2-hera-en", "aura-2-orion-en", "aura-2-orpheus-en",
|
|
35
|
+
"aura-2-arcas-en", "aura-2-zeus-en", "aura-2-apollo-en",
|
|
36
|
+
"aura-2-helena-en", "aura-2-andromeda-en", "aura-2-thalia-en",
|
|
37
|
+
"aura-2-aurora-en", "aura-2-iris-en", "aura-2-electra-en",
|
|
38
|
+
# Aura-2 Chinese / Japanese / Korean
|
|
39
|
+
"aura-2-uzume-ja", "aura-2-ebisu-ja", "aura-2-fujin-ja",
|
|
40
|
+
# Aura-2 European
|
|
41
|
+
"aura-2-agathe-fr", "aura-2-hector-fr",
|
|
42
|
+
"aura-2-elara-de", "aura-2-aurelia-de", "aura-2-julius-de",
|
|
43
|
+
"aura-2-sirio-es", "aura-2-carina-es", "aura-2-diana-es",
|
|
44
|
+
"aura-2-melia-it", "aura-2-elio-it",
|
|
45
|
+
"aura-2-beatrix-nl", "aura-2-daphne-nl",
|
|
46
|
+
# Aura-1 English (legacy)
|
|
47
|
+
"aura-asteria-en", "aura-luna-en", "aura-stella-en",
|
|
48
|
+
"aura-athena-en", "aura-hera-en", "aura-orion-en",
|
|
49
|
+
"aura-arcas-en", "aura-perseus-en", "aura-angus-en",
|
|
50
|
+
"aura-orpheus-en", "aura-helios-en", "aura-zeus-en",
|
|
51
|
+
],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
def __init__(self, settings: DeepgramTTSSettings | None = None) -> None:
|
|
55
|
+
self.settings = settings or DeepgramTTSSettings()
|
|
56
|
+
self._client: httpx.AsyncClient | None = None
|
|
57
|
+
self._owns_client: bool = True
|
|
58
|
+
|
|
59
|
+
def set_http_client(self, client) -> None:
|
|
60
|
+
self._client = client
|
|
61
|
+
self._owns_client = False
|
|
62
|
+
|
|
63
|
+
async def start(self) -> None:
|
|
64
|
+
if self._client is None:
|
|
65
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
66
|
+
self._owns_client = True
|
|
67
|
+
logger.info("{} provider started", self.name)
|
|
68
|
+
|
|
69
|
+
async def stop(self) -> None:
|
|
70
|
+
if self._client and self._owns_client:
|
|
71
|
+
await self._client.aclose()
|
|
72
|
+
self._client = None
|
|
73
|
+
logger.info("{} provider stopped", self.name)
|
|
74
|
+
|
|
75
|
+
async def health_check(self) -> bool:
|
|
76
|
+
return bool(self.settings.api_key)
|
|
77
|
+
|
|
78
|
+
async def synthesize(
|
|
79
|
+
self, text: str, opts: TTSOptions | None = None
|
|
80
|
+
) -> AudioData:
|
|
81
|
+
if self._client is None:
|
|
82
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
83
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
84
|
+
_t0 = time.perf_counter()
|
|
85
|
+
opts = opts or TTSOptions()
|
|
86
|
+
|
|
87
|
+
model = self.settings.model
|
|
88
|
+
headers = {
|
|
89
|
+
"Authorization": f"Token {self.settings.api_key}",
|
|
90
|
+
"Content-Type": "application/json",
|
|
91
|
+
}
|
|
92
|
+
payload = {"text": text}
|
|
93
|
+
|
|
94
|
+
resp = await self._client.post(
|
|
95
|
+
"https://api.deepgram.com/v1/speak",
|
|
96
|
+
params={"model": model, "encoding": "linear16", "sample_rate": "24000"},
|
|
97
|
+
headers=headers,
|
|
98
|
+
json=payload,
|
|
99
|
+
)
|
|
100
|
+
if resp.status_code != 200:
|
|
101
|
+
raise RuntimeError(f"Deepgram TTS API error ({resp.status_code}): {resp.text}")
|
|
102
|
+
|
|
103
|
+
audio_bytes = resp.content
|
|
104
|
+
result = AudioData(
|
|
105
|
+
data=audio_bytes,
|
|
106
|
+
sample_rate=24000,
|
|
107
|
+
channels=1,
|
|
108
|
+
format=AudioFormat.PCM_16K,
|
|
109
|
+
)
|
|
110
|
+
logger.info(
|
|
111
|
+
"{}: completed in {:.0f}ms, output={} bytes",
|
|
112
|
+
self.name,
|
|
113
|
+
(time.perf_counter() - _t0) * 1000,
|
|
114
|
+
len(result.data),
|
|
115
|
+
)
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
async def synthesize_stream(
|
|
119
|
+
self, text: str, opts: TTSOptions | None = None
|
|
120
|
+
) -> AsyncIterator[AudioChunk]:
|
|
121
|
+
if self._client is None:
|
|
122
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
123
|
+
logger.info("{}: stream request, text={} chars", self.name, len(text))
|
|
124
|
+
_t0 = time.perf_counter()
|
|
125
|
+
opts = opts or TTSOptions()
|
|
126
|
+
|
|
127
|
+
model = self.settings.model
|
|
128
|
+
headers = {
|
|
129
|
+
"Authorization": f"Token {self.settings.api_key}",
|
|
130
|
+
"Content-Type": "application/json",
|
|
131
|
+
}
|
|
132
|
+
payload = {"text": text}
|
|
133
|
+
|
|
134
|
+
async with self._client.stream(
|
|
135
|
+
"POST",
|
|
136
|
+
"https://api.deepgram.com/v1/speak",
|
|
137
|
+
params={"model": model, "encoding": "linear16", "sample_rate": "24000"},
|
|
138
|
+
headers=headers,
|
|
139
|
+
json=payload,
|
|
140
|
+
) as response:
|
|
141
|
+
if response.status_code != 200:
|
|
142
|
+
body = await response.aread()
|
|
143
|
+
raise RuntimeError(f"Deepgram TTS API error ({response.status_code}): {body.decode()}")
|
|
144
|
+
sequence = 0
|
|
145
|
+
chunk_count = 0
|
|
146
|
+
async for chunk in response.aiter_bytes(chunk_size=4096):
|
|
147
|
+
logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk))
|
|
148
|
+
yield AudioChunk(data=chunk, sequence=sequence)
|
|
149
|
+
sequence += 1
|
|
150
|
+
chunk_count += 1
|
|
151
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|
|
152
|
+
logger.info(
|
|
153
|
+
"{}: stream complete, {} chunks in {:.0f}ms",
|
|
154
|
+
self.name,
|
|
155
|
+
chunk_count,
|
|
156
|
+
(time.perf_counter() - _t0) * 1000,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
async def list_voices(self) -> list[dict]:
|
|
160
|
+
"""Return available Deepgram Aura voice models."""
|
|
161
|
+
voices = []
|
|
162
|
+
for m in self.field_options["model"]:
|
|
163
|
+
# Extract readable name: "aura-2-asteria-en" → "Asteria (en)"
|
|
164
|
+
parts = m.split("-")
|
|
165
|
+
if parts[0] == "aura" and parts[1] == "2":
|
|
166
|
+
name = parts[2].title()
|
|
167
|
+
lang = parts[3] if len(parts) > 3 else ""
|
|
168
|
+
label = f"{name} ({lang})" if lang else name
|
|
169
|
+
else:
|
|
170
|
+
name = parts[1].title() if len(parts) > 1 else m
|
|
171
|
+
lang = parts[2] if len(parts) > 2 else ""
|
|
172
|
+
label = f"{name} ({lang})" if lang else name
|
|
173
|
+
voices.append({"id": m, "name": label})
|
|
174
|
+
return voices
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""ElevenLabs TTS provider adapter (streaming + voice clone + emotion, in-process)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import base64
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import AsyncIterator
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib.parse import urlencode
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from openspeech.core.base import TTSProvider
|
|
16
|
+
|
|
17
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
18
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
19
|
+
from openspeech.core.settings import BaseSettings
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ElevenLabsTTSSettings(BaseSettings):
|
|
23
|
+
api_key: str = ""
|
|
24
|
+
voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel
|
|
25
|
+
model_id: str = "eleven_multilingual_v2"
|
|
26
|
+
output_format: str = "mp3_44100_128"
|
|
27
|
+
optimize_streaming_latency: int | None = None
|
|
28
|
+
seed: int | None = None
|
|
29
|
+
language_code: str = ""
|
|
30
|
+
apply_text_normalization: str = "auto"
|
|
31
|
+
apply_language_text_normalization: bool = False
|
|
32
|
+
stability: float = 0.5
|
|
33
|
+
similarity_boost: float = 0.75
|
|
34
|
+
style: float = 0.0
|
|
35
|
+
use_speaker_boost: bool = True
|
|
36
|
+
stream_transport: str = "http" # http | ws
|
|
37
|
+
ws_chunk_length_schedule: list[int] | None = None
|
|
38
|
+
|
|
39
|
+
class ElevenLabsTTS(TTSProvider):
|
|
40
|
+
name = "elevenlabs"
|
|
41
|
+
provider_type = ProviderType.TTS
|
|
42
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
43
|
+
settings_cls = ElevenLabsTTSSettings
|
|
44
|
+
capabilities = {
|
|
45
|
+
Capability.STREAMING,
|
|
46
|
+
Capability.VOICE_CLONE,
|
|
47
|
+
Capability.EMOTION,
|
|
48
|
+
}
|
|
49
|
+
field_options = {
|
|
50
|
+
# Synced with ElevenLabs models overview and TTS API docs (Apr 2026):
|
|
51
|
+
# https://elevenlabs.io/docs/overview/models
|
|
52
|
+
# https://elevenlabs.io/docs/api-reference/text-to-speech
|
|
53
|
+
"model_id": [
|
|
54
|
+
"eleven_v3",
|
|
55
|
+
"eleven_flash_v2_5",
|
|
56
|
+
"eleven_flash_v2",
|
|
57
|
+
"eleven_multilingual_v2",
|
|
58
|
+
# Deprecated but still accepted by API in compatibility mode.
|
|
59
|
+
"eleven_multilingual_v1",
|
|
60
|
+
"eleven_turbo_v2_5",
|
|
61
|
+
"eleven_turbo_v2",
|
|
62
|
+
"eleven_monolingual_v1",
|
|
63
|
+
],
|
|
64
|
+
"output_format": [
|
|
65
|
+
"mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96",
|
|
66
|
+
"mp3_44100_128", "mp3_44100_192",
|
|
67
|
+
"opus_48000_32", "opus_48000_64", "opus_48000_96", "opus_48000_128", "opus_48000_192",
|
|
68
|
+
"pcm_16000", "pcm_22050", "pcm_24000", "pcm_32000", "pcm_44100", "pcm_48000", "pcm_8000",
|
|
69
|
+
"wav_16000", "wav_22050", "wav_24000", "wav_32000", "wav_44100", "wav_48000", "wav_8000",
|
|
70
|
+
"ulaw_8000", "alaw_8000",
|
|
71
|
+
],
|
|
72
|
+
"optimize_streaming_latency": [0, 1, 2, 3, 4],
|
|
73
|
+
"stream_transport": ["http", "ws"],
|
|
74
|
+
"apply_text_normalization": ["auto", "on", "off"],
|
|
75
|
+
# Common language_code options (ISO-639-1) for TTS endpoint.
|
|
76
|
+
"language_code": [
|
|
77
|
+
"",
|
|
78
|
+
"en", "zh", "ja", "ko", "es", "fr", "de", "pt", "it", "hi",
|
|
79
|
+
"id", "nl", "tr", "pl", "sv", "ar", "ru", "uk", "vi", "hu",
|
|
80
|
+
"no", "da", "fi", "cs", "el", "ro", "bg", "hr", "sk", "ms",
|
|
81
|
+
"ta", "fil",
|
|
82
|
+
],
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
def __init__(self, settings: ElevenLabsTTSSettings | None = None) -> None:
|
|
86
|
+
self.settings = settings or ElevenLabsTTSSettings()
|
|
87
|
+
self._client: Any = None
|
|
88
|
+
self._http_client: httpx.AsyncClient | None = None
|
|
89
|
+
|
|
90
|
+
async def start(self) -> None:
|
|
91
|
+
try:
|
|
92
|
+
from elevenlabs.client import AsyncElevenLabs
|
|
93
|
+
except ImportError:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"Install elevenlabs: pip install openspeech[elevenlabs-tts]"
|
|
96
|
+
)
|
|
97
|
+
self._http_client = httpx.AsyncClient(timeout=240.0, trust_env=False)
|
|
98
|
+
self._client = AsyncElevenLabs(
|
|
99
|
+
api_key=self.settings.api_key,
|
|
100
|
+
httpx_client=self._http_client,
|
|
101
|
+
)
|
|
102
|
+
logger.info("{} provider started", self.name)
|
|
103
|
+
|
|
104
|
+
async def stop(self) -> None:
|
|
105
|
+
self._client = None
|
|
106
|
+
if self._http_client is not None:
|
|
107
|
+
await self._http_client.aclose()
|
|
108
|
+
self._http_client = None
|
|
109
|
+
logger.info("{} provider stopped", self.name)
|
|
110
|
+
|
|
111
|
+
async def list_voices(self) -> list[dict]:
|
|
112
|
+
if self._client is None:
|
|
113
|
+
return []
|
|
114
|
+
try:
|
|
115
|
+
resp = await self._client.voices.get_all(show_legacy=False)
|
|
116
|
+
except Exception as exc:
|
|
117
|
+
logger.warning("{}: failed to fetch voices: {}", self.name, exc)
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
voices: list[dict] = []
|
|
121
|
+
for v in getattr(resp, "voices", []) or []:
|
|
122
|
+
voice_id = str(getattr(v, "voice_id", "") or "").strip()
|
|
123
|
+
if not voice_id:
|
|
124
|
+
continue
|
|
125
|
+
display_name = str(getattr(v, "name", "") or voice_id).strip()
|
|
126
|
+
category = str(getattr(v, "category", "") or "").strip()
|
|
127
|
+
labels = getattr(v, "labels", None) or {}
|
|
128
|
+
language = ""
|
|
129
|
+
if isinstance(labels, dict):
|
|
130
|
+
language = str(labels.get("language") or labels.get("accent") or "").strip()
|
|
131
|
+
item: dict[str, str] = {"id": voice_id, "name": display_name}
|
|
132
|
+
if category:
|
|
133
|
+
item["group"] = category.title()
|
|
134
|
+
if language:
|
|
135
|
+
item["language"] = language
|
|
136
|
+
voices.append(item)
|
|
137
|
+
voices.sort(key=lambda x: (x.get("group", ""), x.get("name", "").lower()))
|
|
138
|
+
return voices
|
|
139
|
+
|
|
140
|
+
async def health_check(self) -> bool:
|
|
141
|
+
return bool(self.settings.api_key)
|
|
142
|
+
|
|
143
|
+
async def synthesize(
|
|
144
|
+
self, text: str, opts: TTSOptions | None = None
|
|
145
|
+
) -> AudioData:
|
|
146
|
+
if self._client is None:
|
|
147
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
148
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
149
|
+
_t0 = time.perf_counter()
|
|
150
|
+
opts = opts or TTSOptions()
|
|
151
|
+
voice_id = opts.voice or self.settings.voice_id
|
|
152
|
+
model_id = opts.model or self.settings.model_id
|
|
153
|
+
request_kwargs = {
|
|
154
|
+
"voice_id": voice_id,
|
|
155
|
+
"text": text,
|
|
156
|
+
"model_id": model_id,
|
|
157
|
+
"output_format": self.settings.output_format,
|
|
158
|
+
"optimize_streaming_latency": self.settings.optimize_streaming_latency,
|
|
159
|
+
"seed": self.settings.seed,
|
|
160
|
+
"language_code": (self.settings.language_code or None),
|
|
161
|
+
"apply_text_normalization": self.settings.apply_text_normalization,
|
|
162
|
+
"apply_language_text_normalization": self.settings.apply_language_text_normalization,
|
|
163
|
+
"voice_settings": {
|
|
164
|
+
"stability": self.settings.stability,
|
|
165
|
+
"similarity_boost": self.settings.similarity_boost,
|
|
166
|
+
"style": self.settings.style,
|
|
167
|
+
"use_speaker_boost": self.settings.use_speaker_boost,
|
|
168
|
+
},
|
|
169
|
+
}
|
|
170
|
+
response = self._client.text_to_speech.convert(
|
|
171
|
+
**request_kwargs,
|
|
172
|
+
)
|
|
173
|
+
# Use bytearray to avoid quadratic copy cost on long streamed outputs.
|
|
174
|
+
audio_buf = bytearray()
|
|
175
|
+
async for chunk in response:
|
|
176
|
+
audio_buf.extend(chunk)
|
|
177
|
+
result = AudioData(
|
|
178
|
+
data=bytes(audio_buf),
|
|
179
|
+
sample_rate=44100,
|
|
180
|
+
channels=1,
|
|
181
|
+
format=AudioFormat.MP3,
|
|
182
|
+
)
|
|
183
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
184
|
+
return result
|
|
185
|
+
|
|
186
|
+
async def synthesize_stream(
|
|
187
|
+
self, text: str, opts: TTSOptions | None = None
|
|
188
|
+
) -> AsyncIterator[AudioChunk]:
|
|
189
|
+
if self._client is None:
|
|
190
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
191
|
+
logger.info("{}: stream request, text={} chars", self.name, len(text))
|
|
192
|
+
_t0 = time.perf_counter()
|
|
193
|
+
opts = opts or TTSOptions()
|
|
194
|
+
voice_id = opts.voice or self.settings.voice_id
|
|
195
|
+
model_id = opts.model or self.settings.model_id
|
|
196
|
+
stream_transport = (opts.stream_transport or self.settings.stream_transport or "http").strip().lower()
|
|
197
|
+
request_kwargs = {
|
|
198
|
+
"voice_id": voice_id,
|
|
199
|
+
"text": text,
|
|
200
|
+
"model_id": model_id,
|
|
201
|
+
"output_format": self.settings.output_format,
|
|
202
|
+
"optimize_streaming_latency": self.settings.optimize_streaming_latency,
|
|
203
|
+
"seed": self.settings.seed,
|
|
204
|
+
"language_code": (self.settings.language_code or None),
|
|
205
|
+
"apply_text_normalization": self.settings.apply_text_normalization,
|
|
206
|
+
"apply_language_text_normalization": self.settings.apply_language_text_normalization,
|
|
207
|
+
"voice_settings": {
|
|
208
|
+
"stability": self.settings.stability,
|
|
209
|
+
"similarity_boost": self.settings.similarity_boost,
|
|
210
|
+
"style": self.settings.style,
|
|
211
|
+
"use_speaker_boost": self.settings.use_speaker_boost,
|
|
212
|
+
},
|
|
213
|
+
}
|
|
214
|
+
# ElevenLabs docs: text-to-speech WS isn't available for eleven_v3.
|
|
215
|
+
if stream_transport == "ws" and str(model_id) == "eleven_v3":
|
|
216
|
+
logger.warning("{}: model {} does not support WS input streaming, fallback to HTTP streaming", self.name, model_id)
|
|
217
|
+
stream_transport = "http"
|
|
218
|
+
|
|
219
|
+
if stream_transport == "ws":
|
|
220
|
+
async for chunk in self._synthesize_stream_ws(text=text, request_kwargs=request_kwargs):
|
|
221
|
+
yield chunk
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
response = self._client.text_to_speech.convert(**request_kwargs)
|
|
225
|
+
sequence = 0
|
|
226
|
+
chunk_count = 0
|
|
227
|
+
async for chunk in response:
|
|
228
|
+
logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk))
|
|
229
|
+
yield AudioChunk(data=chunk, sequence=sequence)
|
|
230
|
+
sequence += 1
|
|
231
|
+
chunk_count += 1
|
|
232
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|
|
233
|
+
logger.info("{}: stream complete, {} chunks in {:.0f}ms", self.name, chunk_count, (time.perf_counter() - _t0) * 1000)
|
|
234
|
+
|
|
235
|
+
async def _synthesize_stream_ws(
|
|
236
|
+
self, *, text: str, request_kwargs: dict[str, Any]
|
|
237
|
+
) -> AsyncIterator[AudioChunk]:
|
|
238
|
+
"""Low-latency input-streaming over ElevenLabs WebSocket API."""
|
|
239
|
+
import websockets
|
|
240
|
+
|
|
241
|
+
voice_id = request_kwargs["voice_id"]
|
|
242
|
+
params = {
|
|
243
|
+
"model_id": request_kwargs.get("model_id"),
|
|
244
|
+
"output_format": request_kwargs.get("output_format"),
|
|
245
|
+
}
|
|
246
|
+
if request_kwargs.get("optimize_streaming_latency") is not None:
|
|
247
|
+
params["optimize_streaming_latency"] = request_kwargs["optimize_streaming_latency"]
|
|
248
|
+
if request_kwargs.get("language_code"):
|
|
249
|
+
params["language_code"] = request_kwargs.get("language_code")
|
|
250
|
+
qs = urlencode({k: v for k, v in params.items() if v not in (None, "")})
|
|
251
|
+
url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?{qs}"
|
|
252
|
+
|
|
253
|
+
generation_config: dict[str, Any] = {}
|
|
254
|
+
if self.settings.ws_chunk_length_schedule:
|
|
255
|
+
generation_config["chunk_length_schedule"] = self.settings.ws_chunk_length_schedule
|
|
256
|
+
|
|
257
|
+
init_msg: dict[str, Any] = {
|
|
258
|
+
"text": " ",
|
|
259
|
+
"xi_api_key": self.settings.api_key,
|
|
260
|
+
"voice_settings": request_kwargs.get("voice_settings") or {},
|
|
261
|
+
}
|
|
262
|
+
if generation_config:
|
|
263
|
+
init_msg["generation_config"] = generation_config
|
|
264
|
+
|
|
265
|
+
sequence = 0
|
|
266
|
+
had_audio = False
|
|
267
|
+
async with websockets.connect(url) as ws:
|
|
268
|
+
await ws.send(json.dumps(init_msg))
|
|
269
|
+
await ws.send(json.dumps({"text": text}))
|
|
270
|
+
# Force pending audio generation, then explicitly terminate input
|
|
271
|
+
# with EOS. Official protocol treats {"text": ""} as EOS.
|
|
272
|
+
await ws.send(json.dumps({"text": "", "flush": True}))
|
|
273
|
+
await ws.send(json.dumps({"text": ""}))
|
|
274
|
+
|
|
275
|
+
async for raw in ws:
|
|
276
|
+
try:
|
|
277
|
+
data = json.loads(raw)
|
|
278
|
+
except Exception:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
if "error" in data:
|
|
282
|
+
raw_err = data.get("error")
|
|
283
|
+
if isinstance(raw_err, dict):
|
|
284
|
+
err_msg = str(raw_err.get("code") or raw_err.get("message") or raw_err)
|
|
285
|
+
else:
|
|
286
|
+
err_msg = str(raw_err or "")
|
|
287
|
+
# Some sessions return input timeout after all usable audio
|
|
288
|
+
# has already been emitted; treat it as graceful close.
|
|
289
|
+
if "input_timeout_exceeded" in err_msg and had_audio:
|
|
290
|
+
logger.warning(
|
|
291
|
+
"{}: ws ended with input timeout after audio output; treating as complete",
|
|
292
|
+
self.name,
|
|
293
|
+
)
|
|
294
|
+
break
|
|
295
|
+
raise RuntimeError(f"ElevenLabs TTS WS error: {raw_err}")
|
|
296
|
+
|
|
297
|
+
audio_b64 = data.get("audio") or data.get("audio_base64") or ""
|
|
298
|
+
if audio_b64:
|
|
299
|
+
try:
|
|
300
|
+
chunk = base64.b64decode(audio_b64)
|
|
301
|
+
except Exception:
|
|
302
|
+
chunk = b""
|
|
303
|
+
if chunk:
|
|
304
|
+
had_audio = True
|
|
305
|
+
yield AudioChunk(data=chunk, sequence=sequence)
|
|
306
|
+
sequence += 1
|
|
307
|
+
|
|
308
|
+
if data.get("isFinal") is True:
|
|
309
|
+
break
|
|
310
|
+
|
|
311
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|