openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,90 @@
1
+ """CosyVoice TTS provider adapter (voice clone + multilingual, subprocess mode)."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+ from openspeech.core.base import TTSProvider
11
+
12
+ from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
13
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
14
+ from openspeech.core.settings import BaseSettings
15
+
16
+ @dataclass
17
+ class CosyVoiceTTSSettings(BaseSettings):
18
+ model_dir: str = ""
19
+ device: str = "auto"
20
+ fp16: bool = False
21
+ spk_id: str | None = None
22
+
23
+ class CosyVoiceTTS(TTSProvider):
24
+ name = "cosyvoice"
25
+ provider_type = ProviderType.TTS
26
+ execution_mode = ExecMode.SUBPROCESS
27
+ settings_cls = CosyVoiceTTSSettings
28
+ capabilities = {Capability.VOICE_CLONE, Capability.MULTILINGUAL}
29
+ field_options = {
30
+ "device": ["auto", "cpu", "cuda", "mps"],
31
+ }
32
+
33
+ def __init__(self, settings: CosyVoiceTTSSettings | None = None) -> None:
34
+ self.settings = settings or CosyVoiceTTSSettings()
35
+ self._client: Any = None
36
+ self._model: Any = None
37
+
38
+ async def start(self) -> None:
39
+ try:
40
+ from cosyvoice.cli.cosyvoice import CosyVoice as CosyVoiceModel
41
+ except ImportError:
42
+ raise ImportError(
43
+ "Install cosyvoice: pip install openspeech[cosyvoice]"
44
+ )
45
+ self._model = CosyVoiceModel(self.settings.model_dir)
46
+ self._client = self._model
47
+ logger.info("{} provider started", self.name)
48
+
49
+ async def stop(self) -> None:
50
+ self._client = None
51
+ self._model = None
52
+ logger.info("{} provider stopped", self.name)
53
+
54
+ async def health_check(self) -> bool:
55
+ return self._client is not None
56
+
57
+ async def synthesize(
58
+ self, text: str, opts: TTSOptions | None = None
59
+ ) -> AudioData:
60
+ if self._client is None:
61
+ raise RuntimeError("Provider not started — call start() first")
62
+ logger.info("{}: request received, text={} chars", self.name, len(text))
63
+ _t0 = time.perf_counter()
64
+ import io
65
+
66
+ import torchaudio
67
+
68
+ spk_id = self.settings.spk_id or "中文女"
69
+ output = list(self._model.inference_sft(text, spk_id))
70
+ if not output:
71
+ raise RuntimeError("CosyVoice produced no output")
72
+ speech = output[0]["tts_speech"]
73
+ buf = io.BytesIO()
74
+ torchaudio.save(buf, speech, 22050, format="wav")
75
+ result = AudioData(
76
+ data=buf.getvalue(),
77
+ sample_rate=22050,
78
+ channels=1,
79
+ format=AudioFormat.WAV,
80
+ )
81
+ logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
82
+ return result
83
+
84
+ async def synthesize_stream(
85
+ self, text: str, opts: TTSOptions | None = None
86
+ ) -> AsyncIterator[AudioChunk]:
87
+ raise NotImplementedError(
88
+ "CosyVoiceTTS.synthesize_stream() is not yet implemented"
89
+ )
90
+ yield # pragma: no cover
@@ -0,0 +1,174 @@
1
+ """Deepgram TTS provider adapter (Aura API, httpx-based, no SDK needed)."""
2
+ from __future__ import annotations
3
+
4
+ import time
5
+ from collections.abc import AsyncIterator
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+ import httpx
10
+ from openspeech.logging_config import logger
11
+
12
+ from openspeech.core.base import TTSProvider
13
+ from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
14
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
15
+ from openspeech.core.settings import BaseSettings
16
+
17
+
18
+ @dataclass
19
+ class DeepgramTTSSettings(BaseSettings):
20
+ api_key: str = ""
21
+ model: str = "aura-asteria-en"
22
+
23
+
24
+ class DeepgramTTS(TTSProvider):
25
+ name = "deepgram-tts"
26
+ provider_type = ProviderType.TTS
27
+ execution_mode = ExecMode.IN_PROCESS
28
+ settings_cls = DeepgramTTSSettings
29
+ capabilities = {Capability.STREAMING, Capability.MULTILINGUAL}
30
+ field_options = {
31
+ "model": [
32
+ # Aura-2 English
33
+ "aura-2-asteria-en", "aura-2-athena-en", "aura-2-luna-en",
34
+ "aura-2-hera-en", "aura-2-orion-en", "aura-2-orpheus-en",
35
+ "aura-2-arcas-en", "aura-2-zeus-en", "aura-2-apollo-en",
36
+ "aura-2-helena-en", "aura-2-andromeda-en", "aura-2-thalia-en",
37
+ "aura-2-aurora-en", "aura-2-iris-en", "aura-2-electra-en",
38
+ # Aura-2 Chinese / Japanese / Korean
39
+ "aura-2-uzume-ja", "aura-2-ebisu-ja", "aura-2-fujin-ja",
40
+ # Aura-2 European
41
+ "aura-2-agathe-fr", "aura-2-hector-fr",
42
+ "aura-2-elara-de", "aura-2-aurelia-de", "aura-2-julius-de",
43
+ "aura-2-sirio-es", "aura-2-carina-es", "aura-2-diana-es",
44
+ "aura-2-melia-it", "aura-2-elio-it",
45
+ "aura-2-beatrix-nl", "aura-2-daphne-nl",
46
+ # Aura-1 English (legacy)
47
+ "aura-asteria-en", "aura-luna-en", "aura-stella-en",
48
+ "aura-athena-en", "aura-hera-en", "aura-orion-en",
49
+ "aura-arcas-en", "aura-perseus-en", "aura-angus-en",
50
+ "aura-orpheus-en", "aura-helios-en", "aura-zeus-en",
51
+ ],
52
+ }
53
+
54
+ def __init__(self, settings: DeepgramTTSSettings | None = None) -> None:
55
+ self.settings = settings or DeepgramTTSSettings()
56
+ self._client: httpx.AsyncClient | None = None
57
+ self._owns_client: bool = True
58
+
59
+ def set_http_client(self, client) -> None:
60
+ self._client = client
61
+ self._owns_client = False
62
+
63
+ async def start(self) -> None:
64
+ if self._client is None:
65
+ self._client = httpx.AsyncClient(timeout=60.0)
66
+ self._owns_client = True
67
+ logger.info("{} provider started", self.name)
68
+
69
+ async def stop(self) -> None:
70
+ if self._client and self._owns_client:
71
+ await self._client.aclose()
72
+ self._client = None
73
+ logger.info("{} provider stopped", self.name)
74
+
75
+ async def health_check(self) -> bool:
76
+ return bool(self.settings.api_key)
77
+
78
+ async def synthesize(
79
+ self, text: str, opts: TTSOptions | None = None
80
+ ) -> AudioData:
81
+ if self._client is None:
82
+ raise RuntimeError("Provider not started — call start() first")
83
+ logger.info("{}: request received, text={} chars", self.name, len(text))
84
+ _t0 = time.perf_counter()
85
+ opts = opts or TTSOptions()
86
+
87
+ model = self.settings.model
88
+ headers = {
89
+ "Authorization": f"Token {self.settings.api_key}",
90
+ "Content-Type": "application/json",
91
+ }
92
+ payload = {"text": text}
93
+
94
+ resp = await self._client.post(
95
+ "https://api.deepgram.com/v1/speak",
96
+ params={"model": model, "encoding": "linear16", "sample_rate": "24000"},
97
+ headers=headers,
98
+ json=payload,
99
+ )
100
+ if resp.status_code != 200:
101
+ raise RuntimeError(f"Deepgram TTS API error ({resp.status_code}): {resp.text}")
102
+
103
+ audio_bytes = resp.content
104
+ result = AudioData(
105
+ data=audio_bytes,
106
+ sample_rate=24000,
107
+ channels=1,
108
+ format=AudioFormat.PCM_16K,
109
+ )
110
+ logger.info(
111
+ "{}: completed in {:.0f}ms, output={} bytes",
112
+ self.name,
113
+ (time.perf_counter() - _t0) * 1000,
114
+ len(result.data),
115
+ )
116
+ return result
117
+
118
+ async def synthesize_stream(
119
+ self, text: str, opts: TTSOptions | None = None
120
+ ) -> AsyncIterator[AudioChunk]:
121
+ if self._client is None:
122
+ raise RuntimeError("Provider not started — call start() first")
123
+ logger.info("{}: stream request, text={} chars", self.name, len(text))
124
+ _t0 = time.perf_counter()
125
+ opts = opts or TTSOptions()
126
+
127
+ model = self.settings.model
128
+ headers = {
129
+ "Authorization": f"Token {self.settings.api_key}",
130
+ "Content-Type": "application/json",
131
+ }
132
+ payload = {"text": text}
133
+
134
+ async with self._client.stream(
135
+ "POST",
136
+ "https://api.deepgram.com/v1/speak",
137
+ params={"model": model, "encoding": "linear16", "sample_rate": "24000"},
138
+ headers=headers,
139
+ json=payload,
140
+ ) as response:
141
+ if response.status_code != 200:
142
+ body = await response.aread()
143
+ raise RuntimeError(f"Deepgram TTS API error ({response.status_code}): {body.decode()}")
144
+ sequence = 0
145
+ chunk_count = 0
146
+ async for chunk in response.aiter_bytes(chunk_size=4096):
147
+ logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk))
148
+ yield AudioChunk(data=chunk, sequence=sequence)
149
+ sequence += 1
150
+ chunk_count += 1
151
+ yield AudioChunk(data=b"", sequence=sequence, is_final=True)
152
+ logger.info(
153
+ "{}: stream complete, {} chunks in {:.0f}ms",
154
+ self.name,
155
+ chunk_count,
156
+ (time.perf_counter() - _t0) * 1000,
157
+ )
158
+
159
+ async def list_voices(self) -> list[dict]:
160
+ """Return available Deepgram Aura voice models."""
161
+ voices = []
162
+ for m in self.field_options["model"]:
163
+ # Extract readable name: "aura-2-asteria-en" → "Asteria (en)"
164
+ parts = m.split("-")
165
+ if parts[0] == "aura" and parts[1] == "2":
166
+ name = parts[2].title()
167
+ lang = parts[3] if len(parts) > 3 else ""
168
+ label = f"{name} ({lang})" if lang else name
169
+ else:
170
+ name = parts[1].title() if len(parts) > 1 else m
171
+ lang = parts[2] if len(parts) > 2 else ""
172
+ label = f"{name} ({lang})" if lang else name
173
+ voices.append({"id": m, "name": label})
174
+ return voices
@@ -0,0 +1,311 @@
1
+ """ElevenLabs TTS provider adapter (streaming + voice clone + emotion, in-process)."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import base64
6
+ import json
7
+ import time
8
+ from collections.abc import AsyncIterator
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+ from urllib.parse import urlencode
12
+
13
+ import httpx
14
+
15
+ from openspeech.core.base import TTSProvider
16
+
17
+ from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
18
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
19
+ from openspeech.core.settings import BaseSettings
20
+
21
+ @dataclass
22
+ class ElevenLabsTTSSettings(BaseSettings):
23
+ api_key: str = ""
24
+ voice_id: str = "21m00Tcm4TlvDq8ikWAM" # Rachel
25
+ model_id: str = "eleven_multilingual_v2"
26
+ output_format: str = "mp3_44100_128"
27
+ optimize_streaming_latency: int | None = None
28
+ seed: int | None = None
29
+ language_code: str = ""
30
+ apply_text_normalization: str = "auto"
31
+ apply_language_text_normalization: bool = False
32
+ stability: float = 0.5
33
+ similarity_boost: float = 0.75
34
+ style: float = 0.0
35
+ use_speaker_boost: bool = True
36
+ stream_transport: str = "http" # http | ws
37
+ ws_chunk_length_schedule: list[int] | None = None
38
+
39
+ class ElevenLabsTTS(TTSProvider):
40
+ name = "elevenlabs"
41
+ provider_type = ProviderType.TTS
42
+ execution_mode = ExecMode.IN_PROCESS
43
+ settings_cls = ElevenLabsTTSSettings
44
+ capabilities = {
45
+ Capability.STREAMING,
46
+ Capability.VOICE_CLONE,
47
+ Capability.EMOTION,
48
+ }
49
+ field_options = {
50
+ # Synced with ElevenLabs models overview and TTS API docs (Apr 2026):
51
+ # https://elevenlabs.io/docs/overview/models
52
+ # https://elevenlabs.io/docs/api-reference/text-to-speech
53
+ "model_id": [
54
+ "eleven_v3",
55
+ "eleven_flash_v2_5",
56
+ "eleven_flash_v2",
57
+ "eleven_multilingual_v2",
58
+ # Deprecated but still accepted by API in compatibility mode.
59
+ "eleven_multilingual_v1",
60
+ "eleven_turbo_v2_5",
61
+ "eleven_turbo_v2",
62
+ "eleven_monolingual_v1",
63
+ ],
64
+ "output_format": [
65
+ "mp3_22050_32", "mp3_44100_32", "mp3_44100_64", "mp3_44100_96",
66
+ "mp3_44100_128", "mp3_44100_192",
67
+ "opus_48000_32", "opus_48000_64", "opus_48000_96", "opus_48000_128", "opus_48000_192",
68
+ "pcm_16000", "pcm_22050", "pcm_24000", "pcm_32000", "pcm_44100", "pcm_48000", "pcm_8000",
69
+ "wav_16000", "wav_22050", "wav_24000", "wav_32000", "wav_44100", "wav_48000", "wav_8000",
70
+ "ulaw_8000", "alaw_8000",
71
+ ],
72
+ "optimize_streaming_latency": [0, 1, 2, 3, 4],
73
+ "stream_transport": ["http", "ws"],
74
+ "apply_text_normalization": ["auto", "on", "off"],
75
+ # Common language_code options (ISO-639-1) for TTS endpoint.
76
+ "language_code": [
77
+ "",
78
+ "en", "zh", "ja", "ko", "es", "fr", "de", "pt", "it", "hi",
79
+ "id", "nl", "tr", "pl", "sv", "ar", "ru", "uk", "vi", "hu",
80
+ "no", "da", "fi", "cs", "el", "ro", "bg", "hr", "sk", "ms",
81
+ "ta", "fil",
82
+ ],
83
+ }
84
+
85
+ def __init__(self, settings: ElevenLabsTTSSettings | None = None) -> None:
86
+ self.settings = settings or ElevenLabsTTSSettings()
87
+ self._client: Any = None
88
+ self._http_client: httpx.AsyncClient | None = None
89
+
90
+ async def start(self) -> None:
91
+ try:
92
+ from elevenlabs.client import AsyncElevenLabs
93
+ except ImportError:
94
+ raise ImportError(
95
+ "Install elevenlabs: pip install openspeech[elevenlabs-tts]"
96
+ )
97
+ self._http_client = httpx.AsyncClient(timeout=240.0, trust_env=False)
98
+ self._client = AsyncElevenLabs(
99
+ api_key=self.settings.api_key,
100
+ httpx_client=self._http_client,
101
+ )
102
+ logger.info("{} provider started", self.name)
103
+
104
+ async def stop(self) -> None:
105
+ self._client = None
106
+ if self._http_client is not None:
107
+ await self._http_client.aclose()
108
+ self._http_client = None
109
+ logger.info("{} provider stopped", self.name)
110
+
111
+ async def list_voices(self) -> list[dict]:
112
+ if self._client is None:
113
+ return []
114
+ try:
115
+ resp = await self._client.voices.get_all(show_legacy=False)
116
+ except Exception as exc:
117
+ logger.warning("{}: failed to fetch voices: {}", self.name, exc)
118
+ return []
119
+
120
+ voices: list[dict] = []
121
+ for v in getattr(resp, "voices", []) or []:
122
+ voice_id = str(getattr(v, "voice_id", "") or "").strip()
123
+ if not voice_id:
124
+ continue
125
+ display_name = str(getattr(v, "name", "") or voice_id).strip()
126
+ category = str(getattr(v, "category", "") or "").strip()
127
+ labels = getattr(v, "labels", None) or {}
128
+ language = ""
129
+ if isinstance(labels, dict):
130
+ language = str(labels.get("language") or labels.get("accent") or "").strip()
131
+ item: dict[str, str] = {"id": voice_id, "name": display_name}
132
+ if category:
133
+ item["group"] = category.title()
134
+ if language:
135
+ item["language"] = language
136
+ voices.append(item)
137
+ voices.sort(key=lambda x: (x.get("group", ""), x.get("name", "").lower()))
138
+ return voices
139
+
140
+ async def health_check(self) -> bool:
141
+ return bool(self.settings.api_key)
142
+
143
+ async def synthesize(
144
+ self, text: str, opts: TTSOptions | None = None
145
+ ) -> AudioData:
146
+ if self._client is None:
147
+ raise RuntimeError("Provider not started — call start() first")
148
+ logger.info("{}: request received, text={} chars", self.name, len(text))
149
+ _t0 = time.perf_counter()
150
+ opts = opts or TTSOptions()
151
+ voice_id = opts.voice or self.settings.voice_id
152
+ model_id = opts.model or self.settings.model_id
153
+ request_kwargs = {
154
+ "voice_id": voice_id,
155
+ "text": text,
156
+ "model_id": model_id,
157
+ "output_format": self.settings.output_format,
158
+ "optimize_streaming_latency": self.settings.optimize_streaming_latency,
159
+ "seed": self.settings.seed,
160
+ "language_code": (self.settings.language_code or None),
161
+ "apply_text_normalization": self.settings.apply_text_normalization,
162
+ "apply_language_text_normalization": self.settings.apply_language_text_normalization,
163
+ "voice_settings": {
164
+ "stability": self.settings.stability,
165
+ "similarity_boost": self.settings.similarity_boost,
166
+ "style": self.settings.style,
167
+ "use_speaker_boost": self.settings.use_speaker_boost,
168
+ },
169
+ }
170
+ response = self._client.text_to_speech.convert(
171
+ **request_kwargs,
172
+ )
173
+ # Use bytearray to avoid quadratic copy cost on long streamed outputs.
174
+ audio_buf = bytearray()
175
+ async for chunk in response:
176
+ audio_buf.extend(chunk)
177
+ result = AudioData(
178
+ data=bytes(audio_buf),
179
+ sample_rate=44100,
180
+ channels=1,
181
+ format=AudioFormat.MP3,
182
+ )
183
+ logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
184
+ return result
185
+
186
+ async def synthesize_stream(
187
+ self, text: str, opts: TTSOptions | None = None
188
+ ) -> AsyncIterator[AudioChunk]:
189
+ if self._client is None:
190
+ raise RuntimeError("Provider not started — call start() first")
191
+ logger.info("{}: stream request, text={} chars", self.name, len(text))
192
+ _t0 = time.perf_counter()
193
+ opts = opts or TTSOptions()
194
+ voice_id = opts.voice or self.settings.voice_id
195
+ model_id = opts.model or self.settings.model_id
196
+ stream_transport = (opts.stream_transport or self.settings.stream_transport or "http").strip().lower()
197
+ request_kwargs = {
198
+ "voice_id": voice_id,
199
+ "text": text,
200
+ "model_id": model_id,
201
+ "output_format": self.settings.output_format,
202
+ "optimize_streaming_latency": self.settings.optimize_streaming_latency,
203
+ "seed": self.settings.seed,
204
+ "language_code": (self.settings.language_code or None),
205
+ "apply_text_normalization": self.settings.apply_text_normalization,
206
+ "apply_language_text_normalization": self.settings.apply_language_text_normalization,
207
+ "voice_settings": {
208
+ "stability": self.settings.stability,
209
+ "similarity_boost": self.settings.similarity_boost,
210
+ "style": self.settings.style,
211
+ "use_speaker_boost": self.settings.use_speaker_boost,
212
+ },
213
+ }
214
+ # ElevenLabs docs: text-to-speech WS isn't available for eleven_v3.
215
+ if stream_transport == "ws" and str(model_id) == "eleven_v3":
216
+ logger.warning("{}: model {} does not support WS input streaming, fallback to HTTP streaming", self.name, model_id)
217
+ stream_transport = "http"
218
+
219
+ if stream_transport == "ws":
220
+ async for chunk in self._synthesize_stream_ws(text=text, request_kwargs=request_kwargs):
221
+ yield chunk
222
+ return
223
+
224
+ response = self._client.text_to_speech.convert(**request_kwargs)
225
+ sequence = 0
226
+ chunk_count = 0
227
+ async for chunk in response:
228
+ logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk))
229
+ yield AudioChunk(data=chunk, sequence=sequence)
230
+ sequence += 1
231
+ chunk_count += 1
232
+ yield AudioChunk(data=b"", sequence=sequence, is_final=True)
233
+ logger.info("{}: stream complete, {} chunks in {:.0f}ms", self.name, chunk_count, (time.perf_counter() - _t0) * 1000)
234
+
235
+ async def _synthesize_stream_ws(
236
+ self, *, text: str, request_kwargs: dict[str, Any]
237
+ ) -> AsyncIterator[AudioChunk]:
238
+ """Low-latency input-streaming over ElevenLabs WebSocket API."""
239
+ import websockets
240
+
241
+ voice_id = request_kwargs["voice_id"]
242
+ params = {
243
+ "model_id": request_kwargs.get("model_id"),
244
+ "output_format": request_kwargs.get("output_format"),
245
+ }
246
+ if request_kwargs.get("optimize_streaming_latency") is not None:
247
+ params["optimize_streaming_latency"] = request_kwargs["optimize_streaming_latency"]
248
+ if request_kwargs.get("language_code"):
249
+ params["language_code"] = request_kwargs.get("language_code")
250
+ qs = urlencode({k: v for k, v in params.items() if v not in (None, "")})
251
+ url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?{qs}"
252
+
253
+ generation_config: dict[str, Any] = {}
254
+ if self.settings.ws_chunk_length_schedule:
255
+ generation_config["chunk_length_schedule"] = self.settings.ws_chunk_length_schedule
256
+
257
+ init_msg: dict[str, Any] = {
258
+ "text": " ",
259
+ "xi_api_key": self.settings.api_key,
260
+ "voice_settings": request_kwargs.get("voice_settings") or {},
261
+ }
262
+ if generation_config:
263
+ init_msg["generation_config"] = generation_config
264
+
265
+ sequence = 0
266
+ had_audio = False
267
+ async with websockets.connect(url) as ws:
268
+ await ws.send(json.dumps(init_msg))
269
+ await ws.send(json.dumps({"text": text}))
270
+ # Force pending audio generation, then explicitly terminate input
271
+ # with EOS. Official protocol treats {"text": ""} as EOS.
272
+ await ws.send(json.dumps({"text": "", "flush": True}))
273
+ await ws.send(json.dumps({"text": ""}))
274
+
275
+ async for raw in ws:
276
+ try:
277
+ data = json.loads(raw)
278
+ except Exception:
279
+ continue
280
+
281
+ if "error" in data:
282
+ raw_err = data.get("error")
283
+ if isinstance(raw_err, dict):
284
+ err_msg = str(raw_err.get("code") or raw_err.get("message") or raw_err)
285
+ else:
286
+ err_msg = str(raw_err or "")
287
+ # Some sessions return input timeout after all usable audio
288
+ # has already been emitted; treat it as graceful close.
289
+ if "input_timeout_exceeded" in err_msg and had_audio:
290
+ logger.warning(
291
+ "{}: ws ended with input timeout after audio output; treating as complete",
292
+ self.name,
293
+ )
294
+ break
295
+ raise RuntimeError(f"ElevenLabs TTS WS error: {raw_err}")
296
+
297
+ audio_b64 = data.get("audio") or data.get("audio_base64") or ""
298
+ if audio_b64:
299
+ try:
300
+ chunk = base64.b64decode(audio_b64)
301
+ except Exception:
302
+ chunk = b""
303
+ if chunk:
304
+ had_audio = True
305
+ yield AudioChunk(data=chunk, sequence=sequence)
306
+ sequence += 1
307
+
308
+ if data.get("isFinal") is True:
309
+ break
310
+
311
+ yield AudioChunk(data=b"", sequence=sequence, is_final=True)