openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,249 @@
1
+ """Windows native STT provider using System.Speech.Recognition via PowerShell.
2
+
3
+ Uses PowerShell to invoke the .NET System.Speech.Recognition.SpeechRecognitionEngine,
4
+ which is available on all Windows 10+ systems without extra dependencies.
5
+ Results are exchanged through a temporary JSON file.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ from openspeech.logging_config import logger
12
+ import shutil
13
+ import sys
14
+ import tempfile
15
+ import time
16
+ from collections.abc import AsyncIterator
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from openspeech.core.base import STTProvider
22
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
23
+ from openspeech.core.models import AudioData, STTOptions, Transcription
24
+ from openspeech.core.settings import BaseSettings
25
+ from openspeech.utils.audio_converter import AudioConverter
26
+
27
+ # PowerShell script that performs speech recognition and outputs JSON.
28
+ _PS_RECOGNIZE_SCRIPT = r"""
29
+ param(
30
+ [Parameter(Mandatory=$true)][string]$AudioPath,
31
+ [Parameter(Mandatory=$true)][string]$OutputPath,
32
+ [string]$Language = "zh-CN"
33
+ )
34
+
35
+ try {
36
+ Add-Type -AssemblyName System.Speech
37
+
38
+ $culture = New-Object System.Globalization.CultureInfo($Language)
39
+ $engine = New-Object System.Speech.Recognition.SpeechRecognitionEngine($culture)
40
+ $grammar = New-Object System.Speech.Recognition.DictationGrammar
41
+ $engine.LoadGrammar($grammar)
42
+ $engine.SetInputToWaveFile($AudioPath)
43
+
44
+ $result = $engine.Recognize()
45
+ $engine.Dispose()
46
+
47
+ if ($result -ne $null) {
48
+ $output = @{
49
+ status = "ok"
50
+ text = $result.Text
51
+ confidence = [math]::Round($result.Confidence, 4)
52
+ language = $Language
53
+ }
54
+ } else {
55
+ $output = @{
56
+ status = "ok"
57
+ text = ""
58
+ confidence = 0
59
+ language = $Language
60
+ }
61
+ }
62
+ } catch {
63
+ $output = @{
64
+ status = "error"
65
+ error = $_.Exception.Message
66
+ }
67
+ }
68
+
69
+ $output | ConvertTo-Json -Compress | Out-File -FilePath $OutputPath -Encoding UTF8
70
+ """
71
+
72
+ @dataclass
73
+ class WindowsSpeechSettings(BaseSettings):
74
+ language: str = "zh-CN"
75
+
76
+ class WindowsSpeechSTT(STTProvider):
77
+ """Windows native STT via System.Speech.Recognition (PowerShell)."""
78
+
79
+ name = "windows-stt"
80
+ provider_type = ProviderType.STT
81
+ execution_mode = ExecMode.IN_PROCESS
82
+ settings_cls = WindowsSpeechSettings
83
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
84
+ field_options = {"language": ["zh-CN", "en-US", "ja-JP", "ko-KR", "fr-FR", "de-DE", "es-ES"]}
85
+
86
+ def __init__(self, settings: WindowsSpeechSettings | None = None) -> None:
87
+ self.settings = settings or WindowsSpeechSettings()
88
+ self._available: bool = False
89
+ self._powershell: str | None = None
90
+
91
+ # -- lifecycle ------------------------------------------------------------
92
+
93
+ @staticmethod
94
+ def _find_powershell() -> str | None:
95
+ """Locate PowerShell executable — try PATH first, then well-known paths."""
96
+ ps = shutil.which("powershell") or shutil.which("pwsh")
97
+ if ps:
98
+ return ps
99
+ # Fallback: well-known Windows paths
100
+ for candidate in (
101
+ Path(r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe"),
102
+ Path(r"C:\Windows\SysWOW64\WindowsPowerShell\v1.0\powershell.exe"),
103
+ ):
104
+ if candidate.exists():
105
+ return str(candidate)
106
+ return None
107
+
108
+ async def start(self) -> None:
109
+ if sys.platform != "win32":
110
+ raise RuntimeError(
111
+ "Windows STT (System.Speech) is only available on Windows"
112
+ )
113
+ # Locate PowerShell
114
+ ps = self._find_powershell()
115
+ if ps is None:
116
+ raise RuntimeError("PowerShell not found on this system")
117
+ self._powershell = ps
118
+ logger.debug("Using PowerShell: {}", ps)
119
+
120
+ # Verify System.Speech is available
121
+ ok = await self._check_system_speech()
122
+ if not ok:
123
+ raise RuntimeError(
124
+ "System.Speech assembly not available. "
125
+ "Ensure Windows Speech Recognition is installed."
126
+ )
127
+ self._available = True
128
+ logger.info("{} provider started", self.name)
129
+
130
+ async def stop(self) -> None:
131
+ self._available = False
132
+ self._powershell = None
133
+ logger.info("{} provider stopped", self.name)
134
+
135
+ async def health_check(self) -> bool:
136
+ if self._available:
137
+ return True
138
+ if sys.platform != "win32":
139
+ return False
140
+ return self._find_powershell() is not None
141
+
142
+ # -- transcription --------------------------------------------------------
143
+
144
+ async def transcribe(
145
+ self, audio: AudioData, opts: STTOptions | None = None
146
+ ) -> Transcription:
147
+ if not self._available or self._powershell is None:
148
+ raise RuntimeError("Provider not started — call start() first")
149
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
150
+ _t0 = time.perf_counter()
151
+
152
+ opts = opts or STTOptions()
153
+ language = opts.language or self.settings.language
154
+
155
+ # Convert to WAV (16-bit PCM) for System.Speech
156
+ wav_audio = AudioConverter.to_wav(audio)
157
+
158
+ tmp_wav: str | None = None
159
+ tmp_out: str | None = None
160
+ tmp_ps: str | None = None
161
+ try:
162
+ # Write WAV to temp file
163
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
164
+ f.write(wav_audio.data)
165
+ tmp_wav = f.name
166
+
167
+ # Output file for JSON result
168
+ with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
169
+ tmp_out = f.name
170
+
171
+ # Write PowerShell script to temp file
172
+ with tempfile.NamedTemporaryFile(
173
+ suffix=".ps1", delete=False, mode="w", encoding="utf-8"
174
+ ) as f:
175
+ f.write(_PS_RECOGNIZE_SCRIPT)
176
+ tmp_ps = f.name
177
+
178
+ # Run PowerShell
179
+ cmd = [
180
+ self._powershell,
181
+ "-ExecutionPolicy", "Bypass",
182
+ "-NoProfile",
183
+ "-File", tmp_ps,
184
+ "-AudioPath", tmp_wav,
185
+ "-OutputPath", tmp_out,
186
+ "-Language", language,
187
+ ]
188
+ proc = await asyncio.create_subprocess_exec(
189
+ *cmd,
190
+ stdout=asyncio.subprocess.PIPE,
191
+ stderr=asyncio.subprocess.PIPE,
192
+ )
193
+ _, stderr = await proc.communicate()
194
+
195
+ # Read JSON result
196
+ out_path = Path(tmp_out)
197
+ if not out_path.exists() or out_path.stat().st_size == 0:
198
+ err_msg = stderr.decode(errors="replace").strip() if stderr else "no output"
199
+ raise RuntimeError(f"Windows STT failed: {err_msg}")
200
+
201
+ raw = out_path.read_text(encoding="utf-8-sig").strip()
202
+ result = json.loads(raw)
203
+
204
+ if result.get("status") == "error":
205
+ raise RuntimeError(
206
+ f"Windows STT error: {result.get('error', 'unknown')}"
207
+ )
208
+
209
+ transcription = Transcription(
210
+ text=result.get("text", ""),
211
+ language=result.get("language"),
212
+ confidence=result.get("confidence"),
213
+ )
214
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(transcription.text))
215
+ return transcription
216
+ finally:
217
+ for p in (tmp_wav, tmp_out, tmp_ps):
218
+ if p is not None:
219
+ try:
220
+ Path(p).unlink(missing_ok=True)
221
+ except OSError:
222
+ pass
223
+
224
+ def transcribe_stream(
225
+ self, stream: AsyncIterator[bytes]
226
+ ) -> AsyncIterator[Any]:
227
+ raise NotImplementedError(
228
+ "Windows System.Speech does not support streaming via this provider"
229
+ )
230
+
231
+ # -- internal helpers -----------------------------------------------------
232
+
233
+ async def _check_system_speech(self) -> bool:
234
+ """Verify that System.Speech assembly is loadable."""
235
+ check_script = (
236
+ 'try { Add-Type -AssemblyName System.Speech; '
237
+ 'Write-Output "ok" } '
238
+ 'catch { Write-Output "fail" }'
239
+ )
240
+ proc = await asyncio.create_subprocess_exec(
241
+ self._powershell,
242
+ "-ExecutionPolicy", "Bypass",
243
+ "-NoProfile",
244
+ "-Command", check_script,
245
+ stdout=asyncio.subprocess.PIPE,
246
+ stderr=asyncio.subprocess.PIPE,
247
+ )
248
+ stdout, _ = await proc.communicate()
249
+ return stdout.decode(errors="replace").strip() == "ok"
File without changes
@@ -0,0 +1,95 @@
1
+ """Alibaba Cloud (Bailian/DashScope) TTS provider adapter — OpenAI-compatible."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from openspeech.core.base import TTSProvider
13
+
14
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
15
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
16
+ from openspeech.core.settings import BaseSettings
17
+
18
+ @dataclass
19
+ class AlibabaTTSSettings(BaseSettings):
20
+ api_key: str = ""
21
+ model: str = "cosyvoice-v1"
22
+ voice: str = "longxiaochun"
23
+ base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
24
+
25
+ class AlibabaTTS(TTSProvider):
26
+ name = "alibaba-tts"
27
+ provider_type = ProviderType.TTS
28
+ execution_mode = ExecMode.IN_PROCESS
29
+ settings_cls = AlibabaTTSSettings
30
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
31
+ field_options = {"model": ["cosyvoice-v1", "sambert-v1"], "voice": ["longxiaochun", "longhua", "longxiaoxia", "longlaotie", "longshu"]}
32
+
33
+ def __init__(self, settings: AlibabaTTSSettings | None = None) -> None:
34
+ self.settings = settings or AlibabaTTSSettings()
35
+ self._client: httpx.AsyncClient | None = None
36
+ self._owns_client: bool = True
37
+
38
+ def set_http_client(self, client) -> None:
39
+ self._client = client
40
+ self._owns_client = False
41
+
42
+ async def start(self) -> None:
43
+ if self._client is None:
44
+ self._client = httpx.AsyncClient(timeout=60.0)
45
+ self._owns_client = True
46
+ logger.info("{} provider started", self.name)
47
+
48
+ async def stop(self) -> None:
49
+ if self._client is not None and self._owns_client:
50
+ await self._client.aclose()
51
+ self._client = None
52
+ logger.info("{} provider stopped", self.name)
53
+
54
+ async def health_check(self) -> bool:
55
+ return bool(self.settings.api_key)
56
+
57
+ async def synthesize(
58
+ self, text: str, opts: TTSOptions | None = None
59
+ ) -> AudioData:
60
+ if self._client is None:
61
+ raise RuntimeError("Provider not started — call start() first")
62
+ logger.info("{}: request received, text={} chars", self.name, len(text))
63
+ _t0 = time.perf_counter()
64
+ opts = opts or TTSOptions()
65
+
66
+ voice = opts.voice or self.settings.voice
67
+ url = f"{self.settings.base_url}/audio/speech"
68
+
69
+ headers = {
70
+ "Authorization": f"Bearer {self.settings.api_key}",
71
+ "Content-Type": "application/json",
72
+ }
73
+ payload = {
74
+ "model": self.settings.model,
75
+ "input": text,
76
+ "voice": voice,
77
+ }
78
+
79
+ resp = await self._client.post(url, json=payload, headers=headers)
80
+ resp.raise_for_status()
81
+
82
+ result = AudioData(
83
+ data=resp.content,
84
+ sample_rate=24000,
85
+ channels=1,
86
+ format=opts.output_format,
87
+ )
88
+ logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
89
+ return result
90
+
91
+ async def synthesize_stream(
92
+ self, text: str, opts: TTSOptions | None = None
93
+ ) -> AsyncIterator[AudioChunk]:
94
+ raise NotImplementedError("Alibaba TTS streaming not implemented")
95
+ yield # noqa: unreachable — makes this an async generator
@@ -0,0 +1,123 @@
1
+ """Azure Speech TTS provider adapter (batch, httpx)."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from openspeech.core.base import TTSProvider
13
+
14
+ from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
15
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
16
+ from openspeech.core.settings import BaseSettings
17
+
18
+ @dataclass
19
+ class AzureSpeechTTSSettings(BaseSettings):
20
+ subscription_key: str = ""
21
+ region: str = "eastus"
22
+ voice: str = "en-US-JennyNeural"
23
+ language: str = "en-US"
24
+
25
+ class AzureSpeechTTS(TTSProvider):
26
+ name = "azure-tts"
27
+ provider_type = ProviderType.TTS
28
+ execution_mode = ExecMode.REMOTE
29
+ settings_cls = AzureSpeechTTSSettings
30
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
31
+ field_options = {"language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE"], "voice": ["en-US-JennyNeural", "en-US-GuyNeural", "en-US-AriaNeural", "zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural", "zh-CN-XiaoyiNeural", "ja-JP-NanamiNeural", "ko-KR-SunHiNeural"], "region": ["eastus", "westus2", "westeurope", "eastasia", "southeastasia"]}
32
+
33
+ def __init__(self, settings: AzureSpeechTTSSettings | None = None) -> None:
34
+ self.settings = settings or AzureSpeechTTSSettings()
35
+ self._client: httpx.AsyncClient | None = None
36
+ self._owns_client: bool = True
37
+
38
+ def set_http_client(self, client) -> None:
39
+ self._client = client
40
+ self._owns_client = False
41
+
42
+ async def start(self) -> None:
43
+ if self._client is None:
44
+ self._client = httpx.AsyncClient(timeout=60.0)
45
+ self._owns_client = True
46
+ logger.info("{} provider started", self.name)
47
+
48
+ async def stop(self) -> None:
49
+ if self._client is not None and self._owns_client:
50
+ await self._client.aclose()
51
+ self._client = None
52
+ logger.info("{} provider stopped", self.name)
53
+
54
+ async def health_check(self) -> bool:
55
+ return bool(self.settings.subscription_key)
56
+
57
+ async def _get_access_token(self) -> str:
58
+ """Fetch a short-lived access token from the Azure token endpoint."""
59
+ if self._client is None:
60
+ raise RuntimeError("Provider not started — call start() first")
61
+ url = (
62
+ f"https://{self.settings.region}.api.cognitive.microsoft.com"
63
+ f"/sts/v1.0/issueToken"
64
+ )
65
+ headers = {
66
+ "Ocp-Apim-Subscription-Key": self.settings.subscription_key,
67
+ "Content-Length": "0",
68
+ }
69
+ response = await self._client.post(url, headers=headers, content=b"")
70
+ if response.status_code != 200:
71
+ raise RuntimeError(
72
+ f"Azure token endpoint error {response.status_code}: {response.text}"
73
+ )
74
+ return response.text
75
+
76
+ async def synthesize(
77
+ self, text: str, opts: TTSOptions | None = None
78
+ ) -> AudioData:
79
+ if self._client is None:
80
+ raise RuntimeError("Provider not started — call start() first")
81
+ logger.info("{}: request received, text={} chars", self.name, len(text))
82
+ _t0 = time.perf_counter()
83
+ opts = opts or TTSOptions()
84
+ voice = opts.voice or self.settings.voice
85
+ language = self.settings.language
86
+
87
+ token = await self._get_access_token()
88
+
89
+ url = (
90
+ f"https://{self.settings.region}.tts.speech.microsoft.com"
91
+ f"/cognitiveservices/v1"
92
+ )
93
+ headers = {
94
+ "Authorization": f"Bearer {token}",
95
+ "Content-Type": "application/ssml+xml",
96
+ "X-Microsoft-OutputFormat": "riff-24khz-16bit-mono-pcm",
97
+ }
98
+ ssml = (
99
+ f"<speak version='1.0' xml:lang='{language}'>"
100
+ f"<voice name='{voice}'>{text}</voice>"
101
+ f"</speak>"
102
+ )
103
+ response = await self._client.post(url, headers=headers, content=ssml)
104
+ if response.status_code != 200:
105
+ raise RuntimeError(
106
+ f"Azure Speech TTS API error {response.status_code}: {response.text}"
107
+ )
108
+ result = AudioData(
109
+ data=response.content,
110
+ sample_rate=24000,
111
+ channels=1,
112
+ format=AudioFormat.WAV,
113
+ )
114
+ logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
115
+ return result
116
+
117
+ async def synthesize_stream(
118
+ self, text: str, opts: TTSOptions | None = None
119
+ ) -> AsyncIterator[AudioChunk]:
120
+ raise NotImplementedError(
121
+ "Azure Speech TTS batch provider does not support streaming output"
122
+ )
123
+ yield # pragma: no cover
@@ -0,0 +1,143 @@
1
+ """Baidu Cloud TTS provider adapter."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from openspeech.core.base import TTSProvider
13
+
14
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
15
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
16
+ from openspeech.core.settings import BaseSettings
17
+
18
+ @dataclass
19
+ class BaiduTTSSettings(BaseSettings):
20
+ api_key: str = ""
21
+ secret_key: str = ""
22
+ per: int = 0 # 0=female, 1=male, 3=度逍遥, 4=度丫丫
23
+ spd: int = 5 # speed 0-15
24
+ pit: int = 5 # pitch 0-15
25
+ vol: int = 5 # volume 0-15
26
+
27
+ class BaiduTTS(TTSProvider):
28
+ name = "baidu-tts"
29
+ provider_type = ProviderType.TTS
30
+ execution_mode = ExecMode.IN_PROCESS
31
+ settings_cls = BaiduTTSSettings
32
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
33
+ field_options = {"per": [0, 1, 3, 4, 5, 103, 106, 110, 111]}
34
+
35
+ def __init__(self, settings: BaiduTTSSettings | None = None) -> None:
36
+ self.settings = settings or BaiduTTSSettings()
37
+ self._client: httpx.AsyncClient | None = None
38
+ self._owns_client: bool = True
39
+ self._token: str | None = None
40
+ self._token_expires_at: float = 0.0
41
+
42
+ def set_http_client(self, client) -> None:
43
+ self._client = client
44
+ self._owns_client = False
45
+
46
+ async def start(self) -> None:
47
+ if self._client is None:
48
+ self._client = httpx.AsyncClient(timeout=60.0)
49
+ self._owns_client = True
50
+ logger.info("{} provider started", self.name)
51
+
52
+ async def stop(self) -> None:
53
+ if self._client is not None and self._owns_client:
54
+ await self._client.aclose()
55
+ self._client = None
56
+ self._token = None
57
+ self._token_expires_at = 0.0
58
+ logger.info("{} provider stopped", self.name)
59
+
60
+ async def health_check(self) -> bool:
61
+ return bool(self.settings.api_key) and bool(self.settings.secret_key)
62
+
63
+ async def _get_token(self) -> str:
64
+ """Fetch or return cached OAuth access token."""
65
+ if self._token and time.time() < self._token_expires_at:
66
+ return self._token
67
+
68
+ if self._client is None:
69
+ raise RuntimeError("Provider not started — call start() first")
70
+
71
+ resp = await self._client.get(
72
+ "https://aip.baidubce.com/oauth/2.0/token",
73
+ params={
74
+ "grant_type": "client_credentials",
75
+ "client_id": self.settings.api_key,
76
+ "client_secret": self.settings.secret_key,
77
+ },
78
+ )
79
+ resp.raise_for_status()
80
+ data = resp.json()
81
+
82
+ if "access_token" not in data:
83
+ raise RuntimeError(
84
+ f"Baidu OAuth error: {data.get('error_description', 'unknown')}"
85
+ )
86
+
87
+ self._token = data["access_token"]
88
+ self._token_expires_at = time.time() + data.get("expires_in", 2592000) - 60
89
+ return self._token # type: ignore[return-value]
90
+
91
+ async def synthesize(
92
+ self, text: str, opts: TTSOptions | None = None
93
+ ) -> AudioData:
94
+ if self._client is None:
95
+ raise RuntimeError("Provider not started — call start() first")
96
+ logger.info("{}: request received, text={} chars", self.name, len(text))
97
+ _t0 = time.perf_counter()
98
+ opts = opts or TTSOptions()
99
+
100
+ token = await self._get_token()
101
+
102
+ form_data = {
103
+ "tex": text,
104
+ "tok": token,
105
+ "cuid": "openspeech",
106
+ "ctp": "1",
107
+ "lan": "zh",
108
+ "spd": str(self.settings.spd),
109
+ "pit": str(self.settings.pit),
110
+ "vol": str(self.settings.vol),
111
+ "per": str(self.settings.per),
112
+ "aue": "6", # 6 = WAV format
113
+ }
114
+
115
+ resp = await self._client.post(
116
+ "https://tsn.baidu.com/text2audio",
117
+ data=form_data,
118
+ )
119
+ resp.raise_for_status()
120
+
121
+ content_type = resp.headers.get("content-type", "")
122
+ if "audio" in content_type:
123
+ result = AudioData(
124
+ data=resp.content,
125
+ sample_rate=16000,
126
+ channels=1,
127
+ format=opts.output_format,
128
+ )
129
+ logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
130
+ return result
131
+
132
+ # JSON response means error
133
+ error_data = resp.json()
134
+ raise RuntimeError(
135
+ f"Baidu TTS error [{error_data.get('err_no')}]: "
136
+ f"{error_data.get('err_msg', 'unknown')}"
137
+ )
138
+
139
+ async def synthesize_stream(
140
+ self, text: str, opts: TTSOptions | None = None
141
+ ) -> AsyncIterator[AudioChunk]:
142
+ raise NotImplementedError("Baidu TTS streaming not implemented")
143
+ yield # noqa: unreachable — makes this an async generator
@@ -0,0 +1,64 @@
1
+ """Coqui TTS provider adapter (voice clone + multilingual, in-process)."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ from openspeech.core.base import TTSProvider
11
+
12
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
13
+ from openspeech.core.models import AudioChunk, AudioData, TTSOptions
14
+ from openspeech.core.settings import BaseSettings
15
+
16
+ @dataclass
17
+ class CoquiTTSSettings(BaseSettings):
18
+ model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"
19
+ vocoder_name: str | None = None
20
+ use_cuda: bool = False
21
+ speaker_wav: str | None = None
22
+ language: str | None = None
23
+
24
+ class CoquiTTS(TTSProvider):
25
+ name = "coqui"
26
+ provider_type = ProviderType.TTS
27
+ execution_mode = ExecMode.IN_PROCESS
28
+ settings_cls = CoquiTTSSettings
29
+ capabilities = {Capability.VOICE_CLONE, Capability.MULTILINGUAL}
30
+
31
+ def __init__(self, settings: CoquiTTSSettings | None = None) -> None:
32
+ self.settings = settings or CoquiTTSSettings()
33
+ self._client: Any = None
34
+
35
+ async def start(self) -> None:
36
+ try:
37
+ from TTS.api import TTS # noqa: F401
38
+ self._client = object() # sentinel
39
+ except ImportError:
40
+ raise ImportError(
41
+ "Install coqui TTS: pip install openspeech[coqui]"
42
+ )
43
+ logger.info("{} provider started", self.name)
44
+
45
+ async def stop(self) -> None:
46
+ self._client = None
47
+ logger.info("{} provider stopped", self.name)
48
+
49
+ async def health_check(self) -> bool:
50
+ return self._client is not None
51
+
52
+ async def synthesize(
53
+ self, text: str, opts: TTSOptions | None = None
54
+ ) -> AudioData:
55
+ if self._client is None:
56
+ raise RuntimeError("Provider not started — call start() first")
57
+ logger.info("{}: request received, text={} chars", self.name, len(text))
58
+ raise NotImplementedError("CoquiTTS.synthesize() is not yet implemented")
59
+
60
+ async def synthesize_stream(
61
+ self, text: str, opts: TTSOptions | None = None
62
+ ) -> AsyncIterator[AudioChunk]:
63
+ raise NotImplementedError("CoquiTTS does not support streaming synthesis")
64
+ yield # pragma: no cover