openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Windows native STT provider using System.Speech.Recognition via PowerShell.
|
|
2
|
+
|
|
3
|
+
Uses PowerShell to invoke the .NET System.Speech.Recognition.SpeechRecognitionEngine,
|
|
4
|
+
which is available on all Windows 10+ systems without extra dependencies.
|
|
5
|
+
Results are exchanged through a temporary JSON file.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
from openspeech.logging_config import logger
|
|
12
|
+
import shutil
|
|
13
|
+
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
import time
|
|
16
|
+
from collections.abc import AsyncIterator
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from openspeech.core.base import STTProvider
|
|
22
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
23
|
+
from openspeech.core.models import AudioData, STTOptions, Transcription
|
|
24
|
+
from openspeech.core.settings import BaseSettings
|
|
25
|
+
from openspeech.utils.audio_converter import AudioConverter
|
|
26
|
+
|
|
27
|
+
# PowerShell script that performs speech recognition and outputs JSON.
|
|
28
|
+
_PS_RECOGNIZE_SCRIPT = r"""
|
|
29
|
+
param(
|
|
30
|
+
[Parameter(Mandatory=$true)][string]$AudioPath,
|
|
31
|
+
[Parameter(Mandatory=$true)][string]$OutputPath,
|
|
32
|
+
[string]$Language = "zh-CN"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
Add-Type -AssemblyName System.Speech
|
|
37
|
+
|
|
38
|
+
$culture = New-Object System.Globalization.CultureInfo($Language)
|
|
39
|
+
$engine = New-Object System.Speech.Recognition.SpeechRecognitionEngine($culture)
|
|
40
|
+
$grammar = New-Object System.Speech.Recognition.DictationGrammar
|
|
41
|
+
$engine.LoadGrammar($grammar)
|
|
42
|
+
$engine.SetInputToWaveFile($AudioPath)
|
|
43
|
+
|
|
44
|
+
$result = $engine.Recognize()
|
|
45
|
+
$engine.Dispose()
|
|
46
|
+
|
|
47
|
+
if ($result -ne $null) {
|
|
48
|
+
$output = @{
|
|
49
|
+
status = "ok"
|
|
50
|
+
text = $result.Text
|
|
51
|
+
confidence = [math]::Round($result.Confidence, 4)
|
|
52
|
+
language = $Language
|
|
53
|
+
}
|
|
54
|
+
} else {
|
|
55
|
+
$output = @{
|
|
56
|
+
status = "ok"
|
|
57
|
+
text = ""
|
|
58
|
+
confidence = 0
|
|
59
|
+
language = $Language
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
} catch {
|
|
63
|
+
$output = @{
|
|
64
|
+
status = "error"
|
|
65
|
+
error = $_.Exception.Message
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
$output | ConvertTo-Json -Compress | Out-File -FilePath $OutputPath -Encoding UTF8
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class WindowsSpeechSettings(BaseSettings):
|
|
74
|
+
language: str = "zh-CN"
|
|
75
|
+
|
|
76
|
+
class WindowsSpeechSTT(STTProvider):
|
|
77
|
+
"""Windows native STT via System.Speech.Recognition (PowerShell)."""
|
|
78
|
+
|
|
79
|
+
name = "windows-stt"
|
|
80
|
+
provider_type = ProviderType.STT
|
|
81
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
82
|
+
settings_cls = WindowsSpeechSettings
|
|
83
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
84
|
+
field_options = {"language": ["zh-CN", "en-US", "ja-JP", "ko-KR", "fr-FR", "de-DE", "es-ES"]}
|
|
85
|
+
|
|
86
|
+
def __init__(self, settings: WindowsSpeechSettings | None = None) -> None:
|
|
87
|
+
self.settings = settings or WindowsSpeechSettings()
|
|
88
|
+
self._available: bool = False
|
|
89
|
+
self._powershell: str | None = None
|
|
90
|
+
|
|
91
|
+
# -- lifecycle ------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def _find_powershell() -> str | None:
|
|
95
|
+
"""Locate PowerShell executable — try PATH first, then well-known paths."""
|
|
96
|
+
ps = shutil.which("powershell") or shutil.which("pwsh")
|
|
97
|
+
if ps:
|
|
98
|
+
return ps
|
|
99
|
+
# Fallback: well-known Windows paths
|
|
100
|
+
for candidate in (
|
|
101
|
+
Path(r"C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe"),
|
|
102
|
+
Path(r"C:\Windows\SysWOW64\WindowsPowerShell\v1.0\powershell.exe"),
|
|
103
|
+
):
|
|
104
|
+
if candidate.exists():
|
|
105
|
+
return str(candidate)
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
async def start(self) -> None:
|
|
109
|
+
if sys.platform != "win32":
|
|
110
|
+
raise RuntimeError(
|
|
111
|
+
"Windows STT (System.Speech) is only available on Windows"
|
|
112
|
+
)
|
|
113
|
+
# Locate PowerShell
|
|
114
|
+
ps = self._find_powershell()
|
|
115
|
+
if ps is None:
|
|
116
|
+
raise RuntimeError("PowerShell not found on this system")
|
|
117
|
+
self._powershell = ps
|
|
118
|
+
logger.debug("Using PowerShell: {}", ps)
|
|
119
|
+
|
|
120
|
+
# Verify System.Speech is available
|
|
121
|
+
ok = await self._check_system_speech()
|
|
122
|
+
if not ok:
|
|
123
|
+
raise RuntimeError(
|
|
124
|
+
"System.Speech assembly not available. "
|
|
125
|
+
"Ensure Windows Speech Recognition is installed."
|
|
126
|
+
)
|
|
127
|
+
self._available = True
|
|
128
|
+
logger.info("{} provider started", self.name)
|
|
129
|
+
|
|
130
|
+
async def stop(self) -> None:
|
|
131
|
+
self._available = False
|
|
132
|
+
self._powershell = None
|
|
133
|
+
logger.info("{} provider stopped", self.name)
|
|
134
|
+
|
|
135
|
+
async def health_check(self) -> bool:
|
|
136
|
+
if self._available:
|
|
137
|
+
return True
|
|
138
|
+
if sys.platform != "win32":
|
|
139
|
+
return False
|
|
140
|
+
return self._find_powershell() is not None
|
|
141
|
+
|
|
142
|
+
# -- transcription --------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
async def transcribe(
|
|
145
|
+
self, audio: AudioData, opts: STTOptions | None = None
|
|
146
|
+
) -> Transcription:
|
|
147
|
+
if not self._available or self._powershell is None:
|
|
148
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
149
|
+
logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
|
|
150
|
+
_t0 = time.perf_counter()
|
|
151
|
+
|
|
152
|
+
opts = opts or STTOptions()
|
|
153
|
+
language = opts.language or self.settings.language
|
|
154
|
+
|
|
155
|
+
# Convert to WAV (16-bit PCM) for System.Speech
|
|
156
|
+
wav_audio = AudioConverter.to_wav(audio)
|
|
157
|
+
|
|
158
|
+
tmp_wav: str | None = None
|
|
159
|
+
tmp_out: str | None = None
|
|
160
|
+
tmp_ps: str | None = None
|
|
161
|
+
try:
|
|
162
|
+
# Write WAV to temp file
|
|
163
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
164
|
+
f.write(wav_audio.data)
|
|
165
|
+
tmp_wav = f.name
|
|
166
|
+
|
|
167
|
+
# Output file for JSON result
|
|
168
|
+
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
|
169
|
+
tmp_out = f.name
|
|
170
|
+
|
|
171
|
+
# Write PowerShell script to temp file
|
|
172
|
+
with tempfile.NamedTemporaryFile(
|
|
173
|
+
suffix=".ps1", delete=False, mode="w", encoding="utf-8"
|
|
174
|
+
) as f:
|
|
175
|
+
f.write(_PS_RECOGNIZE_SCRIPT)
|
|
176
|
+
tmp_ps = f.name
|
|
177
|
+
|
|
178
|
+
# Run PowerShell
|
|
179
|
+
cmd = [
|
|
180
|
+
self._powershell,
|
|
181
|
+
"-ExecutionPolicy", "Bypass",
|
|
182
|
+
"-NoProfile",
|
|
183
|
+
"-File", tmp_ps,
|
|
184
|
+
"-AudioPath", tmp_wav,
|
|
185
|
+
"-OutputPath", tmp_out,
|
|
186
|
+
"-Language", language,
|
|
187
|
+
]
|
|
188
|
+
proc = await asyncio.create_subprocess_exec(
|
|
189
|
+
*cmd,
|
|
190
|
+
stdout=asyncio.subprocess.PIPE,
|
|
191
|
+
stderr=asyncio.subprocess.PIPE,
|
|
192
|
+
)
|
|
193
|
+
_, stderr = await proc.communicate()
|
|
194
|
+
|
|
195
|
+
# Read JSON result
|
|
196
|
+
out_path = Path(tmp_out)
|
|
197
|
+
if not out_path.exists() or out_path.stat().st_size == 0:
|
|
198
|
+
err_msg = stderr.decode(errors="replace").strip() if stderr else "no output"
|
|
199
|
+
raise RuntimeError(f"Windows STT failed: {err_msg}")
|
|
200
|
+
|
|
201
|
+
raw = out_path.read_text(encoding="utf-8-sig").strip()
|
|
202
|
+
result = json.loads(raw)
|
|
203
|
+
|
|
204
|
+
if result.get("status") == "error":
|
|
205
|
+
raise RuntimeError(
|
|
206
|
+
f"Windows STT error: {result.get('error', 'unknown')}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
transcription = Transcription(
|
|
210
|
+
text=result.get("text", ""),
|
|
211
|
+
language=result.get("language"),
|
|
212
|
+
confidence=result.get("confidence"),
|
|
213
|
+
)
|
|
214
|
+
logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(transcription.text))
|
|
215
|
+
return transcription
|
|
216
|
+
finally:
|
|
217
|
+
for p in (tmp_wav, tmp_out, tmp_ps):
|
|
218
|
+
if p is not None:
|
|
219
|
+
try:
|
|
220
|
+
Path(p).unlink(missing_ok=True)
|
|
221
|
+
except OSError:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
def transcribe_stream(
|
|
225
|
+
self, stream: AsyncIterator[bytes]
|
|
226
|
+
) -> AsyncIterator[Any]:
|
|
227
|
+
raise NotImplementedError(
|
|
228
|
+
"Windows System.Speech does not support streaming via this provider"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# -- internal helpers -----------------------------------------------------
|
|
232
|
+
|
|
233
|
+
async def _check_system_speech(self) -> bool:
|
|
234
|
+
"""Verify that System.Speech assembly is loadable."""
|
|
235
|
+
check_script = (
|
|
236
|
+
'try { Add-Type -AssemblyName System.Speech; '
|
|
237
|
+
'Write-Output "ok" } '
|
|
238
|
+
'catch { Write-Output "fail" }'
|
|
239
|
+
)
|
|
240
|
+
proc = await asyncio.create_subprocess_exec(
|
|
241
|
+
self._powershell,
|
|
242
|
+
"-ExecutionPolicy", "Bypass",
|
|
243
|
+
"-NoProfile",
|
|
244
|
+
"-Command", check_script,
|
|
245
|
+
stdout=asyncio.subprocess.PIPE,
|
|
246
|
+
stderr=asyncio.subprocess.PIPE,
|
|
247
|
+
)
|
|
248
|
+
stdout, _ = await proc.communicate()
|
|
249
|
+
return stdout.decode(errors="replace").strip() == "ok"
|
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Alibaba Cloud (Bailian/DashScope) TTS provider adapter — OpenAI-compatible."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from openspeech.core.base import TTSProvider
|
|
13
|
+
|
|
14
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
15
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
16
|
+
from openspeech.core.settings import BaseSettings
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AlibabaTTSSettings(BaseSettings):
|
|
20
|
+
api_key: str = ""
|
|
21
|
+
model: str = "cosyvoice-v1"
|
|
22
|
+
voice: str = "longxiaochun"
|
|
23
|
+
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
24
|
+
|
|
25
|
+
class AlibabaTTS(TTSProvider):
|
|
26
|
+
name = "alibaba-tts"
|
|
27
|
+
provider_type = ProviderType.TTS
|
|
28
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
29
|
+
settings_cls = AlibabaTTSSettings
|
|
30
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
31
|
+
field_options = {"model": ["cosyvoice-v1", "sambert-v1"], "voice": ["longxiaochun", "longhua", "longxiaoxia", "longlaotie", "longshu"]}
|
|
32
|
+
|
|
33
|
+
def __init__(self, settings: AlibabaTTSSettings | None = None) -> None:
|
|
34
|
+
self.settings = settings or AlibabaTTSSettings()
|
|
35
|
+
self._client: httpx.AsyncClient | None = None
|
|
36
|
+
self._owns_client: bool = True
|
|
37
|
+
|
|
38
|
+
def set_http_client(self, client) -> None:
|
|
39
|
+
self._client = client
|
|
40
|
+
self._owns_client = False
|
|
41
|
+
|
|
42
|
+
async def start(self) -> None:
|
|
43
|
+
if self._client is None:
|
|
44
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
45
|
+
self._owns_client = True
|
|
46
|
+
logger.info("{} provider started", self.name)
|
|
47
|
+
|
|
48
|
+
async def stop(self) -> None:
|
|
49
|
+
if self._client is not None and self._owns_client:
|
|
50
|
+
await self._client.aclose()
|
|
51
|
+
self._client = None
|
|
52
|
+
logger.info("{} provider stopped", self.name)
|
|
53
|
+
|
|
54
|
+
async def health_check(self) -> bool:
|
|
55
|
+
return bool(self.settings.api_key)
|
|
56
|
+
|
|
57
|
+
async def synthesize(
|
|
58
|
+
self, text: str, opts: TTSOptions | None = None
|
|
59
|
+
) -> AudioData:
|
|
60
|
+
if self._client is None:
|
|
61
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
62
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
63
|
+
_t0 = time.perf_counter()
|
|
64
|
+
opts = opts or TTSOptions()
|
|
65
|
+
|
|
66
|
+
voice = opts.voice or self.settings.voice
|
|
67
|
+
url = f"{self.settings.base_url}/audio/speech"
|
|
68
|
+
|
|
69
|
+
headers = {
|
|
70
|
+
"Authorization": f"Bearer {self.settings.api_key}",
|
|
71
|
+
"Content-Type": "application/json",
|
|
72
|
+
}
|
|
73
|
+
payload = {
|
|
74
|
+
"model": self.settings.model,
|
|
75
|
+
"input": text,
|
|
76
|
+
"voice": voice,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
resp = await self._client.post(url, json=payload, headers=headers)
|
|
80
|
+
resp.raise_for_status()
|
|
81
|
+
|
|
82
|
+
result = AudioData(
|
|
83
|
+
data=resp.content,
|
|
84
|
+
sample_rate=24000,
|
|
85
|
+
channels=1,
|
|
86
|
+
format=opts.output_format,
|
|
87
|
+
)
|
|
88
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
async def synthesize_stream(
|
|
92
|
+
self, text: str, opts: TTSOptions | None = None
|
|
93
|
+
) -> AsyncIterator[AudioChunk]:
|
|
94
|
+
raise NotImplementedError("Alibaba TTS streaming not implemented")
|
|
95
|
+
yield # noqa: unreachable — makes this an async generator
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Azure Speech TTS provider adapter (batch, httpx)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from openspeech.core.base import TTSProvider
|
|
13
|
+
|
|
14
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
15
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
16
|
+
from openspeech.core.settings import BaseSettings
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AzureSpeechTTSSettings(BaseSettings):
|
|
20
|
+
subscription_key: str = ""
|
|
21
|
+
region: str = "eastus"
|
|
22
|
+
voice: str = "en-US-JennyNeural"
|
|
23
|
+
language: str = "en-US"
|
|
24
|
+
|
|
25
|
+
class AzureSpeechTTS(TTSProvider):
|
|
26
|
+
name = "azure-tts"
|
|
27
|
+
provider_type = ProviderType.TTS
|
|
28
|
+
execution_mode = ExecMode.REMOTE
|
|
29
|
+
settings_cls = AzureSpeechTTSSettings
|
|
30
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
31
|
+
field_options = {"language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE"], "voice": ["en-US-JennyNeural", "en-US-GuyNeural", "en-US-AriaNeural", "zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural", "zh-CN-XiaoyiNeural", "ja-JP-NanamiNeural", "ko-KR-SunHiNeural"], "region": ["eastus", "westus2", "westeurope", "eastasia", "southeastasia"]}
|
|
32
|
+
|
|
33
|
+
def __init__(self, settings: AzureSpeechTTSSettings | None = None) -> None:
|
|
34
|
+
self.settings = settings or AzureSpeechTTSSettings()
|
|
35
|
+
self._client: httpx.AsyncClient | None = None
|
|
36
|
+
self._owns_client: bool = True
|
|
37
|
+
|
|
38
|
+
def set_http_client(self, client) -> None:
|
|
39
|
+
self._client = client
|
|
40
|
+
self._owns_client = False
|
|
41
|
+
|
|
42
|
+
async def start(self) -> None:
|
|
43
|
+
if self._client is None:
|
|
44
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
45
|
+
self._owns_client = True
|
|
46
|
+
logger.info("{} provider started", self.name)
|
|
47
|
+
|
|
48
|
+
async def stop(self) -> None:
|
|
49
|
+
if self._client is not None and self._owns_client:
|
|
50
|
+
await self._client.aclose()
|
|
51
|
+
self._client = None
|
|
52
|
+
logger.info("{} provider stopped", self.name)
|
|
53
|
+
|
|
54
|
+
async def health_check(self) -> bool:
|
|
55
|
+
return bool(self.settings.subscription_key)
|
|
56
|
+
|
|
57
|
+
async def _get_access_token(self) -> str:
|
|
58
|
+
"""Fetch a short-lived access token from the Azure token endpoint."""
|
|
59
|
+
if self._client is None:
|
|
60
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
61
|
+
url = (
|
|
62
|
+
f"https://{self.settings.region}.api.cognitive.microsoft.com"
|
|
63
|
+
f"/sts/v1.0/issueToken"
|
|
64
|
+
)
|
|
65
|
+
headers = {
|
|
66
|
+
"Ocp-Apim-Subscription-Key": self.settings.subscription_key,
|
|
67
|
+
"Content-Length": "0",
|
|
68
|
+
}
|
|
69
|
+
response = await self._client.post(url, headers=headers, content=b"")
|
|
70
|
+
if response.status_code != 200:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
f"Azure token endpoint error {response.status_code}: {response.text}"
|
|
73
|
+
)
|
|
74
|
+
return response.text
|
|
75
|
+
|
|
76
|
+
async def synthesize(
|
|
77
|
+
self, text: str, opts: TTSOptions | None = None
|
|
78
|
+
) -> AudioData:
|
|
79
|
+
if self._client is None:
|
|
80
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
81
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
82
|
+
_t0 = time.perf_counter()
|
|
83
|
+
opts = opts or TTSOptions()
|
|
84
|
+
voice = opts.voice or self.settings.voice
|
|
85
|
+
language = self.settings.language
|
|
86
|
+
|
|
87
|
+
token = await self._get_access_token()
|
|
88
|
+
|
|
89
|
+
url = (
|
|
90
|
+
f"https://{self.settings.region}.tts.speech.microsoft.com"
|
|
91
|
+
f"/cognitiveservices/v1"
|
|
92
|
+
)
|
|
93
|
+
headers = {
|
|
94
|
+
"Authorization": f"Bearer {token}",
|
|
95
|
+
"Content-Type": "application/ssml+xml",
|
|
96
|
+
"X-Microsoft-OutputFormat": "riff-24khz-16bit-mono-pcm",
|
|
97
|
+
}
|
|
98
|
+
ssml = (
|
|
99
|
+
f"<speak version='1.0' xml:lang='{language}'>"
|
|
100
|
+
f"<voice name='{voice}'>{text}</voice>"
|
|
101
|
+
f"</speak>"
|
|
102
|
+
)
|
|
103
|
+
response = await self._client.post(url, headers=headers, content=ssml)
|
|
104
|
+
if response.status_code != 200:
|
|
105
|
+
raise RuntimeError(
|
|
106
|
+
f"Azure Speech TTS API error {response.status_code}: {response.text}"
|
|
107
|
+
)
|
|
108
|
+
result = AudioData(
|
|
109
|
+
data=response.content,
|
|
110
|
+
sample_rate=24000,
|
|
111
|
+
channels=1,
|
|
112
|
+
format=AudioFormat.WAV,
|
|
113
|
+
)
|
|
114
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
115
|
+
return result
|
|
116
|
+
|
|
117
|
+
async def synthesize_stream(
|
|
118
|
+
self, text: str, opts: TTSOptions | None = None
|
|
119
|
+
) -> AsyncIterator[AudioChunk]:
|
|
120
|
+
raise NotImplementedError(
|
|
121
|
+
"Azure Speech TTS batch provider does not support streaming output"
|
|
122
|
+
)
|
|
123
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Baidu Cloud TTS provider adapter."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from openspeech.core.base import TTSProvider
|
|
13
|
+
|
|
14
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
15
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
16
|
+
from openspeech.core.settings import BaseSettings
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BaiduTTSSettings(BaseSettings):
|
|
20
|
+
api_key: str = ""
|
|
21
|
+
secret_key: str = ""
|
|
22
|
+
per: int = 0 # 0=female, 1=male, 3=度逍遥, 4=度丫丫
|
|
23
|
+
spd: int = 5 # speed 0-15
|
|
24
|
+
pit: int = 5 # pitch 0-15
|
|
25
|
+
vol: int = 5 # volume 0-15
|
|
26
|
+
|
|
27
|
+
class BaiduTTS(TTSProvider):
|
|
28
|
+
name = "baidu-tts"
|
|
29
|
+
provider_type = ProviderType.TTS
|
|
30
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
31
|
+
settings_cls = BaiduTTSSettings
|
|
32
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
33
|
+
field_options = {"per": [0, 1, 3, 4, 5, 103, 106, 110, 111]}
|
|
34
|
+
|
|
35
|
+
def __init__(self, settings: BaiduTTSSettings | None = None) -> None:
|
|
36
|
+
self.settings = settings or BaiduTTSSettings()
|
|
37
|
+
self._client: httpx.AsyncClient | None = None
|
|
38
|
+
self._owns_client: bool = True
|
|
39
|
+
self._token: str | None = None
|
|
40
|
+
self._token_expires_at: float = 0.0
|
|
41
|
+
|
|
42
|
+
def set_http_client(self, client) -> None:
|
|
43
|
+
self._client = client
|
|
44
|
+
self._owns_client = False
|
|
45
|
+
|
|
46
|
+
async def start(self) -> None:
|
|
47
|
+
if self._client is None:
|
|
48
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
49
|
+
self._owns_client = True
|
|
50
|
+
logger.info("{} provider started", self.name)
|
|
51
|
+
|
|
52
|
+
async def stop(self) -> None:
|
|
53
|
+
if self._client is not None and self._owns_client:
|
|
54
|
+
await self._client.aclose()
|
|
55
|
+
self._client = None
|
|
56
|
+
self._token = None
|
|
57
|
+
self._token_expires_at = 0.0
|
|
58
|
+
logger.info("{} provider stopped", self.name)
|
|
59
|
+
|
|
60
|
+
async def health_check(self) -> bool:
|
|
61
|
+
return bool(self.settings.api_key) and bool(self.settings.secret_key)
|
|
62
|
+
|
|
63
|
+
async def _get_token(self) -> str:
|
|
64
|
+
"""Fetch or return cached OAuth access token."""
|
|
65
|
+
if self._token and time.time() < self._token_expires_at:
|
|
66
|
+
return self._token
|
|
67
|
+
|
|
68
|
+
if self._client is None:
|
|
69
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
70
|
+
|
|
71
|
+
resp = await self._client.get(
|
|
72
|
+
"https://aip.baidubce.com/oauth/2.0/token",
|
|
73
|
+
params={
|
|
74
|
+
"grant_type": "client_credentials",
|
|
75
|
+
"client_id": self.settings.api_key,
|
|
76
|
+
"client_secret": self.settings.secret_key,
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
resp.raise_for_status()
|
|
80
|
+
data = resp.json()
|
|
81
|
+
|
|
82
|
+
if "access_token" not in data:
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
f"Baidu OAuth error: {data.get('error_description', 'unknown')}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self._token = data["access_token"]
|
|
88
|
+
self._token_expires_at = time.time() + data.get("expires_in", 2592000) - 60
|
|
89
|
+
return self._token # type: ignore[return-value]
|
|
90
|
+
|
|
91
|
+
async def synthesize(
|
|
92
|
+
self, text: str, opts: TTSOptions | None = None
|
|
93
|
+
) -> AudioData:
|
|
94
|
+
if self._client is None:
|
|
95
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
96
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
97
|
+
_t0 = time.perf_counter()
|
|
98
|
+
opts = opts or TTSOptions()
|
|
99
|
+
|
|
100
|
+
token = await self._get_token()
|
|
101
|
+
|
|
102
|
+
form_data = {
|
|
103
|
+
"tex": text,
|
|
104
|
+
"tok": token,
|
|
105
|
+
"cuid": "openspeech",
|
|
106
|
+
"ctp": "1",
|
|
107
|
+
"lan": "zh",
|
|
108
|
+
"spd": str(self.settings.spd),
|
|
109
|
+
"pit": str(self.settings.pit),
|
|
110
|
+
"vol": str(self.settings.vol),
|
|
111
|
+
"per": str(self.settings.per),
|
|
112
|
+
"aue": "6", # 6 = WAV format
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
resp = await self._client.post(
|
|
116
|
+
"https://tsn.baidu.com/text2audio",
|
|
117
|
+
data=form_data,
|
|
118
|
+
)
|
|
119
|
+
resp.raise_for_status()
|
|
120
|
+
|
|
121
|
+
content_type = resp.headers.get("content-type", "")
|
|
122
|
+
if "audio" in content_type:
|
|
123
|
+
result = AudioData(
|
|
124
|
+
data=resp.content,
|
|
125
|
+
sample_rate=16000,
|
|
126
|
+
channels=1,
|
|
127
|
+
format=opts.output_format,
|
|
128
|
+
)
|
|
129
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
# JSON response means error
|
|
133
|
+
error_data = resp.json()
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
f"Baidu TTS error [{error_data.get('err_no')}]: "
|
|
136
|
+
f"{error_data.get('err_msg', 'unknown')}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
async def synthesize_stream(
|
|
140
|
+
self, text: str, opts: TTSOptions | None = None
|
|
141
|
+
) -> AsyncIterator[AudioChunk]:
|
|
142
|
+
raise NotImplementedError("Baidu TTS streaming not implemented")
|
|
143
|
+
yield # noqa: unreachable — makes this an async generator
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Coqui TTS provider adapter (voice clone + multilingual, in-process)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from openspeech.core.base import TTSProvider
|
|
11
|
+
|
|
12
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
13
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
14
|
+
from openspeech.core.settings import BaseSettings
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class CoquiTTSSettings(BaseSettings):
|
|
18
|
+
model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"
|
|
19
|
+
vocoder_name: str | None = None
|
|
20
|
+
use_cuda: bool = False
|
|
21
|
+
speaker_wav: str | None = None
|
|
22
|
+
language: str | None = None
|
|
23
|
+
|
|
24
|
+
class CoquiTTS(TTSProvider):
|
|
25
|
+
name = "coqui"
|
|
26
|
+
provider_type = ProviderType.TTS
|
|
27
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
28
|
+
settings_cls = CoquiTTSSettings
|
|
29
|
+
capabilities = {Capability.VOICE_CLONE, Capability.MULTILINGUAL}
|
|
30
|
+
|
|
31
|
+
def __init__(self, settings: CoquiTTSSettings | None = None) -> None:
|
|
32
|
+
self.settings = settings or CoquiTTSSettings()
|
|
33
|
+
self._client: Any = None
|
|
34
|
+
|
|
35
|
+
async def start(self) -> None:
|
|
36
|
+
try:
|
|
37
|
+
from TTS.api import TTS # noqa: F401
|
|
38
|
+
self._client = object() # sentinel
|
|
39
|
+
except ImportError:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
"Install coqui TTS: pip install openspeech[coqui]"
|
|
42
|
+
)
|
|
43
|
+
logger.info("{} provider started", self.name)
|
|
44
|
+
|
|
45
|
+
async def stop(self) -> None:
|
|
46
|
+
self._client = None
|
|
47
|
+
logger.info("{} provider stopped", self.name)
|
|
48
|
+
|
|
49
|
+
async def health_check(self) -> bool:
|
|
50
|
+
return self._client is not None
|
|
51
|
+
|
|
52
|
+
async def synthesize(
|
|
53
|
+
self, text: str, opts: TTSOptions | None = None
|
|
54
|
+
) -> AudioData:
|
|
55
|
+
if self._client is None:
|
|
56
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
57
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
58
|
+
raise NotImplementedError("CoquiTTS.synthesize() is not yet implemented")
|
|
59
|
+
|
|
60
|
+
async def synthesize_stream(
|
|
61
|
+
self, text: str, opts: TTSOptions | None = None
|
|
62
|
+
) -> AsyncIterator[AudioChunk]:
|
|
63
|
+
raise NotImplementedError("CoquiTTS does not support streaming synthesis")
|
|
64
|
+
yield # pragma: no cover
|