openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""macOS native TTS provider using the ``say`` command."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import io
|
|
6
|
+
from openspeech.logging_config import logger
|
|
7
|
+
import shutil
|
|
8
|
+
import struct
|
|
9
|
+
import tempfile
|
|
10
|
+
import time
|
|
11
|
+
import wave
|
|
12
|
+
from collections.abc import AsyncIterator
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from openspeech.core.base import TTSProvider
|
|
17
|
+
|
|
18
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
19
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
20
|
+
from openspeech.core.settings import BaseSettings
|
|
21
|
+
from openspeech.utils.audio_converter import AudioConverter
|
|
22
|
+
|
|
23
|
+
# Language prefix → human-readable group name
|
|
24
|
+
_LANG_GROUP_MAP: dict[str, str] = {
|
|
25
|
+
"zh": "中文",
|
|
26
|
+
"en": "English",
|
|
27
|
+
"ja": "日本語",
|
|
28
|
+
"ko": "한국어",
|
|
29
|
+
"fr": "Français",
|
|
30
|
+
"de": "Deutsch",
|
|
31
|
+
"es": "Español",
|
|
32
|
+
"it": "Italiano",
|
|
33
|
+
"pt": "Português",
|
|
34
|
+
"ru": "Русский",
|
|
35
|
+
"ar": "العربية",
|
|
36
|
+
"nl": "Nederlands",
|
|
37
|
+
"sv": "Svenska",
|
|
38
|
+
"da": "Dansk",
|
|
39
|
+
"fi": "Suomi",
|
|
40
|
+
"nb": "Norsk",
|
|
41
|
+
"pl": "Polski",
|
|
42
|
+
"tr": "Türkçe",
|
|
43
|
+
"th": "ไทย",
|
|
44
|
+
"hi": "हिन्दी",
|
|
45
|
+
"he": "עברית",
|
|
46
|
+
"ro": "Română",
|
|
47
|
+
"hu": "Magyar",
|
|
48
|
+
"cs": "Čeština",
|
|
49
|
+
"el": "Ελληνικά",
|
|
50
|
+
"id": "Bahasa Indonesia",
|
|
51
|
+
"vi": "Tiếng Việt",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class MacOSSaySettings(BaseSettings):
|
|
56
|
+
default_voice: str = "Samantha"
|
|
57
|
+
default_rate: int = 200
|
|
58
|
+
|
|
59
|
+
class MacOSSayTTS(TTSProvider):
|
|
60
|
+
"""macOS native TTS via the ``say`` command-line tool."""
|
|
61
|
+
|
|
62
|
+
name = "macos-say"
|
|
63
|
+
provider_type = ProviderType.TTS
|
|
64
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
65
|
+
settings_cls = MacOSSaySettings
|
|
66
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
67
|
+
|
|
68
|
+
def __init__(self, settings: MacOSSaySettings | None = None) -> None:
|
|
69
|
+
self.settings = settings or MacOSSaySettings()
|
|
70
|
+
self._available: bool = False
|
|
71
|
+
self._voices_cache: list[dict] | None = None
|
|
72
|
+
|
|
73
|
+
# -- lifecycle ------------------------------------------------------------
|
|
74
|
+
|
|
75
|
+
async def start(self) -> None:
|
|
76
|
+
if shutil.which("say") is None:
|
|
77
|
+
raise RuntimeError(
|
|
78
|
+
"macOS 'say' command not found — this provider requires macOS"
|
|
79
|
+
)
|
|
80
|
+
self._available = True
|
|
81
|
+
logger.info("{} provider started", self.name)
|
|
82
|
+
|
|
83
|
+
async def stop(self) -> None:
|
|
84
|
+
self._available = False
|
|
85
|
+
self._voices_cache = None
|
|
86
|
+
logger.info("{} provider stopped", self.name)
|
|
87
|
+
|
|
88
|
+
async def health_check(self) -> bool:
|
|
89
|
+
if self._available:
|
|
90
|
+
return True
|
|
91
|
+
# Pre-start check: verify say command exists on macOS
|
|
92
|
+
return shutil.which("say") is not None
|
|
93
|
+
|
|
94
|
+
# -- synthesis ------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
async def synthesize(
|
|
97
|
+
self, text: str, opts: TTSOptions | None = None
|
|
98
|
+
) -> AudioData:
|
|
99
|
+
if not self._available:
|
|
100
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
101
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
102
|
+
_t0 = time.perf_counter()
|
|
103
|
+
|
|
104
|
+
opts = opts or TTSOptions()
|
|
105
|
+
voice = opts.voice or self.settings.default_voice
|
|
106
|
+
rate = int(self.settings.default_rate * opts.speed)
|
|
107
|
+
|
|
108
|
+
tmp_path: str | None = None
|
|
109
|
+
try:
|
|
110
|
+
# Create a temp file for the AIFF output
|
|
111
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".aiff")
|
|
112
|
+
import os
|
|
113
|
+
os.close(fd)
|
|
114
|
+
|
|
115
|
+
await self._run_say(voice, rate, tmp_path, text)
|
|
116
|
+
|
|
117
|
+
aiff_data = Path(tmp_path).read_bytes()
|
|
118
|
+
|
|
119
|
+
# Convert AIFF → WAV via AudioConverter
|
|
120
|
+
aiff_audio = AudioData(
|
|
121
|
+
data=aiff_data,
|
|
122
|
+
sample_rate=22050, # placeholder; to_wav parses AIFF header
|
|
123
|
+
channels=1,
|
|
124
|
+
format=AudioFormat.AIFF,
|
|
125
|
+
)
|
|
126
|
+
wav_audio = AudioConverter.to_wav(aiff_audio)
|
|
127
|
+
|
|
128
|
+
# Parse WAV to compute duration
|
|
129
|
+
with wave.open(io.BytesIO(wav_audio.data), "rb") as wf:
|
|
130
|
+
n_frames = wf.getnframes()
|
|
131
|
+
frame_rate = wf.getframerate()
|
|
132
|
+
duration_ms = int(n_frames / frame_rate * 1000) if frame_rate else 0
|
|
133
|
+
|
|
134
|
+
result = AudioData(
|
|
135
|
+
data=wav_audio.data,
|
|
136
|
+
sample_rate=wav_audio.sample_rate,
|
|
137
|
+
channels=wav_audio.channels,
|
|
138
|
+
format=AudioFormat.WAV,
|
|
139
|
+
duration_ms=duration_ms,
|
|
140
|
+
)
|
|
141
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
142
|
+
return result
|
|
143
|
+
finally:
|
|
144
|
+
if tmp_path is not None:
|
|
145
|
+
try:
|
|
146
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
147
|
+
except OSError:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
async def synthesize_stream(
|
|
151
|
+
self, text: str, opts: TTSOptions | None = None
|
|
152
|
+
) -> AsyncIterator[AudioChunk]:
|
|
153
|
+
"""Batch-then-chunk fallback: synthesize full audio, then yield chunks."""
|
|
154
|
+
logger.info("{}: stream request, text={} chars", self.name, len(text))
|
|
155
|
+
_t0 = time.perf_counter()
|
|
156
|
+
result = await self.synthesize(text, opts)
|
|
157
|
+
chunk_size = 4096
|
|
158
|
+
sequence = 0
|
|
159
|
+
chunk_count = 0
|
|
160
|
+
for i in range(0, len(result.data), chunk_size):
|
|
161
|
+
chunk_data = result.data[i : i + chunk_size]
|
|
162
|
+
logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk_data))
|
|
163
|
+
yield AudioChunk(data=chunk_data, sequence=sequence)
|
|
164
|
+
sequence += 1
|
|
165
|
+
chunk_count += 1
|
|
166
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|
|
167
|
+
logger.info("{}: stream complete, {} chunks in {:.0f}ms", self.name, chunk_count, (time.perf_counter() - _t0) * 1000)
|
|
168
|
+
|
|
169
|
+
# -- voices ---------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
async def list_voices(self) -> list[dict]:
|
|
172
|
+
if self._voices_cache is not None:
|
|
173
|
+
return self._voices_cache
|
|
174
|
+
|
|
175
|
+
output = await self._run_command(["say", "-v", "?"])
|
|
176
|
+
self._voices_cache = self._parse_voices_output(output)
|
|
177
|
+
return self._voices_cache
|
|
178
|
+
|
|
179
|
+
# -- internal helpers -----------------------------------------------------
|
|
180
|
+
|
|
181
|
+
async def _run_say(
|
|
182
|
+
self, voice: str, rate: int, output_path: str, text: str
|
|
183
|
+
) -> None:
|
|
184
|
+
"""Execute ``say`` to produce an AIFF file."""
|
|
185
|
+
cmd = ["say", "-v", voice, "-r", str(rate), "-o", output_path, "--", text]
|
|
186
|
+
proc = await asyncio.create_subprocess_exec(
|
|
187
|
+
*cmd,
|
|
188
|
+
stdout=asyncio.subprocess.PIPE,
|
|
189
|
+
stderr=asyncio.subprocess.PIPE,
|
|
190
|
+
)
|
|
191
|
+
_, stderr = await proc.communicate()
|
|
192
|
+
if proc.returncode != 0:
|
|
193
|
+
raise RuntimeError(
|
|
194
|
+
f"say command failed (exit {proc.returncode}): "
|
|
195
|
+
f"{stderr.decode(errors='replace')}"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
async def _run_command(self, cmd: list[str]) -> str:
|
|
199
|
+
"""Run an arbitrary command and return its stdout."""
|
|
200
|
+
proc = await asyncio.create_subprocess_exec(
|
|
201
|
+
*cmd,
|
|
202
|
+
stdout=asyncio.subprocess.PIPE,
|
|
203
|
+
stderr=asyncio.subprocess.PIPE,
|
|
204
|
+
)
|
|
205
|
+
stdout, _ = await proc.communicate()
|
|
206
|
+
return stdout.decode(errors="replace")
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def _parse_voices_output(output: str) -> list[dict]:
|
|
210
|
+
"""Parse the output of ``say -v ?`` into a list of voice dicts.
|
|
211
|
+
|
|
212
|
+
Expected line format::
|
|
213
|
+
|
|
214
|
+
Samantha en_US # Hello, my name is Samantha.
|
|
215
|
+
"""
|
|
216
|
+
voices: list[dict] = []
|
|
217
|
+
for line in output.splitlines():
|
|
218
|
+
line = line.strip()
|
|
219
|
+
if not line:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Split on '#' to separate the description
|
|
223
|
+
description = ""
|
|
224
|
+
if "#" in line:
|
|
225
|
+
main_part, description = line.split("#", 1)
|
|
226
|
+
description = description.strip()
|
|
227
|
+
else:
|
|
228
|
+
main_part = line
|
|
229
|
+
|
|
230
|
+
# The name and language are separated by whitespace.
|
|
231
|
+
# Name can contain spaces, but language is always the last
|
|
232
|
+
# whitespace-separated token before the '#'.
|
|
233
|
+
parts = main_part.rsplit(None, 1)
|
|
234
|
+
if len(parts) < 2:
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
name = parts[0].strip()
|
|
238
|
+
language = parts[1].strip()
|
|
239
|
+
|
|
240
|
+
# Derive group from language prefix
|
|
241
|
+
lang_prefix = language.split("_")[0].lower()
|
|
242
|
+
group = _LANG_GROUP_MAP.get(lang_prefix, lang_prefix.upper())
|
|
243
|
+
|
|
244
|
+
voices.append({
|
|
245
|
+
"name": name,
|
|
246
|
+
"language": language,
|
|
247
|
+
"description": description,
|
|
248
|
+
"group": group,
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
return voices
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""MiniMax TTS provider adapter (REST API, in-process)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from openspeech.core.base import TTSProvider
|
|
12
|
+
|
|
13
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
14
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
15
|
+
from openspeech.core.settings import BaseSettings
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class MinimaxTTSSettings(BaseSettings):
|
|
19
|
+
api_key: str = ""
|
|
20
|
+
group_id: str = ""
|
|
21
|
+
model: str = "speech-01"
|
|
22
|
+
voice_id: str = "male-qn-qingse"
|
|
23
|
+
speed: float = 1.0
|
|
24
|
+
vol: float = 1.0
|
|
25
|
+
pitch: int = 0
|
|
26
|
+
|
|
27
|
+
class MinimaxTTS(TTSProvider):
|
|
28
|
+
name = "minimax"
|
|
29
|
+
provider_type = ProviderType.TTS
|
|
30
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
31
|
+
settings_cls = MinimaxTTSSettings
|
|
32
|
+
capabilities = {Capability.STREAMING}
|
|
33
|
+
field_options = {
|
|
34
|
+
"model": ["speech-01", "speech-01-turbo", "speech-02"],
|
|
35
|
+
"voice_id": [
|
|
36
|
+
"male-qn-qingse", "male-qn-jingying", "male-qn-badao", "male-qn-daxuesheng",
|
|
37
|
+
"female-shaonv", "female-yujie", "female-chengshu", "female-tianmei",
|
|
38
|
+
"presenter_male", "presenter_female",
|
|
39
|
+
],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def __init__(self, settings: MinimaxTTSSettings | None = None) -> None:
|
|
43
|
+
self.settings = settings or MinimaxTTSSettings()
|
|
44
|
+
self._client: Any = None
|
|
45
|
+
self._owns_client: bool = True
|
|
46
|
+
|
|
47
|
+
def set_http_client(self, client) -> None:
|
|
48
|
+
self._client = client
|
|
49
|
+
self._owns_client = False
|
|
50
|
+
|
|
51
|
+
async def start(self) -> None:
|
|
52
|
+
if self._client is None:
|
|
53
|
+
try:
|
|
54
|
+
import httpx
|
|
55
|
+
except ImportError:
|
|
56
|
+
raise ImportError(
|
|
57
|
+
"Install httpx: pip install openspeech[minimax]"
|
|
58
|
+
)
|
|
59
|
+
self._client = httpx.AsyncClient(timeout=30.0)
|
|
60
|
+
self._owns_client = True
|
|
61
|
+
logger.info("{} provider started", self.name)
|
|
62
|
+
|
|
63
|
+
async def stop(self) -> None:
|
|
64
|
+
if self._client is not None and self._owns_client:
|
|
65
|
+
await self._client.aclose()
|
|
66
|
+
self._client = None
|
|
67
|
+
logger.info("{} provider stopped", self.name)
|
|
68
|
+
|
|
69
|
+
async def health_check(self) -> bool:
|
|
70
|
+
return bool(self.settings.api_key)
|
|
71
|
+
|
|
72
|
+
async def synthesize(
|
|
73
|
+
self, text: str, opts: TTSOptions | None = None
|
|
74
|
+
) -> AudioData:
|
|
75
|
+
if self._client is None:
|
|
76
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
77
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
78
|
+
_t0 = time.perf_counter()
|
|
79
|
+
opts = opts or TTSOptions()
|
|
80
|
+
url = f"https://api.minimax.chat/v1/t2a_v2?GroupId={self.settings.group_id}"
|
|
81
|
+
payload = {
|
|
82
|
+
"model": self.settings.model,
|
|
83
|
+
"text": text,
|
|
84
|
+
"stream": False,
|
|
85
|
+
"voice_setting": {
|
|
86
|
+
"voice_id": opts.voice or self.settings.voice_id,
|
|
87
|
+
"speed": opts.speed if opts.speed != 1.0 else self.settings.speed,
|
|
88
|
+
"vol": self.settings.vol,
|
|
89
|
+
"pitch": self.settings.pitch,
|
|
90
|
+
},
|
|
91
|
+
"audio_setting": {
|
|
92
|
+
"sample_rate": 32000,
|
|
93
|
+
"bitrate": 128000,
|
|
94
|
+
"format": "mp3",
|
|
95
|
+
},
|
|
96
|
+
}
|
|
97
|
+
headers = {
|
|
98
|
+
"Authorization": f"Bearer {self.settings.api_key}",
|
|
99
|
+
"Content-Type": "application/json",
|
|
100
|
+
}
|
|
101
|
+
response = await self._client.post(url, json=payload, headers=headers)
|
|
102
|
+
response.raise_for_status()
|
|
103
|
+
data = response.json()
|
|
104
|
+
if "data" in data and "audio" in data["data"]:
|
|
105
|
+
audio_bytes = base64.b64decode(data["data"]["audio"])
|
|
106
|
+
result = AudioData(
|
|
107
|
+
data=audio_bytes,
|
|
108
|
+
sample_rate=32000,
|
|
109
|
+
channels=1,
|
|
110
|
+
format=AudioFormat.MP3,
|
|
111
|
+
)
|
|
112
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
113
|
+
return result
|
|
114
|
+
raise RuntimeError(f"Minimax API error: {data}")
|
|
115
|
+
|
|
116
|
+
async def synthesize_stream(
|
|
117
|
+
self, text: str, opts: TTSOptions | None = None
|
|
118
|
+
) -> AsyncIterator[AudioChunk]:
|
|
119
|
+
raise NotImplementedError(
|
|
120
|
+
"MinimaxTTS.synthesize_stream() — streaming not yet implemented"
|
|
121
|
+
)
|
|
122
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""OpenAI TTS provider adapter (speech API)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from openspeech.core.base import TTSProvider
|
|
11
|
+
|
|
12
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
13
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
14
|
+
from openspeech.core.settings import BaseSettings
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class OpenAITTSSettings(BaseSettings):
|
|
18
|
+
api_key: str = ""
|
|
19
|
+
base_url: str = ""
|
|
20
|
+
model: str = "tts-1"
|
|
21
|
+
voice: str = "alloy"
|
|
22
|
+
response_format: str = "pcm"
|
|
23
|
+
|
|
24
|
+
class OpenAITTS(TTSProvider):
|
|
25
|
+
name = "openai-tts"
|
|
26
|
+
provider_type = ProviderType.TTS
|
|
27
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
28
|
+
settings_cls = OpenAITTSSettings
|
|
29
|
+
capabilities = {Capability.STREAMING, Capability.MULTILINGUAL}
|
|
30
|
+
field_options = {"model": ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"], "voice": ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse"]}
|
|
31
|
+
|
|
32
|
+
def __init__(self, settings: OpenAITTSSettings | None = None) -> None:
|
|
33
|
+
self.settings = settings or OpenAITTSSettings()
|
|
34
|
+
self._client: Any = None
|
|
35
|
+
|
|
36
|
+
async def start(self) -> None:
|
|
37
|
+
try:
|
|
38
|
+
from openai import AsyncOpenAI
|
|
39
|
+
kwargs: dict[str, Any] = {"api_key": self.settings.api_key}
|
|
40
|
+
if self.settings.base_url:
|
|
41
|
+
kwargs["base_url"] = self.settings.base_url
|
|
42
|
+
self._client = AsyncOpenAI(**kwargs)
|
|
43
|
+
except ImportError:
|
|
44
|
+
raise ImportError("Install openai: pip install openspeech[openai]")
|
|
45
|
+
logger.info("{} provider started", self.name)
|
|
46
|
+
|
|
47
|
+
async def stop(self) -> None:
|
|
48
|
+
self._client = None
|
|
49
|
+
logger.info("{} provider stopped", self.name)
|
|
50
|
+
|
|
51
|
+
async def health_check(self) -> bool:
|
|
52
|
+
return bool(self.settings.api_key)
|
|
53
|
+
|
|
54
|
+
async def synthesize(
|
|
55
|
+
self, text: str, opts: TTSOptions | None = None
|
|
56
|
+
) -> AudioData:
|
|
57
|
+
if self._client is None:
|
|
58
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
59
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
60
|
+
_t0 = time.perf_counter()
|
|
61
|
+
opts = opts or TTSOptions()
|
|
62
|
+
voice = opts.voice or self.settings.voice
|
|
63
|
+
model = getattr(opts, "model", None) or self.settings.model
|
|
64
|
+
response = await self._client.audio.speech.create(
|
|
65
|
+
model=model,
|
|
66
|
+
voice=voice,
|
|
67
|
+
input=text,
|
|
68
|
+
response_format=self.settings.response_format,
|
|
69
|
+
)
|
|
70
|
+
audio_bytes = response.content
|
|
71
|
+
result = AudioData(
|
|
72
|
+
data=audio_bytes,
|
|
73
|
+
sample_rate=24000,
|
|
74
|
+
channels=1,
|
|
75
|
+
format=opts.output_format,
|
|
76
|
+
)
|
|
77
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
async def synthesize_stream(
|
|
81
|
+
self, text: str, opts: TTSOptions | None = None
|
|
82
|
+
) -> AsyncIterator[AudioChunk]:
|
|
83
|
+
if self._client is None:
|
|
84
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
85
|
+
logger.info("{}: stream request, text={} chars", self.name, len(text))
|
|
86
|
+
_t0 = time.perf_counter()
|
|
87
|
+
opts = opts or TTSOptions()
|
|
88
|
+
voice = opts.voice or self.settings.voice
|
|
89
|
+
model = getattr(opts, "model", None) or self.settings.model
|
|
90
|
+
chunk_count = 0
|
|
91
|
+
async with self._client.audio.speech.with_streaming_response.create(
|
|
92
|
+
model=model,
|
|
93
|
+
voice=voice,
|
|
94
|
+
input=text,
|
|
95
|
+
response_format=self.settings.response_format,
|
|
96
|
+
) as response:
|
|
97
|
+
sequence = 0
|
|
98
|
+
async for chunk in response.iter_bytes(chunk_size=4096):
|
|
99
|
+
logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk))
|
|
100
|
+
yield AudioChunk(data=chunk, sequence=sequence)
|
|
101
|
+
sequence += 1
|
|
102
|
+
chunk_count += 1
|
|
103
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|
|
104
|
+
logger.info("{}: stream complete, {} chunks in {:.0f}ms", self.name, chunk_count, (time.perf_counter() - _t0) * 1000)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Piper TTS provider adapter (batch, in-process)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from openspeech.logging_config import logger
|
|
5
|
+
import time
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from openspeech.core.base import TTSProvider
|
|
11
|
+
|
|
12
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
13
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
14
|
+
from openspeech.core.settings import BaseSettings
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class PiperTTSSettings(BaseSettings):
|
|
18
|
+
model_path: str = ""
|
|
19
|
+
config_path: str = ""
|
|
20
|
+
use_cuda: bool = False
|
|
21
|
+
noise_scale: float = 0.667
|
|
22
|
+
length_scale: float = 1.0
|
|
23
|
+
noise_w: float = 0.8
|
|
24
|
+
|
|
25
|
+
class PiperTTS(TTSProvider):
|
|
26
|
+
name = "piper"
|
|
27
|
+
provider_type = ProviderType.TTS
|
|
28
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
29
|
+
settings_cls = PiperTTSSettings
|
|
30
|
+
capabilities = {Capability.BATCH}
|
|
31
|
+
|
|
32
|
+
def __init__(self, settings: PiperTTSSettings | None = None) -> None:
|
|
33
|
+
self.settings = settings or PiperTTSSettings()
|
|
34
|
+
self._client: Any = None
|
|
35
|
+
self._voice: Any = None
|
|
36
|
+
|
|
37
|
+
async def start(self) -> None:
|
|
38
|
+
try:
|
|
39
|
+
import piper
|
|
40
|
+
except ImportError:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"Install piper-tts: pip install openspeech[piper]"
|
|
43
|
+
)
|
|
44
|
+
self._voice = piper.PiperVoice.load(
|
|
45
|
+
self.settings.model_path,
|
|
46
|
+
config_path=self.settings.config_path or None,
|
|
47
|
+
use_cuda=self.settings.use_cuda,
|
|
48
|
+
)
|
|
49
|
+
self._client = self._voice
|
|
50
|
+
logger.info("{} provider started", self.name)
|
|
51
|
+
|
|
52
|
+
async def stop(self) -> None:
|
|
53
|
+
self._client = None
|
|
54
|
+
self._voice = None
|
|
55
|
+
logger.info("{} provider stopped", self.name)
|
|
56
|
+
|
|
57
|
+
async def health_check(self) -> bool:
|
|
58
|
+
return self._client is not None
|
|
59
|
+
|
|
60
|
+
async def synthesize(
|
|
61
|
+
self, text: str, opts: TTSOptions | None = None
|
|
62
|
+
) -> AudioData:
|
|
63
|
+
if self._client is None:
|
|
64
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
65
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
66
|
+
_t0 = time.perf_counter()
|
|
67
|
+
import io
|
|
68
|
+
import wave
|
|
69
|
+
|
|
70
|
+
audio_data = b""
|
|
71
|
+
for audio_bytes in self._voice.synthesize_stream_raw(
|
|
72
|
+
text,
|
|
73
|
+
length_scale=self.settings.length_scale,
|
|
74
|
+
noise_scale=self.settings.noise_scale,
|
|
75
|
+
noise_w=self.settings.noise_w,
|
|
76
|
+
):
|
|
77
|
+
audio_data += audio_bytes
|
|
78
|
+
|
|
79
|
+
sample_rate = 22050
|
|
80
|
+
if hasattr(self._voice, "config") and hasattr(self._voice.config, "sample_rate"):
|
|
81
|
+
sample_rate = self._voice.config.sample_rate
|
|
82
|
+
|
|
83
|
+
buf = io.BytesIO()
|
|
84
|
+
with wave.open(buf, "wb") as wf:
|
|
85
|
+
wf.setnchannels(1)
|
|
86
|
+
wf.setsampwidth(2)
|
|
87
|
+
wf.setframerate(sample_rate)
|
|
88
|
+
wf.writeframes(audio_data)
|
|
89
|
+
result = AudioData(
|
|
90
|
+
data=buf.getvalue(),
|
|
91
|
+
sample_rate=sample_rate,
|
|
92
|
+
channels=1,
|
|
93
|
+
format=AudioFormat.WAV,
|
|
94
|
+
)
|
|
95
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
async def synthesize_stream(
|
|
99
|
+
self, text: str, opts: TTSOptions | None = None
|
|
100
|
+
) -> AsyncIterator[AudioChunk]:
|
|
101
|
+
raise NotImplementedError(
|
|
102
|
+
"PiperTTS does not support streaming synthesis"
|
|
103
|
+
)
|
|
104
|
+
yield # pragma: no cover
|