openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Fish-Speech TTS provider adapter (HTTP API, in-process)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections.abc import AsyncIterator
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import io
|
|
7
|
+
from openspeech.logging_config import logger
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any
|
|
10
|
+
from urllib.parse import urljoin
|
|
11
|
+
import wave
|
|
12
|
+
|
|
13
|
+
from openspeech.core.base import TTSProvider
|
|
14
|
+
|
|
15
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
16
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
17
|
+
from openspeech.core.settings import BaseSettings
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class FishSpeechTTSSettings(BaseSettings):
|
|
21
|
+
api_url: str = "http://localhost:8080"
|
|
22
|
+
reference_audio: str | None = None
|
|
23
|
+
reference_id: str | None = None
|
|
24
|
+
timeout_s: float = 60.0
|
|
25
|
+
retries: int = 0
|
|
26
|
+
health_path: str = "/health"
|
|
27
|
+
format: str = "wav"
|
|
28
|
+
latency: str = "balanced"
|
|
29
|
+
chunk_length: int = 200
|
|
30
|
+
max_new_tokens: int = 1024
|
|
31
|
+
top_p: float = 0.8
|
|
32
|
+
repetition_penalty: float = 1.1
|
|
33
|
+
temperature: float = 0.8
|
|
34
|
+
use_memory_cache: str = "on"
|
|
35
|
+
normalize: bool = True
|
|
36
|
+
|
|
37
|
+
class FishSpeechTTS(TTSProvider):
|
|
38
|
+
name = "fish-speech"
|
|
39
|
+
provider_type = ProviderType.TTS
|
|
40
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
41
|
+
settings_cls = FishSpeechTTSSettings
|
|
42
|
+
capabilities = {Capability.STREAMING, Capability.VOICE_CLONE}
|
|
43
|
+
field_options = {
|
|
44
|
+
"format": ["wav", "mp3", "flac", "opus"],
|
|
45
|
+
"latency": ["normal", "balanced"],
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __init__(self, settings: FishSpeechTTSSettings | None = None) -> None:
|
|
49
|
+
self.settings = settings or FishSpeechTTSSettings()
|
|
50
|
+
self._client: Any = None
|
|
51
|
+
self._owns_client: bool = True
|
|
52
|
+
|
|
53
|
+
def set_http_client(self, client) -> None:
|
|
54
|
+
self._client = client
|
|
55
|
+
self._owns_client = False
|
|
56
|
+
|
|
57
|
+
async def start(self) -> None:
|
|
58
|
+
if self._client is None:
|
|
59
|
+
try:
|
|
60
|
+
import httpx
|
|
61
|
+
except ImportError:
|
|
62
|
+
raise ImportError(
|
|
63
|
+
"Install httpx: pip install openspeech[fish-speech]"
|
|
64
|
+
)
|
|
65
|
+
# Ignore process-level proxy env vars by default to avoid accidental SOCKS deps.
|
|
66
|
+
self._client = httpx.AsyncClient(timeout=self.settings.timeout_s, trust_env=False)
|
|
67
|
+
self._owns_client = True
|
|
68
|
+
logger.info("{} provider started", self.name)
|
|
69
|
+
|
|
70
|
+
async def stop(self) -> None:
|
|
71
|
+
if self._client is not None and self._owns_client:
|
|
72
|
+
await self._client.aclose()
|
|
73
|
+
self._client = None
|
|
74
|
+
logger.info("{} provider stopped", self.name)
|
|
75
|
+
|
|
76
|
+
async def health_check(self) -> bool:
|
|
77
|
+
if self._client is None:
|
|
78
|
+
return False
|
|
79
|
+
try:
|
|
80
|
+
url = urljoin(self.settings.api_url.rstrip("/") + "/", self.settings.health_path.lstrip("/"))
|
|
81
|
+
resp = await self._client.get(url)
|
|
82
|
+
return resp.status_code < 500
|
|
83
|
+
except Exception:
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
async def synthesize(
|
|
87
|
+
self, text: str, opts: TTSOptions | None = None
|
|
88
|
+
) -> AudioData:
|
|
89
|
+
if self._client is None:
|
|
90
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
91
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
92
|
+
_t0 = time.perf_counter()
|
|
93
|
+
opts = opts or TTSOptions()
|
|
94
|
+
payload: dict[str, Any] = {
|
|
95
|
+
"text": text,
|
|
96
|
+
"format": self.settings.format,
|
|
97
|
+
"latency": self.settings.latency,
|
|
98
|
+
"chunk_length": self.settings.chunk_length,
|
|
99
|
+
"max_new_tokens": self.settings.max_new_tokens,
|
|
100
|
+
"top_p": self.settings.top_p,
|
|
101
|
+
"repetition_penalty": self.settings.repetition_penalty,
|
|
102
|
+
"temperature": self.settings.temperature,
|
|
103
|
+
"use_memory_cache": self.settings.use_memory_cache,
|
|
104
|
+
"normalize": self.settings.normalize,
|
|
105
|
+
}
|
|
106
|
+
if opts.voice:
|
|
107
|
+
payload["voice"] = opts.voice
|
|
108
|
+
if opts.speed and opts.speed != 1.0:
|
|
109
|
+
payload["speed"] = opts.speed
|
|
110
|
+
if self.settings.reference_audio:
|
|
111
|
+
payload["reference_audio"] = self.settings.reference_audio
|
|
112
|
+
if self.settings.reference_id:
|
|
113
|
+
payload["reference_id"] = self.settings.reference_id
|
|
114
|
+
last_exc: Exception | None = None
|
|
115
|
+
response = None
|
|
116
|
+
attempts = max(0, self.settings.retries) + 1
|
|
117
|
+
for _ in range(attempts):
|
|
118
|
+
try:
|
|
119
|
+
response = await self._client.post(
|
|
120
|
+
f"{self.settings.api_url}/v1/tts",
|
|
121
|
+
json=payload,
|
|
122
|
+
)
|
|
123
|
+
break
|
|
124
|
+
except Exception as exc: # noqa: BLE001
|
|
125
|
+
last_exc = exc
|
|
126
|
+
if response is None:
|
|
127
|
+
raise RuntimeError(f"Fish-Speech request failed: {last_exc}") from last_exc
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
duration_ms = self._wav_duration_ms(response.content)
|
|
130
|
+
result = AudioData(
|
|
131
|
+
data=response.content,
|
|
132
|
+
sample_rate=44100,
|
|
133
|
+
channels=1,
|
|
134
|
+
format=AudioFormat.WAV,
|
|
135
|
+
duration_ms=duration_ms,
|
|
136
|
+
)
|
|
137
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
async def synthesize_stream(
|
|
141
|
+
self, text: str, opts: TTSOptions | None = None
|
|
142
|
+
) -> AsyncIterator[AudioChunk]:
|
|
143
|
+
raise NotImplementedError(
|
|
144
|
+
"FishSpeechTTS.synthesize_stream() is not yet implemented"
|
|
145
|
+
)
|
|
146
|
+
yield # pragma: no cover
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _wav_duration_ms(data: bytes) -> int | None:
|
|
150
|
+
try:
|
|
151
|
+
with wave.open(io.BytesIO(data), "rb") as wf:
|
|
152
|
+
sample_rate = wf.getframerate()
|
|
153
|
+
if sample_rate <= 0:
|
|
154
|
+
return None
|
|
155
|
+
frames = wf.getnframes()
|
|
156
|
+
return int((frames / sample_rate) * 1000)
|
|
157
|
+
except Exception:
|
|
158
|
+
return None
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Google Cloud TTS provider adapter (batch, httpx)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from openspeech.core.base import TTSProvider
|
|
14
|
+
|
|
15
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
16
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
17
|
+
from openspeech.core.settings import BaseSettings
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class GoogleCloudTTSSettings(BaseSettings):
|
|
21
|
+
api_key: str = ""
|
|
22
|
+
language: str = "en-US"
|
|
23
|
+
voice_name: str = "en-US-Standard-A"
|
|
24
|
+
speaking_rate: float = 1.0
|
|
25
|
+
|
|
26
|
+
class GoogleCloudTTS(TTSProvider):
|
|
27
|
+
name = "google-tts"
|
|
28
|
+
provider_type = ProviderType.TTS
|
|
29
|
+
execution_mode = ExecMode.REMOTE
|
|
30
|
+
settings_cls = GoogleCloudTTSSettings
|
|
31
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
32
|
+
field_options = {"language": ["en-US", "zh-CN", "ja-JP", "ko-KR", "es-ES", "fr-FR", "de-DE", "pt-BR"], "voice_name": ["en-US-Standard-A", "en-US-Standard-B", "en-US-Standard-C", "en-US-Standard-D", "en-US-Wavenet-A", "en-US-Wavenet-B", "zh-CN-Standard-A", "zh-CN-Standard-B", "zh-CN-Standard-C", "zh-CN-Standard-D", "zh-CN-Wavenet-A", "ja-JP-Standard-A", "ja-JP-Standard-B"]}
|
|
33
|
+
|
|
34
|
+
def __init__(self, settings: GoogleCloudTTSSettings | None = None) -> None:
|
|
35
|
+
self.settings = settings or GoogleCloudTTSSettings()
|
|
36
|
+
self._client: httpx.AsyncClient | None = None
|
|
37
|
+
self._owns_client: bool = True
|
|
38
|
+
|
|
39
|
+
def set_http_client(self, client) -> None:
|
|
40
|
+
self._client = client
|
|
41
|
+
self._owns_client = False
|
|
42
|
+
|
|
43
|
+
async def start(self) -> None:
|
|
44
|
+
if self._client is None:
|
|
45
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
46
|
+
self._owns_client = True
|
|
47
|
+
logger.info("{} provider started", self.name)
|
|
48
|
+
|
|
49
|
+
async def stop(self) -> None:
|
|
50
|
+
if self._client is not None and self._owns_client:
|
|
51
|
+
await self._client.aclose()
|
|
52
|
+
self._client = None
|
|
53
|
+
logger.info("{} provider stopped", self.name)
|
|
54
|
+
|
|
55
|
+
async def health_check(self) -> bool:
|
|
56
|
+
return bool(self.settings.api_key)
|
|
57
|
+
|
|
58
|
+
async def synthesize(
|
|
59
|
+
self, text: str, opts: TTSOptions | None = None
|
|
60
|
+
) -> AudioData:
|
|
61
|
+
if self._client is None:
|
|
62
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
63
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
64
|
+
_t0 = time.perf_counter()
|
|
65
|
+
opts = opts or TTSOptions()
|
|
66
|
+
voice_name = opts.voice or self.settings.voice_name
|
|
67
|
+
language = self.settings.language
|
|
68
|
+
speaking_rate = opts.speed if opts.speed != 1.0 else self.settings.speaking_rate
|
|
69
|
+
|
|
70
|
+
url = (
|
|
71
|
+
"https://texttospeech.googleapis.com/v1/text:synthesize"
|
|
72
|
+
f"?key={self.settings.api_key}"
|
|
73
|
+
)
|
|
74
|
+
body = {
|
|
75
|
+
"input": {"text": text},
|
|
76
|
+
"voice": {
|
|
77
|
+
"languageCode": language,
|
|
78
|
+
"name": voice_name,
|
|
79
|
+
},
|
|
80
|
+
"audioConfig": {
|
|
81
|
+
"audioEncoding": "LINEAR16",
|
|
82
|
+
"speakingRate": speaking_rate,
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
response = await self._client.post(url, json=body)
|
|
86
|
+
if response.status_code != 200:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
f"Google Cloud TTS API error {response.status_code}: {response.text}"
|
|
89
|
+
)
|
|
90
|
+
data = response.json()
|
|
91
|
+
audio_bytes = base64.b64decode(data["audioContent"])
|
|
92
|
+
result = AudioData(
|
|
93
|
+
data=audio_bytes,
|
|
94
|
+
sample_rate=24000,
|
|
95
|
+
channels=1,
|
|
96
|
+
format=AudioFormat.WAV,
|
|
97
|
+
)
|
|
98
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
async def synthesize_stream(
|
|
102
|
+
self, text: str, opts: TTSOptions | None = None
|
|
103
|
+
) -> AsyncIterator[AudioChunk]:
|
|
104
|
+
raise NotImplementedError(
|
|
105
|
+
"Google Cloud TTS batch provider does not support streaming output"
|
|
106
|
+
)
|
|
107
|
+
yield # pragma: no cover
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""iFlytek (讯飞) TTS provider adapter — WebSocket-based."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
import hashlib
|
|
6
|
+
import hmac
|
|
7
|
+
import json
|
|
8
|
+
from openspeech.logging_config import logger
|
|
9
|
+
import urllib.parse
|
|
10
|
+
from collections.abc import AsyncIterator
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from email.utils import formatdate
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
import websockets
|
|
18
|
+
|
|
19
|
+
from openspeech.core.base import TTSProvider
|
|
20
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
21
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
22
|
+
from openspeech.core.settings import BaseSettings
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class IflytekTTSSettings(BaseSettings):
|
|
26
|
+
app_id: str = ""
|
|
27
|
+
api_key: str = ""
|
|
28
|
+
api_secret: str = ""
|
|
29
|
+
voice: str = "xiaoyan"
|
|
30
|
+
speed: int = 50
|
|
31
|
+
|
|
32
|
+
class IflytekTTS(TTSProvider):
|
|
33
|
+
name = "iflytek-tts"
|
|
34
|
+
provider_type = ProviderType.TTS
|
|
35
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
36
|
+
settings_cls = IflytekTTSSettings
|
|
37
|
+
capabilities = {Capability.BATCH, Capability.STREAMING, Capability.MULTILINGUAL}
|
|
38
|
+
field_options = {"voice": [
|
|
39
|
+
"xiaoyan", "aisjiuxu", "aisxping", "aisjinger", "aisbabyxu",
|
|
40
|
+
"x4_lingxiaolu_em", "x4_lingfeizhe_em", "xiaoyu", "xiaoqi",
|
|
41
|
+
"xiaofeng", "xiaomei", "xiaolin", "xiaorong", "xiaoqian",
|
|
42
|
+
"catherine", "john", "laura", "yuka", "xiaoqiukor",
|
|
43
|
+
]}
|
|
44
|
+
|
|
45
|
+
_WS_HOST = "tts-api.xfyun.cn"
|
|
46
|
+
_WS_PATH = "/v2/tts"
|
|
47
|
+
|
|
48
|
+
def __init__(self, settings: IflytekTTSSettings | None = None) -> None:
|
|
49
|
+
self.settings = settings or IflytekTTSSettings()
|
|
50
|
+
self._client: httpx.AsyncClient | None = None
|
|
51
|
+
self._owns_client: bool = True
|
|
52
|
+
|
|
53
|
+
def set_http_client(self, client) -> None:
|
|
54
|
+
self._client = client
|
|
55
|
+
self._owns_client = False
|
|
56
|
+
|
|
57
|
+
async def start(self) -> None:
|
|
58
|
+
if self._client is None:
|
|
59
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
60
|
+
self._owns_client = True
|
|
61
|
+
|
|
62
|
+
async def stop(self) -> None:
|
|
63
|
+
if self._client is not None and self._owns_client:
|
|
64
|
+
await self._client.aclose()
|
|
65
|
+
self._client = None
|
|
66
|
+
|
|
67
|
+
async def health_check(self) -> bool:
|
|
68
|
+
return bool(self.settings.app_id) and bool(self.settings.api_key) and bool(self.settings.api_secret)
|
|
69
|
+
|
|
70
|
+
def _build_auth_url(self) -> str:
|
|
71
|
+
"""Build HMAC-SHA256 signed WebSocket URL."""
|
|
72
|
+
now = datetime.now(tz=timezone.utc)
|
|
73
|
+
date = formatdate(timeval=now.timestamp(), localtime=False, usegmt=True)
|
|
74
|
+
|
|
75
|
+
signature_origin = (
|
|
76
|
+
f"host: {self._WS_HOST}\n"
|
|
77
|
+
f"date: {date}\n"
|
|
78
|
+
f"GET {self._WS_PATH} HTTP/1.1"
|
|
79
|
+
)
|
|
80
|
+
signature_sha = hmac.new(
|
|
81
|
+
self.settings.api_secret.encode("utf-8"),
|
|
82
|
+
signature_origin.encode("utf-8"),
|
|
83
|
+
hashlib.sha256,
|
|
84
|
+
).digest()
|
|
85
|
+
signature = base64.b64encode(signature_sha).decode("utf-8")
|
|
86
|
+
|
|
87
|
+
authorization_origin = (
|
|
88
|
+
f'api_key="{self.settings.api_key}", '
|
|
89
|
+
f'algorithm="hmac-sha256", '
|
|
90
|
+
f'headers="host date request-line", '
|
|
91
|
+
f'signature="{signature}"'
|
|
92
|
+
)
|
|
93
|
+
authorization = base64.b64encode(
|
|
94
|
+
authorization_origin.encode("utf-8")
|
|
95
|
+
).decode("utf-8")
|
|
96
|
+
|
|
97
|
+
params = urllib.parse.urlencode(
|
|
98
|
+
{"authorization": authorization, "date": date, "host": self._WS_HOST}
|
|
99
|
+
)
|
|
100
|
+
return f"wss://{self._WS_HOST}{self._WS_PATH}?{params}"
|
|
101
|
+
|
|
102
|
+
async def synthesize(
|
|
103
|
+
self, text: str, opts: TTSOptions | None = None
|
|
104
|
+
) -> AudioData:
|
|
105
|
+
"""Batch synthesize by collecting all stream chunks."""
|
|
106
|
+
parts: list[bytes] = []
|
|
107
|
+
async for chunk in self.synthesize_stream(text, opts):
|
|
108
|
+
parts.append(chunk.data)
|
|
109
|
+
audio_bytes = b"".join(parts)
|
|
110
|
+
logger.info("iFlytek TTS: {} chunks, {} MP3 bytes total", len(parts), len(audio_bytes))
|
|
111
|
+
return AudioData(
|
|
112
|
+
data=audio_bytes,
|
|
113
|
+
sample_rate=16000,
|
|
114
|
+
channels=1,
|
|
115
|
+
format="mp3",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# iFlytek voice catalog — vcn → (display name, language, group)
|
|
119
|
+
_VOICES = [
|
|
120
|
+
# 中文
|
|
121
|
+
("xiaoyan", "小燕 — 中文女声", "zh_cn", "中文"),
|
|
122
|
+
("aisjiuxu", "许久 — 中文男声", "zh_cn", "中文"),
|
|
123
|
+
("aisxping", "小萍 — 中文女声", "zh_cn", "中文"),
|
|
124
|
+
("aisjinger", "小婧 — 中文女声", "zh_cn", "中文"),
|
|
125
|
+
("aisbabyxu", "许小宝 — 童声", "zh_cn", "中文"),
|
|
126
|
+
("x4_lingxiaolu_em", "凌小路 — 情感男声", "zh_cn", "中文"),
|
|
127
|
+
("x4_lingfeizhe_em", "凌飞哲 — 情感男声", "zh_cn", "中文"),
|
|
128
|
+
("xiaoyu", "小宇 — 中文男声", "zh_cn", "中文"),
|
|
129
|
+
("xiaoqi", "小琪 — 中文女声", "zh_cn", "中文"),
|
|
130
|
+
("xiaofeng", "小峰 — 中文男声", "zh_cn", "中文"),
|
|
131
|
+
("xiaomei", "小梅 — 粤语女声", "zh_cn", "粤语"),
|
|
132
|
+
("xiaolin", "小林 — 台湾女声", "zh_cn", "中文"),
|
|
133
|
+
("xiaorong", "小蓉 — 四川女声", "zh_cn", "方言"),
|
|
134
|
+
("xiaoqian", "小芊 — 东北女声", "zh_cn", "方言"),
|
|
135
|
+
# English
|
|
136
|
+
("catherine", "Catherine — English Female", "en_us", "English"),
|
|
137
|
+
("john", "John — English Male", "en_us", "English"),
|
|
138
|
+
("laura", "Laura — English Female", "en_us", "English"),
|
|
139
|
+
# Japanese
|
|
140
|
+
("yuka", "Yuka — Japanese Female", "ja_jp", "日本語"),
|
|
141
|
+
# Korean
|
|
142
|
+
("xiaoqiukor", "小秋 — Korean Female", "ko_kr", "한국어"),
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
def _build_ws_message(self, text: str, voice: str | None = None, speed: float | None = None) -> dict:
|
|
146
|
+
"""Build the WebSocket request message."""
|
|
147
|
+
vcn = voice or self.settings.voice
|
|
148
|
+
spd = int(speed * 50) if speed and speed != 1.0 else self.settings.speed
|
|
149
|
+
text_b64 = base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
|
150
|
+
return {
|
|
151
|
+
"common": {"app_id": self.settings.app_id},
|
|
152
|
+
"business": {
|
|
153
|
+
"aue": "lame",
|
|
154
|
+
"sfl": 1,
|
|
155
|
+
"vcn": vcn,
|
|
156
|
+
"speed": spd,
|
|
157
|
+
"tte": "UTF8",
|
|
158
|
+
},
|
|
159
|
+
"data": {
|
|
160
|
+
"status": 2,
|
|
161
|
+
"text": text_b64,
|
|
162
|
+
},
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
async def list_voices(self) -> list[dict]:
|
|
166
|
+
"""Return available iFlytek voices."""
|
|
167
|
+
return [
|
|
168
|
+
{"name": vcn, "description": desc, "language": lang, "group": group}
|
|
169
|
+
for vcn, desc, lang, group in self._VOICES
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
async def synthesize_stream(
|
|
173
|
+
self, text: str, opts: TTSOptions | None = None
|
|
174
|
+
) -> AsyncIterator[AudioChunk]:
|
|
175
|
+
"""Stream MP3 audio chunks as they arrive from iFlytek WebSocket."""
|
|
176
|
+
if self._client is None:
|
|
177
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
178
|
+
opts = opts or TTSOptions()
|
|
179
|
+
|
|
180
|
+
voice = opts.voice or self.settings.voice
|
|
181
|
+
speed = opts.speed if opts.speed != 1.0 else None
|
|
182
|
+
logger.info("iFlytek stream: voice={}, speed={}, text={} chars", voice, speed, len(text))
|
|
183
|
+
|
|
184
|
+
url = self._build_auth_url()
|
|
185
|
+
msg = self._build_ws_message(text, voice=voice, speed=speed)
|
|
186
|
+
seq = 0
|
|
187
|
+
|
|
188
|
+
async with websockets.connect(url) as ws:
|
|
189
|
+
await ws.send(json.dumps(msg))
|
|
190
|
+
|
|
191
|
+
async for message in ws:
|
|
192
|
+
resp = json.loads(message)
|
|
193
|
+
code = resp.get("code", -1)
|
|
194
|
+
if code != 0:
|
|
195
|
+
raise RuntimeError(
|
|
196
|
+
f"iFlytek TTS error [{code}]: {resp.get('message', 'unknown')}"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
data = resp.get("data", {})
|
|
200
|
+
audio_b64 = data.get("audio", "")
|
|
201
|
+
is_final = data.get("status") == 2
|
|
202
|
+
|
|
203
|
+
if audio_b64:
|
|
204
|
+
chunk_bytes = base64.b64decode(audio_b64)
|
|
205
|
+
yield AudioChunk(data=chunk_bytes, sequence=seq, is_final=is_final)
|
|
206
|
+
seq += 1
|
|
207
|
+
|
|
208
|
+
if is_final:
|
|
209
|
+
break
|