openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Tencent Cloud TTS provider adapter (TC3-HMAC-SHA256 signed)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
import hashlib
|
|
6
|
+
import hmac
|
|
7
|
+
import json
|
|
8
|
+
from openspeech.logging_config import logger
|
|
9
|
+
import time
|
|
10
|
+
import uuid
|
|
11
|
+
from collections.abc import AsyncIterator
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from openspeech.core.base import TTSProvider
|
|
19
|
+
|
|
20
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
21
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
22
|
+
from openspeech.core.settings import BaseSettings
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class TencentTTSSettings(BaseSettings):
|
|
26
|
+
secret_id: str = ""
|
|
27
|
+
secret_key: str = ""
|
|
28
|
+
voice_type: int = 1001
|
|
29
|
+
region: str = "ap-guangzhou"
|
|
30
|
+
|
|
31
|
+
class TencentTTS(TTSProvider):
|
|
32
|
+
name = "tencent-tts"
|
|
33
|
+
provider_type = ProviderType.TTS
|
|
34
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
35
|
+
settings_cls = TencentTTSSettings
|
|
36
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
37
|
+
field_options = {"voice_type": [1001, 1002, 1003, 1004, 1005, 1007, 1008, 1009, 1010, 1017, 1018, 101001, 101002], "region": ["ap-guangzhou", "ap-shanghai", "ap-beijing"]}
|
|
38
|
+
|
|
39
|
+
_SERVICE = "tts"
|
|
40
|
+
_HOST = "tts.tencentcloudapi.com"
|
|
41
|
+
|
|
42
|
+
def __init__(self, settings: TencentTTSSettings | None = None) -> None:
|
|
43
|
+
self.settings = settings or TencentTTSSettings()
|
|
44
|
+
self._client: httpx.AsyncClient | None = None
|
|
45
|
+
self._owns_client: bool = True
|
|
46
|
+
|
|
47
|
+
def set_http_client(self, client) -> None:
|
|
48
|
+
self._client = client
|
|
49
|
+
self._owns_client = False
|
|
50
|
+
|
|
51
|
+
async def start(self) -> None:
|
|
52
|
+
if self._client is None:
|
|
53
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
54
|
+
self._owns_client = True
|
|
55
|
+
logger.info("{} provider started", self.name)
|
|
56
|
+
|
|
57
|
+
async def stop(self) -> None:
|
|
58
|
+
if self._client is not None and self._owns_client:
|
|
59
|
+
await self._client.aclose()
|
|
60
|
+
self._client = None
|
|
61
|
+
logger.info("{} provider stopped", self.name)
|
|
62
|
+
|
|
63
|
+
async def health_check(self) -> bool:
|
|
64
|
+
return bool(self.settings.secret_id) and bool(self.settings.secret_key)
|
|
65
|
+
|
|
66
|
+
# ---- TC3-HMAC-SHA256 signing ------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def _sign_request(
|
|
69
|
+
self, action: str, version: str, payload_json: str
|
|
70
|
+
) -> dict[str, str]:
|
|
71
|
+
"""Build Tencent Cloud API v3 signed headers."""
|
|
72
|
+
timestamp = int(time.time())
|
|
73
|
+
date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime(
|
|
74
|
+
"%Y-%m-%d"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# 1. Canonical request
|
|
78
|
+
http_method = "POST"
|
|
79
|
+
canonical_uri = "/"
|
|
80
|
+
canonical_querystring = ""
|
|
81
|
+
ct = "application/json; charset=utf-8"
|
|
82
|
+
canonical_headers = (
|
|
83
|
+
f"content-type:{ct}\nhost:{self._HOST}\nx-tc-action:{action.lower()}\n"
|
|
84
|
+
)
|
|
85
|
+
signed_headers = "content-type;host;x-tc-action"
|
|
86
|
+
hashed_payload = hashlib.sha256(payload_json.encode("utf-8")).hexdigest()
|
|
87
|
+
canonical_request = (
|
|
88
|
+
f"{http_method}\n{canonical_uri}\n{canonical_querystring}\n"
|
|
89
|
+
f"{canonical_headers}\n{signed_headers}\n{hashed_payload}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# 2. String to sign
|
|
93
|
+
algorithm = "TC3-HMAC-SHA256"
|
|
94
|
+
credential_scope = f"{date}/{self._SERVICE}/tc3_request"
|
|
95
|
+
hashed_canonical = hashlib.sha256(
|
|
96
|
+
canonical_request.encode("utf-8")
|
|
97
|
+
).hexdigest()
|
|
98
|
+
string_to_sign = (
|
|
99
|
+
f"{algorithm}\n{timestamp}\n{credential_scope}\n{hashed_canonical}"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# 3. Signing key
|
|
103
|
+
secret_date = hmac.new(
|
|
104
|
+
("TC3" + self.settings.secret_key).encode("utf-8"),
|
|
105
|
+
date.encode("utf-8"),
|
|
106
|
+
hashlib.sha256,
|
|
107
|
+
).digest()
|
|
108
|
+
secret_service = hmac.new(
|
|
109
|
+
secret_date, self._SERVICE.encode("utf-8"), hashlib.sha256
|
|
110
|
+
).digest()
|
|
111
|
+
secret_signing = hmac.new(
|
|
112
|
+
secret_service, b"tc3_request", hashlib.sha256
|
|
113
|
+
).digest()
|
|
114
|
+
|
|
115
|
+
# 4. Signature
|
|
116
|
+
signature = hmac.new(
|
|
117
|
+
secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
|
|
118
|
+
).hexdigest()
|
|
119
|
+
|
|
120
|
+
authorization = (
|
|
121
|
+
f"{algorithm} Credential={self.settings.secret_id}/{credential_scope}, "
|
|
122
|
+
f"SignedHeaders={signed_headers}, Signature={signature}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"Authorization": authorization,
|
|
127
|
+
"Content-Type": ct,
|
|
128
|
+
"Host": self._HOST,
|
|
129
|
+
"X-TC-Action": action,
|
|
130
|
+
"X-TC-Version": version,
|
|
131
|
+
"X-TC-Timestamp": str(timestamp),
|
|
132
|
+
"X-TC-Region": self.settings.region,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# ---- API call ----------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
async def synthesize(
|
|
138
|
+
self, text: str, opts: TTSOptions | None = None
|
|
139
|
+
) -> AudioData:
|
|
140
|
+
if self._client is None:
|
|
141
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
142
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
143
|
+
_t0 = time.perf_counter()
|
|
144
|
+
opts = opts or TTSOptions()
|
|
145
|
+
|
|
146
|
+
session_id = uuid.uuid4().hex
|
|
147
|
+
payload = {
|
|
148
|
+
"Text": text,
|
|
149
|
+
"SessionId": session_id,
|
|
150
|
+
"VoiceType": self.settings.voice_type,
|
|
151
|
+
"Codec": "wav",
|
|
152
|
+
"Volume": 0,
|
|
153
|
+
"Speed": 0,
|
|
154
|
+
}
|
|
155
|
+
payload_json = json.dumps(payload)
|
|
156
|
+
headers = self._sign_request("TextToVoice", "2019-08-23", payload_json)
|
|
157
|
+
|
|
158
|
+
resp = await self._client.post(
|
|
159
|
+
f"https://{self._HOST}",
|
|
160
|
+
content=payload_json,
|
|
161
|
+
headers=headers,
|
|
162
|
+
)
|
|
163
|
+
resp.raise_for_status()
|
|
164
|
+
result = resp.json()
|
|
165
|
+
|
|
166
|
+
response = result.get("Response", {})
|
|
167
|
+
if "Error" in response:
|
|
168
|
+
err = response["Error"]
|
|
169
|
+
raise RuntimeError(
|
|
170
|
+
f"Tencent TTS error [{err.get('Code')}]: {err.get('Message')}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
audio_b64 = response.get("Audio", "")
|
|
174
|
+
audio_bytes = base64.b64decode(audio_b64)
|
|
175
|
+
|
|
176
|
+
result = AudioData(
|
|
177
|
+
data=audio_bytes,
|
|
178
|
+
sample_rate=16000,
|
|
179
|
+
channels=1,
|
|
180
|
+
format=opts.output_format,
|
|
181
|
+
)
|
|
182
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
async def synthesize_stream(
|
|
186
|
+
self, text: str, opts: TTSOptions | None = None
|
|
187
|
+
) -> AsyncIterator[AudioChunk]:
|
|
188
|
+
raise NotImplementedError("Tencent TTS streaming not implemented")
|
|
189
|
+
yield # noqa: unreachable — makes this an async generator
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Volcengine (ByteDance) TTS provider adapter."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import base64
|
|
5
|
+
from openspeech.logging_config import logger
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from openspeech.core.base import TTSProvider
|
|
14
|
+
|
|
15
|
+
from openspeech.core.enums import Capability, ExecMode, ProviderType
|
|
16
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
17
|
+
from openspeech.core.settings import BaseSettings
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class VolcengineTTSSettings(BaseSettings):
|
|
21
|
+
access_token: str = ""
|
|
22
|
+
app_id: str = ""
|
|
23
|
+
cluster: str = "volcano_tts"
|
|
24
|
+
voice_type: str = "BV001_streaming"
|
|
25
|
+
|
|
26
|
+
class VolcengineTTS(TTSProvider):
|
|
27
|
+
name = "volcengine-tts"
|
|
28
|
+
provider_type = ProviderType.TTS
|
|
29
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
30
|
+
settings_cls = VolcengineTTSSettings
|
|
31
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
32
|
+
field_options = {"voice_type": ["BV001_streaming", "BV002_streaming", "BV700_streaming", "BV406_streaming", "BV407_streaming"]}
|
|
33
|
+
|
|
34
|
+
def __init__(self, settings: VolcengineTTSSettings | None = None) -> None:
|
|
35
|
+
self.settings = settings or VolcengineTTSSettings()
|
|
36
|
+
self._client: httpx.AsyncClient | None = None
|
|
37
|
+
self._owns_client: bool = True
|
|
38
|
+
|
|
39
|
+
def set_http_client(self, client) -> None:
|
|
40
|
+
self._client = client
|
|
41
|
+
self._owns_client = False
|
|
42
|
+
|
|
43
|
+
async def start(self) -> None:
|
|
44
|
+
if self._client is None:
|
|
45
|
+
self._client = httpx.AsyncClient(timeout=60.0)
|
|
46
|
+
self._owns_client = True
|
|
47
|
+
logger.info("{} provider started", self.name)
|
|
48
|
+
|
|
49
|
+
async def stop(self) -> None:
|
|
50
|
+
if self._client is not None and self._owns_client:
|
|
51
|
+
await self._client.aclose()
|
|
52
|
+
self._client = None
|
|
53
|
+
logger.info("{} provider stopped", self.name)
|
|
54
|
+
|
|
55
|
+
async def health_check(self) -> bool:
|
|
56
|
+
return bool(self.settings.access_token) and bool(self.settings.app_id)
|
|
57
|
+
|
|
58
|
+
async def synthesize(
|
|
59
|
+
self, text: str, opts: TTSOptions | None = None
|
|
60
|
+
) -> AudioData:
|
|
61
|
+
if self._client is None:
|
|
62
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
63
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
64
|
+
_t0 = time.perf_counter()
|
|
65
|
+
opts = opts or TTSOptions()
|
|
66
|
+
|
|
67
|
+
payload = {
|
|
68
|
+
"app": {
|
|
69
|
+
"appid": self.settings.app_id,
|
|
70
|
+
"cluster": self.settings.cluster,
|
|
71
|
+
},
|
|
72
|
+
"user": {"uid": "openspeech"},
|
|
73
|
+
"audio": {
|
|
74
|
+
"voice_type": self.settings.voice_type,
|
|
75
|
+
"encoding": "wav",
|
|
76
|
+
"speed_ratio": opts.speed,
|
|
77
|
+
},
|
|
78
|
+
"request": {
|
|
79
|
+
"text": text,
|
|
80
|
+
"operation": "query",
|
|
81
|
+
},
|
|
82
|
+
}
|
|
83
|
+
headers = {
|
|
84
|
+
"Authorization": f"Bearer;{self.settings.access_token}",
|
|
85
|
+
"Content-Type": "application/json",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
resp = await self._client.post(
|
|
89
|
+
"https://openspeech.bytedance.com/api/v1/tts",
|
|
90
|
+
json=payload,
|
|
91
|
+
headers=headers,
|
|
92
|
+
)
|
|
93
|
+
resp.raise_for_status()
|
|
94
|
+
result = resp.json()
|
|
95
|
+
|
|
96
|
+
if result.get("code") != 0 and result.get("code") is not None:
|
|
97
|
+
raise RuntimeError(
|
|
98
|
+
f"Volcengine TTS error: {result.get('message', 'unknown error')}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
audio_b64 = result.get("data", "")
|
|
102
|
+
audio_bytes = base64.b64decode(audio_b64)
|
|
103
|
+
|
|
104
|
+
result = AudioData(
|
|
105
|
+
data=audio_bytes,
|
|
106
|
+
sample_rate=24000,
|
|
107
|
+
channels=1,
|
|
108
|
+
format=opts.output_format,
|
|
109
|
+
)
|
|
110
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
111
|
+
return result
|
|
112
|
+
|
|
113
|
+
async def synthesize_stream(
|
|
114
|
+
self, text: str, opts: TTSOptions | None = None
|
|
115
|
+
) -> AsyncIterator[AudioChunk]:
|
|
116
|
+
raise NotImplementedError("Volcengine TTS streaming not implemented")
|
|
117
|
+
yield # noqa: unreachable — makes this an async generator
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Windows native TTS provider using SAPI5 via ``pyttsx3``."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import io
|
|
6
|
+
from openspeech.logging_config import logger
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
import time
|
|
11
|
+
import wave
|
|
12
|
+
from collections.abc import AsyncIterator
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from openspeech.core.base import TTSProvider
|
|
17
|
+
|
|
18
|
+
from openspeech.core.enums import AudioFormat, Capability, ExecMode, ProviderType
|
|
19
|
+
from openspeech.core.models import AudioChunk, AudioData, TTSOptions
|
|
20
|
+
from openspeech.core.settings import BaseSettings
|
|
21
|
+
|
|
22
|
+
# Language prefix → human-readable group name (reuse same map as macos_say)
|
|
23
|
+
_LANG_GROUP_MAP: dict[str, str] = {
|
|
24
|
+
"zh": "中文",
|
|
25
|
+
"en": "English",
|
|
26
|
+
"ja": "日本語",
|
|
27
|
+
"ko": "한국어",
|
|
28
|
+
"fr": "Français",
|
|
29
|
+
"de": "Deutsch",
|
|
30
|
+
"es": "Español",
|
|
31
|
+
"it": "Italiano",
|
|
32
|
+
"pt": "Português",
|
|
33
|
+
"ru": "Русский",
|
|
34
|
+
"ar": "العربية",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class WindowsSapiSettings(BaseSettings):
|
|
39
|
+
default_voice: str = "" # Empty = system default; or voice name substring
|
|
40
|
+
default_rate: int = 200 # pyttsx3 rate (words per minute)
|
|
41
|
+
|
|
42
|
+
class WindowsSapiTTS(TTSProvider):
|
|
43
|
+
"""Windows native TTS via SAPI5 (pyttsx3)."""
|
|
44
|
+
|
|
45
|
+
name = "windows-tts"
|
|
46
|
+
provider_type = ProviderType.TTS
|
|
47
|
+
execution_mode = ExecMode.IN_PROCESS
|
|
48
|
+
settings_cls = WindowsSapiSettings
|
|
49
|
+
capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
|
|
50
|
+
|
|
51
|
+
def __init__(self, settings: WindowsSapiSettings | None = None) -> None:
|
|
52
|
+
self.settings = settings or WindowsSapiSettings()
|
|
53
|
+
self._available: bool = False
|
|
54
|
+
self._voices_cache: list[dict] | None = None
|
|
55
|
+
|
|
56
|
+
# -- lifecycle ------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
async def start(self) -> None:
|
|
59
|
+
if sys.platform != "win32":
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
"Windows TTS (SAPI5) is only available on Windows"
|
|
62
|
+
)
|
|
63
|
+
try:
|
|
64
|
+
import pyttsx3 # noqa: F401
|
|
65
|
+
except ImportError:
|
|
66
|
+
raise RuntimeError(
|
|
67
|
+
"pyttsx3 is required for Windows TTS. "
|
|
68
|
+
"Install with: pip install pyttsx3"
|
|
69
|
+
)
|
|
70
|
+
# Quick validation: ensure engine can be created
|
|
71
|
+
await asyncio.to_thread(self._validate_engine)
|
|
72
|
+
self._available = True
|
|
73
|
+
logger.info("{} provider started", self.name)
|
|
74
|
+
|
|
75
|
+
async def stop(self) -> None:
|
|
76
|
+
self._available = False
|
|
77
|
+
self._voices_cache = None
|
|
78
|
+
logger.info("{} provider stopped", self.name)
|
|
79
|
+
|
|
80
|
+
async def health_check(self) -> bool:
|
|
81
|
+
if self._available:
|
|
82
|
+
return True
|
|
83
|
+
if sys.platform != "win32":
|
|
84
|
+
return False
|
|
85
|
+
try:
|
|
86
|
+
import pyttsx3 # noqa: F401
|
|
87
|
+
return True
|
|
88
|
+
except ImportError:
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
# -- synthesis ------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
async def synthesize(
|
|
94
|
+
self, text: str, opts: TTSOptions | None = None
|
|
95
|
+
) -> AudioData:
|
|
96
|
+
if not self._available:
|
|
97
|
+
raise RuntimeError("Provider not started — call start() first")
|
|
98
|
+
logger.info("{}: request received, text={} chars", self.name, len(text))
|
|
99
|
+
_t0 = time.perf_counter()
|
|
100
|
+
|
|
101
|
+
opts = opts or TTSOptions()
|
|
102
|
+
voice_hint = opts.voice or self.settings.default_voice
|
|
103
|
+
rate = int(self.settings.default_rate * opts.speed)
|
|
104
|
+
|
|
105
|
+
wav_bytes = await asyncio.to_thread(
|
|
106
|
+
self._synthesize_sync, text, voice_hint, rate,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Parse WAV to extract real sample_rate/channels/duration
|
|
110
|
+
with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
|
|
111
|
+
sample_rate = wf.getframerate()
|
|
112
|
+
channels = wf.getnchannels()
|
|
113
|
+
n_frames = wf.getnframes()
|
|
114
|
+
duration_ms = int(n_frames / sample_rate * 1000) if sample_rate else 0
|
|
115
|
+
|
|
116
|
+
result = AudioData(
|
|
117
|
+
data=wav_bytes,
|
|
118
|
+
sample_rate=sample_rate,
|
|
119
|
+
channels=channels,
|
|
120
|
+
format=AudioFormat.WAV,
|
|
121
|
+
duration_ms=duration_ms,
|
|
122
|
+
)
|
|
123
|
+
logger.info("{}: completed in {:.0f}ms, output={} bytes", self.name, (time.perf_counter() - _t0) * 1000, len(result.data))
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
async def synthesize_stream(
|
|
127
|
+
self, text: str, opts: TTSOptions | None = None
|
|
128
|
+
) -> AsyncIterator[AudioChunk]:
|
|
129
|
+
"""Batch-then-chunk fallback: synthesize full audio, then yield chunks."""
|
|
130
|
+
logger.info("{}: stream request, text={} chars", self.name, len(text))
|
|
131
|
+
_t0 = time.perf_counter()
|
|
132
|
+
result = await self.synthesize(text, opts)
|
|
133
|
+
chunk_size = 4096
|
|
134
|
+
sequence = 0
|
|
135
|
+
chunk_count = 0
|
|
136
|
+
for i in range(0, len(result.data), chunk_size):
|
|
137
|
+
chunk_data = result.data[i : i + chunk_size]
|
|
138
|
+
logger.debug("{}: chunk #{}, {} bytes", self.name, sequence, len(chunk_data))
|
|
139
|
+
yield AudioChunk(data=chunk_data, sequence=sequence)
|
|
140
|
+
sequence += 1
|
|
141
|
+
chunk_count += 1
|
|
142
|
+
yield AudioChunk(data=b"", sequence=sequence, is_final=True)
|
|
143
|
+
logger.info("{}: stream complete, {} chunks in {:.0f}ms", self.name, chunk_count, (time.perf_counter() - _t0) * 1000)
|
|
144
|
+
|
|
145
|
+
# -- voices ---------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
async def list_voices(self) -> list[dict]:
|
|
148
|
+
if self._voices_cache is not None:
|
|
149
|
+
return self._voices_cache
|
|
150
|
+
|
|
151
|
+
voices = await asyncio.to_thread(self._list_voices_sync)
|
|
152
|
+
self._voices_cache = voices
|
|
153
|
+
return self._voices_cache
|
|
154
|
+
|
|
155
|
+
# -- internal helpers (sync, run in thread) --------------------------------
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def _validate_engine() -> None:
|
|
159
|
+
"""Create and immediately dispose a pyttsx3 engine to verify SAPI5."""
|
|
160
|
+
import pyttsx3
|
|
161
|
+
engine = pyttsx3.init("sapi5")
|
|
162
|
+
engine.stop()
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def _synthesize_sync(text: str, voice_hint: str, rate: int) -> bytes:
|
|
166
|
+
"""Synchronous TTS: create engine, synthesize to WAV, return bytes."""
|
|
167
|
+
import pyttsx3
|
|
168
|
+
|
|
169
|
+
engine = pyttsx3.init("sapi5")
|
|
170
|
+
try:
|
|
171
|
+
# Set rate
|
|
172
|
+
engine.setProperty("rate", rate)
|
|
173
|
+
|
|
174
|
+
# Resolve and set voice
|
|
175
|
+
if voice_hint:
|
|
176
|
+
voices = engine.getProperty("voices")
|
|
177
|
+
hint_lower = voice_hint.lower()
|
|
178
|
+
for v in voices:
|
|
179
|
+
if hint_lower in v.name.lower() or hint_lower in v.id.lower():
|
|
180
|
+
engine.setProperty("voice", v.id)
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
# Synthesize to temp WAV file
|
|
184
|
+
fd, tmp_path = tempfile.mkstemp(suffix=".wav")
|
|
185
|
+
os.close(fd)
|
|
186
|
+
try:
|
|
187
|
+
engine.save_to_file(text, tmp_path)
|
|
188
|
+
engine.runAndWait()
|
|
189
|
+
wav_bytes = Path(tmp_path).read_bytes()
|
|
190
|
+
finally:
|
|
191
|
+
try:
|
|
192
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
193
|
+
except OSError:
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
return wav_bytes
|
|
197
|
+
finally:
|
|
198
|
+
engine.stop()
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def _list_voices_sync() -> list[dict]:
|
|
202
|
+
"""List SAPI5 voices synchronously."""
|
|
203
|
+
import pyttsx3
|
|
204
|
+
|
|
205
|
+
engine = pyttsx3.init("sapi5")
|
|
206
|
+
try:
|
|
207
|
+
voices = engine.getProperty("voices")
|
|
208
|
+
result: list[dict] = []
|
|
209
|
+
for v in voices:
|
|
210
|
+
# SAPI voice language is in v.languages (list of bytes)
|
|
211
|
+
# or can be parsed from v.id
|
|
212
|
+
lang_str = ""
|
|
213
|
+
if v.languages:
|
|
214
|
+
# pyttsx3 gives languages as list; first entry is primary
|
|
215
|
+
raw = v.languages[0]
|
|
216
|
+
if isinstance(raw, bytes):
|
|
217
|
+
lang_str = raw.decode("utf-8", errors="replace").strip("\x00")
|
|
218
|
+
elif isinstance(raw, str):
|
|
219
|
+
lang_str = raw
|
|
220
|
+
|
|
221
|
+
# Derive group from language
|
|
222
|
+
lang_prefix = lang_str.split("-")[0].split("_")[0].lower() if lang_str else ""
|
|
223
|
+
group = _LANG_GROUP_MAP.get(lang_prefix, lang_prefix.upper()) if lang_prefix else "Other"
|
|
224
|
+
|
|
225
|
+
result.append({
|
|
226
|
+
"name": v.name,
|
|
227
|
+
"language": lang_str,
|
|
228
|
+
"description": getattr(v, "description", "") or "",
|
|
229
|
+
"group": group,
|
|
230
|
+
"id": v.id,
|
|
231
|
+
})
|
|
232
|
+
return result
|
|
233
|
+
finally:
|
|
234
|
+
engine.stop()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""OpenSpeech FastAPI server."""
|
openspeech/server/app.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""FastAPI application wrapping ServiceDispatcher."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from contextlib import asynccontextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from fastapi import FastAPI, Request
|
|
8
|
+
|
|
9
|
+
from openspeech.config import load_config
|
|
10
|
+
from openspeech.core.registry import ProviderRegistry
|
|
11
|
+
from openspeech.dispatch.dispatcher import ServiceDispatcher
|
|
12
|
+
from openspeech.dispatch.watcher import ConfigWatcher
|
|
13
|
+
from openspeech.local_engines import EngineManager
|
|
14
|
+
from openspeech.logging_config import ensure_configured
|
|
15
|
+
from openspeech.server.middleware import RequestContextMiddleware
|
|
16
|
+
from openspeech.server.routes import stt, tts, management, webui
|
|
17
|
+
from openspeech.server.ws import stt_stream, tts_stream
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def create_app(config_path: Path, registry: ProviderRegistry) -> FastAPI:
|
|
21
|
+
"""Create FastAPI app with dispatcher lifecycle."""
|
|
22
|
+
ensure_configured()
|
|
23
|
+
dispatcher = ServiceDispatcher.from_config(config_path, registry)
|
|
24
|
+
config = load_config(config_path)
|
|
25
|
+
|
|
26
|
+
@asynccontextmanager
|
|
27
|
+
async def lifespan(app: FastAPI):
|
|
28
|
+
await dispatcher.start()
|
|
29
|
+
|
|
30
|
+
async def _on_reload() -> dict:
|
|
31
|
+
return await dispatcher.reload_config(config_path, registry)
|
|
32
|
+
|
|
33
|
+
watcher = ConfigWatcher(config_path, on_reload=_on_reload)
|
|
34
|
+
watcher.start()
|
|
35
|
+
|
|
36
|
+
yield
|
|
37
|
+
|
|
38
|
+
await watcher.stop()
|
|
39
|
+
await dispatcher.stop()
|
|
40
|
+
|
|
41
|
+
app = FastAPI(title="OpenSpeech API", version="0.1.0", lifespan=lifespan)
|
|
42
|
+
app.state.dispatcher = dispatcher
|
|
43
|
+
app.state.config_path = config_path
|
|
44
|
+
app.state.registry = registry
|
|
45
|
+
app.state.server_config = config.server
|
|
46
|
+
app.state.engine_manager = EngineManager()
|
|
47
|
+
|
|
48
|
+
# Request-scoped logging context (request_id + route timing) — added first
|
|
49
|
+
# so that auth-middleware rejections still get logged with a request_id.
|
|
50
|
+
app.add_middleware(RequestContextMiddleware)
|
|
51
|
+
|
|
52
|
+
# Conditionally add authentication middleware
|
|
53
|
+
if config.server.auth_enabled and config.server.api_keys:
|
|
54
|
+
from openspeech.server.auth import AuthMiddleware
|
|
55
|
+
app.add_middleware(AuthMiddleware, api_keys=config.server.api_keys)
|
|
56
|
+
|
|
57
|
+
# Register routes
|
|
58
|
+
app.include_router(stt.router, prefix="/v1/stt", tags=["STT"])
|
|
59
|
+
app.include_router(tts.router, prefix="/v1/tts", tags=["TTS"])
|
|
60
|
+
app.include_router(management.router, prefix="/v1", tags=["Management"])
|
|
61
|
+
app.include_router(webui.router, tags=["WebUI"])
|
|
62
|
+
|
|
63
|
+
# Register WebSocket endpoints
|
|
64
|
+
app.include_router(stt_stream.router, prefix="/v1/stt", tags=["STT Streaming"])
|
|
65
|
+
app.include_router(tts_stream.router, prefix="/v1/tts", tags=["TTS Streaming"])
|
|
66
|
+
|
|
67
|
+
return app
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_dispatcher(request: Request) -> ServiceDispatcher:
|
|
71
|
+
"""Dependency to get dispatcher from app state."""
|
|
72
|
+
return request.app.state.dispatcher
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""API Key authentication middleware."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import json
|
|
4
|
+
from fastapi import Request
|
|
5
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
|
6
|
+
from starlette.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
# Exempt paths (no auth required)
|
|
9
|
+
EXEMPT_PATHS = {"/v1/health", "/docs", "/openapi.json", "/redoc"}
|
|
10
|
+
EXEMPT_PREFIXES = ("/ui",)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _unauthorized(detail: str) -> JSONResponse:
|
|
14
|
+
return JSONResponse(status_code=401, content={"detail": detail})
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AuthMiddleware(BaseHTTPMiddleware):
|
|
18
|
+
def __init__(self, app, api_keys: list[str]) -> None:
|
|
19
|
+
super().__init__(app)
|
|
20
|
+
self._api_keys = set(api_keys)
|
|
21
|
+
|
|
22
|
+
async def dispatch(self, request: Request, call_next):
|
|
23
|
+
# Skip auth for exempt paths
|
|
24
|
+
if request.url.path in EXEMPT_PATHS:
|
|
25
|
+
return await call_next(request)
|
|
26
|
+
if request.url.path.startswith(EXEMPT_PREFIXES):
|
|
27
|
+
return await call_next(request)
|
|
28
|
+
|
|
29
|
+
# Skip auth for WebSocket (handled separately via query param)
|
|
30
|
+
if request.scope.get("type") == "websocket":
|
|
31
|
+
return await call_next(request)
|
|
32
|
+
|
|
33
|
+
# Check Bearer token
|
|
34
|
+
auth_header = request.headers.get("Authorization", "")
|
|
35
|
+
if not auth_header.startswith("Bearer "):
|
|
36
|
+
return _unauthorized("Missing or invalid Authorization header")
|
|
37
|
+
|
|
38
|
+
token = auth_header[7:] # Strip "Bearer "
|
|
39
|
+
if token not in self._api_keys:
|
|
40
|
+
return _unauthorized("Invalid API key")
|
|
41
|
+
|
|
42
|
+
return await call_next(request)
|