openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
openspeech/demo.py
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
1
|
+
"""OpenSpeech interactive demo CLI."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import asyncio
|
|
6
|
+
import io
|
|
7
|
+
import os
|
|
8
|
+
import shlex
|
|
9
|
+
import struct
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
import wave
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
# ANSI color codes — no extra deps needed
|
|
17
|
+
_RESET = "\033[0m"
|
|
18
|
+
_BOLD = "\033[1m"
|
|
19
|
+
_GREEN = "\033[32m"
|
|
20
|
+
_CYAN = "\033[36m"
|
|
21
|
+
_YELLOW = "\033[33m"
|
|
22
|
+
_RED = "\033[31m"
|
|
23
|
+
_DIM = "\033[2m"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _c(text: str, *codes: str) -> str:
|
|
27
|
+
"""Apply ANSI color codes if stdout is a TTY."""
|
|
28
|
+
if not sys.stdout.isatty():
|
|
29
|
+
return text
|
|
30
|
+
return "".join(codes) + text + _RESET
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _load_dotenv() -> None:
|
|
34
|
+
"""Try to load .env file."""
|
|
35
|
+
try:
|
|
36
|
+
from dotenv import load_dotenv # type: ignore[import]
|
|
37
|
+
|
|
38
|
+
load_dotenv()
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _build_registry() -> Any:
|
|
44
|
+
"""Build provider registry with available providers."""
|
|
45
|
+
from openspeech.core.registry import ProviderRegistry
|
|
46
|
+
from openspeech.providers.stt.faster_whisper import FasterWhisperSTT
|
|
47
|
+
from openspeech.providers.stt.openai import OpenAISTT
|
|
48
|
+
from openspeech.providers.tts.openai import OpenAITTS
|
|
49
|
+
|
|
50
|
+
registry = ProviderRegistry()
|
|
51
|
+
registry.register("openai", OpenAISTT)
|
|
52
|
+
registry.register("faster-whisper", FasterWhisperSTT)
|
|
53
|
+
registry.register("openai-tts", OpenAITTS)
|
|
54
|
+
return registry
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _write_audio_file(path: str, audio: Any) -> None:
|
|
58
|
+
"""Write AudioData to a file. Wraps raw PCM in WAV headers if needed."""
|
|
59
|
+
from openspeech.core.enums import AudioFormat
|
|
60
|
+
|
|
61
|
+
p = Path(path)
|
|
62
|
+
ext = p.suffix.lower()
|
|
63
|
+
|
|
64
|
+
# If output is .wav and data is raw PCM (no WAV header), wrap it
|
|
65
|
+
is_raw_pcm = audio.format in (AudioFormat.PCM_16K, AudioFormat.PCM_44K) or (
|
|
66
|
+
len(audio.data) > 4 and audio.data[:4] != b"RIFF"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if ext == ".wav" and is_raw_pcm:
|
|
70
|
+
buf = io.BytesIO()
|
|
71
|
+
with wave.open(buf, "wb") as wf:
|
|
72
|
+
wf.setnchannels(audio.channels)
|
|
73
|
+
wf.setsampwidth(2) # 16-bit
|
|
74
|
+
wf.setframerate(audio.sample_rate)
|
|
75
|
+
wf.writeframes(audio.data)
|
|
76
|
+
p.write_bytes(buf.getvalue())
|
|
77
|
+
else:
|
|
78
|
+
p.write_bytes(audio.data)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _read_audio(path: str) -> Any:
|
|
82
|
+
"""Read an audio file into AudioData."""
|
|
83
|
+
from openspeech.core.enums import AudioFormat
|
|
84
|
+
from openspeech.core.models import AudioData
|
|
85
|
+
|
|
86
|
+
p = Path(path)
|
|
87
|
+
if not p.exists():
|
|
88
|
+
print(_c(f"Error: File not found: {path}", _RED), file=sys.stderr)
|
|
89
|
+
sys.exit(1)
|
|
90
|
+
|
|
91
|
+
data = p.read_bytes()
|
|
92
|
+
ext = p.suffix.lower()
|
|
93
|
+
fmt_map = {
|
|
94
|
+
".wav": AudioFormat.WAV,
|
|
95
|
+
".mp3": AudioFormat.MP3,
|
|
96
|
+
".ogg": AudioFormat.OGG,
|
|
97
|
+
".flac": AudioFormat.FLAC,
|
|
98
|
+
".opus": AudioFormat.OPUS,
|
|
99
|
+
}
|
|
100
|
+
fmt = fmt_map.get(ext, AudioFormat.WAV)
|
|
101
|
+
return AudioData(data=data, sample_rate=16000, channels=1, format=fmt)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _check_openai_key() -> str:
|
|
105
|
+
"""Return OPENAI_API_KEY or print error and exit."""
|
|
106
|
+
key = os.environ.get("OPENAI_API_KEY", "")
|
|
107
|
+
if not key:
|
|
108
|
+
print(
|
|
109
|
+
_c("Error: OPENAI_API_KEY environment variable is not set.", _RED),
|
|
110
|
+
file=sys.stderr,
|
|
111
|
+
)
|
|
112
|
+
print(
|
|
113
|
+
" Set it with: export OPENAI_API_KEY=sk-...",
|
|
114
|
+
file=sys.stderr,
|
|
115
|
+
)
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
return key
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _print_stt_result(result: Any, elapsed: float, show_words: bool = False) -> None:
|
|
121
|
+
"""Pretty-print a Transcription result."""
|
|
122
|
+
print()
|
|
123
|
+
print(_c("STT Result", _BOLD + _CYAN))
|
|
124
|
+
print(_c("─" * 50, _DIM))
|
|
125
|
+
print(f" {_c('Text:', _BOLD)} {result.text}")
|
|
126
|
+
lang = result.language or "N/A"
|
|
127
|
+
print(f" {_c('Language:', _BOLD)} {lang}")
|
|
128
|
+
conf = f"{result.confidence:.3f}" if result.confidence is not None else "N/A"
|
|
129
|
+
print(f" {_c('Confidence:', _BOLD)} {conf}")
|
|
130
|
+
print(f" {_c('Elapsed:', _BOLD)} {elapsed:.2f}s")
|
|
131
|
+
|
|
132
|
+
if show_words and result.words:
|
|
133
|
+
print()
|
|
134
|
+
print(_c(" Word timestamps:", _BOLD))
|
|
135
|
+
for w in result.words:
|
|
136
|
+
word_conf = f"{w.confidence:.2f}" if w.confidence is not None else "N/A"
|
|
137
|
+
print(
|
|
138
|
+
f" [{w.start_ms:>6}ms - {w.end_ms:>6}ms] "
|
|
139
|
+
f"{w.text:<20} conf={word_conf}"
|
|
140
|
+
)
|
|
141
|
+
print()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _print_tts_result(out_path: str, audio: Any, elapsed: float) -> None:
|
|
145
|
+
"""Pretty-print a TTS synthesis result."""
|
|
146
|
+
file_size = len(audio.data)
|
|
147
|
+
print()
|
|
148
|
+
print(_c("TTS Result", _BOLD + _CYAN))
|
|
149
|
+
print(_c("─" * 50, _DIM))
|
|
150
|
+
print(f" {_c('Output:', _BOLD)} {out_path}")
|
|
151
|
+
print(f" {_c('File size:', _BOLD)} {file_size:,} bytes")
|
|
152
|
+
print(f" {_c('Sample rate:', _BOLD)} {audio.sample_rate} Hz")
|
|
153
|
+
print(f" {_c('Elapsed:', _BOLD)} {elapsed:.2f}s")
|
|
154
|
+
print()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def _make_stt_provider(provider_name: str, args: Any) -> Any:
|
|
158
|
+
"""Instantiate and start an STT provider."""
|
|
159
|
+
registry = _build_registry()
|
|
160
|
+
cls = registry.get(provider_name)
|
|
161
|
+
|
|
162
|
+
settings: Any = None
|
|
163
|
+
if provider_name == "faster-whisper":
|
|
164
|
+
from openspeech.providers.stt.faster_whisper import FasterWhisperSTTSettings
|
|
165
|
+
|
|
166
|
+
model_size = getattr(args, "model_size", None) or "base"
|
|
167
|
+
device = getattr(args, "device", None) or "auto"
|
|
168
|
+
settings = FasterWhisperSTTSettings(model_size=model_size, device=device)
|
|
169
|
+
elif provider_name in ("openai", "openai-stt"):
|
|
170
|
+
from openspeech.providers.stt.openai import OpenAISTTSettings
|
|
171
|
+
|
|
172
|
+
api_key = _check_openai_key()
|
|
173
|
+
settings = OpenAISTTSettings(api_key=api_key)
|
|
174
|
+
|
|
175
|
+
provider = cls(settings)
|
|
176
|
+
await provider.start()
|
|
177
|
+
return provider
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def _make_tts_provider(provider_name: str, args: Any) -> Any:
|
|
181
|
+
"""Instantiate and start a TTS provider."""
|
|
182
|
+
registry = _build_registry()
|
|
183
|
+
cls = registry.get(provider_name)
|
|
184
|
+
|
|
185
|
+
settings: Any = None
|
|
186
|
+
if provider_name == "openai-tts":
|
|
187
|
+
from openspeech.providers.tts.openai import OpenAITTSSettings
|
|
188
|
+
|
|
189
|
+
api_key = _check_openai_key()
|
|
190
|
+
voice = getattr(args, "voice", None) or "alloy"
|
|
191
|
+
model = getattr(args, "model", None) or "tts-1"
|
|
192
|
+
settings = OpenAITTSSettings(api_key=api_key, voice=voice, model=model)
|
|
193
|
+
|
|
194
|
+
provider = cls(settings)
|
|
195
|
+
await provider.start()
|
|
196
|
+
return provider
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
async def _cmd_stt(args: Any) -> None:
|
|
200
|
+
"""Run STT transcription."""
|
|
201
|
+
provider_name = getattr(args, "provider", None) or "faster-whisper"
|
|
202
|
+
audio = _read_audio(args.input)
|
|
203
|
+
|
|
204
|
+
print(_c(f"Transcribing with provider: {provider_name} ...", _DIM))
|
|
205
|
+
provider = await _make_stt_provider(provider_name, args)
|
|
206
|
+
try:
|
|
207
|
+
t0 = time.perf_counter()
|
|
208
|
+
result = await provider.transcribe(audio)
|
|
209
|
+
elapsed = time.perf_counter() - t0
|
|
210
|
+
finally:
|
|
211
|
+
await provider.stop()
|
|
212
|
+
|
|
213
|
+
show_words = getattr(args, "words", False)
|
|
214
|
+
_print_stt_result(result, elapsed, show_words=show_words)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
async def _cmd_tts(args: Any) -> None:
|
|
218
|
+
"""Run TTS synthesis."""
|
|
219
|
+
provider_name = getattr(args, "provider", None) or "openai-tts"
|
|
220
|
+
out_path = getattr(args, "output", None) or "output.wav"
|
|
221
|
+
text = args.text
|
|
222
|
+
|
|
223
|
+
print(_c(f"Synthesizing with provider: {provider_name} ...", _DIM))
|
|
224
|
+
provider = await _make_tts_provider(provider_name, args)
|
|
225
|
+
try:
|
|
226
|
+
t0 = time.perf_counter()
|
|
227
|
+
audio = await provider.synthesize(text)
|
|
228
|
+
elapsed = time.perf_counter() - t0
|
|
229
|
+
finally:
|
|
230
|
+
await provider.stop()
|
|
231
|
+
|
|
232
|
+
_write_audio_file(out_path, audio)
|
|
233
|
+
_print_tts_result(out_path, audio, elapsed)
|
|
234
|
+
if getattr(args, "play", False):
|
|
235
|
+
from openspeech.utils.audio_playback import play_audio
|
|
236
|
+
|
|
237
|
+
play_audio(
|
|
238
|
+
audio,
|
|
239
|
+
device=getattr(args, "play_device", None),
|
|
240
|
+
volume=float(getattr(args, "play_volume", 1.0)),
|
|
241
|
+
blocking=not getattr(args, "play_non_blocking", False),
|
|
242
|
+
backend=getattr(args, "play_backend", "auto"),
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
async def _cmd_roundtrip(args: Any) -> None:
|
|
247
|
+
"""Run TTS -> STT roundtrip."""
|
|
248
|
+
tts_provider_name = getattr(args, "tts", None) or "openai-tts"
|
|
249
|
+
stt_provider_name = getattr(args, "stt", None) or "faster-whisper"
|
|
250
|
+
text = args.text
|
|
251
|
+
|
|
252
|
+
print()
|
|
253
|
+
print(_c("Roundtrip Test", _BOLD + _CYAN))
|
|
254
|
+
print(_c("─" * 50, _DIM))
|
|
255
|
+
print(f" {_c('Original text:', _BOLD)} {text}")
|
|
256
|
+
print(f" {_c('TTS provider:', _BOLD)} {tts_provider_name}")
|
|
257
|
+
print(f" {_c('STT provider:', _BOLD)} {stt_provider_name}")
|
|
258
|
+
print()
|
|
259
|
+
|
|
260
|
+
# Step 1: TTS
|
|
261
|
+
print(_c(f"Step 1: Synthesizing with {tts_provider_name} ...", _DIM))
|
|
262
|
+
tts_provider = await _make_tts_provider(tts_provider_name, args)
|
|
263
|
+
try:
|
|
264
|
+
t0 = time.perf_counter()
|
|
265
|
+
audio = await tts_provider.synthesize(text)
|
|
266
|
+
tts_elapsed = time.perf_counter() - t0
|
|
267
|
+
finally:
|
|
268
|
+
await tts_provider.stop()
|
|
269
|
+
|
|
270
|
+
if getattr(args, "play", False):
|
|
271
|
+
from openspeech.utils.audio_playback import play_audio
|
|
272
|
+
|
|
273
|
+
play_audio(
|
|
274
|
+
audio,
|
|
275
|
+
device=getattr(args, "play_device", None),
|
|
276
|
+
volume=float(getattr(args, "play_volume", 1.0)),
|
|
277
|
+
blocking=not getattr(args, "play_non_blocking", False),
|
|
278
|
+
backend=getattr(args, "play_backend", "auto"),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
print(
|
|
282
|
+
f" {_c('Audio:', _BOLD)} {len(audio.data):,} bytes "
|
|
283
|
+
f"@ {audio.sample_rate} Hz ({tts_elapsed:.2f}s)"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Step 2: STT
|
|
287
|
+
print(_c(f"Step 2: Transcribing with {stt_provider_name} ...", _DIM))
|
|
288
|
+
stt_provider = await _make_stt_provider(stt_provider_name, args)
|
|
289
|
+
try:
|
|
290
|
+
t0 = time.perf_counter()
|
|
291
|
+
result = await stt_provider.transcribe(audio)
|
|
292
|
+
stt_elapsed = time.perf_counter() - t0
|
|
293
|
+
finally:
|
|
294
|
+
await stt_provider.stop()
|
|
295
|
+
|
|
296
|
+
print(f" {_c('Transcribed:', _BOLD)} {result.text}")
|
|
297
|
+
conf = f"{result.confidence:.3f}" if result.confidence is not None else "N/A"
|
|
298
|
+
print(f" {_c('Confidence:', _BOLD)} {conf} ({stt_elapsed:.2f}s)")
|
|
299
|
+
print()
|
|
300
|
+
|
|
301
|
+
# Summary
|
|
302
|
+
total = tts_elapsed + stt_elapsed
|
|
303
|
+
print(_c("Summary", _BOLD))
|
|
304
|
+
print(f" Original: {text}")
|
|
305
|
+
print(f" Transcribed: {result.text}")
|
|
306
|
+
print(f" Total time: {total:.2f}s")
|
|
307
|
+
print()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
async def _cmd_compare(args: Any) -> None:
|
|
311
|
+
"""Run multi-provider STT comparison."""
|
|
312
|
+
providers_str: str = getattr(args, "provider", None) or "faster-whisper"
|
|
313
|
+
provider_names = [p.strip() for p in providers_str.split(",")]
|
|
314
|
+
audio = _read_audio(args.input)
|
|
315
|
+
|
|
316
|
+
print(_c(f"Comparing {len(provider_names)} STT provider(s) ...", _DIM))
|
|
317
|
+
|
|
318
|
+
async def _run_one(name: str) -> tuple[str, Any, float]:
|
|
319
|
+
try:
|
|
320
|
+
provider = await _make_stt_provider(name, args)
|
|
321
|
+
t0 = time.perf_counter()
|
|
322
|
+
try:
|
|
323
|
+
result = await provider.transcribe(audio)
|
|
324
|
+
finally:
|
|
325
|
+
await provider.stop()
|
|
326
|
+
elapsed = time.perf_counter() - t0
|
|
327
|
+
return (name, result, elapsed)
|
|
328
|
+
except Exception as exc: # noqa: BLE001
|
|
329
|
+
return (name, exc, 0.0)
|
|
330
|
+
|
|
331
|
+
results = await asyncio.gather(*[_run_one(n) for n in provider_names])
|
|
332
|
+
|
|
333
|
+
# Print comparison table
|
|
334
|
+
col_prov = 18
|
|
335
|
+
col_text = 35
|
|
336
|
+
col_conf = 12
|
|
337
|
+
col_time = 8
|
|
338
|
+
|
|
339
|
+
header = (
|
|
340
|
+
f"{'Provider':<{col_prov}}"
|
|
341
|
+
f"{'Text':<{col_text}}"
|
|
342
|
+
f"{'Confidence':<{col_conf}}"
|
|
343
|
+
f"{'Time':>{col_time}}"
|
|
344
|
+
)
|
|
345
|
+
sep = "─" * (col_prov + col_text + col_conf + col_time)
|
|
346
|
+
|
|
347
|
+
print()
|
|
348
|
+
print(_c(header, _BOLD))
|
|
349
|
+
print(_c(sep, _DIM))
|
|
350
|
+
for name, result, elapsed in results:
|
|
351
|
+
if isinstance(result, Exception):
|
|
352
|
+
text_cell = _c(f"ERROR: {result}", _RED)
|
|
353
|
+
conf_cell = "N/A"
|
|
354
|
+
time_cell = "N/A"
|
|
355
|
+
else:
|
|
356
|
+
raw_text = result.text or ""
|
|
357
|
+
text_cell = raw_text[:col_text - 2] + ".." if len(raw_text) > col_text - 1 else raw_text
|
|
358
|
+
conf_val = result.confidence
|
|
359
|
+
conf_cell = f"{conf_val:.3f}" if conf_val is not None else "N/A"
|
|
360
|
+
time_cell = f"{elapsed:.2f}s"
|
|
361
|
+
print(
|
|
362
|
+
f"{name:<{col_prov}}"
|
|
363
|
+
f"{text_cell:<{col_text}}"
|
|
364
|
+
f"{conf_cell:<{col_conf}}"
|
|
365
|
+
f"{time_cell:>{col_time}}"
|
|
366
|
+
)
|
|
367
|
+
print()
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _parse_repl_line(line: str) -> list[str]:
|
|
371
|
+
"""Parse a REPL input line using shell-like splitting."""
|
|
372
|
+
try:
|
|
373
|
+
return shlex.split(line.strip())
|
|
374
|
+
except ValueError:
|
|
375
|
+
# Fallback for unbalanced quotes
|
|
376
|
+
return line.strip().split()
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _print_repl_help() -> None:
|
|
380
|
+
print()
|
|
381
|
+
print(_c("Available REPL commands:", _BOLD + _CYAN))
|
|
382
|
+
print(" stt <audio_file> [-p <provider>] Transcribe audio file")
|
|
383
|
+
print(" tts <text> [-o <output.wav>] [-p <prov>] [--play] Synthesize speech")
|
|
384
|
+
print(" roundtrip <text> [--tts <prov>] [--stt <prov>] Roundtrip test")
|
|
385
|
+
print(" compare <audio_file> [-p p1,p2] Compare providers")
|
|
386
|
+
print(" providers List available providers")
|
|
387
|
+
print(" help Show this help")
|
|
388
|
+
print(" quit / exit / q Exit")
|
|
389
|
+
print()
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _print_providers() -> None:
|
|
393
|
+
print()
|
|
394
|
+
print(_c("Available providers:", _BOLD + _CYAN))
|
|
395
|
+
print(
|
|
396
|
+
f" {'Name':<20}{'Type':<8}{'Mode':<14}{'Requirements'}"
|
|
397
|
+
)
|
|
398
|
+
print(_c(" " + "─" * 56, _DIM))
|
|
399
|
+
rows = [
|
|
400
|
+
("faster-whisper", "STT", "subprocess", "pip install faster-whisper"),
|
|
401
|
+
("openai", "STT", "in_process", "OPENAI_API_KEY"),
|
|
402
|
+
("openai-tts", "TTS", "in_process", "OPENAI_API_KEY"),
|
|
403
|
+
]
|
|
404
|
+
for name, ptype, mode, req in rows:
|
|
405
|
+
print(f" {name:<20}{ptype:<8}{mode:<14}{req}")
|
|
406
|
+
print()
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
async def _repl_dispatch(tokens: list[str]) -> None:
|
|
410
|
+
"""Dispatch a single REPL command."""
|
|
411
|
+
if not tokens:
|
|
412
|
+
return
|
|
413
|
+
|
|
414
|
+
cmd = tokens[0].lower()
|
|
415
|
+
|
|
416
|
+
if cmd in ("quit", "exit", "q"):
|
|
417
|
+
print(_c("Goodbye!", _GREEN))
|
|
418
|
+
sys.exit(0)
|
|
419
|
+
|
|
420
|
+
elif cmd == "help":
|
|
421
|
+
_print_repl_help()
|
|
422
|
+
|
|
423
|
+
elif cmd == "providers":
|
|
424
|
+
_print_providers()
|
|
425
|
+
|
|
426
|
+
elif cmd == "stt":
|
|
427
|
+
if len(tokens) < 2:
|
|
428
|
+
print(_c("Usage: stt <audio_file> [-p <provider>]", _YELLOW))
|
|
429
|
+
return
|
|
430
|
+
parser = argparse.ArgumentParser(prog="stt", add_help=False)
|
|
431
|
+
parser.add_argument("input")
|
|
432
|
+
parser.add_argument("-p", "--provider", default="faster-whisper")
|
|
433
|
+
parser.add_argument("--model-size", default="base")
|
|
434
|
+
parser.add_argument("--device", default="auto")
|
|
435
|
+
parser.add_argument("--words", action="store_true")
|
|
436
|
+
try:
|
|
437
|
+
parsed = parser.parse_args(tokens[1:])
|
|
438
|
+
except SystemExit:
|
|
439
|
+
return
|
|
440
|
+
await _cmd_stt(parsed)
|
|
441
|
+
|
|
442
|
+
elif cmd == "tts":
|
|
443
|
+
if len(tokens) < 2:
|
|
444
|
+
print(_c("Usage: tts <text> [-o output.wav] [-p <provider>]", _YELLOW))
|
|
445
|
+
return
|
|
446
|
+
parser = argparse.ArgumentParser(prog="tts", add_help=False)
|
|
447
|
+
parser.add_argument("text")
|
|
448
|
+
parser.add_argument("-o", "--output", default="output.wav")
|
|
449
|
+
parser.add_argument("-p", "--provider", default="openai-tts")
|
|
450
|
+
parser.add_argument("--voice", default="alloy")
|
|
451
|
+
parser.add_argument("--model", default="tts-1")
|
|
452
|
+
parser.add_argument("--play", action="store_true")
|
|
453
|
+
parser.add_argument("--play-device", default=None)
|
|
454
|
+
parser.add_argument("--play-volume", type=float, default=1.0)
|
|
455
|
+
parser.add_argument("--play-backend", default="auto")
|
|
456
|
+
parser.add_argument("--play-non-blocking", action="store_true")
|
|
457
|
+
try:
|
|
458
|
+
parsed = parser.parse_args(tokens[1:])
|
|
459
|
+
except SystemExit:
|
|
460
|
+
return
|
|
461
|
+
await _cmd_tts(parsed)
|
|
462
|
+
|
|
463
|
+
elif cmd == "roundtrip":
|
|
464
|
+
if len(tokens) < 2:
|
|
465
|
+
print(_c("Usage: roundtrip <text> [--tts <provider>] [--stt <provider>]", _YELLOW))
|
|
466
|
+
return
|
|
467
|
+
parser = argparse.ArgumentParser(prog="roundtrip", add_help=False)
|
|
468
|
+
parser.add_argument("text")
|
|
469
|
+
parser.add_argument("--tts", default="openai-tts")
|
|
470
|
+
parser.add_argument("--stt", default="faster-whisper")
|
|
471
|
+
parser.add_argument("--voice", default="alloy")
|
|
472
|
+
parser.add_argument("--model-size", default="base")
|
|
473
|
+
parser.add_argument("--device", default="auto")
|
|
474
|
+
parser.add_argument("--play", action="store_true")
|
|
475
|
+
parser.add_argument("--play-device", default=None)
|
|
476
|
+
parser.add_argument("--play-volume", type=float, default=1.0)
|
|
477
|
+
parser.add_argument("--play-backend", default="auto")
|
|
478
|
+
parser.add_argument("--play-non-blocking", action="store_true")
|
|
479
|
+
try:
|
|
480
|
+
parsed = parser.parse_args(tokens[1:])
|
|
481
|
+
except SystemExit:
|
|
482
|
+
return
|
|
483
|
+
await _cmd_roundtrip(parsed)
|
|
484
|
+
|
|
485
|
+
elif cmd == "compare":
|
|
486
|
+
if len(tokens) < 2:
|
|
487
|
+
print(_c("Usage: compare <audio_file> [-p provider1,provider2]", _YELLOW))
|
|
488
|
+
return
|
|
489
|
+
parser = argparse.ArgumentParser(prog="compare", add_help=False)
|
|
490
|
+
parser.add_argument("input")
|
|
491
|
+
parser.add_argument("-p", "--provider", default="faster-whisper")
|
|
492
|
+
parser.add_argument("--model-size", default="base")
|
|
493
|
+
parser.add_argument("--device", default="auto")
|
|
494
|
+
try:
|
|
495
|
+
parsed = parser.parse_args(tokens[1:])
|
|
496
|
+
except SystemExit:
|
|
497
|
+
return
|
|
498
|
+
await _cmd_compare(parsed)
|
|
499
|
+
|
|
500
|
+
else:
|
|
501
|
+
print(_c(f"Unknown command: {cmd}. Type 'help' for available commands.", _YELLOW))
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
async def _repl_loop() -> None:
|
|
505
|
+
"""Interactive REPL loop."""
|
|
506
|
+
print()
|
|
507
|
+
print(_c("OpenSpeech Interactive Demo", _BOLD + _GREEN))
|
|
508
|
+
print(_c("═" * 40, _DIM))
|
|
509
|
+
_print_repl_help()
|
|
510
|
+
print(_c("Type 'help' for commands, 'quit' to exit.", _DIM))
|
|
511
|
+
print()
|
|
512
|
+
|
|
513
|
+
while True:
|
|
514
|
+
try:
|
|
515
|
+
line = input(_c("> ", _BOLD + _GREEN))
|
|
516
|
+
except (EOFError, KeyboardInterrupt):
|
|
517
|
+
print()
|
|
518
|
+
print(_c("Goodbye!", _GREEN))
|
|
519
|
+
break
|
|
520
|
+
|
|
521
|
+
tokens = _parse_repl_line(line)
|
|
522
|
+
if not tokens:
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
try:
|
|
526
|
+
await _repl_dispatch(tokens)
|
|
527
|
+
except KeyboardInterrupt:
|
|
528
|
+
print(_c("\nInterrupted.", _YELLOW))
|
|
529
|
+
except Exception as exc: # noqa: BLE001
|
|
530
|
+
print(_c(f"Error: {exc}", _RED))
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def _cmd_repl(_args: Any) -> None:
|
|
534
|
+
"""Entry point for REPL subcommand."""
|
|
535
|
+
asyncio.run(_repl_loop())
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def main() -> None:
|
|
539
|
+
_load_dotenv()
|
|
540
|
+
|
|
541
|
+
parser = argparse.ArgumentParser(
|
|
542
|
+
prog="openspeech-demo",
|
|
543
|
+
description="OpenSpeech interactive demo — STT, TTS, roundtrip, and compare.",
|
|
544
|
+
)
|
|
545
|
+
subparsers = parser.add_subparsers(dest="command", metavar="<command>")
|
|
546
|
+
subparsers.required = True
|
|
547
|
+
|
|
548
|
+
# ── stt ──────────────────────────────────────────────────────────────────
|
|
549
|
+
stt_p = subparsers.add_parser("stt", help="Transcribe audio file to text")
|
|
550
|
+
stt_p.add_argument("-i", "--input", required=True, metavar="FILE", help="Input audio file")
|
|
551
|
+
stt_p.add_argument(
|
|
552
|
+
"-p", "--provider", default="faster-whisper",
|
|
553
|
+
metavar="PROVIDER", help="STT provider (default: faster-whisper)"
|
|
554
|
+
)
|
|
555
|
+
stt_p.add_argument(
|
|
556
|
+
"--model-size", default="base",
|
|
557
|
+
metavar="SIZE", help="Model size for faster-whisper (default: base)"
|
|
558
|
+
)
|
|
559
|
+
stt_p.add_argument(
|
|
560
|
+
"--device", default="auto",
|
|
561
|
+
metavar="DEVICE", help="Device for faster-whisper (default: auto)"
|
|
562
|
+
)
|
|
563
|
+
stt_p.add_argument(
|
|
564
|
+
"--words", action="store_true",
|
|
565
|
+
help="Show word-level timestamps"
|
|
566
|
+
)
|
|
567
|
+
stt_p.set_defaults(func=lambda a: asyncio.run(_cmd_stt(a)))
|
|
568
|
+
|
|
569
|
+
# ── tts ──────────────────────────────────────────────────────────────────
|
|
570
|
+
tts_p = subparsers.add_parser("tts", help="Synthesize text to audio file")
|
|
571
|
+
tts_p.add_argument("-t", "--text", required=True, metavar="TEXT", help="Input text")
|
|
572
|
+
tts_p.add_argument(
|
|
573
|
+
"-o", "--output", default="output.wav",
|
|
574
|
+
metavar="FILE", help="Output audio file (default: output.wav)"
|
|
575
|
+
)
|
|
576
|
+
tts_p.add_argument(
|
|
577
|
+
"-p", "--provider", default="openai-tts",
|
|
578
|
+
metavar="PROVIDER", help="TTS provider (default: openai-tts)"
|
|
579
|
+
)
|
|
580
|
+
tts_p.add_argument(
|
|
581
|
+
"--voice", default="alloy",
|
|
582
|
+
metavar="VOICE", help="Voice for openai-tts (default: alloy)"
|
|
583
|
+
)
|
|
584
|
+
tts_p.add_argument(
|
|
585
|
+
"--model", default="tts-1",
|
|
586
|
+
metavar="MODEL", help="Model for openai-tts (default: tts-1)"
|
|
587
|
+
)
|
|
588
|
+
tts_p.add_argument("--play", action="store_true", help="Play audio after synthesis")
|
|
589
|
+
tts_p.add_argument("--play-device", default=None, metavar="DEVICE", help="Playback device")
|
|
590
|
+
tts_p.add_argument(
|
|
591
|
+
"--play-volume", type=float, default=1.0, metavar="VOLUME", help="Playback volume"
|
|
592
|
+
)
|
|
593
|
+
tts_p.add_argument(
|
|
594
|
+
"--play-backend",
|
|
595
|
+
default="auto",
|
|
596
|
+
choices=["auto", "sounddevice", "external"],
|
|
597
|
+
metavar="BACKEND",
|
|
598
|
+
help="Playback backend",
|
|
599
|
+
)
|
|
600
|
+
tts_p.add_argument(
|
|
601
|
+
"--play-non-blocking",
|
|
602
|
+
action="store_true",
|
|
603
|
+
help="Do not block while playing audio",
|
|
604
|
+
)
|
|
605
|
+
tts_p.set_defaults(func=lambda a: asyncio.run(_cmd_tts(a)))
|
|
606
|
+
|
|
607
|
+
# ── roundtrip ─────────────────────────────────────────────────────────────
|
|
608
|
+
rt_p = subparsers.add_parser("roundtrip", help="TTS -> STT roundtrip test")
|
|
609
|
+
rt_p.add_argument("-t", "--text", required=True, metavar="TEXT", help="Input text")
|
|
610
|
+
rt_p.add_argument(
|
|
611
|
+
"--tts", default="openai-tts",
|
|
612
|
+
metavar="PROVIDER", help="TTS provider (default: openai-tts)"
|
|
613
|
+
)
|
|
614
|
+
rt_p.add_argument(
|
|
615
|
+
"--stt", default="faster-whisper",
|
|
616
|
+
metavar="PROVIDER", help="STT provider (default: faster-whisper)"
|
|
617
|
+
)
|
|
618
|
+
rt_p.add_argument(
|
|
619
|
+
"--voice", default="alloy",
|
|
620
|
+
metavar="VOICE", help="Voice for openai-tts (default: alloy)"
|
|
621
|
+
)
|
|
622
|
+
rt_p.add_argument(
|
|
623
|
+
"--model-size", default="base",
|
|
624
|
+
metavar="SIZE", help="Model size for faster-whisper (default: base)"
|
|
625
|
+
)
|
|
626
|
+
rt_p.add_argument(
|
|
627
|
+
"--device", default="auto",
|
|
628
|
+
metavar="DEVICE", help="Device for faster-whisper (default: auto)"
|
|
629
|
+
)
|
|
630
|
+
rt_p.add_argument("--play", action="store_true", help="Play synthesized audio before STT")
|
|
631
|
+
rt_p.add_argument("--play-device", default=None, metavar="DEVICE", help="Playback device")
|
|
632
|
+
rt_p.add_argument(
|
|
633
|
+
"--play-volume", type=float, default=1.0, metavar="VOLUME", help="Playback volume"
|
|
634
|
+
)
|
|
635
|
+
rt_p.add_argument(
|
|
636
|
+
"--play-backend",
|
|
637
|
+
default="auto",
|
|
638
|
+
choices=["auto", "sounddevice", "external"],
|
|
639
|
+
metavar="BACKEND",
|
|
640
|
+
help="Playback backend",
|
|
641
|
+
)
|
|
642
|
+
rt_p.add_argument(
|
|
643
|
+
"--play-non-blocking",
|
|
644
|
+
action="store_true",
|
|
645
|
+
help="Do not block while playing audio",
|
|
646
|
+
)
|
|
647
|
+
rt_p.set_defaults(func=lambda a: asyncio.run(_cmd_roundtrip(a)))
|
|
648
|
+
|
|
649
|
+
# ── compare ───────────────────────────────────────────────────────────────
|
|
650
|
+
cmp_p = subparsers.add_parser("compare", help="Compare multiple STT providers")
|
|
651
|
+
cmp_p.add_argument("-i", "--input", required=True, metavar="FILE", help="Input audio file")
|
|
652
|
+
cmp_p.add_argument(
|
|
653
|
+
"-p", "--provider", default="faster-whisper",
|
|
654
|
+
metavar="PROVIDERS", help="Comma-separated provider list (default: faster-whisper)"
|
|
655
|
+
)
|
|
656
|
+
cmp_p.add_argument(
|
|
657
|
+
"--model-size", default="base",
|
|
658
|
+
metavar="SIZE", help="Model size for faster-whisper (default: base)"
|
|
659
|
+
)
|
|
660
|
+
cmp_p.add_argument(
|
|
661
|
+
"--device", default="auto",
|
|
662
|
+
metavar="DEVICE", help="Device for faster-whisper (default: auto)"
|
|
663
|
+
)
|
|
664
|
+
cmp_p.set_defaults(func=lambda a: asyncio.run(_cmd_compare(a)))
|
|
665
|
+
|
|
666
|
+
# ── repl ──────────────────────────────────────────────────────────────────
|
|
667
|
+
repl_p = subparsers.add_parser("repl", help="Interactive REPL mode")
|
|
668
|
+
repl_p.set_defaults(func=_cmd_repl)
|
|
669
|
+
|
|
670
|
+
args = parser.parse_args()
|
|
671
|
+
args.func(args)
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
if __name__ == "__main__":
|
|
675
|
+
main()
|
|
File without changes
|