openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
openspeech/demo.py ADDED
@@ -0,0 +1,675 @@
1
+ """OpenSpeech interactive demo CLI."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import asyncio
6
+ import io
7
+ import os
8
+ import shlex
9
+ import struct
10
+ import sys
11
+ import time
12
+ import wave
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ # ANSI color codes — no extra deps needed
17
+ _RESET = "\033[0m"
18
+ _BOLD = "\033[1m"
19
+ _GREEN = "\033[32m"
20
+ _CYAN = "\033[36m"
21
+ _YELLOW = "\033[33m"
22
+ _RED = "\033[31m"
23
+ _DIM = "\033[2m"
24
+
25
+
26
+ def _c(text: str, *codes: str) -> str:
27
+ """Apply ANSI color codes if stdout is a TTY."""
28
+ if not sys.stdout.isatty():
29
+ return text
30
+ return "".join(codes) + text + _RESET
31
+
32
+
33
+ def _load_dotenv() -> None:
34
+ """Try to load .env file."""
35
+ try:
36
+ from dotenv import load_dotenv # type: ignore[import]
37
+
38
+ load_dotenv()
39
+ except ImportError:
40
+ pass
41
+
42
+
43
+ def _build_registry() -> Any:
44
+ """Build provider registry with available providers."""
45
+ from openspeech.core.registry import ProviderRegistry
46
+ from openspeech.providers.stt.faster_whisper import FasterWhisperSTT
47
+ from openspeech.providers.stt.openai import OpenAISTT
48
+ from openspeech.providers.tts.openai import OpenAITTS
49
+
50
+ registry = ProviderRegistry()
51
+ registry.register("openai", OpenAISTT)
52
+ registry.register("faster-whisper", FasterWhisperSTT)
53
+ registry.register("openai-tts", OpenAITTS)
54
+ return registry
55
+
56
+
57
+ def _write_audio_file(path: str, audio: Any) -> None:
58
+ """Write AudioData to a file. Wraps raw PCM in WAV headers if needed."""
59
+ from openspeech.core.enums import AudioFormat
60
+
61
+ p = Path(path)
62
+ ext = p.suffix.lower()
63
+
64
+ # If output is .wav and data is raw PCM (no WAV header), wrap it
65
+ is_raw_pcm = audio.format in (AudioFormat.PCM_16K, AudioFormat.PCM_44K) or (
66
+ len(audio.data) > 4 and audio.data[:4] != b"RIFF"
67
+ )
68
+
69
+ if ext == ".wav" and is_raw_pcm:
70
+ buf = io.BytesIO()
71
+ with wave.open(buf, "wb") as wf:
72
+ wf.setnchannels(audio.channels)
73
+ wf.setsampwidth(2) # 16-bit
74
+ wf.setframerate(audio.sample_rate)
75
+ wf.writeframes(audio.data)
76
+ p.write_bytes(buf.getvalue())
77
+ else:
78
+ p.write_bytes(audio.data)
79
+
80
+
81
+ def _read_audio(path: str) -> Any:
82
+ """Read an audio file into AudioData."""
83
+ from openspeech.core.enums import AudioFormat
84
+ from openspeech.core.models import AudioData
85
+
86
+ p = Path(path)
87
+ if not p.exists():
88
+ print(_c(f"Error: File not found: {path}", _RED), file=sys.stderr)
89
+ sys.exit(1)
90
+
91
+ data = p.read_bytes()
92
+ ext = p.suffix.lower()
93
+ fmt_map = {
94
+ ".wav": AudioFormat.WAV,
95
+ ".mp3": AudioFormat.MP3,
96
+ ".ogg": AudioFormat.OGG,
97
+ ".flac": AudioFormat.FLAC,
98
+ ".opus": AudioFormat.OPUS,
99
+ }
100
+ fmt = fmt_map.get(ext, AudioFormat.WAV)
101
+ return AudioData(data=data, sample_rate=16000, channels=1, format=fmt)
102
+
103
+
104
+ def _check_openai_key() -> str:
105
+ """Return OPENAI_API_KEY or print error and exit."""
106
+ key = os.environ.get("OPENAI_API_KEY", "")
107
+ if not key:
108
+ print(
109
+ _c("Error: OPENAI_API_KEY environment variable is not set.", _RED),
110
+ file=sys.stderr,
111
+ )
112
+ print(
113
+ " Set it with: export OPENAI_API_KEY=sk-...",
114
+ file=sys.stderr,
115
+ )
116
+ sys.exit(1)
117
+ return key
118
+
119
+
120
+ def _print_stt_result(result: Any, elapsed: float, show_words: bool = False) -> None:
121
+ """Pretty-print a Transcription result."""
122
+ print()
123
+ print(_c("STT Result", _BOLD + _CYAN))
124
+ print(_c("─" * 50, _DIM))
125
+ print(f" {_c('Text:', _BOLD)} {result.text}")
126
+ lang = result.language or "N/A"
127
+ print(f" {_c('Language:', _BOLD)} {lang}")
128
+ conf = f"{result.confidence:.3f}" if result.confidence is not None else "N/A"
129
+ print(f" {_c('Confidence:', _BOLD)} {conf}")
130
+ print(f" {_c('Elapsed:', _BOLD)} {elapsed:.2f}s")
131
+
132
+ if show_words and result.words:
133
+ print()
134
+ print(_c(" Word timestamps:", _BOLD))
135
+ for w in result.words:
136
+ word_conf = f"{w.confidence:.2f}" if w.confidence is not None else "N/A"
137
+ print(
138
+ f" [{w.start_ms:>6}ms - {w.end_ms:>6}ms] "
139
+ f"{w.text:<20} conf={word_conf}"
140
+ )
141
+ print()
142
+
143
+
144
+ def _print_tts_result(out_path: str, audio: Any, elapsed: float) -> None:
145
+ """Pretty-print a TTS synthesis result."""
146
+ file_size = len(audio.data)
147
+ print()
148
+ print(_c("TTS Result", _BOLD + _CYAN))
149
+ print(_c("─" * 50, _DIM))
150
+ print(f" {_c('Output:', _BOLD)} {out_path}")
151
+ print(f" {_c('File size:', _BOLD)} {file_size:,} bytes")
152
+ print(f" {_c('Sample rate:', _BOLD)} {audio.sample_rate} Hz")
153
+ print(f" {_c('Elapsed:', _BOLD)} {elapsed:.2f}s")
154
+ print()
155
+
156
+
157
+ async def _make_stt_provider(provider_name: str, args: Any) -> Any:
158
+ """Instantiate and start an STT provider."""
159
+ registry = _build_registry()
160
+ cls = registry.get(provider_name)
161
+
162
+ settings: Any = None
163
+ if provider_name == "faster-whisper":
164
+ from openspeech.providers.stt.faster_whisper import FasterWhisperSTTSettings
165
+
166
+ model_size = getattr(args, "model_size", None) or "base"
167
+ device = getattr(args, "device", None) or "auto"
168
+ settings = FasterWhisperSTTSettings(model_size=model_size, device=device)
169
+ elif provider_name in ("openai", "openai-stt"):
170
+ from openspeech.providers.stt.openai import OpenAISTTSettings
171
+
172
+ api_key = _check_openai_key()
173
+ settings = OpenAISTTSettings(api_key=api_key)
174
+
175
+ provider = cls(settings)
176
+ await provider.start()
177
+ return provider
178
+
179
+
180
+ async def _make_tts_provider(provider_name: str, args: Any) -> Any:
181
+ """Instantiate and start a TTS provider."""
182
+ registry = _build_registry()
183
+ cls = registry.get(provider_name)
184
+
185
+ settings: Any = None
186
+ if provider_name == "openai-tts":
187
+ from openspeech.providers.tts.openai import OpenAITTSSettings
188
+
189
+ api_key = _check_openai_key()
190
+ voice = getattr(args, "voice", None) or "alloy"
191
+ model = getattr(args, "model", None) or "tts-1"
192
+ settings = OpenAITTSSettings(api_key=api_key, voice=voice, model=model)
193
+
194
+ provider = cls(settings)
195
+ await provider.start()
196
+ return provider
197
+
198
+
199
+ async def _cmd_stt(args: Any) -> None:
200
+ """Run STT transcription."""
201
+ provider_name = getattr(args, "provider", None) or "faster-whisper"
202
+ audio = _read_audio(args.input)
203
+
204
+ print(_c(f"Transcribing with provider: {provider_name} ...", _DIM))
205
+ provider = await _make_stt_provider(provider_name, args)
206
+ try:
207
+ t0 = time.perf_counter()
208
+ result = await provider.transcribe(audio)
209
+ elapsed = time.perf_counter() - t0
210
+ finally:
211
+ await provider.stop()
212
+
213
+ show_words = getattr(args, "words", False)
214
+ _print_stt_result(result, elapsed, show_words=show_words)
215
+
216
+
217
+ async def _cmd_tts(args: Any) -> None:
218
+ """Run TTS synthesis."""
219
+ provider_name = getattr(args, "provider", None) or "openai-tts"
220
+ out_path = getattr(args, "output", None) or "output.wav"
221
+ text = args.text
222
+
223
+ print(_c(f"Synthesizing with provider: {provider_name} ...", _DIM))
224
+ provider = await _make_tts_provider(provider_name, args)
225
+ try:
226
+ t0 = time.perf_counter()
227
+ audio = await provider.synthesize(text)
228
+ elapsed = time.perf_counter() - t0
229
+ finally:
230
+ await provider.stop()
231
+
232
+ _write_audio_file(out_path, audio)
233
+ _print_tts_result(out_path, audio, elapsed)
234
+ if getattr(args, "play", False):
235
+ from openspeech.utils.audio_playback import play_audio
236
+
237
+ play_audio(
238
+ audio,
239
+ device=getattr(args, "play_device", None),
240
+ volume=float(getattr(args, "play_volume", 1.0)),
241
+ blocking=not getattr(args, "play_non_blocking", False),
242
+ backend=getattr(args, "play_backend", "auto"),
243
+ )
244
+
245
+
246
+ async def _cmd_roundtrip(args: Any) -> None:
247
+ """Run TTS -> STT roundtrip."""
248
+ tts_provider_name = getattr(args, "tts", None) or "openai-tts"
249
+ stt_provider_name = getattr(args, "stt", None) or "faster-whisper"
250
+ text = args.text
251
+
252
+ print()
253
+ print(_c("Roundtrip Test", _BOLD + _CYAN))
254
+ print(_c("─" * 50, _DIM))
255
+ print(f" {_c('Original text:', _BOLD)} {text}")
256
+ print(f" {_c('TTS provider:', _BOLD)} {tts_provider_name}")
257
+ print(f" {_c('STT provider:', _BOLD)} {stt_provider_name}")
258
+ print()
259
+
260
+ # Step 1: TTS
261
+ print(_c(f"Step 1: Synthesizing with {tts_provider_name} ...", _DIM))
262
+ tts_provider = await _make_tts_provider(tts_provider_name, args)
263
+ try:
264
+ t0 = time.perf_counter()
265
+ audio = await tts_provider.synthesize(text)
266
+ tts_elapsed = time.perf_counter() - t0
267
+ finally:
268
+ await tts_provider.stop()
269
+
270
+ if getattr(args, "play", False):
271
+ from openspeech.utils.audio_playback import play_audio
272
+
273
+ play_audio(
274
+ audio,
275
+ device=getattr(args, "play_device", None),
276
+ volume=float(getattr(args, "play_volume", 1.0)),
277
+ blocking=not getattr(args, "play_non_blocking", False),
278
+ backend=getattr(args, "play_backend", "auto"),
279
+ )
280
+
281
+ print(
282
+ f" {_c('Audio:', _BOLD)} {len(audio.data):,} bytes "
283
+ f"@ {audio.sample_rate} Hz ({tts_elapsed:.2f}s)"
284
+ )
285
+
286
+ # Step 2: STT
287
+ print(_c(f"Step 2: Transcribing with {stt_provider_name} ...", _DIM))
288
+ stt_provider = await _make_stt_provider(stt_provider_name, args)
289
+ try:
290
+ t0 = time.perf_counter()
291
+ result = await stt_provider.transcribe(audio)
292
+ stt_elapsed = time.perf_counter() - t0
293
+ finally:
294
+ await stt_provider.stop()
295
+
296
+ print(f" {_c('Transcribed:', _BOLD)} {result.text}")
297
+ conf = f"{result.confidence:.3f}" if result.confidence is not None else "N/A"
298
+ print(f" {_c('Confidence:', _BOLD)} {conf} ({stt_elapsed:.2f}s)")
299
+ print()
300
+
301
+ # Summary
302
+ total = tts_elapsed + stt_elapsed
303
+ print(_c("Summary", _BOLD))
304
+ print(f" Original: {text}")
305
+ print(f" Transcribed: {result.text}")
306
+ print(f" Total time: {total:.2f}s")
307
+ print()
308
+
309
+
310
+ async def _cmd_compare(args: Any) -> None:
311
+ """Run multi-provider STT comparison."""
312
+ providers_str: str = getattr(args, "provider", None) or "faster-whisper"
313
+ provider_names = [p.strip() for p in providers_str.split(",")]
314
+ audio = _read_audio(args.input)
315
+
316
+ print(_c(f"Comparing {len(provider_names)} STT provider(s) ...", _DIM))
317
+
318
+ async def _run_one(name: str) -> tuple[str, Any, float]:
319
+ try:
320
+ provider = await _make_stt_provider(name, args)
321
+ t0 = time.perf_counter()
322
+ try:
323
+ result = await provider.transcribe(audio)
324
+ finally:
325
+ await provider.stop()
326
+ elapsed = time.perf_counter() - t0
327
+ return (name, result, elapsed)
328
+ except Exception as exc: # noqa: BLE001
329
+ return (name, exc, 0.0)
330
+
331
+ results = await asyncio.gather(*[_run_one(n) for n in provider_names])
332
+
333
+ # Print comparison table
334
+ col_prov = 18
335
+ col_text = 35
336
+ col_conf = 12
337
+ col_time = 8
338
+
339
+ header = (
340
+ f"{'Provider':<{col_prov}}"
341
+ f"{'Text':<{col_text}}"
342
+ f"{'Confidence':<{col_conf}}"
343
+ f"{'Time':>{col_time}}"
344
+ )
345
+ sep = "─" * (col_prov + col_text + col_conf + col_time)
346
+
347
+ print()
348
+ print(_c(header, _BOLD))
349
+ print(_c(sep, _DIM))
350
+ for name, result, elapsed in results:
351
+ if isinstance(result, Exception):
352
+ text_cell = _c(f"ERROR: {result}", _RED)
353
+ conf_cell = "N/A"
354
+ time_cell = "N/A"
355
+ else:
356
+ raw_text = result.text or ""
357
+ text_cell = raw_text[:col_text - 2] + ".." if len(raw_text) > col_text - 1 else raw_text
358
+ conf_val = result.confidence
359
+ conf_cell = f"{conf_val:.3f}" if conf_val is not None else "N/A"
360
+ time_cell = f"{elapsed:.2f}s"
361
+ print(
362
+ f"{name:<{col_prov}}"
363
+ f"{text_cell:<{col_text}}"
364
+ f"{conf_cell:<{col_conf}}"
365
+ f"{time_cell:>{col_time}}"
366
+ )
367
+ print()
368
+
369
+
370
+ def _parse_repl_line(line: str) -> list[str]:
371
+ """Parse a REPL input line using shell-like splitting."""
372
+ try:
373
+ return shlex.split(line.strip())
374
+ except ValueError:
375
+ # Fallback for unbalanced quotes
376
+ return line.strip().split()
377
+
378
+
379
+ def _print_repl_help() -> None:
380
+ print()
381
+ print(_c("Available REPL commands:", _BOLD + _CYAN))
382
+ print(" stt <audio_file> [-p <provider>] Transcribe audio file")
383
+ print(" tts <text> [-o <output.wav>] [-p <prov>] [--play] Synthesize speech")
384
+ print(" roundtrip <text> [--tts <prov>] [--stt <prov>] Roundtrip test")
385
+ print(" compare <audio_file> [-p p1,p2] Compare providers")
386
+ print(" providers List available providers")
387
+ print(" help Show this help")
388
+ print(" quit / exit / q Exit")
389
+ print()
390
+
391
+
392
+ def _print_providers() -> None:
393
+ print()
394
+ print(_c("Available providers:", _BOLD + _CYAN))
395
+ print(
396
+ f" {'Name':<20}{'Type':<8}{'Mode':<14}{'Requirements'}"
397
+ )
398
+ print(_c(" " + "─" * 56, _DIM))
399
+ rows = [
400
+ ("faster-whisper", "STT", "subprocess", "pip install faster-whisper"),
401
+ ("openai", "STT", "in_process", "OPENAI_API_KEY"),
402
+ ("openai-tts", "TTS", "in_process", "OPENAI_API_KEY"),
403
+ ]
404
+ for name, ptype, mode, req in rows:
405
+ print(f" {name:<20}{ptype:<8}{mode:<14}{req}")
406
+ print()
407
+
408
+
409
+ async def _repl_dispatch(tokens: list[str]) -> None:
410
+ """Dispatch a single REPL command."""
411
+ if not tokens:
412
+ return
413
+
414
+ cmd = tokens[0].lower()
415
+
416
+ if cmd in ("quit", "exit", "q"):
417
+ print(_c("Goodbye!", _GREEN))
418
+ sys.exit(0)
419
+
420
+ elif cmd == "help":
421
+ _print_repl_help()
422
+
423
+ elif cmd == "providers":
424
+ _print_providers()
425
+
426
+ elif cmd == "stt":
427
+ if len(tokens) < 2:
428
+ print(_c("Usage: stt <audio_file> [-p <provider>]", _YELLOW))
429
+ return
430
+ parser = argparse.ArgumentParser(prog="stt", add_help=False)
431
+ parser.add_argument("input")
432
+ parser.add_argument("-p", "--provider", default="faster-whisper")
433
+ parser.add_argument("--model-size", default="base")
434
+ parser.add_argument("--device", default="auto")
435
+ parser.add_argument("--words", action="store_true")
436
+ try:
437
+ parsed = parser.parse_args(tokens[1:])
438
+ except SystemExit:
439
+ return
440
+ await _cmd_stt(parsed)
441
+
442
+ elif cmd == "tts":
443
+ if len(tokens) < 2:
444
+ print(_c("Usage: tts <text> [-o output.wav] [-p <provider>]", _YELLOW))
445
+ return
446
+ parser = argparse.ArgumentParser(prog="tts", add_help=False)
447
+ parser.add_argument("text")
448
+ parser.add_argument("-o", "--output", default="output.wav")
449
+ parser.add_argument("-p", "--provider", default="openai-tts")
450
+ parser.add_argument("--voice", default="alloy")
451
+ parser.add_argument("--model", default="tts-1")
452
+ parser.add_argument("--play", action="store_true")
453
+ parser.add_argument("--play-device", default=None)
454
+ parser.add_argument("--play-volume", type=float, default=1.0)
455
+ parser.add_argument("--play-backend", default="auto")
456
+ parser.add_argument("--play-non-blocking", action="store_true")
457
+ try:
458
+ parsed = parser.parse_args(tokens[1:])
459
+ except SystemExit:
460
+ return
461
+ await _cmd_tts(parsed)
462
+
463
+ elif cmd == "roundtrip":
464
+ if len(tokens) < 2:
465
+ print(_c("Usage: roundtrip <text> [--tts <provider>] [--stt <provider>]", _YELLOW))
466
+ return
467
+ parser = argparse.ArgumentParser(prog="roundtrip", add_help=False)
468
+ parser.add_argument("text")
469
+ parser.add_argument("--tts", default="openai-tts")
470
+ parser.add_argument("--stt", default="faster-whisper")
471
+ parser.add_argument("--voice", default="alloy")
472
+ parser.add_argument("--model-size", default="base")
473
+ parser.add_argument("--device", default="auto")
474
+ parser.add_argument("--play", action="store_true")
475
+ parser.add_argument("--play-device", default=None)
476
+ parser.add_argument("--play-volume", type=float, default=1.0)
477
+ parser.add_argument("--play-backend", default="auto")
478
+ parser.add_argument("--play-non-blocking", action="store_true")
479
+ try:
480
+ parsed = parser.parse_args(tokens[1:])
481
+ except SystemExit:
482
+ return
483
+ await _cmd_roundtrip(parsed)
484
+
485
+ elif cmd == "compare":
486
+ if len(tokens) < 2:
487
+ print(_c("Usage: compare <audio_file> [-p provider1,provider2]", _YELLOW))
488
+ return
489
+ parser = argparse.ArgumentParser(prog="compare", add_help=False)
490
+ parser.add_argument("input")
491
+ parser.add_argument("-p", "--provider", default="faster-whisper")
492
+ parser.add_argument("--model-size", default="base")
493
+ parser.add_argument("--device", default="auto")
494
+ try:
495
+ parsed = parser.parse_args(tokens[1:])
496
+ except SystemExit:
497
+ return
498
+ await _cmd_compare(parsed)
499
+
500
+ else:
501
+ print(_c(f"Unknown command: {cmd}. Type 'help' for available commands.", _YELLOW))
502
+
503
+
504
+ async def _repl_loop() -> None:
505
+ """Interactive REPL loop."""
506
+ print()
507
+ print(_c("OpenSpeech Interactive Demo", _BOLD + _GREEN))
508
+ print(_c("═" * 40, _DIM))
509
+ _print_repl_help()
510
+ print(_c("Type 'help' for commands, 'quit' to exit.", _DIM))
511
+ print()
512
+
513
+ while True:
514
+ try:
515
+ line = input(_c("> ", _BOLD + _GREEN))
516
+ except (EOFError, KeyboardInterrupt):
517
+ print()
518
+ print(_c("Goodbye!", _GREEN))
519
+ break
520
+
521
+ tokens = _parse_repl_line(line)
522
+ if not tokens:
523
+ continue
524
+
525
+ try:
526
+ await _repl_dispatch(tokens)
527
+ except KeyboardInterrupt:
528
+ print(_c("\nInterrupted.", _YELLOW))
529
+ except Exception as exc: # noqa: BLE001
530
+ print(_c(f"Error: {exc}", _RED))
531
+
532
+
533
+ def _cmd_repl(_args: Any) -> None:
534
+ """Entry point for REPL subcommand."""
535
+ asyncio.run(_repl_loop())
536
+
537
+
538
+ def main() -> None:
539
+ _load_dotenv()
540
+
541
+ parser = argparse.ArgumentParser(
542
+ prog="openspeech-demo",
543
+ description="OpenSpeech interactive demo — STT, TTS, roundtrip, and compare.",
544
+ )
545
+ subparsers = parser.add_subparsers(dest="command", metavar="<command>")
546
+ subparsers.required = True
547
+
548
+ # ── stt ──────────────────────────────────────────────────────────────────
549
+ stt_p = subparsers.add_parser("stt", help="Transcribe audio file to text")
550
+ stt_p.add_argument("-i", "--input", required=True, metavar="FILE", help="Input audio file")
551
+ stt_p.add_argument(
552
+ "-p", "--provider", default="faster-whisper",
553
+ metavar="PROVIDER", help="STT provider (default: faster-whisper)"
554
+ )
555
+ stt_p.add_argument(
556
+ "--model-size", default="base",
557
+ metavar="SIZE", help="Model size for faster-whisper (default: base)"
558
+ )
559
+ stt_p.add_argument(
560
+ "--device", default="auto",
561
+ metavar="DEVICE", help="Device for faster-whisper (default: auto)"
562
+ )
563
+ stt_p.add_argument(
564
+ "--words", action="store_true",
565
+ help="Show word-level timestamps"
566
+ )
567
+ stt_p.set_defaults(func=lambda a: asyncio.run(_cmd_stt(a)))
568
+
569
+ # ── tts ──────────────────────────────────────────────────────────────────
570
+ tts_p = subparsers.add_parser("tts", help="Synthesize text to audio file")
571
+ tts_p.add_argument("-t", "--text", required=True, metavar="TEXT", help="Input text")
572
+ tts_p.add_argument(
573
+ "-o", "--output", default="output.wav",
574
+ metavar="FILE", help="Output audio file (default: output.wav)"
575
+ )
576
+ tts_p.add_argument(
577
+ "-p", "--provider", default="openai-tts",
578
+ metavar="PROVIDER", help="TTS provider (default: openai-tts)"
579
+ )
580
+ tts_p.add_argument(
581
+ "--voice", default="alloy",
582
+ metavar="VOICE", help="Voice for openai-tts (default: alloy)"
583
+ )
584
+ tts_p.add_argument(
585
+ "--model", default="tts-1",
586
+ metavar="MODEL", help="Model for openai-tts (default: tts-1)"
587
+ )
588
+ tts_p.add_argument("--play", action="store_true", help="Play audio after synthesis")
589
+ tts_p.add_argument("--play-device", default=None, metavar="DEVICE", help="Playback device")
590
+ tts_p.add_argument(
591
+ "--play-volume", type=float, default=1.0, metavar="VOLUME", help="Playback volume"
592
+ )
593
+ tts_p.add_argument(
594
+ "--play-backend",
595
+ default="auto",
596
+ choices=["auto", "sounddevice", "external"],
597
+ metavar="BACKEND",
598
+ help="Playback backend",
599
+ )
600
+ tts_p.add_argument(
601
+ "--play-non-blocking",
602
+ action="store_true",
603
+ help="Do not block while playing audio",
604
+ )
605
+ tts_p.set_defaults(func=lambda a: asyncio.run(_cmd_tts(a)))
606
+
607
+ # ── roundtrip ─────────────────────────────────────────────────────────────
608
+ rt_p = subparsers.add_parser("roundtrip", help="TTS -> STT roundtrip test")
609
+ rt_p.add_argument("-t", "--text", required=True, metavar="TEXT", help="Input text")
610
+ rt_p.add_argument(
611
+ "--tts", default="openai-tts",
612
+ metavar="PROVIDER", help="TTS provider (default: openai-tts)"
613
+ )
614
+ rt_p.add_argument(
615
+ "--stt", default="faster-whisper",
616
+ metavar="PROVIDER", help="STT provider (default: faster-whisper)"
617
+ )
618
+ rt_p.add_argument(
619
+ "--voice", default="alloy",
620
+ metavar="VOICE", help="Voice for openai-tts (default: alloy)"
621
+ )
622
+ rt_p.add_argument(
623
+ "--model-size", default="base",
624
+ metavar="SIZE", help="Model size for faster-whisper (default: base)"
625
+ )
626
+ rt_p.add_argument(
627
+ "--device", default="auto",
628
+ metavar="DEVICE", help="Device for faster-whisper (default: auto)"
629
+ )
630
+ rt_p.add_argument("--play", action="store_true", help="Play synthesized audio before STT")
631
+ rt_p.add_argument("--play-device", default=None, metavar="DEVICE", help="Playback device")
632
+ rt_p.add_argument(
633
+ "--play-volume", type=float, default=1.0, metavar="VOLUME", help="Playback volume"
634
+ )
635
+ rt_p.add_argument(
636
+ "--play-backend",
637
+ default="auto",
638
+ choices=["auto", "sounddevice", "external"],
639
+ metavar="BACKEND",
640
+ help="Playback backend",
641
+ )
642
+ rt_p.add_argument(
643
+ "--play-non-blocking",
644
+ action="store_true",
645
+ help="Do not block while playing audio",
646
+ )
647
+ rt_p.set_defaults(func=lambda a: asyncio.run(_cmd_roundtrip(a)))
648
+
649
+ # ── compare ───────────────────────────────────────────────────────────────
650
+ cmp_p = subparsers.add_parser("compare", help="Compare multiple STT providers")
651
+ cmp_p.add_argument("-i", "--input", required=True, metavar="FILE", help="Input audio file")
652
+ cmp_p.add_argument(
653
+ "-p", "--provider", default="faster-whisper",
654
+ metavar="PROVIDERS", help="Comma-separated provider list (default: faster-whisper)"
655
+ )
656
+ cmp_p.add_argument(
657
+ "--model-size", default="base",
658
+ metavar="SIZE", help="Model size for faster-whisper (default: base)"
659
+ )
660
+ cmp_p.add_argument(
661
+ "--device", default="auto",
662
+ metavar="DEVICE", help="Device for faster-whisper (default: auto)"
663
+ )
664
+ cmp_p.set_defaults(func=lambda a: asyncio.run(_cmd_compare(a)))
665
+
666
+ # ── repl ──────────────────────────────────────────────────────────────────
667
+ repl_p = subparsers.add_parser("repl", help="Interactive REPL mode")
668
+ repl_p.set_defaults(func=_cmd_repl)
669
+
670
+ args = parser.parse_args()
671
+ args.func(args)
672
+
673
+
674
+ if __name__ == "__main__":
675
+ main()
File without changes