openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,406 @@
1
+ """Unified audio format conversion utilities for OpenSpeech.
2
+
3
+ Centralises format detection, PCM/WAV/AIFF conversions (stdlib only) and
4
+ MP3/OGG/FLAC/OPUS conversions (via ffmpeg subprocess).
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import io
9
+ import shutil
10
+ import struct
11
+ import subprocess
12
+ import wave
13
+
14
+ from openspeech.core.enums import AudioFormat
15
+ from openspeech.core.models import AudioData
16
+
17
+
18
+ class AudioConverter:
19
+ """Stateless audio conversion helper -- all public methods are static."""
20
+
21
+ # -- format detection -----------------------------------------------------
22
+
23
+ @staticmethod
24
+ def detect_format(data: bytes) -> AudioFormat:
25
+ """Detect audio format from file header magic bytes.
26
+
27
+ Falls back to ``AudioFormat.PCM_16K`` when the header is not
28
+ recognised (raw PCM has no header).
29
+ """
30
+ if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WAVE":
31
+ return AudioFormat.WAV
32
+ if len(data) >= 12 and data[:4] == b"FORM" and data[8:12] in (b"AIFF", b"AIFC"):
33
+ return AudioFormat.AIFF
34
+ if len(data) >= 4 and data[:4] == b"fLaC":
35
+ return AudioFormat.FLAC
36
+ if len(data) >= 4 and data[:4] == b"OggS":
37
+ return AudioFormat.OGG
38
+ # MP3: ID3 tag or MPEG frame sync
39
+ if len(data) >= 3 and data[:3] == b"ID3":
40
+ return AudioFormat.MP3
41
+ if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0:
42
+ return AudioFormat.MP3
43
+ return AudioFormat.PCM_16K
44
+
45
+ # -- ffmpeg helpers -------------------------------------------------------
46
+
47
+ @staticmethod
48
+ def ffmpeg_available() -> bool:
49
+ """Return *True* if ffmpeg is found on ``$PATH``."""
50
+ return shutil.which("ffmpeg") is not None
51
+
52
+ @staticmethod
53
+ def _require_ffmpeg() -> None:
54
+ if not AudioConverter.ffmpeg_available():
55
+ raise RuntimeError(
56
+ "ffmpeg is required for this conversion but was not found on $PATH"
57
+ )
58
+
59
+ @staticmethod
60
+ def _ffmpeg_convert(
61
+ input_data: bytes,
62
+ input_format: str,
63
+ output_format: str,
64
+ sample_rate: int | None = None,
65
+ channels: int | None = None,
66
+ ) -> bytes:
67
+ """Run ffmpeg with stdin/stdout pipes (no temp files)."""
68
+ AudioConverter._require_ffmpeg()
69
+
70
+ cmd: list[str] = [
71
+ "ffmpeg",
72
+ "-y",
73
+ "-f", input_format,
74
+ "-i", "pipe:0",
75
+ ]
76
+ if sample_rate is not None:
77
+ cmd += ["-ar", str(sample_rate)]
78
+ if channels is not None:
79
+ cmd += ["-ac", str(channels)]
80
+ cmd += ["-f", output_format, "pipe:1"]
81
+
82
+ proc = subprocess.run(
83
+ cmd,
84
+ input=input_data,
85
+ capture_output=True,
86
+ check=False,
87
+ )
88
+ if proc.returncode != 0:
89
+ raise RuntimeError(f"ffmpeg failed: {proc.stderr.decode(errors='replace')}")
90
+ return proc.stdout
91
+
92
+ # -- PCM resampling -------------------------------------------------------
93
+
94
+ @staticmethod
95
+ def resample(pcm_data: bytes, src_rate: int, dst_rate: int) -> bytes:
96
+ """Resample 16-bit PCM using linear interpolation.
97
+
98
+ Returns the data unchanged when *src_rate* == *dst_rate*.
99
+ """
100
+ if src_rate == dst_rate:
101
+ return pcm_data
102
+
103
+ # Unpack 16-bit signed little-endian samples
104
+ n_samples = len(pcm_data) // 2
105
+ samples = struct.unpack(f"<{n_samples}h", pcm_data)
106
+
107
+ ratio = src_rate / dst_rate
108
+ out_len = int(n_samples / ratio)
109
+ out: list[int] = []
110
+ for i in range(out_len):
111
+ src_pos = i * ratio
112
+ idx = int(src_pos)
113
+ frac = src_pos - idx
114
+ if idx + 1 < n_samples:
115
+ val = samples[idx] * (1 - frac) + samples[idx + 1] * frac
116
+ else:
117
+ val = samples[idx] if idx < n_samples else 0
118
+ # Clamp to int16 range
119
+ val = max(-32768, min(32767, int(round(val))))
120
+ out.append(val)
121
+
122
+ return struct.pack(f"<{len(out)}h", *out)
123
+
124
+ # -- channel mixing -------------------------------------------------------
125
+
126
+ @staticmethod
127
+ def mix_to_mono(pcm_data: bytes, channels: int) -> bytes:
128
+ """Mix multi-channel 16-bit PCM down to mono by averaging channels."""
129
+ if channels <= 1:
130
+ return pcm_data
131
+
132
+ n_samples = len(pcm_data) // 2
133
+ samples = struct.unpack(f"<{n_samples}h", pcm_data)
134
+
135
+ frames = n_samples // channels
136
+ out: list[int] = []
137
+ for f in range(frames):
138
+ total = 0
139
+ for c in range(channels):
140
+ total += samples[f * channels + c]
141
+ avg = total // channels
142
+ avg = max(-32768, min(32767, avg))
143
+ out.append(avg)
144
+
145
+ return struct.pack(f"<{len(out)}h", *out)
146
+
147
+ # -- AIFF support ---------------------------------------------------------
148
+
149
+ @staticmethod
150
+ def _parse_aiff_extended(data: bytes) -> float:
151
+ """Parse an 80-bit IEEE 754 extended precision float (big-endian).
152
+
153
+ Used by AIFF to encode the sample rate in the COMM chunk.
154
+ """
155
+ # 80-bit extended: 1 sign, 15 exponent, 64 mantissa
156
+ exponent = ((data[0] & 0x7F) << 8) | data[1]
157
+ mantissa = 0
158
+ for i in range(2, 10):
159
+ mantissa = (mantissa << 8) | data[i]
160
+ sign = -1 if data[0] & 0x80 else 1
161
+ if exponent == 0 and mantissa == 0:
162
+ return 0.0
163
+ exponent -= 16383 # bias
164
+ # mantissa has explicit integer bit
165
+ value = sign * (mantissa / (1 << 63)) * (2 ** exponent)
166
+ return value
167
+
168
+ @staticmethod
169
+ def _aiff_to_pcm(data: bytes) -> tuple[bytes, int, int]:
170
+ """Read AIFF data and return (pcm_le_bytes, sample_rate, channels).
171
+
172
+ AIFF stores samples as big-endian; this converts to little-endian
173
+ 16-bit PCM. Parses AIFF manually to avoid the removed ``aifc``
174
+ module (dropped in Python 3.13).
175
+ """
176
+ if len(data) < 12 or data[:4] != b"FORM" or data[8:12] not in (b"AIFF", b"AIFC"):
177
+ raise RuntimeError("Not a valid AIFF file")
178
+
179
+ is_aifc = data[8:12] == b"AIFC"
180
+ n_channels = 0
181
+ samp_width = 0
182
+ frame_rate = 0
183
+ n_frames = 0
184
+ raw = b""
185
+ compression_type = b"NONE"
186
+
187
+ pos = 12
188
+ while pos < len(data) - 8:
189
+ chunk_id = data[pos : pos + 4]
190
+ chunk_size = struct.unpack(">I", data[pos + 4 : pos + 8])[0]
191
+ chunk_data = data[pos + 8 : pos + 8 + chunk_size]
192
+
193
+ if chunk_id == b"COMM":
194
+ # COMM: channels(2), numFrames(4), sampleSize(2), sampleRate(10 extended)
195
+ n_channels = struct.unpack(">h", chunk_data[0:2])[0]
196
+ n_frames = struct.unpack(">I", chunk_data[2:6])[0]
197
+ samp_width = struct.unpack(">h", chunk_data[6:8])[0] // 8
198
+ frame_rate = int(AudioConverter._parse_aiff_extended(chunk_data[8:18]))
199
+ # AIFF-C has compression type after the 18-byte standard COMM fields
200
+ if is_aifc and len(chunk_data) >= 22:
201
+ compression_type = chunk_data[18:22]
202
+ elif chunk_id == b"SSND":
203
+ # SSND: offset(4), blockSize(4), then raw sample data
204
+ offset = struct.unpack(">I", chunk_data[0:4])[0]
205
+ raw = chunk_data[8 + offset :]
206
+
207
+ # Skip FVER and other AIFC-specific chunks
208
+ # Chunks are padded to even size
209
+ pos += 8 + chunk_size + (chunk_size % 2)
210
+
211
+ if n_channels == 0:
212
+ raise RuntimeError("AIFF file missing COMM chunk")
213
+
214
+ # AIFF-C compression: NONE and twos are big-endian PCM (standard),
215
+ # sowt is little-endian PCM (no byte swap needed)
216
+ if is_aifc and compression_type not in (b"NONE", b"twos", b"sowt"):
217
+ raise RuntimeError(
218
+ f"Unsupported AIFF-C compression: {compression_type!r}. "
219
+ "Only uncompressed AIFF/AIFF-C is supported."
220
+ )
221
+
222
+ is_little_endian = is_aifc and compression_type == b"sowt"
223
+
224
+ # Convert samples to little-endian 16-bit PCM
225
+ if samp_width == 2:
226
+ n = len(raw) // 2
227
+ if is_little_endian:
228
+ pcm = raw[:n * 2]
229
+ else:
230
+ be_samples = struct.unpack(f">{n}h", raw)
231
+ pcm = struct.pack(f"<{n}h", *be_samples)
232
+ elif samp_width == 1:
233
+ pcm = b"".join(
234
+ struct.pack("<h", (b - 128) * 256) for b in raw
235
+ )
236
+ else:
237
+ raise RuntimeError(f"Unsupported AIFF sample width: {samp_width * 8}-bit")
238
+
239
+ return pcm, frame_rate, n_channels
240
+
241
+ # -- core conversions -----------------------------------------------------
242
+
243
+ @staticmethod
244
+ def to_wav(
245
+ audio: AudioData,
246
+ target_rate: int | None = None,
247
+ target_channels: int | None = None,
248
+ ) -> AudioData:
249
+ """Convert *audio* to WAV format.
250
+
251
+ Optionally resample and/or remix channels. WAV input is returned
252
+ unchanged unless resampling/remixing is requested.
253
+ """
254
+ fmt = audio.format
255
+
256
+ # Obtain raw PCM + metadata
257
+ if fmt == AudioFormat.WAV:
258
+ # Decode existing WAV to get raw PCM
259
+ with wave.open(io.BytesIO(audio.data), "rb") as wf:
260
+ pcm = wf.readframes(wf.getnframes())
261
+ rate = wf.getframerate()
262
+ ch = wf.getnchannels()
263
+ elif fmt == AudioFormat.AIFF:
264
+ pcm, rate, ch = AudioConverter._aiff_to_pcm(audio.data)
265
+ elif fmt in (AudioFormat.PCM_16K, AudioFormat.PCM_44K):
266
+ pcm = audio.data
267
+ rate = audio.sample_rate
268
+ ch = audio.channels
269
+ elif fmt in (AudioFormat.MP3, AudioFormat.OGG, AudioFormat.FLAC, AudioFormat.OPUS):
270
+ # Use ffmpeg to decode to raw PCM first
271
+ fmt_map = {
272
+ AudioFormat.MP3: "mp3",
273
+ AudioFormat.OGG: "ogg",
274
+ AudioFormat.FLAC: "flac",
275
+ AudioFormat.OPUS: "ogg",
276
+ }
277
+ raw = AudioConverter._ffmpeg_convert(
278
+ audio.data, fmt_map[fmt], "s16le",
279
+ sample_rate=target_rate or audio.sample_rate,
280
+ channels=target_channels or audio.channels,
281
+ )
282
+ out_rate = target_rate or audio.sample_rate
283
+ out_ch = target_channels or audio.channels
284
+ buf = io.BytesIO()
285
+ with wave.open(buf, "wb") as wf:
286
+ wf.setnchannels(out_ch)
287
+ wf.setsampwidth(2)
288
+ wf.setframerate(out_rate)
289
+ wf.writeframes(raw)
290
+ return AudioData(
291
+ data=buf.getvalue(),
292
+ sample_rate=out_rate,
293
+ channels=out_ch,
294
+ format=AudioFormat.WAV,
295
+ )
296
+ else:
297
+ raise RuntimeError(f"Unsupported format for to_wav: {fmt}")
298
+
299
+ # Optional channel remix
300
+ out_ch = target_channels or ch
301
+ if out_ch != ch:
302
+ if out_ch == 1:
303
+ pcm = AudioConverter.mix_to_mono(pcm, ch)
304
+ else:
305
+ raise RuntimeError("Only mono down-mix is supported")
306
+
307
+ # Optional resample
308
+ out_rate = target_rate or rate
309
+ if out_rate != rate:
310
+ # If multi-channel, must be mono at this point for simple resample
311
+ if out_ch > 1:
312
+ raise RuntimeError("Resample only supported for mono audio")
313
+ pcm = AudioConverter.resample(pcm, rate, out_rate)
314
+
315
+ # Wrap as WAV
316
+ buf = io.BytesIO()
317
+ with wave.open(buf, "wb") as wf:
318
+ wf.setnchannels(out_ch)
319
+ wf.setsampwidth(2)
320
+ wf.setframerate(out_rate)
321
+ wf.writeframes(pcm)
322
+
323
+ return AudioData(
324
+ data=buf.getvalue(),
325
+ sample_rate=out_rate,
326
+ channels=out_ch,
327
+ format=AudioFormat.WAV,
328
+ )
329
+
330
+ @staticmethod
331
+ def to_pcm16k(audio: AudioData) -> AudioData:
332
+ """Convert *audio* to 16 kHz mono 16-bit PCM."""
333
+ if (
334
+ audio.format == AudioFormat.PCM_16K
335
+ and audio.sample_rate == 16000
336
+ and audio.channels == 1
337
+ ):
338
+ return audio
339
+
340
+ # Go through WAV first, then extract raw PCM
341
+ wav = AudioConverter.to_wav(audio, target_rate=16000, target_channels=1)
342
+ with wave.open(io.BytesIO(wav.data), "rb") as wf:
343
+ pcm = wf.readframes(wf.getnframes())
344
+
345
+ return AudioData(
346
+ data=pcm,
347
+ sample_rate=16000,
348
+ channels=1,
349
+ format=AudioFormat.PCM_16K,
350
+ )
351
+
352
+ @staticmethod
353
+ def convert(audio: AudioData, target: AudioFormat) -> AudioData:
354
+ """Convert *audio* to *target* format."""
355
+ if audio.format == target:
356
+ return audio
357
+
358
+ # Targets achievable with stdlib
359
+ if target == AudioFormat.WAV:
360
+ return AudioConverter.to_wav(audio)
361
+ if target == AudioFormat.PCM_16K:
362
+ return AudioConverter.to_pcm16k(audio)
363
+ if target == AudioFormat.PCM_44K:
364
+ wav = AudioConverter.to_wav(audio, target_rate=44100)
365
+ with wave.open(io.BytesIO(wav.data), "rb") as wf:
366
+ pcm = wf.readframes(wf.getnframes())
367
+ return AudioData(
368
+ data=pcm,
369
+ sample_rate=44100,
370
+ channels=wav.channels,
371
+ format=AudioFormat.PCM_44K,
372
+ )
373
+
374
+ # Targets requiring ffmpeg
375
+ fmt_map = {
376
+ AudioFormat.MP3: "mp3",
377
+ AudioFormat.OGG: "ogg",
378
+ AudioFormat.FLAC: "flac",
379
+ AudioFormat.OPUS: "opus",
380
+ }
381
+ if target not in fmt_map:
382
+ raise RuntimeError(f"Unsupported target format: {target}")
383
+
384
+ # First ensure we have WAV (ffmpeg reads WAV easily)
385
+ wav = AudioConverter.to_wav(audio)
386
+ out = AudioConverter._ffmpeg_convert(
387
+ wav.data, "wav", fmt_map[target],
388
+ sample_rate=audio.sample_rate,
389
+ channels=audio.channels,
390
+ )
391
+ return AudioData(
392
+ data=out,
393
+ sample_rate=audio.sample_rate,
394
+ channels=audio.channels,
395
+ format=target,
396
+ )
397
+
398
+ # -- streaming (reserved) -------------------------------------------------
399
+
400
+ @staticmethod
401
+ def convert_stream(
402
+ audio: AudioData,
403
+ target: AudioFormat,
404
+ ) -> AudioData:
405
+ """Streaming conversion -- not yet implemented."""
406
+ raise NotImplementedError("Streaming conversion is not yet supported")
@@ -0,0 +1,156 @@
1
+ """Audio playback utilities for demo and external applications."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import shutil
6
+ import subprocess
7
+ import tempfile
8
+ import wave
9
+ from array import array
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from openspeech.core.enums import AudioFormat
14
+ from openspeech.core.models import AudioData
15
+
16
+
17
+ def _wav_bytes_from_audio(audio: AudioData) -> bytes:
18
+ if audio.format == AudioFormat.WAV and audio.data[:4] == b"RIFF":
19
+ return audio.data
20
+
21
+ data = audio.data
22
+ if audio.format == AudioFormat.PCM_44K:
23
+ sample_rate = 44100
24
+ elif audio.format == AudioFormat.PCM_16K:
25
+ sample_rate = 16000
26
+ else:
27
+ sample_rate = audio.sample_rate
28
+
29
+ buf = io.BytesIO()
30
+ with wave.open(buf, "wb") as wf:
31
+ wf.setnchannels(audio.channels)
32
+ wf.setsampwidth(2) # 16-bit PCM
33
+ wf.setframerate(sample_rate)
34
+ wf.writeframes(data)
35
+ return buf.getvalue()
36
+
37
+
38
+ def _scale_pcm16(pcm_bytes: bytes, volume: float) -> bytes:
39
+ if volume == 1.0:
40
+ return pcm_bytes
41
+ samples = array("h")
42
+ samples.frombytes(pcm_bytes)
43
+ for i, s in enumerate(samples):
44
+ v = int(s * volume)
45
+ if v > 32767:
46
+ v = 32767
47
+ elif v < -32768:
48
+ v = -32768
49
+ samples[i] = v
50
+ return samples.tobytes()
51
+
52
+
53
+ def _play_with_sounddevice(
54
+ audio: AudioData,
55
+ device: str | int | None,
56
+ volume: float,
57
+ blocking: bool,
58
+ ) -> None:
59
+ try:
60
+ import numpy as np
61
+ import sounddevice as sd
62
+ except ImportError as exc:
63
+ raise RuntimeError(
64
+ "sounddevice backend unavailable. Install extras: pip install openspeech[audio]"
65
+ ) from exc
66
+
67
+ wav_bytes = _wav_bytes_from_audio(audio)
68
+ with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
69
+ frames = wf.readframes(wf.getnframes())
70
+ channels = wf.getnchannels()
71
+ sample_rate = wf.getframerate()
72
+ sample_width = wf.getsampwidth()
73
+
74
+ if sample_width != 2:
75
+ raise RuntimeError(f"Only 16-bit PCM playback is supported, got sample_width={sample_width}")
76
+
77
+ scaled = _scale_pcm16(frames, volume)
78
+ pcm = np.frombuffer(scaled, dtype=np.int16)
79
+ if channels > 1:
80
+ pcm = pcm.reshape(-1, channels)
81
+
82
+ sd.play(pcm, samplerate=sample_rate, device=device, blocking=blocking)
83
+ if blocking:
84
+ sd.wait()
85
+
86
+
87
+ def _play_with_external_player(
88
+ audio: AudioData,
89
+ device: str | int | None,
90
+ blocking: bool,
91
+ ) -> None:
92
+ if device is not None:
93
+ raise RuntimeError("device selection is only supported with backend='sounddevice'")
94
+
95
+ wav_bytes = _wav_bytes_from_audio(audio)
96
+ with tempfile.NamedTemporaryFile(prefix="openspeech_", suffix=".wav", delete=False) as f:
97
+ temp_path = Path(f.name)
98
+ f.write(wav_bytes)
99
+
100
+ if shutil.which("ffplay"):
101
+ cmd = ["ffplay", "-nodisp", "-autoexit", "-loglevel", "error", str(temp_path)]
102
+ elif shutil.which("afplay"):
103
+ cmd = ["afplay", str(temp_path)]
104
+ elif shutil.which("aplay"):
105
+ cmd = ["aplay", str(temp_path)]
106
+ else:
107
+ temp_path.unlink(missing_ok=True)
108
+ raise RuntimeError(
109
+ "No playback backend found. Install sounddevice or ffplay/afplay/aplay."
110
+ )
111
+
112
+ if blocking:
113
+ subprocess.run(cmd, check=False)
114
+ temp_path.unlink(missing_ok=True)
115
+ else:
116
+ subprocess.Popen(cmd) # noqa: S603
117
+
118
+
119
+ def play_audio(
120
+ audio: AudioData,
121
+ *,
122
+ device: str | int | None = None,
123
+ volume: float = 1.0,
124
+ blocking: bool = True,
125
+ backend: str = "auto",
126
+ ) -> None:
127
+ """Play AudioData with optional backend/device selection."""
128
+ if volume <= 0:
129
+ raise ValueError("volume must be > 0")
130
+ backend = backend.lower().strip()
131
+ if backend not in {"auto", "sounddevice", "external"}:
132
+ raise ValueError("backend must be one of: auto, sounddevice, external")
133
+
134
+ if backend in {"auto", "sounddevice"}:
135
+ try:
136
+ _play_with_sounddevice(audio, device=device, volume=volume, blocking=blocking)
137
+ return
138
+ except RuntimeError:
139
+ if backend == "sounddevice":
140
+ raise
141
+
142
+ _play_with_external_player(audio, device=device, blocking=blocking)
143
+
144
+
145
+ def list_output_devices() -> list[dict[str, Any]]:
146
+ """List playback devices for sounddevice backend."""
147
+ try:
148
+ import sounddevice as sd
149
+ except ImportError:
150
+ return []
151
+
152
+ devices = []
153
+ for idx, dev in enumerate(sd.query_devices()): # type: ignore[arg-type]
154
+ if dev.get("max_output_channels", 0) > 0:
155
+ devices.append({"id": idx, "name": dev.get("name", f"device-{idx}")})
156
+ return devices
@@ -0,0 +1,74 @@
1
+ # Vendor Registry — cloud provider service templates.
2
+ # Defines shared credential fields for each vendor.
3
+ # Engines reference vendors via the "vendor" field in engine_registry.yaml.
4
+ version: 1
5
+
6
+ vendors:
7
+ openai:
8
+ display_name: "OpenAI"
9
+ shared_fields:
10
+ api_key: { required: true, description: "API Key", secret: true }
11
+ base_url: { required: false, description: "API Base URL (leave empty for default)", default: "" }
12
+
13
+ iflytek:
14
+ display_name: "iFlytek (科大讯飞)"
15
+ shared_fields:
16
+ app_id: { required: true, description: "App ID" }
17
+ api_key: { required: true, description: "API Key", secret: true }
18
+ api_secret: { required: true, description: "API Secret", secret: true }
19
+
20
+ google:
21
+ display_name: "Google Cloud"
22
+ shared_fields:
23
+ api_key: { required: true, description: "API Key", secret: true }
24
+
25
+ azure:
26
+ display_name: "Azure"
27
+ shared_fields:
28
+ subscription_key: { required: true, description: "Subscription Key", secret: true }
29
+ region: { required: true, description: "Region", default: "eastus" }
30
+
31
+ alibaba:
32
+ display_name: "Alibaba (阿里云)"
33
+ shared_fields:
34
+ api_key: { required: true, description: "API Key", secret: true }
35
+ base_url: { required: false, description: "API Base URL", default: "https://dashscope.aliyuncs.com/compatible-mode/v1" }
36
+
37
+ tencent:
38
+ display_name: "Tencent (腾讯云)"
39
+ shared_fields:
40
+ secret_id: { required: true, description: "Secret ID", secret: true }
41
+ secret_key: { required: true, description: "Secret Key", secret: true }
42
+
43
+ baidu:
44
+ display_name: "Baidu (百度)"
45
+ shared_fields:
46
+ api_key: { required: true, description: "API Key", secret: true }
47
+ secret_key: { required: true, description: "Secret Key", secret: true }
48
+
49
+ volcengine:
50
+ display_name: "Volcengine (火山引擎)"
51
+ shared_fields:
52
+ access_token: { required: true, description: "Access Token", secret: true }
53
+ app_id: { required: true, description: "App ID" }
54
+
55
+ deepgram:
56
+ display_name: "Deepgram"
57
+ shared_fields:
58
+ api_key: { required: true, description: "API Key", secret: true }
59
+
60
+ assemblyai:
61
+ display_name: "AssemblyAI"
62
+ shared_fields:
63
+ api_key: { required: true, description: "API Key", secret: true }
64
+
65
+ elevenlabs:
66
+ display_name: "ElevenLabs"
67
+ shared_fields:
68
+ api_key: { required: true, description: "API Key", secret: true }
69
+
70
+ minimax:
71
+ display_name: "MiniMax"
72
+ shared_fields:
73
+ api_key: { required: true, description: "API Key", secret: true }
74
+ group_id: { required: true, description: "Group ID" }