openspeechapi 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openspeech/__init__.py +75 -0
- openspeech/__main__.py +5 -0
- openspeech/cli.py +413 -0
- openspeech/client/__init__.py +4 -0
- openspeech/client/client.py +145 -0
- openspeech/config.py +212 -0
- openspeech/core/__init__.py +0 -0
- openspeech/core/base.py +75 -0
- openspeech/core/enums.py +39 -0
- openspeech/core/models.py +61 -0
- openspeech/core/registry.py +37 -0
- openspeech/core/settings.py +8 -0
- openspeech/demo.py +675 -0
- openspeech/dispatch/__init__.py +0 -0
- openspeech/dispatch/context.py +34 -0
- openspeech/dispatch/dispatcher.py +661 -0
- openspeech/dispatch/executors/__init__.py +0 -0
- openspeech/dispatch/executors/base.py +34 -0
- openspeech/dispatch/executors/in_process.py +66 -0
- openspeech/dispatch/executors/remote.py +64 -0
- openspeech/dispatch/executors/subprocess_exec.py +446 -0
- openspeech/dispatch/fanout.py +95 -0
- openspeech/dispatch/filters.py +73 -0
- openspeech/dispatch/lifecycle.py +178 -0
- openspeech/dispatch/watcher.py +82 -0
- openspeech/engine_catalog.py +236 -0
- openspeech/engine_registry.yaml +347 -0
- openspeech/exceptions.py +51 -0
- openspeech/factory.py +325 -0
- openspeech/local_engines/__init__.py +12 -0
- openspeech/local_engines/aim_resolver.py +91 -0
- openspeech/local_engines/backends/__init__.py +1 -0
- openspeech/local_engines/backends/docker_backend.py +490 -0
- openspeech/local_engines/backends/native_backend.py +902 -0
- openspeech/local_engines/base.py +30 -0
- openspeech/local_engines/engines/__init__.py +1 -0
- openspeech/local_engines/engines/faster_whisper.py +36 -0
- openspeech/local_engines/engines/fish_speech.py +33 -0
- openspeech/local_engines/engines/sherpa_onnx.py +56 -0
- openspeech/local_engines/engines/whisper.py +41 -0
- openspeech/local_engines/engines/whisperlivekit.py +60 -0
- openspeech/local_engines/manager.py +208 -0
- openspeech/local_engines/models.py +50 -0
- openspeech/local_engines/progress.py +69 -0
- openspeech/local_engines/registry.py +19 -0
- openspeech/local_engines/task_store.py +52 -0
- openspeech/local_engines/tasks.py +71 -0
- openspeech/logging_config.py +607 -0
- openspeech/observe/__init__.py +0 -0
- openspeech/observe/base.py +79 -0
- openspeech/observe/debug.py +44 -0
- openspeech/observe/latency.py +19 -0
- openspeech/observe/metrics.py +47 -0
- openspeech/observe/tracing.py +44 -0
- openspeech/observe/usage.py +27 -0
- openspeech/providers/__init__.py +0 -0
- openspeech/providers/_template.py +101 -0
- openspeech/providers/stt/__init__.py +0 -0
- openspeech/providers/stt/alibaba.py +86 -0
- openspeech/providers/stt/assemblyai.py +135 -0
- openspeech/providers/stt/azure_speech.py +99 -0
- openspeech/providers/stt/baidu.py +135 -0
- openspeech/providers/stt/deepgram.py +311 -0
- openspeech/providers/stt/elevenlabs.py +385 -0
- openspeech/providers/stt/faster_whisper.py +211 -0
- openspeech/providers/stt/google_cloud.py +106 -0
- openspeech/providers/stt/iflytek.py +427 -0
- openspeech/providers/stt/macos_speech.py +226 -0
- openspeech/providers/stt/openai.py +84 -0
- openspeech/providers/stt/sherpa_onnx.py +353 -0
- openspeech/providers/stt/tencent.py +212 -0
- openspeech/providers/stt/volcengine.py +107 -0
- openspeech/providers/stt/whisper.py +153 -0
- openspeech/providers/stt/whisperlivekit.py +530 -0
- openspeech/providers/stt/windows_speech.py +249 -0
- openspeech/providers/tts/__init__.py +0 -0
- openspeech/providers/tts/alibaba.py +95 -0
- openspeech/providers/tts/azure_speech.py +123 -0
- openspeech/providers/tts/baidu.py +143 -0
- openspeech/providers/tts/coqui.py +64 -0
- openspeech/providers/tts/cosyvoice.py +90 -0
- openspeech/providers/tts/deepgram.py +174 -0
- openspeech/providers/tts/elevenlabs.py +311 -0
- openspeech/providers/tts/fish_speech.py +158 -0
- openspeech/providers/tts/google_cloud.py +107 -0
- openspeech/providers/tts/iflytek.py +209 -0
- openspeech/providers/tts/macos_say.py +251 -0
- openspeech/providers/tts/minimax.py +122 -0
- openspeech/providers/tts/openai.py +104 -0
- openspeech/providers/tts/piper.py +104 -0
- openspeech/providers/tts/tencent.py +189 -0
- openspeech/providers/tts/volcengine.py +117 -0
- openspeech/providers/tts/windows_sapi.py +234 -0
- openspeech/server/__init__.py +1 -0
- openspeech/server/app.py +72 -0
- openspeech/server/auth.py +42 -0
- openspeech/server/middleware.py +75 -0
- openspeech/server/routes/__init__.py +1 -0
- openspeech/server/routes/management.py +848 -0
- openspeech/server/routes/stt.py +121 -0
- openspeech/server/routes/tts.py +159 -0
- openspeech/server/routes/webui.py +29 -0
- openspeech/server/webui/app.js +2649 -0
- openspeech/server/webui/index.html +216 -0
- openspeech/server/webui/styles.css +617 -0
- openspeech/server/ws/__init__.py +1 -0
- openspeech/server/ws/stt_stream.py +263 -0
- openspeech/server/ws/tts_stream.py +207 -0
- openspeech/telemetry/__init__.py +21 -0
- openspeech/telemetry/perf.py +307 -0
- openspeech/utils/__init__.py +5 -0
- openspeech/utils/audio_converter.py +406 -0
- openspeech/utils/audio_playback.py +156 -0
- openspeech/vendor_registry.yaml +74 -0
- openspeechapi-0.1.0.dist-info/METADATA +101 -0
- openspeechapi-0.1.0.dist-info/RECORD +118 -0
- openspeechapi-0.1.0.dist-info/WHEEL +4 -0
- openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""Unified audio format conversion utilities for OpenSpeech.
|
|
2
|
+
|
|
3
|
+
Centralises format detection, PCM/WAV/AIFF conversions (stdlib only) and
|
|
4
|
+
MP3/OGG/FLAC/OPUS conversions (via ffmpeg subprocess).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import shutil
|
|
10
|
+
import struct
|
|
11
|
+
import subprocess
|
|
12
|
+
import wave
|
|
13
|
+
|
|
14
|
+
from openspeech.core.enums import AudioFormat
|
|
15
|
+
from openspeech.core.models import AudioData
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AudioConverter:
|
|
19
|
+
"""Stateless audio conversion helper -- all public methods are static."""
|
|
20
|
+
|
|
21
|
+
# -- format detection -----------------------------------------------------
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def detect_format(data: bytes) -> AudioFormat:
|
|
25
|
+
"""Detect audio format from file header magic bytes.
|
|
26
|
+
|
|
27
|
+
Falls back to ``AudioFormat.PCM_16K`` when the header is not
|
|
28
|
+
recognised (raw PCM has no header).
|
|
29
|
+
"""
|
|
30
|
+
if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WAVE":
|
|
31
|
+
return AudioFormat.WAV
|
|
32
|
+
if len(data) >= 12 and data[:4] == b"FORM" and data[8:12] in (b"AIFF", b"AIFC"):
|
|
33
|
+
return AudioFormat.AIFF
|
|
34
|
+
if len(data) >= 4 and data[:4] == b"fLaC":
|
|
35
|
+
return AudioFormat.FLAC
|
|
36
|
+
if len(data) >= 4 and data[:4] == b"OggS":
|
|
37
|
+
return AudioFormat.OGG
|
|
38
|
+
# MP3: ID3 tag or MPEG frame sync
|
|
39
|
+
if len(data) >= 3 and data[:3] == b"ID3":
|
|
40
|
+
return AudioFormat.MP3
|
|
41
|
+
if len(data) >= 2 and data[0] == 0xFF and (data[1] & 0xE0) == 0xE0:
|
|
42
|
+
return AudioFormat.MP3
|
|
43
|
+
return AudioFormat.PCM_16K
|
|
44
|
+
|
|
45
|
+
# -- ffmpeg helpers -------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def ffmpeg_available() -> bool:
|
|
49
|
+
"""Return *True* if ffmpeg is found on ``$PATH``."""
|
|
50
|
+
return shutil.which("ffmpeg") is not None
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _require_ffmpeg() -> None:
|
|
54
|
+
if not AudioConverter.ffmpeg_available():
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
"ffmpeg is required for this conversion but was not found on $PATH"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _ffmpeg_convert(
|
|
61
|
+
input_data: bytes,
|
|
62
|
+
input_format: str,
|
|
63
|
+
output_format: str,
|
|
64
|
+
sample_rate: int | None = None,
|
|
65
|
+
channels: int | None = None,
|
|
66
|
+
) -> bytes:
|
|
67
|
+
"""Run ffmpeg with stdin/stdout pipes (no temp files)."""
|
|
68
|
+
AudioConverter._require_ffmpeg()
|
|
69
|
+
|
|
70
|
+
cmd: list[str] = [
|
|
71
|
+
"ffmpeg",
|
|
72
|
+
"-y",
|
|
73
|
+
"-f", input_format,
|
|
74
|
+
"-i", "pipe:0",
|
|
75
|
+
]
|
|
76
|
+
if sample_rate is not None:
|
|
77
|
+
cmd += ["-ar", str(sample_rate)]
|
|
78
|
+
if channels is not None:
|
|
79
|
+
cmd += ["-ac", str(channels)]
|
|
80
|
+
cmd += ["-f", output_format, "pipe:1"]
|
|
81
|
+
|
|
82
|
+
proc = subprocess.run(
|
|
83
|
+
cmd,
|
|
84
|
+
input=input_data,
|
|
85
|
+
capture_output=True,
|
|
86
|
+
check=False,
|
|
87
|
+
)
|
|
88
|
+
if proc.returncode != 0:
|
|
89
|
+
raise RuntimeError(f"ffmpeg failed: {proc.stderr.decode(errors='replace')}")
|
|
90
|
+
return proc.stdout
|
|
91
|
+
|
|
92
|
+
# -- PCM resampling -------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def resample(pcm_data: bytes, src_rate: int, dst_rate: int) -> bytes:
|
|
96
|
+
"""Resample 16-bit PCM using linear interpolation.
|
|
97
|
+
|
|
98
|
+
Returns the data unchanged when *src_rate* == *dst_rate*.
|
|
99
|
+
"""
|
|
100
|
+
if src_rate == dst_rate:
|
|
101
|
+
return pcm_data
|
|
102
|
+
|
|
103
|
+
# Unpack 16-bit signed little-endian samples
|
|
104
|
+
n_samples = len(pcm_data) // 2
|
|
105
|
+
samples = struct.unpack(f"<{n_samples}h", pcm_data)
|
|
106
|
+
|
|
107
|
+
ratio = src_rate / dst_rate
|
|
108
|
+
out_len = int(n_samples / ratio)
|
|
109
|
+
out: list[int] = []
|
|
110
|
+
for i in range(out_len):
|
|
111
|
+
src_pos = i * ratio
|
|
112
|
+
idx = int(src_pos)
|
|
113
|
+
frac = src_pos - idx
|
|
114
|
+
if idx + 1 < n_samples:
|
|
115
|
+
val = samples[idx] * (1 - frac) + samples[idx + 1] * frac
|
|
116
|
+
else:
|
|
117
|
+
val = samples[idx] if idx < n_samples else 0
|
|
118
|
+
# Clamp to int16 range
|
|
119
|
+
val = max(-32768, min(32767, int(round(val))))
|
|
120
|
+
out.append(val)
|
|
121
|
+
|
|
122
|
+
return struct.pack(f"<{len(out)}h", *out)
|
|
123
|
+
|
|
124
|
+
# -- channel mixing -------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def mix_to_mono(pcm_data: bytes, channels: int) -> bytes:
|
|
128
|
+
"""Mix multi-channel 16-bit PCM down to mono by averaging channels."""
|
|
129
|
+
if channels <= 1:
|
|
130
|
+
return pcm_data
|
|
131
|
+
|
|
132
|
+
n_samples = len(pcm_data) // 2
|
|
133
|
+
samples = struct.unpack(f"<{n_samples}h", pcm_data)
|
|
134
|
+
|
|
135
|
+
frames = n_samples // channels
|
|
136
|
+
out: list[int] = []
|
|
137
|
+
for f in range(frames):
|
|
138
|
+
total = 0
|
|
139
|
+
for c in range(channels):
|
|
140
|
+
total += samples[f * channels + c]
|
|
141
|
+
avg = total // channels
|
|
142
|
+
avg = max(-32768, min(32767, avg))
|
|
143
|
+
out.append(avg)
|
|
144
|
+
|
|
145
|
+
return struct.pack(f"<{len(out)}h", *out)
|
|
146
|
+
|
|
147
|
+
# -- AIFF support ---------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _parse_aiff_extended(data: bytes) -> float:
|
|
151
|
+
"""Parse an 80-bit IEEE 754 extended precision float (big-endian).
|
|
152
|
+
|
|
153
|
+
Used by AIFF to encode the sample rate in the COMM chunk.
|
|
154
|
+
"""
|
|
155
|
+
# 80-bit extended: 1 sign, 15 exponent, 64 mantissa
|
|
156
|
+
exponent = ((data[0] & 0x7F) << 8) | data[1]
|
|
157
|
+
mantissa = 0
|
|
158
|
+
for i in range(2, 10):
|
|
159
|
+
mantissa = (mantissa << 8) | data[i]
|
|
160
|
+
sign = -1 if data[0] & 0x80 else 1
|
|
161
|
+
if exponent == 0 and mantissa == 0:
|
|
162
|
+
return 0.0
|
|
163
|
+
exponent -= 16383 # bias
|
|
164
|
+
# mantissa has explicit integer bit
|
|
165
|
+
value = sign * (mantissa / (1 << 63)) * (2 ** exponent)
|
|
166
|
+
return value
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def _aiff_to_pcm(data: bytes) -> tuple[bytes, int, int]:
|
|
170
|
+
"""Read AIFF data and return (pcm_le_bytes, sample_rate, channels).
|
|
171
|
+
|
|
172
|
+
AIFF stores samples as big-endian; this converts to little-endian
|
|
173
|
+
16-bit PCM. Parses AIFF manually to avoid the removed ``aifc``
|
|
174
|
+
module (dropped in Python 3.13).
|
|
175
|
+
"""
|
|
176
|
+
if len(data) < 12 or data[:4] != b"FORM" or data[8:12] not in (b"AIFF", b"AIFC"):
|
|
177
|
+
raise RuntimeError("Not a valid AIFF file")
|
|
178
|
+
|
|
179
|
+
is_aifc = data[8:12] == b"AIFC"
|
|
180
|
+
n_channels = 0
|
|
181
|
+
samp_width = 0
|
|
182
|
+
frame_rate = 0
|
|
183
|
+
n_frames = 0
|
|
184
|
+
raw = b""
|
|
185
|
+
compression_type = b"NONE"
|
|
186
|
+
|
|
187
|
+
pos = 12
|
|
188
|
+
while pos < len(data) - 8:
|
|
189
|
+
chunk_id = data[pos : pos + 4]
|
|
190
|
+
chunk_size = struct.unpack(">I", data[pos + 4 : pos + 8])[0]
|
|
191
|
+
chunk_data = data[pos + 8 : pos + 8 + chunk_size]
|
|
192
|
+
|
|
193
|
+
if chunk_id == b"COMM":
|
|
194
|
+
# COMM: channels(2), numFrames(4), sampleSize(2), sampleRate(10 extended)
|
|
195
|
+
n_channels = struct.unpack(">h", chunk_data[0:2])[0]
|
|
196
|
+
n_frames = struct.unpack(">I", chunk_data[2:6])[0]
|
|
197
|
+
samp_width = struct.unpack(">h", chunk_data[6:8])[0] // 8
|
|
198
|
+
frame_rate = int(AudioConverter._parse_aiff_extended(chunk_data[8:18]))
|
|
199
|
+
# AIFF-C has compression type after the 18-byte standard COMM fields
|
|
200
|
+
if is_aifc and len(chunk_data) >= 22:
|
|
201
|
+
compression_type = chunk_data[18:22]
|
|
202
|
+
elif chunk_id == b"SSND":
|
|
203
|
+
# SSND: offset(4), blockSize(4), then raw sample data
|
|
204
|
+
offset = struct.unpack(">I", chunk_data[0:4])[0]
|
|
205
|
+
raw = chunk_data[8 + offset :]
|
|
206
|
+
|
|
207
|
+
# Skip FVER and other AIFC-specific chunks
|
|
208
|
+
# Chunks are padded to even size
|
|
209
|
+
pos += 8 + chunk_size + (chunk_size % 2)
|
|
210
|
+
|
|
211
|
+
if n_channels == 0:
|
|
212
|
+
raise RuntimeError("AIFF file missing COMM chunk")
|
|
213
|
+
|
|
214
|
+
# AIFF-C compression: NONE and twos are big-endian PCM (standard),
|
|
215
|
+
# sowt is little-endian PCM (no byte swap needed)
|
|
216
|
+
if is_aifc and compression_type not in (b"NONE", b"twos", b"sowt"):
|
|
217
|
+
raise RuntimeError(
|
|
218
|
+
f"Unsupported AIFF-C compression: {compression_type!r}. "
|
|
219
|
+
"Only uncompressed AIFF/AIFF-C is supported."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
is_little_endian = is_aifc and compression_type == b"sowt"
|
|
223
|
+
|
|
224
|
+
# Convert samples to little-endian 16-bit PCM
|
|
225
|
+
if samp_width == 2:
|
|
226
|
+
n = len(raw) // 2
|
|
227
|
+
if is_little_endian:
|
|
228
|
+
pcm = raw[:n * 2]
|
|
229
|
+
else:
|
|
230
|
+
be_samples = struct.unpack(f">{n}h", raw)
|
|
231
|
+
pcm = struct.pack(f"<{n}h", *be_samples)
|
|
232
|
+
elif samp_width == 1:
|
|
233
|
+
pcm = b"".join(
|
|
234
|
+
struct.pack("<h", (b - 128) * 256) for b in raw
|
|
235
|
+
)
|
|
236
|
+
else:
|
|
237
|
+
raise RuntimeError(f"Unsupported AIFF sample width: {samp_width * 8}-bit")
|
|
238
|
+
|
|
239
|
+
return pcm, frame_rate, n_channels
|
|
240
|
+
|
|
241
|
+
# -- core conversions -----------------------------------------------------
|
|
242
|
+
|
|
243
|
+
@staticmethod
|
|
244
|
+
def to_wav(
|
|
245
|
+
audio: AudioData,
|
|
246
|
+
target_rate: int | None = None,
|
|
247
|
+
target_channels: int | None = None,
|
|
248
|
+
) -> AudioData:
|
|
249
|
+
"""Convert *audio* to WAV format.
|
|
250
|
+
|
|
251
|
+
Optionally resample and/or remix channels. WAV input is returned
|
|
252
|
+
unchanged unless resampling/remixing is requested.
|
|
253
|
+
"""
|
|
254
|
+
fmt = audio.format
|
|
255
|
+
|
|
256
|
+
# Obtain raw PCM + metadata
|
|
257
|
+
if fmt == AudioFormat.WAV:
|
|
258
|
+
# Decode existing WAV to get raw PCM
|
|
259
|
+
with wave.open(io.BytesIO(audio.data), "rb") as wf:
|
|
260
|
+
pcm = wf.readframes(wf.getnframes())
|
|
261
|
+
rate = wf.getframerate()
|
|
262
|
+
ch = wf.getnchannels()
|
|
263
|
+
elif fmt == AudioFormat.AIFF:
|
|
264
|
+
pcm, rate, ch = AudioConverter._aiff_to_pcm(audio.data)
|
|
265
|
+
elif fmt in (AudioFormat.PCM_16K, AudioFormat.PCM_44K):
|
|
266
|
+
pcm = audio.data
|
|
267
|
+
rate = audio.sample_rate
|
|
268
|
+
ch = audio.channels
|
|
269
|
+
elif fmt in (AudioFormat.MP3, AudioFormat.OGG, AudioFormat.FLAC, AudioFormat.OPUS):
|
|
270
|
+
# Use ffmpeg to decode to raw PCM first
|
|
271
|
+
fmt_map = {
|
|
272
|
+
AudioFormat.MP3: "mp3",
|
|
273
|
+
AudioFormat.OGG: "ogg",
|
|
274
|
+
AudioFormat.FLAC: "flac",
|
|
275
|
+
AudioFormat.OPUS: "ogg",
|
|
276
|
+
}
|
|
277
|
+
raw = AudioConverter._ffmpeg_convert(
|
|
278
|
+
audio.data, fmt_map[fmt], "s16le",
|
|
279
|
+
sample_rate=target_rate or audio.sample_rate,
|
|
280
|
+
channels=target_channels or audio.channels,
|
|
281
|
+
)
|
|
282
|
+
out_rate = target_rate or audio.sample_rate
|
|
283
|
+
out_ch = target_channels or audio.channels
|
|
284
|
+
buf = io.BytesIO()
|
|
285
|
+
with wave.open(buf, "wb") as wf:
|
|
286
|
+
wf.setnchannels(out_ch)
|
|
287
|
+
wf.setsampwidth(2)
|
|
288
|
+
wf.setframerate(out_rate)
|
|
289
|
+
wf.writeframes(raw)
|
|
290
|
+
return AudioData(
|
|
291
|
+
data=buf.getvalue(),
|
|
292
|
+
sample_rate=out_rate,
|
|
293
|
+
channels=out_ch,
|
|
294
|
+
format=AudioFormat.WAV,
|
|
295
|
+
)
|
|
296
|
+
else:
|
|
297
|
+
raise RuntimeError(f"Unsupported format for to_wav: {fmt}")
|
|
298
|
+
|
|
299
|
+
# Optional channel remix
|
|
300
|
+
out_ch = target_channels or ch
|
|
301
|
+
if out_ch != ch:
|
|
302
|
+
if out_ch == 1:
|
|
303
|
+
pcm = AudioConverter.mix_to_mono(pcm, ch)
|
|
304
|
+
else:
|
|
305
|
+
raise RuntimeError("Only mono down-mix is supported")
|
|
306
|
+
|
|
307
|
+
# Optional resample
|
|
308
|
+
out_rate = target_rate or rate
|
|
309
|
+
if out_rate != rate:
|
|
310
|
+
# If multi-channel, must be mono at this point for simple resample
|
|
311
|
+
if out_ch > 1:
|
|
312
|
+
raise RuntimeError("Resample only supported for mono audio")
|
|
313
|
+
pcm = AudioConverter.resample(pcm, rate, out_rate)
|
|
314
|
+
|
|
315
|
+
# Wrap as WAV
|
|
316
|
+
buf = io.BytesIO()
|
|
317
|
+
with wave.open(buf, "wb") as wf:
|
|
318
|
+
wf.setnchannels(out_ch)
|
|
319
|
+
wf.setsampwidth(2)
|
|
320
|
+
wf.setframerate(out_rate)
|
|
321
|
+
wf.writeframes(pcm)
|
|
322
|
+
|
|
323
|
+
return AudioData(
|
|
324
|
+
data=buf.getvalue(),
|
|
325
|
+
sample_rate=out_rate,
|
|
326
|
+
channels=out_ch,
|
|
327
|
+
format=AudioFormat.WAV,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
@staticmethod
|
|
331
|
+
def to_pcm16k(audio: AudioData) -> AudioData:
|
|
332
|
+
"""Convert *audio* to 16 kHz mono 16-bit PCM."""
|
|
333
|
+
if (
|
|
334
|
+
audio.format == AudioFormat.PCM_16K
|
|
335
|
+
and audio.sample_rate == 16000
|
|
336
|
+
and audio.channels == 1
|
|
337
|
+
):
|
|
338
|
+
return audio
|
|
339
|
+
|
|
340
|
+
# Go through WAV first, then extract raw PCM
|
|
341
|
+
wav = AudioConverter.to_wav(audio, target_rate=16000, target_channels=1)
|
|
342
|
+
with wave.open(io.BytesIO(wav.data), "rb") as wf:
|
|
343
|
+
pcm = wf.readframes(wf.getnframes())
|
|
344
|
+
|
|
345
|
+
return AudioData(
|
|
346
|
+
data=pcm,
|
|
347
|
+
sample_rate=16000,
|
|
348
|
+
channels=1,
|
|
349
|
+
format=AudioFormat.PCM_16K,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
@staticmethod
|
|
353
|
+
def convert(audio: AudioData, target: AudioFormat) -> AudioData:
|
|
354
|
+
"""Convert *audio* to *target* format."""
|
|
355
|
+
if audio.format == target:
|
|
356
|
+
return audio
|
|
357
|
+
|
|
358
|
+
# Targets achievable with stdlib
|
|
359
|
+
if target == AudioFormat.WAV:
|
|
360
|
+
return AudioConverter.to_wav(audio)
|
|
361
|
+
if target == AudioFormat.PCM_16K:
|
|
362
|
+
return AudioConverter.to_pcm16k(audio)
|
|
363
|
+
if target == AudioFormat.PCM_44K:
|
|
364
|
+
wav = AudioConverter.to_wav(audio, target_rate=44100)
|
|
365
|
+
with wave.open(io.BytesIO(wav.data), "rb") as wf:
|
|
366
|
+
pcm = wf.readframes(wf.getnframes())
|
|
367
|
+
return AudioData(
|
|
368
|
+
data=pcm,
|
|
369
|
+
sample_rate=44100,
|
|
370
|
+
channels=wav.channels,
|
|
371
|
+
format=AudioFormat.PCM_44K,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Targets requiring ffmpeg
|
|
375
|
+
fmt_map = {
|
|
376
|
+
AudioFormat.MP3: "mp3",
|
|
377
|
+
AudioFormat.OGG: "ogg",
|
|
378
|
+
AudioFormat.FLAC: "flac",
|
|
379
|
+
AudioFormat.OPUS: "opus",
|
|
380
|
+
}
|
|
381
|
+
if target not in fmt_map:
|
|
382
|
+
raise RuntimeError(f"Unsupported target format: {target}")
|
|
383
|
+
|
|
384
|
+
# First ensure we have WAV (ffmpeg reads WAV easily)
|
|
385
|
+
wav = AudioConverter.to_wav(audio)
|
|
386
|
+
out = AudioConverter._ffmpeg_convert(
|
|
387
|
+
wav.data, "wav", fmt_map[target],
|
|
388
|
+
sample_rate=audio.sample_rate,
|
|
389
|
+
channels=audio.channels,
|
|
390
|
+
)
|
|
391
|
+
return AudioData(
|
|
392
|
+
data=out,
|
|
393
|
+
sample_rate=audio.sample_rate,
|
|
394
|
+
channels=audio.channels,
|
|
395
|
+
format=target,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# -- streaming (reserved) -------------------------------------------------
|
|
399
|
+
|
|
400
|
+
@staticmethod
|
|
401
|
+
def convert_stream(
|
|
402
|
+
audio: AudioData,
|
|
403
|
+
target: AudioFormat,
|
|
404
|
+
) -> AudioData:
|
|
405
|
+
"""Streaming conversion -- not yet implemented."""
|
|
406
|
+
raise NotImplementedError("Streaming conversion is not yet supported")
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Audio playback utilities for demo and external applications."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
import wave
|
|
9
|
+
from array import array
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from openspeech.core.enums import AudioFormat
|
|
14
|
+
from openspeech.core.models import AudioData
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _wav_bytes_from_audio(audio: AudioData) -> bytes:
|
|
18
|
+
if audio.format == AudioFormat.WAV and audio.data[:4] == b"RIFF":
|
|
19
|
+
return audio.data
|
|
20
|
+
|
|
21
|
+
data = audio.data
|
|
22
|
+
if audio.format == AudioFormat.PCM_44K:
|
|
23
|
+
sample_rate = 44100
|
|
24
|
+
elif audio.format == AudioFormat.PCM_16K:
|
|
25
|
+
sample_rate = 16000
|
|
26
|
+
else:
|
|
27
|
+
sample_rate = audio.sample_rate
|
|
28
|
+
|
|
29
|
+
buf = io.BytesIO()
|
|
30
|
+
with wave.open(buf, "wb") as wf:
|
|
31
|
+
wf.setnchannels(audio.channels)
|
|
32
|
+
wf.setsampwidth(2) # 16-bit PCM
|
|
33
|
+
wf.setframerate(sample_rate)
|
|
34
|
+
wf.writeframes(data)
|
|
35
|
+
return buf.getvalue()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _scale_pcm16(pcm_bytes: bytes, volume: float) -> bytes:
|
|
39
|
+
if volume == 1.0:
|
|
40
|
+
return pcm_bytes
|
|
41
|
+
samples = array("h")
|
|
42
|
+
samples.frombytes(pcm_bytes)
|
|
43
|
+
for i, s in enumerate(samples):
|
|
44
|
+
v = int(s * volume)
|
|
45
|
+
if v > 32767:
|
|
46
|
+
v = 32767
|
|
47
|
+
elif v < -32768:
|
|
48
|
+
v = -32768
|
|
49
|
+
samples[i] = v
|
|
50
|
+
return samples.tobytes()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _play_with_sounddevice(
|
|
54
|
+
audio: AudioData,
|
|
55
|
+
device: str | int | None,
|
|
56
|
+
volume: float,
|
|
57
|
+
blocking: bool,
|
|
58
|
+
) -> None:
|
|
59
|
+
try:
|
|
60
|
+
import numpy as np
|
|
61
|
+
import sounddevice as sd
|
|
62
|
+
except ImportError as exc:
|
|
63
|
+
raise RuntimeError(
|
|
64
|
+
"sounddevice backend unavailable. Install extras: pip install openspeech[audio]"
|
|
65
|
+
) from exc
|
|
66
|
+
|
|
67
|
+
wav_bytes = _wav_bytes_from_audio(audio)
|
|
68
|
+
with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
|
|
69
|
+
frames = wf.readframes(wf.getnframes())
|
|
70
|
+
channels = wf.getnchannels()
|
|
71
|
+
sample_rate = wf.getframerate()
|
|
72
|
+
sample_width = wf.getsampwidth()
|
|
73
|
+
|
|
74
|
+
if sample_width != 2:
|
|
75
|
+
raise RuntimeError(f"Only 16-bit PCM playback is supported, got sample_width={sample_width}")
|
|
76
|
+
|
|
77
|
+
scaled = _scale_pcm16(frames, volume)
|
|
78
|
+
pcm = np.frombuffer(scaled, dtype=np.int16)
|
|
79
|
+
if channels > 1:
|
|
80
|
+
pcm = pcm.reshape(-1, channels)
|
|
81
|
+
|
|
82
|
+
sd.play(pcm, samplerate=sample_rate, device=device, blocking=blocking)
|
|
83
|
+
if blocking:
|
|
84
|
+
sd.wait()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _play_with_external_player(
|
|
88
|
+
audio: AudioData,
|
|
89
|
+
device: str | int | None,
|
|
90
|
+
blocking: bool,
|
|
91
|
+
) -> None:
|
|
92
|
+
if device is not None:
|
|
93
|
+
raise RuntimeError("device selection is only supported with backend='sounddevice'")
|
|
94
|
+
|
|
95
|
+
wav_bytes = _wav_bytes_from_audio(audio)
|
|
96
|
+
with tempfile.NamedTemporaryFile(prefix="openspeech_", suffix=".wav", delete=False) as f:
|
|
97
|
+
temp_path = Path(f.name)
|
|
98
|
+
f.write(wav_bytes)
|
|
99
|
+
|
|
100
|
+
if shutil.which("ffplay"):
|
|
101
|
+
cmd = ["ffplay", "-nodisp", "-autoexit", "-loglevel", "error", str(temp_path)]
|
|
102
|
+
elif shutil.which("afplay"):
|
|
103
|
+
cmd = ["afplay", str(temp_path)]
|
|
104
|
+
elif shutil.which("aplay"):
|
|
105
|
+
cmd = ["aplay", str(temp_path)]
|
|
106
|
+
else:
|
|
107
|
+
temp_path.unlink(missing_ok=True)
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
"No playback backend found. Install sounddevice or ffplay/afplay/aplay."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if blocking:
|
|
113
|
+
subprocess.run(cmd, check=False)
|
|
114
|
+
temp_path.unlink(missing_ok=True)
|
|
115
|
+
else:
|
|
116
|
+
subprocess.Popen(cmd) # noqa: S603
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def play_audio(
|
|
120
|
+
audio: AudioData,
|
|
121
|
+
*,
|
|
122
|
+
device: str | int | None = None,
|
|
123
|
+
volume: float = 1.0,
|
|
124
|
+
blocking: bool = True,
|
|
125
|
+
backend: str = "auto",
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Play AudioData with optional backend/device selection."""
|
|
128
|
+
if volume <= 0:
|
|
129
|
+
raise ValueError("volume must be > 0")
|
|
130
|
+
backend = backend.lower().strip()
|
|
131
|
+
if backend not in {"auto", "sounddevice", "external"}:
|
|
132
|
+
raise ValueError("backend must be one of: auto, sounddevice, external")
|
|
133
|
+
|
|
134
|
+
if backend in {"auto", "sounddevice"}:
|
|
135
|
+
try:
|
|
136
|
+
_play_with_sounddevice(audio, device=device, volume=volume, blocking=blocking)
|
|
137
|
+
return
|
|
138
|
+
except RuntimeError:
|
|
139
|
+
if backend == "sounddevice":
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
_play_with_external_player(audio, device=device, blocking=blocking)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def list_output_devices() -> list[dict[str, Any]]:
|
|
146
|
+
"""List playback devices for sounddevice backend."""
|
|
147
|
+
try:
|
|
148
|
+
import sounddevice as sd
|
|
149
|
+
except ImportError:
|
|
150
|
+
return []
|
|
151
|
+
|
|
152
|
+
devices = []
|
|
153
|
+
for idx, dev in enumerate(sd.query_devices()): # type: ignore[arg-type]
|
|
154
|
+
if dev.get("max_output_channels", 0) > 0:
|
|
155
|
+
devices.append({"id": idx, "name": dev.get("name", f"device-{idx}")})
|
|
156
|
+
return devices
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Vendor Registry — cloud provider service templates.
|
|
2
|
+
# Defines shared credential fields for each vendor.
|
|
3
|
+
# Engines reference vendors via the "vendor" field in engine_registry.yaml.
|
|
4
|
+
version: 1
|
|
5
|
+
|
|
6
|
+
vendors:
|
|
7
|
+
openai:
|
|
8
|
+
display_name: "OpenAI"
|
|
9
|
+
shared_fields:
|
|
10
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
11
|
+
base_url: { required: false, description: "API Base URL (leave empty for default)", default: "" }
|
|
12
|
+
|
|
13
|
+
iflytek:
|
|
14
|
+
display_name: "iFlytek (科大讯飞)"
|
|
15
|
+
shared_fields:
|
|
16
|
+
app_id: { required: true, description: "App ID" }
|
|
17
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
18
|
+
api_secret: { required: true, description: "API Secret", secret: true }
|
|
19
|
+
|
|
20
|
+
google:
|
|
21
|
+
display_name: "Google Cloud"
|
|
22
|
+
shared_fields:
|
|
23
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
24
|
+
|
|
25
|
+
azure:
|
|
26
|
+
display_name: "Azure"
|
|
27
|
+
shared_fields:
|
|
28
|
+
subscription_key: { required: true, description: "Subscription Key", secret: true }
|
|
29
|
+
region: { required: true, description: "Region", default: "eastus" }
|
|
30
|
+
|
|
31
|
+
alibaba:
|
|
32
|
+
display_name: "Alibaba (阿里云)"
|
|
33
|
+
shared_fields:
|
|
34
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
35
|
+
base_url: { required: false, description: "API Base URL", default: "https://dashscope.aliyuncs.com/compatible-mode/v1" }
|
|
36
|
+
|
|
37
|
+
tencent:
|
|
38
|
+
display_name: "Tencent (腾讯云)"
|
|
39
|
+
shared_fields:
|
|
40
|
+
secret_id: { required: true, description: "Secret ID", secret: true }
|
|
41
|
+
secret_key: { required: true, description: "Secret Key", secret: true }
|
|
42
|
+
|
|
43
|
+
baidu:
|
|
44
|
+
display_name: "Baidu (百度)"
|
|
45
|
+
shared_fields:
|
|
46
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
47
|
+
secret_key: { required: true, description: "Secret Key", secret: true }
|
|
48
|
+
|
|
49
|
+
volcengine:
|
|
50
|
+
display_name: "Volcengine (火山引擎)"
|
|
51
|
+
shared_fields:
|
|
52
|
+
access_token: { required: true, description: "Access Token", secret: true }
|
|
53
|
+
app_id: { required: true, description: "App ID" }
|
|
54
|
+
|
|
55
|
+
deepgram:
|
|
56
|
+
display_name: "Deepgram"
|
|
57
|
+
shared_fields:
|
|
58
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
59
|
+
|
|
60
|
+
assemblyai:
|
|
61
|
+
display_name: "AssemblyAI"
|
|
62
|
+
shared_fields:
|
|
63
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
64
|
+
|
|
65
|
+
elevenlabs:
|
|
66
|
+
display_name: "ElevenLabs"
|
|
67
|
+
shared_fields:
|
|
68
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
69
|
+
|
|
70
|
+
minimax:
|
|
71
|
+
display_name: "MiniMax"
|
|
72
|
+
shared_fields:
|
|
73
|
+
api_key: { required: true, description: "API Key", secret: true }
|
|
74
|
+
group_id: { required: true, description: "Group ID" }
|