myagent-ai 1.15.65 → 1.15.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/deps_checker.py +3 -1
- package/package.json +1 -1
- package/web/api_server.py +84 -3
package/core/deps_checker.py
CHANGED
|
@@ -76,7 +76,9 @@ DEPENDENCIES: List[DepInfo] = [
|
|
|
76
76
|
|
|
77
77
|
# ── 语音识别 (STT) ──
|
|
78
78
|
DepInfo("faster_whisper", "faster-whisper", "1.0.0", "stt", "all",
|
|
79
|
-
note="本地语音识别引擎 (
|
|
79
|
+
note="本地语音识别引擎 (需要 C++ 编译)"),
|
|
80
|
+
DepInfo("speech_recognition", "SpeechRecognition", "3.10.0", "stt", "all",
|
|
81
|
+
note="在线语音识别 (Google API,纯 Python 无需编译,Termux 兼容)"),
|
|
80
82
|
|
|
81
83
|
# ── 浏览器自动化 (ChromeDev MCP) ──
|
|
82
84
|
# Playwright 已移除,浏览器自动化统一使用 ChromeDevTools Protocol (MCP)
|
package/package.json
CHANGED
package/web/api_server.py
CHANGED
|
@@ -1219,11 +1219,92 @@ class ApiServer:
|
|
|
1219
1219
|
except Exception as e:
|
|
1220
1220
|
logger.warning(f"vosk 转录失败: {e}")
|
|
1221
1221
|
|
|
1222
|
+
# ── 尝试用户已配置的 LLM API (Whisper 兼容端点) ──
|
|
1223
|
+
# 大多数 OpenAI 兼容 API 都支持 /v1/audio/transcriptions
|
|
1224
|
+
# 无需额外依赖、无需编译、国内可用
|
|
1225
|
+
try:
|
|
1226
|
+
import aiohttp
|
|
1227
|
+
cfg = self.core.config_mgr.config.llm
|
|
1228
|
+
api_key = cfg.api_key or ""
|
|
1229
|
+
base_url = (cfg.base_url or "").rstrip("/")
|
|
1230
|
+
if api_key and base_url:
|
|
1231
|
+
# 构造 Whisper API URL
|
|
1232
|
+
if base_url.endswith("/v1"):
|
|
1233
|
+
whisper_url = base_url + "/audio/transcriptions"
|
|
1234
|
+
else:
|
|
1235
|
+
whisper_url = base_url.rstrip("/v1") + "/v1/audio/transcriptions"
|
|
1236
|
+
# 准备音频数据
|
|
1237
|
+
audio_bytes = audio_data
|
|
1238
|
+
if audio_format and audio_format not in ("wav", "WAV"):
|
|
1239
|
+
# 非 WAV 格式,尝试用内置 wave 模块(如果已经是 WAV 则直接用)
|
|
1240
|
+
try:
|
|
1241
|
+
import wave, io
|
|
1242
|
+
buf = io.BytesIO(audio_data)
|
|
1243
|
+
with wave.open(buf, 'rb') as rf:
|
|
1244
|
+
audio_bytes = audio_data # 已经是 WAV
|
|
1245
|
+
except Exception:
|
|
1246
|
+
audio_bytes = audio_data # 不是 WAV 也尝试发送
|
|
1247
|
+
|
|
1248
|
+
import mimetypes
|
|
1249
|
+
fmt = audio_format or "wav"
|
|
1250
|
+
mime = mimetypes.guess_type(f"audio.{fmt}")[0] or "audio/wav"
|
|
1251
|
+
|
|
1252
|
+
data = aiohttp.FormData()
|
|
1253
|
+
data.add_field('file', audio_bytes, filename=f'audio.{fmt}', content_type=mime)
|
|
1254
|
+
data.add_field('model', 'whisper-1')
|
|
1255
|
+
data.add_field('language', 'zh')
|
|
1256
|
+
|
|
1257
|
+
headers = {"Authorization": f"Bearer {api_key}"}
|
|
1258
|
+
async with aiohttp.ClientSession() as session:
|
|
1259
|
+
async with session.post(whisper_url, data=data, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
|
1260
|
+
if resp.status == 200:
|
|
1261
|
+
result = await resp.json()
|
|
1262
|
+
text = result.get("text", "").strip()
|
|
1263
|
+
if text:
|
|
1264
|
+
logger.info(f"LLM API (Whisper) 转录成功: {base_url}")
|
|
1265
|
+
return web.json_response({"text": text, "engine": "llm_api"})
|
|
1266
|
+
else:
|
|
1267
|
+
err_text = await resp.text()
|
|
1268
|
+
logger.debug(f"LLM API Whisper 端点不可用 ({resp.status}): {err_text[:200]}")
|
|
1269
|
+
except Exception as e:
|
|
1270
|
+
logger.debug(f"LLM API Whisper 转录失败: {e}")
|
|
1271
|
+
|
|
1272
|
+
# ── 尝试 SpeechRecognition (Google Web Speech API, 纯 Python,需外网) ──
|
|
1273
|
+
try:
|
|
1274
|
+
import speech_recognition as sr
|
|
1275
|
+
wav_buf = io.BytesIO(audio_data)
|
|
1276
|
+
try:
|
|
1277
|
+
audio_buf = io.BytesIO(audio_data)
|
|
1278
|
+
from pydub import AudioSegment
|
|
1279
|
+
seg = AudioSegment.from_file(audio_buf, format=audio_format or "webm")
|
|
1280
|
+
seg = seg.set_channels(1).set_frame_rate(16000).set_sample_width(2)
|
|
1281
|
+
seg.export(wav_buf, format="wav")
|
|
1282
|
+
except Exception:
|
|
1283
|
+
wav_buf = io.BytesIO(audio_data)
|
|
1284
|
+
wav_buf.seek(0)
|
|
1285
|
+
recognizer = sr.Recognizer()
|
|
1286
|
+
with sr.AudioFile(wav_buf) as source:
|
|
1287
|
+
audio = recognizer.record(source)
|
|
1288
|
+
text = recognizer.recognize_google(audio, language="zh-CN")
|
|
1289
|
+
if text:
|
|
1290
|
+
logger.info("SpeechRecognition (Google API) 转录成功")
|
|
1291
|
+
return web.json_response({"text": text, "engine": "speech_recognition"})
|
|
1292
|
+
except ImportError:
|
|
1293
|
+
logger.debug("SpeechRecognition 未安装,跳过")
|
|
1294
|
+
except sr.UnknownValueError:
|
|
1295
|
+
logger.debug("SpeechRecognition 无法识别音频内容")
|
|
1296
|
+
except sr.RequestError as e:
|
|
1297
|
+
logger.warning(f"SpeechRecognition API 请求失败: {e}")
|
|
1298
|
+
except Exception as e:
|
|
1299
|
+
logger.warning(f"SpeechRecognition 转录失败: {e}")
|
|
1300
|
+
|
|
1222
1301
|
# ── 没有可用的 STT 引擎 ──
|
|
1223
1302
|
return web.json_response({
|
|
1224
|
-
"error": "
|
|
1225
|
-
"
|
|
1226
|
-
"
|
|
1303
|
+
"error": "未检测到可用的 STT 引擎。请尝试以下方案:\n"
|
|
1304
|
+
" 1. 配置支持 Whisper 的 LLM API(自动使用,推荐)\n"
|
|
1305
|
+
" 2. pip install faster-whisper (离线本地,需 C++ 编译环境)\n"
|
|
1306
|
+
" 3. pip install vosk (离线本地,需下载模型)\n"
|
|
1307
|
+
" 4. pip install SpeechRecognition (需外网,国内不可用)",
|
|
1227
1308
|
"available": False,
|
|
1228
1309
|
}, status=503)
|
|
1229
1310
|
|