opc-agent 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.opc/memory.db +0 -0
- package/COMPETITIVE-GAP.md +92 -92
- package/CONTRIBUTING.md +36 -36
- package/README.md +290 -290
- package/README.zh-CN.md +269 -269
- package/STUDIO-REWRITE-TASK.md +76 -0
- package/dist/channels/telegram.d.ts +5 -0
- package/dist/channels/telegram.d.ts.map +1 -1
- package/dist/channels/telegram.js +108 -0
- package/dist/channels/telegram.js.map +1 -1
- package/dist/channels/voice.d.ts +71 -97
- package/dist/channels/voice.d.ts.map +1 -1
- package/dist/channels/voice.js +369 -347
- package/dist/channels/voice.js.map +1 -1
- package/dist/channels/web.d.ts.map +1 -1
- package/dist/channels/web.js +8 -2
- package/dist/channels/web.js.map +1 -1
- package/dist/channels/wechat.js +6 -6
- package/dist/cli/chat.d.ts +4 -1
- package/dist/cli/chat.d.ts.map +1 -1
- package/dist/cli/chat.js +680 -73
- package/dist/cli/chat.js.map +1 -1
- package/dist/cli/setup.js +1 -1
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli.js +373 -280
- package/dist/cli.js.map +1 -1
- package/dist/core/a2a-http.d.ts +75 -0
- package/dist/core/a2a-http.d.ts.map +1 -0
- package/dist/core/a2a-http.js +217 -0
- package/dist/core/a2a-http.js.map +1 -0
- package/dist/core/a2a.d.ts +2 -0
- package/dist/core/a2a.d.ts.map +1 -1
- package/dist/core/a2a.js +6 -1
- package/dist/core/a2a.js.map +1 -1
- package/dist/core/agent.d.ts +1 -0
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +3 -0
- package/dist/core/agent.js.map +1 -1
- package/dist/core/gateway-registry.d.ts +116 -0
- package/dist/core/gateway-registry.d.ts.map +1 -0
- package/dist/core/gateway-registry.js +280 -0
- package/dist/core/gateway-registry.js.map +1 -0
- package/dist/core/model-recommender.d.ts +40 -0
- package/dist/core/model-recommender.d.ts.map +1 -0
- package/dist/core/model-recommender.js +186 -0
- package/dist/core/model-recommender.js.map +1 -0
- package/dist/core/priority-queue.d.ts +100 -0
- package/dist/core/priority-queue.d.ts.map +1 -0
- package/dist/core/priority-queue.js +181 -0
- package/dist/core/priority-queue.js.map +1 -0
- package/dist/core/runtime.d.ts.map +1 -1
- package/dist/core/runtime.js +192 -22
- package/dist/core/runtime.js.map +1 -1
- package/dist/deploy/index.js +56 -56
- package/dist/doctor.d.ts +1 -0
- package/dist/doctor.d.ts.map +1 -1
- package/dist/doctor.js +155 -10
- package/dist/doctor.js.map +1 -1
- package/dist/index.d.ts +10 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +24 -13
- package/dist/index.js.map +1 -1
- package/dist/memory/deepbrain.d.ts +1 -1
- package/dist/memory/deepbrain.d.ts.map +1 -1
- package/dist/memory/deepbrain.js +95 -4
- package/dist/memory/deepbrain.js.map +1 -1
- package/dist/memory/evolve-engine.d.ts +113 -0
- package/dist/memory/evolve-engine.d.ts.map +1 -0
- package/dist/memory/evolve-engine.js +549 -0
- package/dist/memory/evolve-engine.js.map +1 -0
- package/dist/memory/index.d.ts +2 -0
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +3 -1
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/sqlite-store.d.ts +40 -0
- package/dist/memory/sqlite-store.d.ts.map +1 -0
- package/dist/memory/sqlite-store.js +269 -0
- package/dist/memory/sqlite-store.js.map +1 -0
- package/dist/memory/user-profiler.d.ts +8 -0
- package/dist/memory/user-profiler.d.ts.map +1 -1
- package/dist/memory/user-profiler.js +89 -0
- package/dist/memory/user-profiler.js.map +1 -1
- package/dist/scheduler/cron-engine.d.ts.map +1 -1
- package/dist/scheduler/cron-engine.js +3 -36
- package/dist/scheduler/cron-engine.js.map +1 -1
- package/dist/scheduler/proactive.d.ts +62 -0
- package/dist/scheduler/proactive.d.ts.map +1 -0
- package/dist/scheduler/proactive.js +185 -0
- package/dist/scheduler/proactive.js.map +1 -0
- package/dist/skills/auto-learn.d.ts.map +1 -1
- package/dist/skills/auto-learn.js +65 -11
- package/dist/skills/auto-learn.js.map +1 -1
- package/dist/skills/builtin/index.d.ts.map +1 -1
- package/dist/skills/builtin/index.js +163 -30
- package/dist/skills/builtin/index.js.map +1 -1
- package/dist/skills/types.d.ts +1 -1
- package/dist/skills/types.d.ts.map +1 -1
- package/dist/skills/types.js +1 -0
- package/dist/skills/types.js.map +1 -1
- package/dist/studio/server.d.ts +1 -0
- package/dist/studio/server.d.ts.map +1 -1
- package/dist/studio/server.js +148 -17
- package/dist/studio/server.js.map +1 -1
- package/dist/studio-ui/index.html +867 -2630
- package/dist/ui/components.js +105 -105
- package/examples/README.md +22 -22
- package/examples/basic-agent.ts +90 -90
- package/examples/brain-integration.ts +71 -71
- package/examples/multi-channel.ts +74 -74
- package/install.ps1 +127 -127
- package/install.sh +154 -154
- package/models.json +164 -164
- package/package.json +5 -2
- package/scripts/install.ps1 +31 -31
- package/scripts/install.sh +40 -40
- package/templates/ecommerce-assistant/README.md +45 -45
- package/templates/ecommerce-assistant/oad.yaml +47 -47
- package/templates/tech-support/README.md +43 -43
- package/templates/tech-support/oad.yaml +45 -45
package/dist/channels/voice.js
CHANGED
|
@@ -1,4 +1,20 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Voice processing module — STT (Speech-to-Text) and TTS (Text-to-Speech)
|
|
4
|
+
*
|
|
5
|
+
* STT providers:
|
|
6
|
+
* 1. Edge STT (free, via edge-stt or Whisper local)
|
|
7
|
+
* 2. Volcano Engine / Doubao STT (best Chinese, ~¥0.01/req)
|
|
8
|
+
* 3. OpenAI Whisper API ($0.006/min)
|
|
9
|
+
* 4. Local Whisper (free, needs model download)
|
|
10
|
+
*
|
|
11
|
+
* TTS providers:
|
|
12
|
+
* 1. edge-tts (free, Microsoft voices, excellent Chinese)
|
|
13
|
+
* 2. Volcano Engine TTS (Doubao voices)
|
|
14
|
+
* 3. OpenAI TTS ($0.015/1K chars)
|
|
15
|
+
*
|
|
16
|
+
* Default: Whisper API for STT + edge-tts for TTS (best quality/cost ratio)
|
|
17
|
+
*/
|
|
2
18
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
19
|
if (k2 === undefined) k2 = k;
|
|
4
20
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
@@ -33,385 +49,391 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
33
49
|
};
|
|
34
50
|
})();
|
|
35
51
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.
|
|
37
|
-
exports.
|
|
38
|
-
const
|
|
39
|
-
const
|
|
52
|
+
exports.VoiceProcessor = void 0;
|
|
53
|
+
exports.createVoiceProcessor = createVoiceProcessor;
|
|
54
|
+
const child_process_1 = require("child_process");
|
|
55
|
+
const fs = __importStar(require("fs"));
|
|
56
|
+
const path = __importStar(require("path"));
|
|
40
57
|
const https = __importStar(require("https"));
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
58
|
+
const http = __importStar(require("http"));
|
|
59
|
+
/** Auto-detect best available STT provider */
|
|
60
|
+
function detectSTTProvider(config) {
|
|
61
|
+
if (config.sttProvider && config.sttProvider !== 'none')
|
|
62
|
+
return config.sttProvider;
|
|
63
|
+
// Priority: volcano (best Chinese) → azure (free tier) → whisper-api → none
|
|
64
|
+
if (config.volcanoAppId || process.env.VOLC_APP_ID)
|
|
65
|
+
return 'volcano';
|
|
66
|
+
if (config.azureSpeechKey || process.env.AZURE_SPEECH_KEY)
|
|
67
|
+
return 'azure';
|
|
68
|
+
if (config.openaiApiKey || process.env.OPENAI_API_KEY)
|
|
69
|
+
return 'whisper-api';
|
|
70
|
+
return 'none';
|
|
71
|
+
}
|
|
72
|
+
/** Auto-detect best available TTS provider */
|
|
73
|
+
function detectTTSProvider(config) {
|
|
74
|
+
if (config.ttsProvider && config.ttsProvider !== 'none')
|
|
75
|
+
return config.ttsProvider;
|
|
76
|
+
// Priority: edge-tts (free) → volcano → azure → openai-tts → none
|
|
77
|
+
return 'edge-tts'; // always try edge-tts first (free, best quality)
|
|
78
|
+
}
|
|
79
|
+
const DEFAULT_CONFIG = {
|
|
80
|
+
sttProvider: 'none', // will be auto-detected
|
|
81
|
+
ttsProvider: 'edge-tts',
|
|
82
|
+
ttsVoice: 'zh-CN-XiaoxiaoNeural',
|
|
83
|
+
ttsLang: 'zh-CN',
|
|
84
|
+
tempDir: '.opc/voice-tmp',
|
|
85
|
+
};
|
|
86
|
+
class VoiceProcessor {
|
|
87
|
+
config;
|
|
88
|
+
constructor(config) {
|
|
89
|
+
this.config = {
|
|
90
|
+
...DEFAULT_CONFIG,
|
|
91
|
+
...config,
|
|
92
|
+
sttProvider: detectSTTProvider(config || {}),
|
|
93
|
+
ttsProvider: detectTTSProvider(config || {}),
|
|
94
|
+
};
|
|
95
|
+
const dir = this.config.tempDir || '.opc/voice-tmp';
|
|
96
|
+
if (!fs.existsSync(dir))
|
|
97
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
47
98
|
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
99
|
+
/**
|
|
100
|
+
* Convert speech audio to text (STT)
|
|
101
|
+
* @param audioPath Path to audio file (.ogg, .mp3, .wav, .m4a)
|
|
102
|
+
* @returns Transcribed text
|
|
103
|
+
*/
|
|
104
|
+
async speechToText(audioPath) {
|
|
105
|
+
switch (this.config.sttProvider) {
|
|
106
|
+
case 'whisper-api':
|
|
107
|
+
return this.whisperApiSTT(audioPath);
|
|
108
|
+
case 'whisper-local':
|
|
109
|
+
return this.whisperLocalSTT(audioPath);
|
|
110
|
+
case 'volcano':
|
|
111
|
+
return this.volcanoSTT(audioPath);
|
|
112
|
+
case 'azure':
|
|
113
|
+
return this.azureSTT(audioPath);
|
|
114
|
+
default:
|
|
115
|
+
throw new Error(`STT not configured. Set voice.sttProvider in config.`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Convert text to speech audio (TTS)
|
|
120
|
+
* @param text Text to convert
|
|
121
|
+
* @returns Path to generated audio file (.mp3)
|
|
122
|
+
*/
|
|
123
|
+
async textToSpeech(text) {
|
|
124
|
+
switch (this.config.ttsProvider) {
|
|
125
|
+
case 'edge-tts':
|
|
126
|
+
return this.edgeTTS(text);
|
|
127
|
+
case 'openai-tts':
|
|
128
|
+
return this.openaiTTS(text);
|
|
129
|
+
case 'volcano':
|
|
130
|
+
return this.volcanoTTS(text);
|
|
131
|
+
case 'azure':
|
|
132
|
+
return this.azureTTS(text);
|
|
133
|
+
default:
|
|
134
|
+
throw new Error(`TTS not configured. Set voice.ttsProvider in config.`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
/** Check if voice processing is available */
|
|
138
|
+
isSTTAvailable() {
|
|
139
|
+
if (this.config.sttProvider === 'none')
|
|
140
|
+
return false;
|
|
141
|
+
if (this.config.sttProvider === 'whisper-api') {
|
|
142
|
+
return !!(this.config.openaiApiKey || process.env.OPENAI_API_KEY);
|
|
143
|
+
}
|
|
144
|
+
if (this.config.sttProvider === 'azure') {
|
|
145
|
+
return !!(this.config.azureSpeechKey || process.env.AZURE_SPEECH_KEY);
|
|
146
|
+
}
|
|
147
|
+
if (this.config.sttProvider === 'volcano') {
|
|
148
|
+
return !!(this.config.volcanoAppId || process.env.VOLC_APP_ID);
|
|
149
|
+
}
|
|
150
|
+
if (this.config.sttProvider === 'whisper-local') {
|
|
151
|
+
return this.checkOllamaWhisper();
|
|
152
|
+
}
|
|
153
|
+
return true;
|
|
154
|
+
}
|
|
155
|
+
isTTSAvailable() {
|
|
156
|
+
if (this.config.ttsProvider === 'none')
|
|
157
|
+
return false;
|
|
158
|
+
if (this.config.ttsProvider === 'edge-tts')
|
|
159
|
+
return this.checkEdgeTTS();
|
|
160
|
+
return true;
|
|
161
|
+
}
|
|
162
|
+
// ─── STT Providers ───
|
|
163
|
+
async whisperApiSTT(audioPath) {
|
|
164
|
+
const apiKey = this.config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
165
|
+
const baseUrl = this.config.openaiBaseUrl || process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
|
|
166
|
+
if (!apiKey)
|
|
167
|
+
throw new Error('OpenAI API key required for Whisper STT');
|
|
168
|
+
// Use multipart/form-data with fetch
|
|
169
|
+
const fileBuffer = fs.readFileSync(audioPath);
|
|
170
|
+
const fileName = path.basename(audioPath);
|
|
171
|
+
// Build multipart body manually
|
|
172
|
+
const boundary = '----OPCVoice' + Date.now();
|
|
52
173
|
const parts = [];
|
|
53
|
-
// file
|
|
54
|
-
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="
|
|
55
|
-
parts.push(
|
|
174
|
+
// file part
|
|
175
|
+
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="${fileName}"\r\nContent-Type: audio/ogg\r\n\r\n`));
|
|
176
|
+
parts.push(fileBuffer);
|
|
56
177
|
parts.push(Buffer.from('\r\n'));
|
|
57
|
-
// model
|
|
178
|
+
// model part
|
|
58
179
|
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="model"\r\n\r\nwhisper-1\r\n`));
|
|
59
|
-
// language
|
|
60
|
-
|
|
61
|
-
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="language"\r\n\r\n${options.language}\r\n`));
|
|
62
|
-
}
|
|
180
|
+
// language part (optimize for Chinese)
|
|
181
|
+
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="language"\r\n\r\nzh\r\n`));
|
|
63
182
|
parts.push(Buffer.from(`--${boundary}--\r\n`));
|
|
64
183
|
const body = Buffer.concat(parts);
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
'Content-Length': body.length,
|
|
74
|
-
},
|
|
75
|
-
}, (res) => {
|
|
76
|
-
const chunks = [];
|
|
77
|
-
res.on('data', (c) => chunks.push(c));
|
|
78
|
-
res.on('end', () => {
|
|
79
|
-
try {
|
|
80
|
-
const data = JSON.parse(Buffer.concat(chunks).toString());
|
|
81
|
-
resolve(data.text ?? '');
|
|
82
|
-
}
|
|
83
|
-
catch (e) {
|
|
84
|
-
reject(new Error('Failed to parse Whisper response'));
|
|
85
|
-
}
|
|
86
|
-
});
|
|
87
|
-
});
|
|
88
|
-
req.on('error', reject);
|
|
89
|
-
req.write(body);
|
|
90
|
-
req.end();
|
|
184
|
+
const url = `${baseUrl}/audio/transcriptions`;
|
|
185
|
+
const response = await fetch(url, {
|
|
186
|
+
method: 'POST',
|
|
187
|
+
headers: {
|
|
188
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
189
|
+
'Content-Type': `multipart/form-data; boundary=${boundary}`,
|
|
190
|
+
},
|
|
191
|
+
body,
|
|
91
192
|
});
|
|
193
|
+
if (!response.ok) {
|
|
194
|
+
const err = await response.text();
|
|
195
|
+
throw new Error(`Whisper API error (${response.status}): ${err}`);
|
|
196
|
+
}
|
|
197
|
+
const result = await response.json();
|
|
198
|
+
return result.text?.trim() || '';
|
|
92
199
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
async transcribe(audio, options) {
|
|
103
|
-
const lang = options?.language ?? 'en';
|
|
104
|
-
return new Promise((resolve, reject) => {
|
|
105
|
-
const req = https.request({
|
|
106
|
-
hostname: 'api.deepgram.com',
|
|
107
|
-
path: `/v1/listen?language=${lang}&model=nova-2`,
|
|
108
|
-
method: 'POST',
|
|
109
|
-
headers: {
|
|
110
|
-
'Authorization': `Token ${this.apiKey}`,
|
|
111
|
-
'Content-Type': 'audio/wav',
|
|
112
|
-
'Content-Length': audio.length,
|
|
113
|
-
},
|
|
114
|
-
}, (res) => {
|
|
115
|
-
const chunks = [];
|
|
116
|
-
res.on('data', (c) => chunks.push(c));
|
|
117
|
-
res.on('end', () => {
|
|
118
|
-
try {
|
|
119
|
-
const data = JSON.parse(Buffer.concat(chunks).toString());
|
|
120
|
-
const transcript = data?.results?.channels?.[0]?.alternatives?.[0]?.transcript ?? '';
|
|
121
|
-
resolve(transcript);
|
|
122
|
-
}
|
|
123
|
-
catch {
|
|
124
|
-
reject(new Error('Failed to parse Deepgram response'));
|
|
125
|
-
}
|
|
126
|
-
});
|
|
200
|
+
async whisperLocalSTT(audioPath) {
|
|
201
|
+
// Use Ollama's audio models or local whisper.cpp
|
|
202
|
+
const ollamaUrl = this.config.ollamaUrl || 'http://localhost:11434';
|
|
203
|
+
try {
|
|
204
|
+
// Try whisper via Ollama (if audio model available)
|
|
205
|
+
// Fallback: use whisper.cpp CLI
|
|
206
|
+
const result = (0, child_process_1.execSync)(`whisper "${audioPath}" --language zh --output_format txt`, {
|
|
207
|
+
encoding: 'utf-8',
|
|
208
|
+
timeout: 30000,
|
|
127
209
|
});
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
210
|
+
return result.trim();
|
|
211
|
+
}
|
|
212
|
+
catch {
|
|
213
|
+
throw new Error('Local Whisper not available. Install whisper.cpp or use whisper-api provider.');
|
|
214
|
+
}
|
|
132
215
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
216
|
+
async volcanoSTT(audioPath) {
|
|
217
|
+
// 火山引擎一句话识别 HTTP API
|
|
218
|
+
const appId = this.config.volcanoAppId || process.env.VOLC_APP_ID || '';
|
|
219
|
+
const token = this.config.volcanoToken || process.env.VOLC_ACCESS_TOKEN || '';
|
|
220
|
+
const cluster = this.config.volcanoCluster || process.env.VOLC_CLUSTER || 'volcengine_input_common';
|
|
221
|
+
if (!appId || !token)
|
|
222
|
+
throw new Error('Volcano Engine credentials required (VOLC_APP_ID + VOLC_ACCESS_TOKEN)');
|
|
223
|
+
const audioData = fs.readFileSync(audioPath);
|
|
224
|
+
const base64Audio = audioData.toString('base64');
|
|
225
|
+
const payload = {
|
|
226
|
+
app: { appid: appId, cluster },
|
|
227
|
+
user: { uid: 'opc-agent' },
|
|
228
|
+
audio: {
|
|
229
|
+
format: 'ogg',
|
|
230
|
+
codec: 'opus',
|
|
231
|
+
rate: 16000,
|
|
232
|
+
bits: 16,
|
|
233
|
+
channel: 1,
|
|
234
|
+
},
|
|
235
|
+
request: {
|
|
236
|
+
reqid: `opc-${Date.now()}`,
|
|
237
|
+
sequence: -1,
|
|
238
|
+
nbest: 1,
|
|
239
|
+
text: '',
|
|
240
|
+
},
|
|
241
|
+
data: base64Audio,
|
|
242
|
+
};
|
|
243
|
+
const response = await fetch('https://openspeech.bytedance.com/api/v1/asr', {
|
|
244
|
+
method: 'POST',
|
|
245
|
+
headers: {
|
|
246
|
+
'Content-Type': 'application/json',
|
|
247
|
+
'Authorization': `Bearer; ${token}`,
|
|
248
|
+
},
|
|
249
|
+
body: JSON.stringify(payload),
|
|
250
|
+
});
|
|
251
|
+
if (!response.ok) {
|
|
252
|
+
throw new Error(`Volcano STT error (${response.status}): ${await response.text()}`);
|
|
253
|
+
}
|
|
254
|
+
const result = await response.json();
|
|
255
|
+
return result?.result?.[0]?.text?.trim() || result?.result || '';
|
|
141
256
|
}
|
|
142
|
-
async
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
257
|
+
async azureSTT(audioPath) {
|
|
258
|
+
// Azure Cognitive Services Speech-to-Text REST API
|
|
259
|
+
const key = this.config.azureSpeechKey || process.env.AZURE_SPEECH_KEY || '';
|
|
260
|
+
const region = this.config.azureSpeechRegion || process.env.AZURE_SPEECH_REGION || 'eastasia';
|
|
261
|
+
if (!key)
|
|
262
|
+
throw new Error('Azure Speech key required (AZURE_SPEECH_KEY)');
|
|
263
|
+
const audioData = fs.readFileSync(audioPath);
|
|
264
|
+
const url = `https://${region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=zh-CN&format=detailed`;
|
|
265
|
+
const response = await fetch(url, {
|
|
266
|
+
method: 'POST',
|
|
267
|
+
headers: {
|
|
268
|
+
'Ocp-Apim-Subscription-Key': key,
|
|
269
|
+
'Content-Type': 'audio/ogg; codecs=opus',
|
|
270
|
+
'Accept': 'application/json',
|
|
271
|
+
},
|
|
272
|
+
body: audioData,
|
|
273
|
+
});
|
|
274
|
+
if (!response.ok) {
|
|
275
|
+
throw new Error(`Azure STT error (${response.status}): ${await response.text()}`);
|
|
146
276
|
}
|
|
147
|
-
const
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
277
|
+
const result = await response.json();
|
|
278
|
+
return result?.DisplayText?.trim() || result?.NBest?.[0]?.Display?.trim() || '';
|
|
279
|
+
}
|
|
280
|
+
// ─── TTS Providers ───
|
|
281
|
+
async edgeTTS(text) {
|
|
282
|
+
const voice = this.config.ttsVoice || 'zh-CN-XiaoxiaoNeural';
|
|
283
|
+
const outPath = path.join(this.config.tempDir || '.opc/voice-tmp', `tts-${Date.now()}.mp3`);
|
|
284
|
+
// edge-tts is a Python package: pip install edge-tts
|
|
151
285
|
return new Promise((resolve, reject) => {
|
|
152
|
-
const
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
const audioChunks = [];
|
|
159
|
-
let headerSent = false;
|
|
160
|
-
ws.on('open', () => {
|
|
161
|
-
// Send config
|
|
162
|
-
ws.send(`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"false"},"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}`);
|
|
163
|
-
// Send SSML
|
|
164
|
-
const ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='${voice}'>${escapeXml(text)}</voice></speak>`;
|
|
165
|
-
ws.send(`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${timestamp}\r\nPath:ssml\r\n\r\n${ssml}`);
|
|
166
|
-
});
|
|
167
|
-
ws.on('message', (data) => {
|
|
168
|
-
if (typeof data === 'string' || (Buffer.isBuffer(data) && data.toString().includes('Path:turn.end'))) {
|
|
169
|
-
if (typeof data === 'string' && data.includes('Path:turn.end')) {
|
|
170
|
-
ws.close();
|
|
171
|
-
resolve(Buffer.concat(audioChunks));
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
else if (Buffer.isBuffer(data)) {
|
|
175
|
-
// Binary message — extract audio after header
|
|
176
|
-
const headerEnd = data.indexOf(Buffer.from('\r\n\r\n'));
|
|
177
|
-
if (headerEnd !== -1) {
|
|
178
|
-
audioChunks.push(data.slice(headerEnd + 4));
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
});
|
|
182
|
-
ws.on('error', (err) => {
|
|
183
|
-
reject(new Error(`Edge TTS WebSocket error: ${err.message}`));
|
|
184
|
-
});
|
|
185
|
-
ws.on('close', () => {
|
|
186
|
-
if (audioChunks.length > 0) {
|
|
187
|
-
resolve(Buffer.concat(audioChunks));
|
|
188
|
-
}
|
|
189
|
-
});
|
|
190
|
-
// Timeout
|
|
191
|
-
setTimeout(() => {
|
|
192
|
-
ws.close();
|
|
193
|
-
if (audioChunks.length > 0) {
|
|
194
|
-
resolve(Buffer.concat(audioChunks));
|
|
286
|
+
const escaped = text.replace(/"/g, '\\"').replace(/\n/g, ' ');
|
|
287
|
+
(0, child_process_1.exec)(`edge-tts --voice "${voice}" --text "${escaped}" --write-media "${outPath}"`, {
|
|
288
|
+
timeout: 30000,
|
|
289
|
+
}, (err) => {
|
|
290
|
+
if (err) {
|
|
291
|
+
reject(new Error(`edge-tts failed: ${err.message}. Install with: pip install edge-tts`));
|
|
195
292
|
}
|
|
196
293
|
else {
|
|
197
|
-
|
|
294
|
+
resolve(outPath);
|
|
198
295
|
}
|
|
199
|
-
}, 30000);
|
|
200
|
-
});
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
exports.EdgeTTSProvider = EdgeTTSProvider;
|
|
204
|
-
function escapeXml(text) {
|
|
205
|
-
return text.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, ''');
|
|
206
|
-
}
|
|
207
|
-
// ── OpenAI TTS Provider ─────────────────────────────────────
|
|
208
|
-
class OpenAITTSProvider {
|
|
209
|
-
name = 'openai-tts';
|
|
210
|
-
apiKey;
|
|
211
|
-
defaultVoice;
|
|
212
|
-
constructor(apiKey, voice) {
|
|
213
|
-
this.apiKey = apiKey;
|
|
214
|
-
this.defaultVoice = voice ?? 'alloy';
|
|
215
|
-
}
|
|
216
|
-
async synthesize(text, options) {
|
|
217
|
-
const voice = options?.voice ?? this.defaultVoice;
|
|
218
|
-
const body = JSON.stringify({
|
|
219
|
-
model: 'tts-1',
|
|
220
|
-
input: text,
|
|
221
|
-
voice,
|
|
222
|
-
speed: options?.speed ?? 1.0,
|
|
223
|
-
});
|
|
224
|
-
return new Promise((resolve, reject) => {
|
|
225
|
-
const req = https.request({
|
|
226
|
-
hostname: 'api.openai.com',
|
|
227
|
-
path: '/v1/audio/speech',
|
|
228
|
-
method: 'POST',
|
|
229
|
-
headers: {
|
|
230
|
-
'Authorization': `Bearer ${this.apiKey}`,
|
|
231
|
-
'Content-Type': 'application/json',
|
|
232
|
-
'Content-Length': Buffer.byteLength(body),
|
|
233
|
-
},
|
|
234
|
-
}, (res) => {
|
|
235
|
-
const chunks = [];
|
|
236
|
-
res.on('data', (c) => chunks.push(c));
|
|
237
|
-
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
238
296
|
});
|
|
239
|
-
req.on('error', reject);
|
|
240
|
-
req.write(body);
|
|
241
|
-
req.end();
|
|
242
297
|
});
|
|
243
298
|
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
299
|
+
async openaiTTS(text) {
|
|
300
|
+
const apiKey = this.config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
301
|
+
const baseUrl = this.config.openaiBaseUrl || process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
|
|
302
|
+
if (!apiKey)
|
|
303
|
+
throw new Error('OpenAI API key required for TTS');
|
|
304
|
+
const outPath = path.join(this.config.tempDir || '.opc/voice-tmp', `tts-${Date.now()}.mp3`);
|
|
305
|
+
const response = await fetch(`${baseUrl}/audio/speech`, {
|
|
306
|
+
method: 'POST',
|
|
307
|
+
headers: {
|
|
308
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
309
|
+
'Content-Type': 'application/json',
|
|
310
|
+
},
|
|
311
|
+
body: JSON.stringify({
|
|
312
|
+
model: 'tts-1',
|
|
313
|
+
voice: 'nova',
|
|
314
|
+
input: text,
|
|
315
|
+
}),
|
|
260
316
|
});
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
headers: {
|
|
267
|
-
'xi-api-key': this.apiKey,
|
|
268
|
-
'Content-Type': 'application/json',
|
|
269
|
-
'Content-Length': Buffer.byteLength(body),
|
|
270
|
-
},
|
|
271
|
-
}, (res) => {
|
|
272
|
-
const chunks = [];
|
|
273
|
-
res.on('data', (c) => chunks.push(c));
|
|
274
|
-
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
275
|
-
});
|
|
276
|
-
req.on('error', reject);
|
|
277
|
-
req.write(body);
|
|
278
|
-
req.end();
|
|
279
|
-
});
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
exports.ElevenLabsTTSProvider = ElevenLabsTTSProvider;
|
|
283
|
-
// ── Voice Config Factory ────────────────────────────────────
|
|
284
|
-
function createVoiceProviders(config) {
|
|
285
|
-
let stt;
|
|
286
|
-
let tts;
|
|
287
|
-
switch (config.sttProvider) {
|
|
288
|
-
case 'whisper':
|
|
289
|
-
if (config.sttApiKey)
|
|
290
|
-
stt = new WhisperSTTProvider(config.sttApiKey);
|
|
291
|
-
break;
|
|
292
|
-
case 'deepgram':
|
|
293
|
-
if (config.sttApiKey)
|
|
294
|
-
stt = new DeepgramSTTProvider(config.sttApiKey);
|
|
295
|
-
break;
|
|
296
|
-
case 'web-speech':
|
|
297
|
-
// Browser only — not available in Node.js
|
|
298
|
-
break;
|
|
299
|
-
}
|
|
300
|
-
switch (config.ttsProvider) {
|
|
301
|
-
case 'edge-tts':
|
|
302
|
-
tts = new EdgeTTSProvider(config.voice);
|
|
303
|
-
break;
|
|
304
|
-
case 'openai-tts':
|
|
305
|
-
if (config.ttsApiKey)
|
|
306
|
-
tts = new OpenAITTSProvider(config.ttsApiKey, config.voice);
|
|
307
|
-
break;
|
|
308
|
-
case 'elevenlabs':
|
|
309
|
-
if (config.ttsApiKey)
|
|
310
|
-
tts = new ElevenLabsTTSProvider(config.ttsApiKey, config.voice);
|
|
311
|
-
break;
|
|
312
|
-
}
|
|
313
|
-
return { stt, tts };
|
|
314
|
-
}
|
|
315
|
-
// ── Voice Channel ───────────────────────────────────────────
|
|
316
|
-
class VoiceChannel extends index_1.BaseChannel {
|
|
317
|
-
type = 'voice';
|
|
318
|
-
config;
|
|
319
|
-
logger = new logger_1.Logger('voice-channel');
|
|
320
|
-
running = false;
|
|
321
|
-
conversationActive = false;
|
|
322
|
-
constructor(config) {
|
|
323
|
-
super();
|
|
324
|
-
this.config = config ?? {};
|
|
317
|
+
if (!response.ok)
|
|
318
|
+
throw new Error(`OpenAI TTS error: ${response.status}`);
|
|
319
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
320
|
+
fs.writeFileSync(outPath, buffer);
|
|
321
|
+
return outPath;
|
|
325
322
|
}
|
|
326
|
-
async
|
|
327
|
-
|
|
328
|
-
this.
|
|
329
|
-
|
|
330
|
-
|
|
323
|
+
async volcanoTTS(text) {
|
|
324
|
+
// 火山引擎语音合成 HTTP API
|
|
325
|
+
const appId = this.config.volcanoAppId || process.env.VOLC_APP_ID || '';
|
|
326
|
+
const token = this.config.volcanoToken || process.env.VOLC_ACCESS_TOKEN || '';
|
|
327
|
+
const cluster = this.config.volcanoCluster || process.env.VOLC_CLUSTER || 'volcengine_tts';
|
|
328
|
+
if (!appId || !token)
|
|
329
|
+
throw new Error('Volcano Engine credentials required');
|
|
330
|
+
const outPath = path.join(this.config.tempDir || '.opc/voice-tmp', `tts-${Date.now()}.mp3`);
|
|
331
|
+
const voice = this.config.ttsVoice || 'zh_female_shuangkuaisisi_moon_bigtts';
|
|
332
|
+
const payload = {
|
|
333
|
+
app: { appid: appId, cluster },
|
|
334
|
+
user: { uid: 'opc-agent' },
|
|
335
|
+
audio: { voice_type: voice, encoding: 'mp3', speed_ratio: 1.0 },
|
|
336
|
+
request: { reqid: `opc-${Date.now()}`, operation: 'query', text },
|
|
337
|
+
};
|
|
338
|
+
const response = await fetch('https://openspeech.bytedance.com/api/v1/tts', {
|
|
339
|
+
method: 'POST',
|
|
340
|
+
headers: {
|
|
341
|
+
'Content-Type': 'application/json',
|
|
342
|
+
'Authorization': `Bearer; ${token}`,
|
|
343
|
+
},
|
|
344
|
+
body: JSON.stringify(payload),
|
|
331
345
|
});
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
return this.running;
|
|
340
|
-
}
|
|
341
|
-
/** Transcribe audio to text */
|
|
342
|
-
async transcribe(audio, format) {
|
|
343
|
-
if (!this.config.sttProvider) {
|
|
344
|
-
throw new Error('No STT provider configured');
|
|
345
|
-
}
|
|
346
|
-
return this.config.sttProvider.transcribe(audio, { language: this.config.language });
|
|
347
|
-
}
|
|
348
|
-
/** Synthesize text to audio */
|
|
349
|
-
async synthesize(text, voice) {
|
|
350
|
-
if (!this.config.ttsProvider) {
|
|
351
|
-
throw new Error('No TTS provider configured');
|
|
346
|
+
if (!response.ok)
|
|
347
|
+
throw new Error(`Volcano TTS error: ${response.status}`);
|
|
348
|
+
const result = await response.json();
|
|
349
|
+
if (result?.data) {
|
|
350
|
+
const audioBuffer = Buffer.from(result.data, 'base64');
|
|
351
|
+
fs.writeFileSync(outPath, audioBuffer);
|
|
352
|
+
return outPath;
|
|
352
353
|
}
|
|
353
|
-
|
|
354
|
+
throw new Error('Volcano TTS returned no audio data');
|
|
354
355
|
}
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
this.
|
|
361
|
-
|
|
362
|
-
|
|
356
|
+
async azureTTS(text) {
|
|
357
|
+
const key = this.config.azureSpeechKey || process.env.AZURE_SPEECH_KEY || '';
|
|
358
|
+
const region = this.config.azureSpeechRegion || process.env.AZURE_SPEECH_REGION || 'eastasia';
|
|
359
|
+
if (!key)
|
|
360
|
+
throw new Error('Azure Speech key required');
|
|
361
|
+
const outPath = path.join(this.config.tempDir || '.opc/voice-tmp', `tts-${Date.now()}.mp3`);
|
|
362
|
+
const voice = this.config.ttsVoice || 'zh-CN-XiaoxiaoNeural';
|
|
363
|
+
const ssml = `<speak version='1.0' xml:lang='zh-CN'><voice name='${voice}'>${text.replace(/&/g, '&').replace(/</g, '<')}</voice></speak>`;
|
|
364
|
+
const response = await fetch(`https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`, {
|
|
365
|
+
method: 'POST',
|
|
366
|
+
headers: {
|
|
367
|
+
'Ocp-Apim-Subscription-Key': key,
|
|
368
|
+
'Content-Type': 'application/ssml+xml',
|
|
369
|
+
'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
|
|
370
|
+
},
|
|
371
|
+
body: ssml,
|
|
372
|
+
});
|
|
373
|
+
if (!response.ok)
|
|
374
|
+
throw new Error(`Azure TTS error: ${response.status}`);
|
|
375
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
376
|
+
fs.writeFileSync(outPath, buffer);
|
|
377
|
+
return outPath;
|
|
363
378
|
}
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if (this.config.ttsProvider) {
|
|
370
|
-
audioResponse = await this.synthesize(response);
|
|
379
|
+
// ─── Helpers ───
|
|
380
|
+
checkEdgeTTS() {
|
|
381
|
+
try {
|
|
382
|
+
(0, child_process_1.execSync)('edge-tts --version', { stdio: 'pipe', timeout: 5000 });
|
|
383
|
+
return true;
|
|
371
384
|
}
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
stopConversation() {
|
|
375
|
-
this.conversationActive = false;
|
|
376
|
-
}
|
|
377
|
-
isConversationActive() {
|
|
378
|
-
return this.conversationActive;
|
|
379
|
-
}
|
|
380
|
-
/** Process audio input: STT → Agent → TTS */
|
|
381
|
-
async processAudio(audio) {
|
|
382
|
-
if (!this.handler)
|
|
383
|
-
throw new Error('No message handler set');
|
|
384
|
-
// STT
|
|
385
|
-
let text;
|
|
386
|
-
if (this.config.sttProvider) {
|
|
387
|
-
text = await this.config.sttProvider.transcribe(audio, { language: this.config.language });
|
|
385
|
+
catch {
|
|
386
|
+
return false;
|
|
388
387
|
}
|
|
389
|
-
|
|
390
|
-
|
|
388
|
+
}
|
|
389
|
+
checkOllamaWhisper() {
|
|
390
|
+
try {
|
|
391
|
+
(0, child_process_1.execSync)('whisper --help', { stdio: 'pipe', timeout: 5000 });
|
|
392
|
+
return true;
|
|
391
393
|
}
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
const message = {
|
|
395
|
-
id: `voice_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
|
396
|
-
role: 'user',
|
|
397
|
-
content: text,
|
|
398
|
-
timestamp: Date.now(),
|
|
399
|
-
metadata: { channel: 'voice' },
|
|
400
|
-
};
|
|
401
|
-
const response = await this.handler(message);
|
|
402
|
-
// TTS
|
|
403
|
-
let audioResponse;
|
|
404
|
-
if (this.config.ttsProvider) {
|
|
405
|
-
audioResponse = await this.config.ttsProvider.synthesize(response.content, { language: this.config.language });
|
|
394
|
+
catch {
|
|
395
|
+
return false;
|
|
406
396
|
}
|
|
407
|
-
return { text, response: response.content, audioResponse };
|
|
408
397
|
}
|
|
409
|
-
|
|
410
|
-
|
|
398
|
+
/** Download a file from URL to local path */
|
|
399
|
+
async downloadFile(url, destPath) {
|
|
400
|
+
return new Promise((resolve, reject) => {
|
|
401
|
+
const client = url.startsWith('https') ? https : http;
|
|
402
|
+
const req = client.get(url, (res) => {
|
|
403
|
+
if (res.statusCode === 301 || res.statusCode === 302) {
|
|
404
|
+
// Follow redirect
|
|
405
|
+
this.downloadFile(res.headers.location, destPath).then(resolve).catch(reject);
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
const ws = fs.createWriteStream(destPath);
|
|
409
|
+
res.pipe(ws);
|
|
410
|
+
ws.on('finish', () => { ws.close(); resolve(); });
|
|
411
|
+
ws.on('error', reject);
|
|
412
|
+
});
|
|
413
|
+
req.on('error', reject);
|
|
414
|
+
req.setTimeout(30000, () => { req.destroy(); reject(new Error('Download timeout')); });
|
|
415
|
+
});
|
|
411
416
|
}
|
|
412
|
-
|
|
413
|
-
|
|
417
|
+
/** Cleanup temp files */
|
|
418
|
+
cleanup() {
|
|
419
|
+
const dir = this.config.tempDir || '.opc/voice-tmp';
|
|
420
|
+
try {
|
|
421
|
+
const files = fs.readdirSync(dir);
|
|
422
|
+
const now = Date.now();
|
|
423
|
+
for (const f of files) {
|
|
424
|
+
const fp = path.join(dir, f);
|
|
425
|
+
const stat = fs.statSync(fp);
|
|
426
|
+
// Remove files older than 1 hour
|
|
427
|
+
if (now - stat.mtimeMs > 3600000) {
|
|
428
|
+
fs.unlinkSync(fp);
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
catch { /* ignore */ }
|
|
414
433
|
}
|
|
415
434
|
}
|
|
416
|
-
exports.
|
|
435
|
+
exports.VoiceProcessor = VoiceProcessor;
|
|
436
|
+
function createVoiceProcessor(config) {
|
|
437
|
+
return new VoiceProcessor(config);
|
|
438
|
+
}
|
|
417
439
|
//# sourceMappingURL=voice.js.map
|