opc-agent 4.1.23 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -119
- package/COMPETITIVE-GAP.md +92 -92
- package/CONTRIBUTING.md +36 -36
- package/README.md +290 -290
- package/README.zh-CN.md +269 -269
- package/dist/channels/telegram.d.ts +0 -5
- package/dist/channels/telegram.d.ts.map +1 -1
- package/dist/channels/telegram.js +0 -108
- package/dist/channels/telegram.js.map +1 -1
- package/dist/channels/voice.d.ts +97 -71
- package/dist/channels/voice.d.ts.map +1 -1
- package/dist/channels/voice.js +347 -369
- package/dist/channels/voice.js.map +1 -1
- package/dist/channels/web.d.ts.map +1 -1
- package/dist/channels/web.js +2 -8
- package/dist/channels/web.js.map +1 -1
- package/dist/channels/wechat.js +6 -6
- package/dist/cli/chat.d.ts +1 -4
- package/dist/cli/chat.d.ts.map +1 -1
- package/dist/cli/chat.js +73 -680
- package/dist/cli/chat.js.map +1 -1
- package/dist/cli/setup.js +1 -1
- package/dist/cli/setup.js.map +1 -1
- package/dist/cli.js +280 -373
- package/dist/cli.js.map +1 -1
- package/dist/core/agent.d.ts +0 -1
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +0 -3
- package/dist/core/agent.js.map +1 -1
- package/dist/core/runtime.d.ts.map +1 -1
- package/dist/core/runtime.js +22 -192
- package/dist/core/runtime.js.map +1 -1
- package/dist/deploy/index.js +56 -56
- package/dist/doctor.d.ts +0 -1
- package/dist/doctor.d.ts.map +1 -1
- package/dist/doctor.js +10 -155
- package/dist/doctor.js.map +1 -1
- package/dist/index.d.ts +3 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +9 -9
- package/dist/index.js.map +1 -1
- package/dist/memory/deepbrain.d.ts +1 -1
- package/dist/memory/deepbrain.d.ts.map +1 -1
- package/dist/memory/deepbrain.js +4 -95
- package/dist/memory/deepbrain.js.map +1 -1
- package/dist/memory/index.d.ts +0 -2
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +1 -3
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/user-profiler.d.ts +0 -8
- package/dist/memory/user-profiler.d.ts.map +1 -1
- package/dist/memory/user-profiler.js +0 -89
- package/dist/memory/user-profiler.js.map +1 -1
- package/dist/scheduler/cron-engine.d.ts.map +1 -1
- package/dist/scheduler/cron-engine.js +36 -3
- package/dist/scheduler/cron-engine.js.map +1 -1
- package/dist/skills/auto-learn.d.ts.map +1 -1
- package/dist/skills/auto-learn.js +11 -65
- package/dist/skills/auto-learn.js.map +1 -1
- package/dist/skills/builtin/index.d.ts.map +1 -1
- package/dist/skills/builtin/index.js +30 -163
- package/dist/skills/builtin/index.js.map +1 -1
- package/dist/skills/types.d.ts +1 -1
- package/dist/skills/types.d.ts.map +1 -1
- package/dist/skills/types.js +0 -1
- package/dist/skills/types.js.map +1 -1
- package/dist/studio/server.d.ts +0 -1
- package/dist/studio/server.d.ts.map +1 -1
- package/dist/studio/server.js +12 -142
- package/dist/studio/server.js.map +1 -1
- package/dist/studio-ui/index.html +26 -365
- package/dist/ui/components.js +105 -105
- package/examples/README.md +22 -22
- package/examples/basic-agent.ts +90 -90
- package/examples/brain-integration.ts +71 -71
- package/examples/multi-channel.ts +74 -74
- package/install.ps1 +127 -127
- package/install.sh +154 -154
- package/models.json +164 -164
- package/package.json +63 -66
- package/scripts/install.ps1 +31 -31
- package/scripts/install.sh +40 -40
- package/templates/ecommerce-assistant/README.md +45 -45
- package/templates/ecommerce-assistant/oad.yaml +47 -47
- package/templates/tech-support/README.md +43 -43
- package/templates/tech-support/oad.yaml +45 -45
- package/.opc/memory.db +0 -0
- package/dist/core/model-recommender.d.ts +0 -40
- package/dist/core/model-recommender.d.ts.map +0 -1
- package/dist/core/model-recommender.js +0 -186
- package/dist/core/model-recommender.js.map +0 -1
- package/dist/memory/evolve-engine.d.ts +0 -113
- package/dist/memory/evolve-engine.d.ts.map +0 -1
- package/dist/memory/evolve-engine.js +0 -549
- package/dist/memory/evolve-engine.js.map +0 -1
- package/dist/memory/sqlite-store.d.ts +0 -40
- package/dist/memory/sqlite-store.d.ts.map +0 -1
- package/dist/memory/sqlite-store.js +0 -269
- package/dist/memory/sqlite-store.js.map +0 -1
- package/dist/scheduler/proactive.d.ts +0 -62
- package/dist/scheduler/proactive.d.ts.map +0 -1
- package/dist/scheduler/proactive.js +0 -185
- package/dist/scheduler/proactive.js.map +0 -1
package/dist/channels/voice.js
CHANGED
|
@@ -1,20 +1,4 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* Voice processing module — STT (Speech-to-Text) and TTS (Text-to-Speech)
|
|
4
|
-
*
|
|
5
|
-
* STT providers:
|
|
6
|
-
* 1. Edge STT (free, via edge-stt or Whisper local)
|
|
7
|
-
* 2. Volcano Engine / Doubao STT (best Chinese, ~¥0.01/req)
|
|
8
|
-
* 3. OpenAI Whisper API ($0.006/min)
|
|
9
|
-
* 4. Local Whisper (free, needs model download)
|
|
10
|
-
*
|
|
11
|
-
* TTS providers:
|
|
12
|
-
* 1. edge-tts (free, Microsoft voices, excellent Chinese)
|
|
13
|
-
* 2. Volcano Engine TTS (Doubao voices)
|
|
14
|
-
* 3. OpenAI TTS ($0.015/1K chars)
|
|
15
|
-
*
|
|
16
|
-
* Default: Whisper API for STT + edge-tts for TTS (best quality/cost ratio)
|
|
17
|
-
*/
|
|
18
2
|
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
19
3
|
if (k2 === undefined) k2 = k;
|
|
20
4
|
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
@@ -49,391 +33,385 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
49
33
|
};
|
|
50
34
|
})();
|
|
51
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
52
|
-
exports.
|
|
53
|
-
exports.
|
|
54
|
-
const
|
|
55
|
-
const
|
|
56
|
-
const path = __importStar(require("path"));
|
|
36
|
+
exports.VoiceChannel = exports.ElevenLabsTTSProvider = exports.OpenAITTSProvider = exports.EdgeTTSProvider = exports.DeepgramSTTProvider = exports.WhisperSTTProvider = void 0;
|
|
37
|
+
exports.createVoiceProviders = createVoiceProviders;
|
|
38
|
+
const index_1 = require("./index");
|
|
39
|
+
const logger_1 = require("../core/logger");
|
|
57
40
|
const https = __importStar(require("https"));
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
if (config.volcanoAppId || process.env.VOLC_APP_ID)
|
|
65
|
-
return 'volcano';
|
|
66
|
-
if (config.azureSpeechKey || process.env.AZURE_SPEECH_KEY)
|
|
67
|
-
return 'azure';
|
|
68
|
-
if (config.openaiApiKey || process.env.OPENAI_API_KEY)
|
|
69
|
-
return 'whisper-api';
|
|
70
|
-
return 'none';
|
|
71
|
-
}
|
|
72
|
-
/** Auto-detect best available TTS provider */
|
|
73
|
-
function detectTTSProvider(config) {
|
|
74
|
-
if (config.ttsProvider && config.ttsProvider !== 'none')
|
|
75
|
-
return config.ttsProvider;
|
|
76
|
-
// Priority: edge-tts (free) → volcano → azure → openai-tts → none
|
|
77
|
-
return 'edge-tts'; // always try edge-tts first (free, best quality)
|
|
78
|
-
}
|
|
79
|
-
const DEFAULT_CONFIG = {
|
|
80
|
-
sttProvider: 'none', // will be auto-detected
|
|
81
|
-
ttsProvider: 'edge-tts',
|
|
82
|
-
ttsVoice: 'zh-CN-XiaoxiaoNeural',
|
|
83
|
-
ttsLang: 'zh-CN',
|
|
84
|
-
tempDir: '.opc/voice-tmp',
|
|
85
|
-
};
|
|
86
|
-
class VoiceProcessor {
|
|
87
|
-
config;
|
|
88
|
-
constructor(config) {
|
|
89
|
-
this.config = {
|
|
90
|
-
...DEFAULT_CONFIG,
|
|
91
|
-
...config,
|
|
92
|
-
sttProvider: detectSTTProvider(config || {}),
|
|
93
|
-
ttsProvider: detectTTSProvider(config || {}),
|
|
94
|
-
};
|
|
95
|
-
const dir = this.config.tempDir || '.opc/voice-tmp';
|
|
96
|
-
if (!fs.existsSync(dir))
|
|
97
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
41
|
+
// ── Whisper STT Provider ────────────────────────────────────
|
|
42
|
+
class WhisperSTTProvider {
|
|
43
|
+
name = 'whisper';
|
|
44
|
+
apiKey;
|
|
45
|
+
constructor(apiKey) {
|
|
46
|
+
this.apiKey = apiKey;
|
|
98
47
|
}
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
*/
|
|
104
|
-
async speechToText(audioPath) {
|
|
105
|
-
switch (this.config.sttProvider) {
|
|
106
|
-
case 'whisper-api':
|
|
107
|
-
return this.whisperApiSTT(audioPath);
|
|
108
|
-
case 'whisper-local':
|
|
109
|
-
return this.whisperLocalSTT(audioPath);
|
|
110
|
-
case 'volcano':
|
|
111
|
-
return this.volcanoSTT(audioPath);
|
|
112
|
-
case 'azure':
|
|
113
|
-
return this.azureSTT(audioPath);
|
|
114
|
-
default:
|
|
115
|
-
throw new Error(`STT not configured. Set voice.sttProvider in config.`);
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
/**
|
|
119
|
-
* Convert text to speech audio (TTS)
|
|
120
|
-
* @param text Text to convert
|
|
121
|
-
* @returns Path to generated audio file (.mp3)
|
|
122
|
-
*/
|
|
123
|
-
async textToSpeech(text) {
|
|
124
|
-
switch (this.config.ttsProvider) {
|
|
125
|
-
case 'edge-tts':
|
|
126
|
-
return this.edgeTTS(text);
|
|
127
|
-
case 'openai-tts':
|
|
128
|
-
return this.openaiTTS(text);
|
|
129
|
-
case 'volcano':
|
|
130
|
-
return this.volcanoTTS(text);
|
|
131
|
-
case 'azure':
|
|
132
|
-
return this.azureTTS(text);
|
|
133
|
-
default:
|
|
134
|
-
throw new Error(`TTS not configured. Set voice.ttsProvider in config.`);
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
/** Check if voice processing is available */
|
|
138
|
-
isSTTAvailable() {
|
|
139
|
-
if (this.config.sttProvider === 'none')
|
|
140
|
-
return false;
|
|
141
|
-
if (this.config.sttProvider === 'whisper-api') {
|
|
142
|
-
return !!(this.config.openaiApiKey || process.env.OPENAI_API_KEY);
|
|
143
|
-
}
|
|
144
|
-
if (this.config.sttProvider === 'azure') {
|
|
145
|
-
return !!(this.config.azureSpeechKey || process.env.AZURE_SPEECH_KEY);
|
|
146
|
-
}
|
|
147
|
-
if (this.config.sttProvider === 'volcano') {
|
|
148
|
-
return !!(this.config.volcanoAppId || process.env.VOLC_APP_ID);
|
|
149
|
-
}
|
|
150
|
-
if (this.config.sttProvider === 'whisper-local') {
|
|
151
|
-
return this.checkOllamaWhisper();
|
|
152
|
-
}
|
|
153
|
-
return true;
|
|
154
|
-
}
|
|
155
|
-
isTTSAvailable() {
|
|
156
|
-
if (this.config.ttsProvider === 'none')
|
|
157
|
-
return false;
|
|
158
|
-
if (this.config.ttsProvider === 'edge-tts')
|
|
159
|
-
return this.checkEdgeTTS();
|
|
160
|
-
return true;
|
|
161
|
-
}
|
|
162
|
-
// ─── STT Providers ───
|
|
163
|
-
async whisperApiSTT(audioPath) {
|
|
164
|
-
const apiKey = this.config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
165
|
-
const baseUrl = this.config.openaiBaseUrl || process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1';
|
|
166
|
-
if (!apiKey)
|
|
167
|
-
throw new Error('OpenAI API key required for Whisper STT');
|
|
168
|
-
// Use multipart/form-data with fetch
|
|
169
|
-
const fileBuffer = fs.readFileSync(audioPath);
|
|
170
|
-
const fileName = path.basename(audioPath);
|
|
171
|
-
// Build multipart body manually
|
|
172
|
-
const boundary = '----OPCVoice' + Date.now();
|
|
48
|
+
async transcribe(audio, options) {
|
|
49
|
+
const FormData = (await Promise.resolve(`${'form-data'}`).then(s => __importStar(require(s))).catch(() => null));
|
|
50
|
+
// Build multipart form data manually
|
|
51
|
+
const boundary = '----OPCBoundary' + Date.now().toString(36);
|
|
173
52
|
const parts = [];
|
|
174
|
-
// file
|
|
175
|
-
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="
|
|
176
|
-
parts.push(
|
|
53
|
+
// file field
|
|
54
|
+
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="file"; filename="audio.wav"\r\nContent-Type: audio/wav\r\n\r\n`));
|
|
55
|
+
parts.push(audio);
|
|
177
56
|
parts.push(Buffer.from('\r\n'));
|
|
178
|
-
// model
|
|
57
|
+
// model field
|
|
179
58
|
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="model"\r\n\r\nwhisper-1\r\n`));
|
|
180
|
-
// language
|
|
181
|
-
|
|
59
|
+
// language field
|
|
60
|
+
if (options?.language) {
|
|
61
|
+
parts.push(Buffer.from(`--${boundary}\r\nContent-Disposition: form-data; name="language"\r\n\r\n${options.language}\r\n`));
|
|
62
|
+
}
|
|
182
63
|
parts.push(Buffer.from(`--${boundary}--\r\n`));
|
|
183
64
|
const body = Buffer.concat(parts);
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
'
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
65
|
+
return new Promise((resolve, reject) => {
|
|
66
|
+
const req = https.request({
|
|
67
|
+
hostname: 'api.openai.com',
|
|
68
|
+
path: '/v1/audio/transcriptions',
|
|
69
|
+
method: 'POST',
|
|
70
|
+
headers: {
|
|
71
|
+
'Authorization': `Bearer ${this.apiKey}`,
|
|
72
|
+
'Content-Type': `multipart/form-data; boundary=${boundary}`,
|
|
73
|
+
'Content-Length': body.length,
|
|
74
|
+
},
|
|
75
|
+
}, (res) => {
|
|
76
|
+
const chunks = [];
|
|
77
|
+
res.on('data', (c) => chunks.push(c));
|
|
78
|
+
res.on('end', () => {
|
|
79
|
+
try {
|
|
80
|
+
const data = JSON.parse(Buffer.concat(chunks).toString());
|
|
81
|
+
resolve(data.text ?? '');
|
|
82
|
+
}
|
|
83
|
+
catch (e) {
|
|
84
|
+
reject(new Error('Failed to parse Whisper response'));
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
req.on('error', reject);
|
|
89
|
+
req.write(body);
|
|
90
|
+
req.end();
|
|
192
91
|
});
|
|
193
|
-
if (!response.ok) {
|
|
194
|
-
const err = await response.text();
|
|
195
|
-
throw new Error(`Whisper API error (${response.status}): ${err}`);
|
|
196
|
-
}
|
|
197
|
-
const result = await response.json();
|
|
198
|
-
return result.text?.trim() || '';
|
|
199
92
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
timeout: 30000,
|
|
209
|
-
});
|
|
210
|
-
return result.trim();
|
|
211
|
-
}
|
|
212
|
-
catch {
|
|
213
|
-
throw new Error('Local Whisper not available. Install whisper.cpp or use whisper-api provider.');
|
|
214
|
-
}
|
|
93
|
+
}
|
|
94
|
+
exports.WhisperSTTProvider = WhisperSTTProvider;
|
|
95
|
+
// ── Deepgram STT Provider ───────────────────────────────────
|
|
96
|
+
class DeepgramSTTProvider {
|
|
97
|
+
name = 'deepgram';
|
|
98
|
+
apiKey;
|
|
99
|
+
constructor(apiKey) {
|
|
100
|
+
this.apiKey = apiKey;
|
|
215
101
|
}
|
|
216
|
-
async
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
headers: {
|
|
246
|
-
'Content-Type': 'application/json',
|
|
247
|
-
'Authorization': `Bearer; ${token}`,
|
|
248
|
-
},
|
|
249
|
-
body: JSON.stringify(payload),
|
|
102
|
+
async transcribe(audio, options) {
|
|
103
|
+
const lang = options?.language ?? 'en';
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
const req = https.request({
|
|
106
|
+
hostname: 'api.deepgram.com',
|
|
107
|
+
path: `/v1/listen?language=${lang}&model=nova-2`,
|
|
108
|
+
method: 'POST',
|
|
109
|
+
headers: {
|
|
110
|
+
'Authorization': `Token ${this.apiKey}`,
|
|
111
|
+
'Content-Type': 'audio/wav',
|
|
112
|
+
'Content-Length': audio.length,
|
|
113
|
+
},
|
|
114
|
+
}, (res) => {
|
|
115
|
+
const chunks = [];
|
|
116
|
+
res.on('data', (c) => chunks.push(c));
|
|
117
|
+
res.on('end', () => {
|
|
118
|
+
try {
|
|
119
|
+
const data = JSON.parse(Buffer.concat(chunks).toString());
|
|
120
|
+
const transcript = data?.results?.channels?.[0]?.alternatives?.[0]?.transcript ?? '';
|
|
121
|
+
resolve(transcript);
|
|
122
|
+
}
|
|
123
|
+
catch {
|
|
124
|
+
reject(new Error('Failed to parse Deepgram response'));
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
});
|
|
128
|
+
req.on('error', reject);
|
|
129
|
+
req.write(audio);
|
|
130
|
+
req.end();
|
|
250
131
|
});
|
|
251
|
-
if (!response.ok) {
|
|
252
|
-
throw new Error(`Volcano STT error (${response.status}): ${await response.text()}`);
|
|
253
|
-
}
|
|
254
|
-
const result = await response.json();
|
|
255
|
-
return result?.result?.[0]?.text?.trim() || result?.result || '';
|
|
256
132
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
const response = await fetch(url, {
|
|
266
|
-
method: 'POST',
|
|
267
|
-
headers: {
|
|
268
|
-
'Ocp-Apim-Subscription-Key': key,
|
|
269
|
-
'Content-Type': 'audio/ogg; codecs=opus',
|
|
270
|
-
'Accept': 'application/json',
|
|
271
|
-
},
|
|
272
|
-
body: audioData,
|
|
273
|
-
});
|
|
274
|
-
if (!response.ok) {
|
|
275
|
-
throw new Error(`Azure STT error (${response.status}): ${await response.text()}`);
|
|
276
|
-
}
|
|
277
|
-
const result = await response.json();
|
|
278
|
-
return result?.DisplayText?.trim() || result?.NBest?.[0]?.Display?.trim() || '';
|
|
133
|
+
}
|
|
134
|
+
exports.DeepgramSTTProvider = DeepgramSTTProvider;
|
|
135
|
+
// ── Edge TTS Provider (free, no API key) ────────────────────
|
|
136
|
+
class EdgeTTSProvider {
|
|
137
|
+
name = 'edge-tts';
|
|
138
|
+
defaultVoice;
|
|
139
|
+
constructor(voice) {
|
|
140
|
+
this.defaultVoice = voice ?? 'en-US-AriaNeural';
|
|
279
141
|
}
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
142
|
+
async synthesize(text, options) {
|
|
143
|
+
const WebSocket = (await Promise.resolve(`${'ws'}`).then(s => __importStar(require(s))).catch(() => null))?.default;
|
|
144
|
+
if (!WebSocket) {
|
|
145
|
+
throw new Error('ws package required for Edge TTS. Install with: npm i ws');
|
|
146
|
+
}
|
|
147
|
+
const voice = options?.voice ?? this.defaultVoice;
|
|
148
|
+
const requestId = [...Array(32)].map(() => Math.random().toString(16)[2]).join('');
|
|
149
|
+
const timestamp = new Date().toISOString();
|
|
150
|
+
const endpoint = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4&ConnectionId=${requestId}`;
|
|
285
151
|
return new Promise((resolve, reject) => {
|
|
286
|
-
const
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
152
|
+
const ws = new WebSocket(endpoint, {
|
|
153
|
+
headers: {
|
|
154
|
+
'Origin': 'chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold',
|
|
155
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
const audioChunks = [];
|
|
159
|
+
let headerSent = false;
|
|
160
|
+
ws.on('open', () => {
|
|
161
|
+
// Send config
|
|
162
|
+
ws.send(`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"false"},"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}`);
|
|
163
|
+
// Send SSML
|
|
164
|
+
const ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='${voice}'>${escapeXml(text)}</voice></speak>`;
|
|
165
|
+
ws.send(`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${timestamp}\r\nPath:ssml\r\n\r\n${ssml}`);
|
|
166
|
+
});
|
|
167
|
+
ws.on('message', (data) => {
|
|
168
|
+
if (typeof data === 'string' || (Buffer.isBuffer(data) && data.toString().includes('Path:turn.end'))) {
|
|
169
|
+
if (typeof data === 'string' && data.includes('Path:turn.end')) {
|
|
170
|
+
ws.close();
|
|
171
|
+
resolve(Buffer.concat(audioChunks));
|
|
172
|
+
}
|
|
292
173
|
}
|
|
293
|
-
else {
|
|
294
|
-
|
|
174
|
+
else if (Buffer.isBuffer(data)) {
|
|
175
|
+
// Binary message — extract audio after header
|
|
176
|
+
const headerEnd = data.indexOf(Buffer.from('\r\n\r\n'));
|
|
177
|
+
if (headerEnd !== -1) {
|
|
178
|
+
audioChunks.push(data.slice(headerEnd + 4));
|
|
179
|
+
}
|
|
295
180
|
}
|
|
296
181
|
});
|
|
182
|
+
ws.on('error', (err) => {
|
|
183
|
+
reject(new Error(`Edge TTS WebSocket error: ${err.message}`));
|
|
184
|
+
});
|
|
185
|
+
ws.on('close', () => {
|
|
186
|
+
if (audioChunks.length > 0) {
|
|
187
|
+
resolve(Buffer.concat(audioChunks));
|
|
188
|
+
}
|
|
189
|
+
});
|
|
190
|
+
// Timeout
|
|
191
|
+
setTimeout(() => {
|
|
192
|
+
ws.close();
|
|
193
|
+
if (audioChunks.length > 0) {
|
|
194
|
+
resolve(Buffer.concat(audioChunks));
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
reject(new Error('Edge TTS timeout'));
|
|
198
|
+
}
|
|
199
|
+
}, 30000);
|
|
297
200
|
});
|
|
298
201
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
202
|
+
}
|
|
203
|
+
exports.EdgeTTSProvider = EdgeTTSProvider;
|
|
204
|
+
function escapeXml(text) {
|
|
205
|
+
return text.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, ''');
|
|
206
|
+
}
|
|
207
|
+
// ── OpenAI TTS Provider ─────────────────────────────────────
|
|
208
|
+
class OpenAITTSProvider {
|
|
209
|
+
name = 'openai-tts';
|
|
210
|
+
apiKey;
|
|
211
|
+
defaultVoice;
|
|
212
|
+
constructor(apiKey, voice) {
|
|
213
|
+
this.apiKey = apiKey;
|
|
214
|
+
this.defaultVoice = voice ?? 'alloy';
|
|
215
|
+
}
|
|
216
|
+
async synthesize(text, options) {
|
|
217
|
+
const voice = options?.voice ?? this.defaultVoice;
|
|
218
|
+
const body = JSON.stringify({
|
|
219
|
+
model: 'tts-1',
|
|
220
|
+
input: text,
|
|
221
|
+
voice,
|
|
222
|
+
speed: options?.speed ?? 1.0,
|
|
223
|
+
});
|
|
224
|
+
return new Promise((resolve, reject) => {
|
|
225
|
+
const req = https.request({
|
|
226
|
+
hostname: 'api.openai.com',
|
|
227
|
+
path: '/v1/audio/speech',
|
|
228
|
+
method: 'POST',
|
|
229
|
+
headers: {
|
|
230
|
+
'Authorization': `Bearer ${this.apiKey}`,
|
|
231
|
+
'Content-Type': 'application/json',
|
|
232
|
+
'Content-Length': Buffer.byteLength(body),
|
|
233
|
+
},
|
|
234
|
+
}, (res) => {
|
|
235
|
+
const chunks = [];
|
|
236
|
+
res.on('data', (c) => chunks.push(c));
|
|
237
|
+
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
238
|
+
});
|
|
239
|
+
req.on('error', reject);
|
|
240
|
+
req.write(body);
|
|
241
|
+
req.end();
|
|
316
242
|
});
|
|
317
|
-
if (!response.ok)
|
|
318
|
-
throw new Error(`OpenAI TTS error: ${response.status}`);
|
|
319
|
-
const buffer = Buffer.from(await response.arrayBuffer());
|
|
320
|
-
fs.writeFileSync(outPath, buffer);
|
|
321
|
-
return outPath;
|
|
322
243
|
}
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
'
|
|
343
|
-
|
|
344
|
-
|
|
244
|
+
}
|
|
245
|
+
exports.OpenAITTSProvider = OpenAITTSProvider;
|
|
246
|
+
// ── ElevenLabs TTS Provider ─────────────────────────────────
|
|
247
|
+
class ElevenLabsTTSProvider {
|
|
248
|
+
name = 'elevenlabs';
|
|
249
|
+
apiKey;
|
|
250
|
+
defaultVoice;
|
|
251
|
+
constructor(apiKey, voice) {
|
|
252
|
+
this.apiKey = apiKey;
|
|
253
|
+
this.defaultVoice = voice ?? '21m00Tcm4TlvDq8ikWAM'; // Rachel
|
|
254
|
+
}
|
|
255
|
+
async synthesize(text, options) {
|
|
256
|
+
const voiceId = options?.voice ?? this.defaultVoice;
|
|
257
|
+
const body = JSON.stringify({
|
|
258
|
+
text,
|
|
259
|
+
model_id: 'eleven_monolingual_v1',
|
|
260
|
+
});
|
|
261
|
+
return new Promise((resolve, reject) => {
|
|
262
|
+
const req = https.request({
|
|
263
|
+
hostname: 'api.elevenlabs.io',
|
|
264
|
+
path: `/v1/text-to-speech/${voiceId}`,
|
|
265
|
+
method: 'POST',
|
|
266
|
+
headers: {
|
|
267
|
+
'xi-api-key': this.apiKey,
|
|
268
|
+
'Content-Type': 'application/json',
|
|
269
|
+
'Content-Length': Buffer.byteLength(body),
|
|
270
|
+
},
|
|
271
|
+
}, (res) => {
|
|
272
|
+
const chunks = [];
|
|
273
|
+
res.on('data', (c) => chunks.push(c));
|
|
274
|
+
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
275
|
+
});
|
|
276
|
+
req.on('error', reject);
|
|
277
|
+
req.write(body);
|
|
278
|
+
req.end();
|
|
345
279
|
});
|
|
346
|
-
if (!response.ok)
|
|
347
|
-
throw new Error(`Volcano TTS error: ${response.status}`);
|
|
348
|
-
const result = await response.json();
|
|
349
|
-
if (result?.data) {
|
|
350
|
-
const audioBuffer = Buffer.from(result.data, 'base64');
|
|
351
|
-
fs.writeFileSync(outPath, audioBuffer);
|
|
352
|
-
return outPath;
|
|
353
|
-
}
|
|
354
|
-
throw new Error('Volcano TTS returned no audio data');
|
|
355
280
|
}
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
281
|
+
}
|
|
282
|
+
exports.ElevenLabsTTSProvider = ElevenLabsTTSProvider;
|
|
283
|
+
// ── Voice Config Factory ────────────────────────────────────
|
|
284
|
+
function createVoiceProviders(config) {
|
|
285
|
+
let stt;
|
|
286
|
+
let tts;
|
|
287
|
+
switch (config.sttProvider) {
|
|
288
|
+
case 'whisper':
|
|
289
|
+
if (config.sttApiKey)
|
|
290
|
+
stt = new WhisperSTTProvider(config.sttApiKey);
|
|
291
|
+
break;
|
|
292
|
+
case 'deepgram':
|
|
293
|
+
if (config.sttApiKey)
|
|
294
|
+
stt = new DeepgramSTTProvider(config.sttApiKey);
|
|
295
|
+
break;
|
|
296
|
+
case 'web-speech':
|
|
297
|
+
// Browser only — not available in Node.js
|
|
298
|
+
break;
|
|
299
|
+
}
|
|
300
|
+
switch (config.ttsProvider) {
|
|
301
|
+
case 'edge-tts':
|
|
302
|
+
tts = new EdgeTTSProvider(config.voice);
|
|
303
|
+
break;
|
|
304
|
+
case 'openai-tts':
|
|
305
|
+
if (config.ttsApiKey)
|
|
306
|
+
tts = new OpenAITTSProvider(config.ttsApiKey, config.voice);
|
|
307
|
+
break;
|
|
308
|
+
case 'elevenlabs':
|
|
309
|
+
if (config.ttsApiKey)
|
|
310
|
+
tts = new ElevenLabsTTSProvider(config.ttsApiKey, config.voice);
|
|
311
|
+
break;
|
|
312
|
+
}
|
|
313
|
+
return { stt, tts };
|
|
314
|
+
}
|
|
315
|
+
// ── Voice Channel ───────────────────────────────────────────
|
|
316
|
+
class VoiceChannel extends index_1.BaseChannel {
|
|
317
|
+
type = 'voice';
|
|
318
|
+
config;
|
|
319
|
+
logger = new logger_1.Logger('voice-channel');
|
|
320
|
+
running = false;
|
|
321
|
+
conversationActive = false;
|
|
322
|
+
constructor(config) {
|
|
323
|
+
super();
|
|
324
|
+
this.config = config ?? {};
|
|
325
|
+
}
|
|
326
|
+
async start() {
|
|
327
|
+
this.running = true;
|
|
328
|
+
this.logger.info('Voice channel started', {
|
|
329
|
+
stt: this.config.sttProvider?.name ?? 'none',
|
|
330
|
+
tts: this.config.ttsProvider?.name ?? 'none',
|
|
372
331
|
});
|
|
373
|
-
if (!response.ok)
|
|
374
|
-
throw new Error(`Azure TTS error: ${response.status}`);
|
|
375
|
-
const buffer = Buffer.from(await response.arrayBuffer());
|
|
376
|
-
fs.writeFileSync(outPath, buffer);
|
|
377
|
-
return outPath;
|
|
378
332
|
}
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
333
|
+
async stop() {
|
|
334
|
+
this.running = false;
|
|
335
|
+
this.conversationActive = false;
|
|
336
|
+
this.logger.info('Voice channel stopped');
|
|
337
|
+
}
|
|
338
|
+
isRunning() {
|
|
339
|
+
return this.running;
|
|
340
|
+
}
|
|
341
|
+
/** Transcribe audio to text */
|
|
342
|
+
async transcribe(audio, format) {
|
|
343
|
+
if (!this.config.sttProvider) {
|
|
344
|
+
throw new Error('No STT provider configured');
|
|
387
345
|
}
|
|
346
|
+
return this.config.sttProvider.transcribe(audio, { language: this.config.language });
|
|
388
347
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
348
|
+
/** Synthesize text to audio */
|
|
349
|
+
async synthesize(text, voice) {
|
|
350
|
+
if (!this.config.ttsProvider) {
|
|
351
|
+
throw new Error('No TTS provider configured');
|
|
393
352
|
}
|
|
394
|
-
|
|
395
|
-
|
|
353
|
+
return this.config.ttsProvider.synthesize(text, { voice });
|
|
354
|
+
}
|
|
355
|
+
/** Start real-time conversation mode */
|
|
356
|
+
async startConversation(onMessage) {
|
|
357
|
+
if (!this.running)
|
|
358
|
+
await this.start();
|
|
359
|
+
this.conversationActive = true;
|
|
360
|
+
this.logger.info('Conversation mode started');
|
|
361
|
+
// In a real implementation, this would set up a microphone stream.
|
|
362
|
+
// For now, expose the conversation loop for programmatic use.
|
|
363
|
+
}
|
|
364
|
+
/** Process a single turn in conversation mode */
|
|
365
|
+
async processConversationTurn(audio, onMessage) {
|
|
366
|
+
const text = await this.transcribe(audio);
|
|
367
|
+
const response = await onMessage(text);
|
|
368
|
+
let audioResponse;
|
|
369
|
+
if (this.config.ttsProvider) {
|
|
370
|
+
audioResponse = await this.synthesize(response);
|
|
396
371
|
}
|
|
372
|
+
return { text, response, audioResponse };
|
|
397
373
|
}
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
return new Promise((resolve, reject) => {
|
|
401
|
-
const client = url.startsWith('https') ? https : http;
|
|
402
|
-
const req = client.get(url, (res) => {
|
|
403
|
-
if (res.statusCode === 301 || res.statusCode === 302) {
|
|
404
|
-
// Follow redirect
|
|
405
|
-
this.downloadFile(res.headers.location, destPath).then(resolve).catch(reject);
|
|
406
|
-
return;
|
|
407
|
-
}
|
|
408
|
-
const ws = fs.createWriteStream(destPath);
|
|
409
|
-
res.pipe(ws);
|
|
410
|
-
ws.on('finish', () => { ws.close(); resolve(); });
|
|
411
|
-
ws.on('error', reject);
|
|
412
|
-
});
|
|
413
|
-
req.on('error', reject);
|
|
414
|
-
req.setTimeout(30000, () => { req.destroy(); reject(new Error('Download timeout')); });
|
|
415
|
-
});
|
|
374
|
+
stopConversation() {
|
|
375
|
+
this.conversationActive = false;
|
|
416
376
|
}
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
377
|
+
isConversationActive() {
|
|
378
|
+
return this.conversationActive;
|
|
379
|
+
}
|
|
380
|
+
/** Process audio input: STT → Agent → TTS */
|
|
381
|
+
async processAudio(audio) {
|
|
382
|
+
if (!this.handler)
|
|
383
|
+
throw new Error('No message handler set');
|
|
384
|
+
// STT
|
|
385
|
+
let text;
|
|
386
|
+
if (this.config.sttProvider) {
|
|
387
|
+
text = await this.config.sttProvider.transcribe(audio, { language: this.config.language });
|
|
388
|
+
}
|
|
389
|
+
else {
|
|
390
|
+
text = audio.toString('utf-8'); // Fallback: treat as text
|
|
431
391
|
}
|
|
432
|
-
|
|
392
|
+
this.logger.debug('STT result', { text });
|
|
393
|
+
// Create message and send to agent
|
|
394
|
+
const message = {
|
|
395
|
+
id: `voice_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
|
396
|
+
role: 'user',
|
|
397
|
+
content: text,
|
|
398
|
+
timestamp: Date.now(),
|
|
399
|
+
metadata: { channel: 'voice' },
|
|
400
|
+
};
|
|
401
|
+
const response = await this.handler(message);
|
|
402
|
+
// TTS
|
|
403
|
+
let audioResponse;
|
|
404
|
+
if (this.config.ttsProvider) {
|
|
405
|
+
audioResponse = await this.config.ttsProvider.synthesize(response.content, { language: this.config.language });
|
|
406
|
+
}
|
|
407
|
+
return { text, response: response.content, audioResponse };
|
|
408
|
+
}
|
|
409
|
+
setSTTProvider(provider) {
|
|
410
|
+
this.config.sttProvider = provider;
|
|
411
|
+
}
|
|
412
|
+
setTTSProvider(provider) {
|
|
413
|
+
this.config.ttsProvider = provider;
|
|
433
414
|
}
|
|
434
415
|
}
|
|
435
|
-
exports.
|
|
436
|
-
function createVoiceProcessor(config) {
|
|
437
|
-
return new VoiceProcessor(config);
|
|
438
|
-
}
|
|
416
|
+
exports.VoiceChannel = VoiceChannel;
|
|
439
417
|
//# sourceMappingURL=voice.js.map
|