neoagent 2.2.0 → 2.2.1-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server/db/database.js +35 -0
- package/server/http/routes.js +1 -0
- package/server/public/assets/fonts/MaterialIcons-Regular.otf +0 -0
- package/server/public/flutter_bootstrap.js +1 -1
- package/server/public/main.dart.js +71727 -70915
- package/server/routes/widgets.js +101 -0
- package/server/services/ai/engine.js +7 -2
- package/server/services/ai/toolResult.js +25 -0
- package/server/services/ai/tools.js +182 -0
- package/server/services/manager.js +31 -0
- package/server/services/scheduler/cron.js +85 -32
- package/server/services/scheduler/cron_utils.js +216 -0
- package/server/services/voice/bufferedLiveRelayAdapter.js +85 -17
- package/server/services/voice/liveSession.js +109 -9
- package/server/services/voice/providers.js +44 -18
- package/server/services/voice/runtimeManager.js +75 -25
- package/server/services/voice/turnRunner.js +53 -25
- package/server/services/websocket.js +26 -1
- package/server/services/widgets/service.js +550 -0
|
@@ -116,6 +116,9 @@ class VoiceRuntimeManager {
|
|
|
116
116
|
async closeSession(sessionId, reason = 'closed') {
|
|
117
117
|
const session = this.getSession(sessionId);
|
|
118
118
|
if (!session) return;
|
|
119
|
+
if (reason === 'socket_disconnected') {
|
|
120
|
+
await this.abortActiveRun(session.id, 'voice_disconnect');
|
|
121
|
+
}
|
|
119
122
|
this.sessions.delete(session.id);
|
|
120
123
|
await session.adapter?.close?.(session.id);
|
|
121
124
|
await session.close(reason);
|
|
@@ -128,13 +131,14 @@ class VoiceRuntimeManager {
|
|
|
128
131
|
session.resetTurnState();
|
|
129
132
|
await session.adapter.onInputStart(session, {
|
|
130
133
|
mimeType: options.mimeType,
|
|
134
|
+
turnId: options.turnId,
|
|
131
135
|
});
|
|
132
136
|
await session.setState('listening');
|
|
133
137
|
}
|
|
134
138
|
|
|
135
139
|
async appendInputAudio(sessionId, audioBytes, options = {}) {
|
|
136
140
|
const session = this.#requireSession(sessionId);
|
|
137
|
-
|
|
141
|
+
return session.adapter.appendAudioChunk(session, audioBytes, options);
|
|
138
142
|
}
|
|
139
143
|
|
|
140
144
|
async commitInput(sessionId, options = {}) {
|
|
@@ -143,7 +147,10 @@ class VoiceRuntimeManager {
|
|
|
143
147
|
return { transcript: '' };
|
|
144
148
|
}
|
|
145
149
|
await session.setState('transcribing');
|
|
146
|
-
const transcript = await session.adapter.commitInput(session
|
|
150
|
+
const transcript = await session.adapter.commitInput(session, {
|
|
151
|
+
turnId: options.turnId,
|
|
152
|
+
finalSequence: options.finalSequence,
|
|
153
|
+
});
|
|
147
154
|
if (!transcript) {
|
|
148
155
|
await session.setState('idle');
|
|
149
156
|
return { transcript: '' };
|
|
@@ -280,7 +287,9 @@ class VoiceRuntimeManager {
|
|
|
280
287
|
kind,
|
|
281
288
|
});
|
|
282
289
|
|
|
283
|
-
|
|
290
|
+
if (kind === 'final') {
|
|
291
|
+
await session.setState('speaking', { kind });
|
|
292
|
+
}
|
|
284
293
|
|
|
285
294
|
const voiceOptions = normalizeVoiceSynthesisOptions({
|
|
286
295
|
provider: session.voiceSettings?.liveProvider,
|
|
@@ -290,39 +299,80 @@ class VoiceRuntimeManager {
|
|
|
290
299
|
|
|
291
300
|
let index = 0;
|
|
292
301
|
let streamError = null;
|
|
302
|
+
const ttsAttempts = this.#buildTtsAttemptOrder(session, voiceOptions);
|
|
293
303
|
try {
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
304
|
+
for (const attempt of ttsAttempts) {
|
|
305
|
+
index = 0;
|
|
306
|
+
streamError = null;
|
|
307
|
+
try {
|
|
308
|
+
await synthesizeVoiceReplyStream(
|
|
309
|
+
content,
|
|
310
|
+
attempt,
|
|
311
|
+
async ({ audioBytes, mimeType }) => {
|
|
312
|
+
if (session.closed || session.interrupted) return;
|
|
313
|
+
socket.emit('voice:audio_chunk', {
|
|
314
|
+
sessionId,
|
|
315
|
+
kind,
|
|
316
|
+
index,
|
|
317
|
+
audioBase64: audioBytes.toString('base64'),
|
|
318
|
+
mimeType,
|
|
319
|
+
});
|
|
320
|
+
index += 1;
|
|
321
|
+
},
|
|
322
|
+
);
|
|
323
|
+
streamError = null;
|
|
324
|
+
break;
|
|
325
|
+
} catch (error) {
|
|
326
|
+
streamError = String(error?.message || error || 'Voice playback failed.');
|
|
327
|
+
}
|
|
328
|
+
}
|
|
313
329
|
} catch (error) {
|
|
314
330
|
streamError = String(error?.message || error || 'Voice playback failed.');
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if (!streamError && !session.closed && !session.interrupted) {
|
|
334
|
+
socket.emit('voice:audio_done', { sessionId, kind, totalChunks: index });
|
|
335
|
+
} else if (kind === 'final' && !session.closed && !session.interrupted) {
|
|
315
336
|
socket.emit('voice:error', {
|
|
316
337
|
sessionId,
|
|
317
338
|
error: streamError,
|
|
339
|
+
recoverable: true,
|
|
340
|
+
phase: 'tts',
|
|
318
341
|
});
|
|
342
|
+
await session.setState('degraded', { kind, phase: 'tts' });
|
|
319
343
|
}
|
|
320
344
|
|
|
321
|
-
if (
|
|
322
|
-
|
|
345
|
+
if (kind === 'final' && !streamError) {
|
|
346
|
+
await session.setState('idle');
|
|
323
347
|
}
|
|
348
|
+
}
|
|
324
349
|
|
|
325
|
-
|
|
350
|
+
#buildTtsAttemptOrder(session, voiceOptions) {
|
|
351
|
+
const attempts = [];
|
|
352
|
+
const providers = [
|
|
353
|
+
voiceOptions.provider,
|
|
354
|
+
...['openai', 'deepgram', 'gemini'].filter((provider) => provider !== voiceOptions.provider),
|
|
355
|
+
];
|
|
356
|
+
for (const provider of providers) {
|
|
357
|
+
const normalized = normalizeVoiceSynthesisOptions({
|
|
358
|
+
provider,
|
|
359
|
+
model: provider === voiceOptions.provider ? voiceOptions.model : null,
|
|
360
|
+
voice: provider === voiceOptions.provider ? voiceOptions.voice : null,
|
|
361
|
+
});
|
|
362
|
+
const runtime = provider === voiceOptions.provider
|
|
363
|
+
? {
|
|
364
|
+
apiKey: session.voiceSettings?.liveApiKey,
|
|
365
|
+
baseUrl: session.voiceSettings?.liveBaseUrl,
|
|
366
|
+
}
|
|
367
|
+
: this.#getProviderRuntime(session.userId, provider, session.agentId);
|
|
368
|
+
attempts.push({
|
|
369
|
+
...normalized,
|
|
370
|
+
apiKey: runtime.apiKey,
|
|
371
|
+
baseUrl: runtime.baseUrl,
|
|
372
|
+
timeoutMs: 12000,
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
return attempts;
|
|
326
376
|
}
|
|
327
377
|
}
|
|
328
378
|
|
|
@@ -42,21 +42,6 @@ async function runVoiceTranscriptTurn({
|
|
|
42
42
|
model: ttsModel,
|
|
43
43
|
voice: ttsVoice,
|
|
44
44
|
});
|
|
45
|
-
const ttsProviderId = voiceOptions.provider === 'gemini'
|
|
46
|
-
? 'google'
|
|
47
|
-
: voiceOptions.provider;
|
|
48
|
-
let ttsRuntime = { apiKey: '', baseUrl: '' };
|
|
49
|
-
if (ttsProviderId !== 'deepgram') {
|
|
50
|
-
try {
|
|
51
|
-
const runtime = getProviderRuntimeConfig(userId, ttsProviderId, agentId);
|
|
52
|
-
ttsRuntime = {
|
|
53
|
-
apiKey: typeof runtime.apiKey === 'string' ? runtime.apiKey.trim() : '',
|
|
54
|
-
baseUrl: typeof runtime.baseUrl === 'string' ? runtime.baseUrl.trim() : '',
|
|
55
|
-
};
|
|
56
|
-
} catch {
|
|
57
|
-
ttsRuntime = { apiKey: '', baseUrl: '' };
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
45
|
|
|
61
46
|
const storedUserContent = transcriptText;
|
|
62
47
|
const normalizedMetadata = metadata && typeof metadata === 'object' ? metadata : {};
|
|
@@ -143,15 +128,40 @@ async function runVoiceTranscriptTurn({
|
|
|
143
128
|
|
|
144
129
|
let synthesized;
|
|
145
130
|
let ttsError = null;
|
|
131
|
+
let providerUsed = voiceOptions.provider;
|
|
132
|
+
let modelUsed = voiceOptions.model;
|
|
133
|
+
let voiceUsed = voiceOptions.voice;
|
|
146
134
|
if (synthesize !== false) {
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
135
|
+
const attemptProviders = [
|
|
136
|
+
voiceOptions.provider,
|
|
137
|
+
...['openai', 'deepgram', 'gemini'].filter((provider) => provider !== voiceOptions.provider),
|
|
138
|
+
];
|
|
139
|
+
let lastTtsError = null;
|
|
140
|
+
for (const provider of attemptProviders) {
|
|
141
|
+
const normalized = normalizeVoiceSynthesisOptions({
|
|
142
|
+
provider,
|
|
143
|
+
model: provider === voiceOptions.provider ? voiceOptions.model : null,
|
|
144
|
+
voice: provider === voiceOptions.provider ? voiceOptions.voice : null,
|
|
152
145
|
});
|
|
153
|
-
|
|
154
|
-
|
|
146
|
+
const runtime = resolveProviderRuntime(userId, agentId, provider);
|
|
147
|
+
try {
|
|
148
|
+
synthesized = await synthesizeVoiceReply(replyText, {
|
|
149
|
+
...normalized,
|
|
150
|
+
apiKey: runtime.apiKey,
|
|
151
|
+
baseUrl: runtime.baseUrl,
|
|
152
|
+
timeoutMs: 12000,
|
|
153
|
+
});
|
|
154
|
+
providerUsed = normalized.provider;
|
|
155
|
+
modelUsed = normalized.model;
|
|
156
|
+
voiceUsed = normalized.voice;
|
|
157
|
+
ttsError = null;
|
|
158
|
+
break;
|
|
159
|
+
} catch (error) {
|
|
160
|
+
lastTtsError = error;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
if (!synthesized) {
|
|
164
|
+
ttsError = String(lastTtsError?.message || lastTtsError || 'Speech synthesis failed.');
|
|
155
165
|
synthesized = {
|
|
156
166
|
mimeType: 'audio/mpeg',
|
|
157
167
|
audioBytes: Buffer.alloc(0),
|
|
@@ -168,15 +178,33 @@ async function runVoiceTranscriptTurn({
|
|
|
168
178
|
runId: runResult?.runId || null,
|
|
169
179
|
transcript: transcriptText,
|
|
170
180
|
replyText,
|
|
171
|
-
ttsProvider:
|
|
172
|
-
ttsModel:
|
|
173
|
-
ttsVoice:
|
|
181
|
+
ttsProvider: providerUsed,
|
|
182
|
+
ttsModel: modelUsed,
|
|
183
|
+
ttsVoice: voiceUsed,
|
|
174
184
|
audioMimeType: synthesized.mimeType,
|
|
175
185
|
audioBase64: synthesized.audioBytes.toString('base64'),
|
|
176
186
|
ttsError,
|
|
177
187
|
};
|
|
178
188
|
}
|
|
179
189
|
|
|
190
|
+
function resolveProviderRuntime(userId, agentId, provider) {
|
|
191
|
+
const providerId = String(provider || '').trim().toLowerCase() === 'gemini'
|
|
192
|
+
? 'google'
|
|
193
|
+
: String(provider || '').trim().toLowerCase();
|
|
194
|
+
if (!providerId || providerId === 'deepgram') {
|
|
195
|
+
return { apiKey: '', baseUrl: '' };
|
|
196
|
+
}
|
|
197
|
+
try {
|
|
198
|
+
const runtime = getProviderRuntimeConfig(userId, providerId, agentId);
|
|
199
|
+
return {
|
|
200
|
+
apiKey: typeof runtime.apiKey === 'string' ? runtime.apiKey.trim() : '',
|
|
201
|
+
baseUrl: typeof runtime.baseUrl === 'string' ? runtime.baseUrl.trim() : '',
|
|
202
|
+
};
|
|
203
|
+
} catch {
|
|
204
|
+
return { apiKey: '', baseUrl: '' };
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
180
208
|
module.exports = {
|
|
181
209
|
runVoiceTranscriptTurn,
|
|
182
210
|
};
|
|
@@ -516,6 +516,7 @@ function setupWebSocket(io, services) {
|
|
|
516
516
|
}
|
|
517
517
|
await voiceRuntimeManager.beginInput(sessionId, {
|
|
518
518
|
mimeType: toOptionalString(data?.mimeType, 128),
|
|
519
|
+
turnId: toOptionalString(data?.turnId, 128),
|
|
519
520
|
});
|
|
520
521
|
} catch (err) {
|
|
521
522
|
console.error(`[WS] voice:input_start failed for user ${userId}:`, err);
|
|
@@ -554,8 +555,30 @@ function setupWebSocket(io, services) {
|
|
|
554
555
|
error: `audio chunk is too large (max ${MAX_VOICE_AUDIO_CHUNK_BYTES} bytes)`,
|
|
555
556
|
});
|
|
556
557
|
}
|
|
557
|
-
|
|
558
|
+
const turnId = toOptionalString(data?.turnId, 128);
|
|
559
|
+
const sequence = toBoundedInt(data?.sequence, -1, -1, 1_000_000);
|
|
560
|
+
if (!turnId) {
|
|
561
|
+
return socket.emit('voice:error', {
|
|
562
|
+
sessionId,
|
|
563
|
+
error: 'turnId is required',
|
|
564
|
+
});
|
|
565
|
+
}
|
|
566
|
+
if (sequence < 0) {
|
|
567
|
+
return socket.emit('voice:error', {
|
|
568
|
+
sessionId,
|
|
569
|
+
error: 'sequence is required',
|
|
570
|
+
});
|
|
571
|
+
}
|
|
572
|
+
const appendResult = await voiceRuntimeManager.appendInputAudio(sessionId, audioBytes, {
|
|
558
573
|
mimeType: toOptionalString(data?.mimeType, 128),
|
|
574
|
+
turnId,
|
|
575
|
+
sequence,
|
|
576
|
+
});
|
|
577
|
+
socket.emit('voice:chunk_ack', {
|
|
578
|
+
sessionId,
|
|
579
|
+
turnId,
|
|
580
|
+
sequence,
|
|
581
|
+
receivedThrough: appendResult?.receivedThrough ?? sequence,
|
|
559
582
|
});
|
|
560
583
|
} catch (err) {
|
|
561
584
|
console.error(`[WS] voice:audio_chunk failed for user ${userId}:`, err);
|
|
@@ -618,6 +641,8 @@ function setupWebSocket(io, services) {
|
|
|
618
641
|
}
|
|
619
642
|
|
|
620
643
|
await voiceRuntimeManager.commitInput(sessionId, {
|
|
644
|
+
turnId: toOptionalString(data?.turnId, 128),
|
|
645
|
+
finalSequence: toBoundedInt(data?.finalSequence, -1, -1, 1_000_000),
|
|
621
646
|
promptHint: toOptionalString(data?.promptHint, 2000),
|
|
622
647
|
metadata,
|
|
623
648
|
});
|