neoagent 2.2.0 → 2.2.1-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -116,6 +116,9 @@ class VoiceRuntimeManager {
116
116
  async closeSession(sessionId, reason = 'closed') {
117
117
  const session = this.getSession(sessionId);
118
118
  if (!session) return;
119
+ if (reason === 'socket_disconnected') {
120
+ await this.abortActiveRun(session.id, 'voice_disconnect');
121
+ }
119
122
  this.sessions.delete(session.id);
120
123
  await session.adapter?.close?.(session.id);
121
124
  await session.close(reason);
@@ -128,13 +131,14 @@ class VoiceRuntimeManager {
128
131
  session.resetTurnState();
129
132
  await session.adapter.onInputStart(session, {
130
133
  mimeType: options.mimeType,
134
+ turnId: options.turnId,
131
135
  });
132
136
  await session.setState('listening');
133
137
  }
134
138
 
135
139
  async appendInputAudio(sessionId, audioBytes, options = {}) {
136
140
  const session = this.#requireSession(sessionId);
137
- await session.adapter.appendAudioChunk(session, audioBytes, options);
141
+ return session.adapter.appendAudioChunk(session, audioBytes, options);
138
142
  }
139
143
 
140
144
  async commitInput(sessionId, options = {}) {
@@ -143,7 +147,10 @@ class VoiceRuntimeManager {
143
147
  return { transcript: '' };
144
148
  }
145
149
  await session.setState('transcribing');
146
- const transcript = await session.adapter.commitInput(session);
150
+ const transcript = await session.adapter.commitInput(session, {
151
+ turnId: options.turnId,
152
+ finalSequence: options.finalSequence,
153
+ });
147
154
  if (!transcript) {
148
155
  await session.setState('idle');
149
156
  return { transcript: '' };
@@ -280,7 +287,9 @@ class VoiceRuntimeManager {
280
287
  kind,
281
288
  });
282
289
 
283
- await session.setState('speaking', { kind });
290
+ if (kind === 'final') {
291
+ await session.setState('speaking', { kind });
292
+ }
284
293
 
285
294
  const voiceOptions = normalizeVoiceSynthesisOptions({
286
295
  provider: session.voiceSettings?.liveProvider,
@@ -290,39 +299,80 @@ class VoiceRuntimeManager {
290
299
 
291
300
  let index = 0;
292
301
  let streamError = null;
302
+ const ttsAttempts = this.#buildTtsAttemptOrder(session, voiceOptions);
293
303
  try {
294
- await synthesizeVoiceReplyStream(
295
- content,
296
- {
297
- ...voiceOptions,
298
- apiKey: session.voiceSettings?.liveApiKey,
299
- baseUrl: session.voiceSettings?.liveBaseUrl,
300
- },
301
- async ({ audioBytes, mimeType }) => {
302
- if (session.closed || session.interrupted) return;
303
- socket.emit('voice:audio_chunk', {
304
- sessionId,
305
- kind,
306
- index,
307
- audioBase64: audioBytes.toString('base64'),
308
- mimeType,
309
- });
310
- index += 1;
311
- },
312
- );
304
+ for (const attempt of ttsAttempts) {
305
+ index = 0;
306
+ streamError = null;
307
+ try {
308
+ await synthesizeVoiceReplyStream(
309
+ content,
310
+ attempt,
311
+ async ({ audioBytes, mimeType }) => {
312
+ if (session.closed || session.interrupted) return;
313
+ socket.emit('voice:audio_chunk', {
314
+ sessionId,
315
+ kind,
316
+ index,
317
+ audioBase64: audioBytes.toString('base64'),
318
+ mimeType,
319
+ });
320
+ index += 1;
321
+ },
322
+ );
323
+ streamError = null;
324
+ break;
325
+ } catch (error) {
326
+ streamError = String(error?.message || error || 'Voice playback failed.');
327
+ }
328
+ }
313
329
  } catch (error) {
314
330
  streamError = String(error?.message || error || 'Voice playback failed.');
331
+ }
332
+
333
+ if (!streamError && !session.closed && !session.interrupted) {
334
+ socket.emit('voice:audio_done', { sessionId, kind, totalChunks: index });
335
+ } else if (kind === 'final' && !session.closed && !session.interrupted) {
315
336
  socket.emit('voice:error', {
316
337
  sessionId,
317
338
  error: streamError,
339
+ recoverable: true,
340
+ phase: 'tts',
318
341
  });
342
+ await session.setState('degraded', { kind, phase: 'tts' });
319
343
  }
320
344
 
321
- if (!streamError && !session.closed && !session.interrupted) {
322
- socket.emit('voice:audio_done', { sessionId, kind, totalChunks: index });
345
+ if (kind === 'final' && !streamError) {
346
+ await session.setState('idle');
323
347
  }
348
+ }
324
349
 
325
- await session.setState('idle');
350
+ #buildTtsAttemptOrder(session, voiceOptions) {
351
+ const attempts = [];
352
+ const providers = [
353
+ voiceOptions.provider,
354
+ ...['openai', 'deepgram', 'gemini'].filter((provider) => provider !== voiceOptions.provider),
355
+ ];
356
+ for (const provider of providers) {
357
+ const normalized = normalizeVoiceSynthesisOptions({
358
+ provider,
359
+ model: provider === voiceOptions.provider ? voiceOptions.model : null,
360
+ voice: provider === voiceOptions.provider ? voiceOptions.voice : null,
361
+ });
362
+ const runtime = provider === voiceOptions.provider
363
+ ? {
364
+ apiKey: session.voiceSettings?.liveApiKey,
365
+ baseUrl: session.voiceSettings?.liveBaseUrl,
366
+ }
367
+ : this.#getProviderRuntime(session.userId, provider, session.agentId);
368
+ attempts.push({
369
+ ...normalized,
370
+ apiKey: runtime.apiKey,
371
+ baseUrl: runtime.baseUrl,
372
+ timeoutMs: 12000,
373
+ });
374
+ }
375
+ return attempts;
326
376
  }
327
377
  }
328
378
 
@@ -42,21 +42,6 @@ async function runVoiceTranscriptTurn({
42
42
  model: ttsModel,
43
43
  voice: ttsVoice,
44
44
  });
45
- const ttsProviderId = voiceOptions.provider === 'gemini'
46
- ? 'google'
47
- : voiceOptions.provider;
48
- let ttsRuntime = { apiKey: '', baseUrl: '' };
49
- if (ttsProviderId !== 'deepgram') {
50
- try {
51
- const runtime = getProviderRuntimeConfig(userId, ttsProviderId, agentId);
52
- ttsRuntime = {
53
- apiKey: typeof runtime.apiKey === 'string' ? runtime.apiKey.trim() : '',
54
- baseUrl: typeof runtime.baseUrl === 'string' ? runtime.baseUrl.trim() : '',
55
- };
56
- } catch {
57
- ttsRuntime = { apiKey: '', baseUrl: '' };
58
- }
59
- }
60
45
 
61
46
  const storedUserContent = transcriptText;
62
47
  const normalizedMetadata = metadata && typeof metadata === 'object' ? metadata : {};
@@ -143,15 +128,40 @@ async function runVoiceTranscriptTurn({
143
128
 
144
129
  let synthesized;
145
130
  let ttsError = null;
131
+ let providerUsed = voiceOptions.provider;
132
+ let modelUsed = voiceOptions.model;
133
+ let voiceUsed = voiceOptions.voice;
146
134
  if (synthesize !== false) {
147
- try {
148
- synthesized = await synthesizeVoiceReply(replyText, {
149
- ...voiceOptions,
150
- apiKey: ttsRuntime.apiKey,
151
- baseUrl: ttsRuntime.baseUrl,
135
+ const attemptProviders = [
136
+ voiceOptions.provider,
137
+ ...['openai', 'deepgram', 'gemini'].filter((provider) => provider !== voiceOptions.provider),
138
+ ];
139
+ let lastTtsError = null;
140
+ for (const provider of attemptProviders) {
141
+ const normalized = normalizeVoiceSynthesisOptions({
142
+ provider,
143
+ model: provider === voiceOptions.provider ? voiceOptions.model : null,
144
+ voice: provider === voiceOptions.provider ? voiceOptions.voice : null,
152
145
  });
153
- } catch (error) {
154
- ttsError = String(error?.message || error || 'Speech synthesis failed.');
146
+ const runtime = resolveProviderRuntime(userId, agentId, provider);
147
+ try {
148
+ synthesized = await synthesizeVoiceReply(replyText, {
149
+ ...normalized,
150
+ apiKey: runtime.apiKey,
151
+ baseUrl: runtime.baseUrl,
152
+ timeoutMs: 12000,
153
+ });
154
+ providerUsed = normalized.provider;
155
+ modelUsed = normalized.model;
156
+ voiceUsed = normalized.voice;
157
+ ttsError = null;
158
+ break;
159
+ } catch (error) {
160
+ lastTtsError = error;
161
+ }
162
+ }
163
+ if (!synthesized) {
164
+ ttsError = String(lastTtsError?.message || lastTtsError || 'Speech synthesis failed.');
155
165
  synthesized = {
156
166
  mimeType: 'audio/mpeg',
157
167
  audioBytes: Buffer.alloc(0),
@@ -168,15 +178,33 @@ async function runVoiceTranscriptTurn({
168
178
  runId: runResult?.runId || null,
169
179
  transcript: transcriptText,
170
180
  replyText,
171
- ttsProvider: voiceOptions.provider,
172
- ttsModel: voiceOptions.model,
173
- ttsVoice: voiceOptions.voice,
181
+ ttsProvider: providerUsed,
182
+ ttsModel: modelUsed,
183
+ ttsVoice: voiceUsed,
174
184
  audioMimeType: synthesized.mimeType,
175
185
  audioBase64: synthesized.audioBytes.toString('base64'),
176
186
  ttsError,
177
187
  };
178
188
  }
179
189
 
190
+ function resolveProviderRuntime(userId, agentId, provider) {
191
+ const providerId = String(provider || '').trim().toLowerCase() === 'gemini'
192
+ ? 'google'
193
+ : String(provider || '').trim().toLowerCase();
194
+ if (!providerId || providerId === 'deepgram') {
195
+ return { apiKey: '', baseUrl: '' };
196
+ }
197
+ try {
198
+ const runtime = getProviderRuntimeConfig(userId, providerId, agentId);
199
+ return {
200
+ apiKey: typeof runtime.apiKey === 'string' ? runtime.apiKey.trim() : '',
201
+ baseUrl: typeof runtime.baseUrl === 'string' ? runtime.baseUrl.trim() : '',
202
+ };
203
+ } catch {
204
+ return { apiKey: '', baseUrl: '' };
205
+ }
206
+ }
207
+
180
208
  module.exports = {
181
209
  runVoiceTranscriptTurn,
182
210
  };
@@ -516,6 +516,7 @@ function setupWebSocket(io, services) {
516
516
  }
517
517
  await voiceRuntimeManager.beginInput(sessionId, {
518
518
  mimeType: toOptionalString(data?.mimeType, 128),
519
+ turnId: toOptionalString(data?.turnId, 128),
519
520
  });
520
521
  } catch (err) {
521
522
  console.error(`[WS] voice:input_start failed for user ${userId}:`, err);
@@ -554,8 +555,30 @@ function setupWebSocket(io, services) {
554
555
  error: `audio chunk is too large (max ${MAX_VOICE_AUDIO_CHUNK_BYTES} bytes)`,
555
556
  });
556
557
  }
557
- await voiceRuntimeManager.appendInputAudio(sessionId, audioBytes, {
558
+ const turnId = toOptionalString(data?.turnId, 128);
559
+ const sequence = toBoundedInt(data?.sequence, -1, -1, 1_000_000);
560
+ if (!turnId) {
561
+ return socket.emit('voice:error', {
562
+ sessionId,
563
+ error: 'turnId is required',
564
+ });
565
+ }
566
+ if (sequence < 0) {
567
+ return socket.emit('voice:error', {
568
+ sessionId,
569
+ error: 'sequence is required',
570
+ });
571
+ }
572
+ const appendResult = await voiceRuntimeManager.appendInputAudio(sessionId, audioBytes, {
558
573
  mimeType: toOptionalString(data?.mimeType, 128),
574
+ turnId,
575
+ sequence,
576
+ });
577
+ socket.emit('voice:chunk_ack', {
578
+ sessionId,
579
+ turnId,
580
+ sequence,
581
+ receivedThrough: appendResult?.receivedThrough ?? sequence,
559
582
  });
560
583
  } catch (err) {
561
584
  console.error(`[WS] voice:audio_chunk failed for user ${userId}:`, err);
@@ -618,6 +641,8 @@ function setupWebSocket(io, services) {
618
641
  }
619
642
 
620
643
  await voiceRuntimeManager.commitInput(sessionId, {
644
+ turnId: toOptionalString(data?.turnId, 128),
645
+ finalSequence: toBoundedInt(data?.finalSequence, -1, -1, 1_000_000),
621
646
  promptHint: toOptionalString(data?.promptHint, 2000),
622
647
  metadata,
623
648
  });