@getpaseo/server 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server/client/daemon-client.d.ts +1 -1
- package/dist/server/client/daemon-client.d.ts.map +1 -1
- package/dist/server/client/daemon-client.js +1 -1
- package/dist/server/client/daemon-client.js.map +1 -1
- package/dist/server/server/agent/agent-response-loop.js +1 -1
- package/dist/server/server/agent/agent-response-loop.js.map +1 -1
- package/dist/server/server/agent/provider-launch-config.d.ts +13 -2
- package/dist/server/server/agent/provider-launch-config.d.ts.map +1 -1
- package/dist/server/server/agent/provider-launch-config.js +17 -9
- package/dist/server/server/agent/provider-launch-config.js.map +1 -1
- package/dist/server/server/agent/provider-manifest.d.ts.map +1 -1
- package/dist/server/server/agent/provider-manifest.js +10 -5
- package/dist/server/server/agent/provider-manifest.js.map +1 -1
- package/dist/server/server/agent/providers/claude-agent.d.ts +3 -1
- package/dist/server/server/agent/providers/claude-agent.d.ts.map +1 -1
- package/dist/server/server/agent/providers/claude-agent.js +5 -1
- package/dist/server/server/agent/providers/claude-agent.js.map +1 -1
- package/dist/server/server/agent/providers/codex-app-server-agent.d.ts.map +1 -1
- package/dist/server/server/agent/providers/codex-app-server-agent.js +30 -1
- package/dist/server/server/agent/providers/codex-app-server-agent.js.map +1 -1
- package/dist/server/server/agent/providers/opencode-agent.d.ts +1 -0
- package/dist/server/server/agent/providers/opencode-agent.d.ts.map +1 -1
- package/dist/server/server/agent/providers/opencode-agent.js +110 -9
- package/dist/server/server/agent/providers/opencode-agent.js.map +1 -1
- package/dist/server/server/agent/tts-manager.d.ts +8 -1
- package/dist/server/server/agent/tts-manager.d.ts.map +1 -1
- package/dist/server/server/agent/tts-manager.js +215 -108
- package/dist/server/server/agent/tts-manager.js.map +1 -1
- package/dist/server/server/bootstrap.d.ts +2 -2
- package/dist/server/server/bootstrap.d.ts.map +1 -1
- package/dist/server/server/bootstrap.js +26 -5
- package/dist/server/server/bootstrap.js.map +1 -1
- package/dist/server/server/exports.d.ts +1 -0
- package/dist/server/server/exports.d.ts.map +1 -1
- package/dist/server/server/exports.js +2 -0
- package/dist/server/server/exports.js.map +1 -1
- package/dist/server/server/persisted-config.d.ts +25 -0
- package/dist/server/server/persisted-config.d.ts.map +1 -1
- package/dist/server/server/persisted-config.js +6 -0
- package/dist/server/server/persisted-config.js.map +1 -1
- package/dist/server/server/session.d.ts +16 -19
- package/dist/server/server/session.d.ts.map +1 -1
- package/dist/server/server/session.js +171 -237
- package/dist/server/server/session.js.map +1 -1
- package/dist/server/server/speech/providers/local/runtime.d.ts +2 -0
- package/dist/server/server/speech/providers/local/runtime.d.ts.map +1 -1
- package/dist/server/server/speech/providers/local/runtime.js +7 -0
- package/dist/server/server/speech/providers/local/runtime.js.map +1 -1
- package/dist/server/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
- package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts +2 -0
- package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts.map +1 -1
- package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts +13 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts.map +1 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts +32 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts.map +1 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
- package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
- package/dist/server/server/speech/providers/openai/runtime.d.ts +2 -0
- package/dist/server/server/speech/providers/openai/runtime.d.ts.map +1 -1
- package/dist/server/server/speech/providers/openai/runtime.js +2 -0
- package/dist/server/server/speech/providers/openai/runtime.js.map +1 -1
- package/dist/server/server/speech/speech-config-resolver.d.ts.map +1 -1
- package/dist/server/server/speech/speech-config-resolver.js +35 -14
- package/dist/server/server/speech/speech-config-resolver.js.map +1 -1
- package/dist/server/server/speech/speech-runtime.d.ts +3 -1
- package/dist/server/server/speech/speech-runtime.d.ts.map +1 -1
- package/dist/server/server/speech/speech-runtime.js +39 -6
- package/dist/server/server/speech/speech-runtime.js.map +1 -1
- package/dist/server/server/speech/speech-types.d.ts +1 -0
- package/dist/server/server/speech/speech-types.d.ts.map +1 -1
- package/dist/server/server/speech/turn-detection-provider.d.ts +22 -0
- package/dist/server/server/speech/turn-detection-provider.d.ts.map +1 -0
- package/dist/server/server/speech/turn-detection-provider.js +2 -0
- package/dist/server/server/speech/turn-detection-provider.js.map +1 -0
- package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts +16 -0
- package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts.map +1 -0
- package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
- package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
- package/dist/server/server/voice/voice-turn-controller.d.ts +34 -0
- package/dist/server/server/voice/voice-turn-controller.d.ts.map +1 -0
- package/dist/server/server/voice/voice-turn-controller.js +161 -0
- package/dist/server/server/voice/voice-turn-controller.js.map +1 -0
- package/dist/server/server/websocket-server.d.ts +3 -0
- package/dist/server/server/websocket-server.d.ts.map +1 -1
- package/dist/server/server/websocket-server.js +5 -1
- package/dist/server/server/websocket-server.js.map +1 -1
- package/dist/server/server/workspace-registry.d.ts +2 -0
- package/dist/server/server/workspace-registry.d.ts.map +1 -1
- package/dist/server/server/workspace-registry.js +11 -4
- package/dist/server/server/workspace-registry.js.map +1 -1
- package/dist/server/shared/messages.d.ts +97 -0
- package/dist/server/shared/messages.d.ts.map +1 -1
- package/dist/server/shared/messages.js +7 -0
- package/dist/server/shared/messages.js.map +1 -1
- package/dist/server/shared/tool-call-display.d.ts.map +1 -1
- package/dist/server/shared/tool-call-display.js +58 -39
- package/dist/server/shared/tool-call-display.js.map +1 -1
- package/dist/src/server/agent/agent-response-loop.js +1 -1
- package/dist/src/server/agent/agent-response-loop.js.map +1 -1
- package/dist/src/server/agent/provider-launch-config.js +17 -9
- package/dist/src/server/agent/provider-launch-config.js.map +1 -1
- package/dist/src/server/agent/provider-manifest.js +10 -5
- package/dist/src/server/agent/provider-manifest.js.map +1 -1
- package/dist/src/server/agent/providers/claude-agent.js +5 -1
- package/dist/src/server/agent/providers/claude-agent.js.map +1 -1
- package/dist/src/server/agent/providers/codex-app-server-agent.js +30 -1
- package/dist/src/server/agent/providers/codex-app-server-agent.js.map +1 -1
- package/dist/src/server/agent/providers/opencode-agent.js +110 -9
- package/dist/src/server/agent/providers/opencode-agent.js.map +1 -1
- package/dist/src/server/agent/tts-manager.js +215 -108
- package/dist/src/server/agent/tts-manager.js.map +1 -1
- package/dist/src/server/bootstrap.js +26 -5
- package/dist/src/server/bootstrap.js.map +1 -1
- package/dist/src/server/persisted-config.js +6 -0
- package/dist/src/server/persisted-config.js.map +1 -1
- package/dist/src/server/session.js +171 -237
- package/dist/src/server/session.js.map +1 -1
- package/dist/src/server/speech/providers/local/runtime.js +7 -0
- package/dist/src/server/speech/providers/local/runtime.js.map +1 -1
- package/dist/src/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
- package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
- package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
- package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
- package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
- package/dist/src/server/speech/providers/openai/runtime.js +2 -0
- package/dist/src/server/speech/providers/openai/runtime.js.map +1 -1
- package/dist/src/server/speech/speech-config-resolver.js +35 -14
- package/dist/src/server/speech/speech-config-resolver.js.map +1 -1
- package/dist/src/server/speech/speech-runtime.js +39 -6
- package/dist/src/server/speech/speech-runtime.js.map +1 -1
- package/dist/src/server/speech/turn-detection-provider.js +2 -0
- package/dist/src/server/speech/turn-detection-provider.js.map +1 -0
- package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
- package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
- package/dist/src/server/voice/voice-turn-controller.js +161 -0
- package/dist/src/server/voice/voice-turn-controller.js.map +1 -0
- package/dist/src/server/websocket-server.js +5 -1
- package/dist/src/server/websocket-server.js.map +1 -1
- package/dist/src/server/workspace-registry.js +11 -4
- package/dist/src/server/workspace-registry.js.map +1 -1
- package/dist/src/shared/messages.js +7 -0
- package/dist/src/shared/messages.js.map +1 -1
- package/dist/src/shared/tool-call-display.js +58 -39
- package/dist/src/shared/tool-call-display.js.map +1 -1
- package/package.json +4 -3
- package/src/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
|
@@ -13,6 +13,7 @@ import { STTManager } from './agent/stt-manager.js';
|
|
|
13
13
|
import { maybePersistTtsDebugAudio } from './agent/tts-debug.js';
|
|
14
14
|
import { isPaseoDictationDebugEnabled } from './agent/recordings-debug.js';
|
|
15
15
|
import { DictationStreamManager, } from './dictation/dictation-stream-manager.js';
|
|
16
|
+
import { createVoiceTurnController, } from './voice/voice-turn-controller.js';
|
|
16
17
|
import { buildConfigOverrides, buildSessionConfig, extractTimestamps } from './persistence-hooks.js';
|
|
17
18
|
import { experimental_createMCPClient } from 'ai';
|
|
18
19
|
import { buildProviderRegistry } from './agent/provider-registry.js';
|
|
@@ -35,6 +36,7 @@ import { getProjectIcon } from '../utils/project-icon.js';
|
|
|
35
36
|
import { expandTilde } from '../utils/path.js';
|
|
36
37
|
import { searchHomeDirectories, searchWorkspaceEntries } from '../utils/directory-suggestions.js';
|
|
37
38
|
import { ensureLocalSpeechModels, getLocalSpeechModelDir, listLocalSpeechModels, } from './speech/providers/local/models.js';
|
|
39
|
+
import { toResolver } from './speech/provider-resolver.js';
|
|
38
40
|
import { resolveClientMessageId } from './client-message-id.js';
|
|
39
41
|
const execAsync = promisify(exec);
|
|
40
42
|
const MAX_INITIAL_AGENT_TITLE_CHARS = Math.min(60, MAX_EXPLICIT_AGENT_TITLE_CHARS);
|
|
@@ -88,11 +90,10 @@ const PCM_BITS_PER_SAMPLE = 16;
|
|
|
88
90
|
const PCM_BYTES_PER_MS = (PCM_SAMPLE_RATE * PCM_CHANNELS * (PCM_BITS_PER_SAMPLE / 8)) / 1000;
|
|
89
91
|
const MIN_STREAMING_SEGMENT_DURATION_MS = 1000;
|
|
90
92
|
const MIN_STREAMING_SEGMENT_BYTES = Math.round(PCM_BYTES_PER_MS * MIN_STREAMING_SEGMENT_DURATION_MS);
|
|
91
|
-
const VOICE_MODE_INACTIVITY_FLUSH_MS = 4500;
|
|
92
|
-
const VOICE_INTERNAL_DICTATION_ID_PREFIX = '__voice_turn__:';
|
|
93
93
|
const SAFE_GIT_REF_PATTERN = /^[A-Za-z0-9._\/-]+$/;
|
|
94
94
|
const AgentIdSchema = z.string().uuid();
|
|
95
95
|
const VOICE_MCP_SERVER_NAME = 'paseo_voice';
|
|
96
|
+
const VOICE_INTERRUPT_CONFIRMATION_MS = 500;
|
|
96
97
|
class VoiceFeatureUnavailableError extends Error {
|
|
97
98
|
constructor(context) {
|
|
98
99
|
super(context.message);
|
|
@@ -161,19 +162,16 @@ export class Session {
|
|
|
161
162
|
// Voice mode state
|
|
162
163
|
this.isVoiceMode = false;
|
|
163
164
|
this.speechInProgress = false;
|
|
165
|
+
this.pendingVoiceSpeechStartAt = null;
|
|
166
|
+
this.pendingVoiceSpeechTimer = null;
|
|
167
|
+
this.voiceTurnController = null;
|
|
168
|
+
this.voiceInputChunkCount = 0;
|
|
169
|
+
this.voiceInputBytes = 0;
|
|
170
|
+
this.voiceInputWindowStartedAt = Date.now();
|
|
164
171
|
// Audio buffering for interruption handling
|
|
165
172
|
this.pendingAudioSegments = [];
|
|
166
173
|
this.bufferTimeout = null;
|
|
167
|
-
this.voiceModeInactivityTimeout = null;
|
|
168
174
|
this.audioBuffer = null;
|
|
169
|
-
this.activeVoiceDictationId = null;
|
|
170
|
-
this.activeVoiceDictationFormat = null;
|
|
171
|
-
this.activeVoiceDictationNextSeq = 0;
|
|
172
|
-
this.activeVoiceDictationStartPromise = null;
|
|
173
|
-
this.activeVoiceDictationFinalizePromise = null;
|
|
174
|
-
this.activeVoiceDictationResultPromise = null;
|
|
175
|
-
this.activeVoiceDictationResolve = null;
|
|
176
|
-
this.activeVoiceDictationReject = null;
|
|
177
175
|
// Optional TTS debug capture (persisted per utterance)
|
|
178
176
|
this.ttsDebugStreams = new Map();
|
|
179
177
|
// Per-session MCP client and tools
|
|
@@ -221,6 +219,7 @@ export class Session {
|
|
|
221
219
|
this.unsubscribeTerminalsChanged = this.terminalManager.subscribeTerminalsChanged((event) => this.handleTerminalsChanged(event));
|
|
222
220
|
}
|
|
223
221
|
this.voiceAgentMcpStdio = voice?.voiceAgentMcpStdio ?? null;
|
|
222
|
+
this.resolveVoiceTurnDetection = toResolver(voice?.turnDetection ?? null);
|
|
224
223
|
const configuredModelsDir = dictation?.localModels?.modelsDir?.trim();
|
|
225
224
|
this.localSpeechModelsDir =
|
|
226
225
|
configuredModelsDir && configuredModelsDir.length > 0
|
|
@@ -257,13 +256,6 @@ export class Session {
|
|
|
257
256
|
stt: dictation?.stt ?? null,
|
|
258
257
|
finalTimeoutMs: dictation?.finalTimeoutMs,
|
|
259
258
|
});
|
|
260
|
-
this.voiceStreamManager = new DictationStreamManager({
|
|
261
|
-
logger: this.sessionLogger.child({ stream: 'voice-internal' }),
|
|
262
|
-
sessionId: this.sessionId,
|
|
263
|
-
emit: (msg) => this.handleDictationManagerMessage(msg),
|
|
264
|
-
stt: stt,
|
|
265
|
-
finalTimeoutMs: dictation?.finalTimeoutMs,
|
|
266
|
-
});
|
|
267
259
|
// Initialize agent MCP client asynchronously
|
|
268
260
|
void this.initializeAgentMcp();
|
|
269
261
|
this.subscribeToAgentEvents();
|
|
@@ -1409,7 +1401,9 @@ export class Session {
|
|
|
1409
1401
|
* Handle voice mode toggle
|
|
1410
1402
|
*/
|
|
1411
1403
|
async handleSetVoiceMode(enabled, agentId, requestId) {
|
|
1404
|
+
const startedAt = Date.now();
|
|
1412
1405
|
try {
|
|
1406
|
+
this.sessionLogger.info({ enabled, requestedAgentId: agentId ?? null, requestId: requestId ?? null }, 'set_voice_mode started');
|
|
1413
1407
|
if (enabled) {
|
|
1414
1408
|
const unavailable = this.resolveVoiceFeatureUnavailableContext('voice_mode');
|
|
1415
1409
|
if (unavailable) {
|
|
@@ -1419,15 +1413,26 @@ export class Session {
|
|
|
1419
1413
|
if (this.isVoiceMode &&
|
|
1420
1414
|
this.voiceModeAgentId &&
|
|
1421
1415
|
this.voiceModeAgentId !== normalizedAgentId) {
|
|
1416
|
+
this.sessionLogger.info({
|
|
1417
|
+
previousAgentId: this.voiceModeAgentId,
|
|
1418
|
+
nextAgentId: normalizedAgentId,
|
|
1419
|
+
elapsedMs: Date.now() - startedAt,
|
|
1420
|
+
}, 'set_voice_mode disabling previous active voice agent');
|
|
1422
1421
|
await this.disableVoiceModeForActiveAgent(true);
|
|
1423
1422
|
}
|
|
1424
1423
|
if (!this.isVoiceMode || this.voiceModeAgentId !== normalizedAgentId) {
|
|
1424
|
+
this.sessionLogger.info({ agentId: normalizedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode enabling voice for agent');
|
|
1425
1425
|
const refreshedAgentId = await this.enableVoiceModeForAgent(normalizedAgentId);
|
|
1426
1426
|
this.voiceModeAgentId = refreshedAgentId;
|
|
1427
|
+
this.sessionLogger.info({ agentId: refreshedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode agent enable complete');
|
|
1427
1428
|
}
|
|
1429
|
+
this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode starting voice turn controller');
|
|
1430
|
+
await this.startVoiceTurnController();
|
|
1431
|
+
this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode voice turn controller started');
|
|
1428
1432
|
this.isVoiceMode = true;
|
|
1429
1433
|
this.sessionLogger.info({
|
|
1430
1434
|
agentId: this.voiceModeAgentId,
|
|
1435
|
+
elapsedMs: Date.now() - startedAt,
|
|
1431
1436
|
}, 'Voice mode enabled for existing agent');
|
|
1432
1437
|
if (requestId) {
|
|
1433
1438
|
this.emit({
|
|
@@ -1443,9 +1448,10 @@ export class Session {
|
|
|
1443
1448
|
}
|
|
1444
1449
|
return;
|
|
1445
1450
|
}
|
|
1451
|
+
this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode disabling active voice mode');
|
|
1446
1452
|
await this.disableVoiceModeForActiveAgent(true);
|
|
1447
1453
|
this.isVoiceMode = false;
|
|
1448
|
-
this.sessionLogger.info('Voice mode disabled');
|
|
1454
|
+
this.sessionLogger.info({ elapsedMs: Date.now() - startedAt }, 'Voice mode disabled');
|
|
1449
1455
|
if (requestId) {
|
|
1450
1456
|
this.emit({
|
|
1451
1457
|
type: 'set_voice_mode_response',
|
|
@@ -1466,6 +1472,7 @@ export class Session {
|
|
|
1466
1472
|
err: error,
|
|
1467
1473
|
enabled,
|
|
1468
1474
|
requestedAgentId: agentId ?? null,
|
|
1475
|
+
elapsedMs: Date.now() - startedAt,
|
|
1469
1476
|
}, 'set_voice_mode failed');
|
|
1470
1477
|
if (requestId) {
|
|
1471
1478
|
this.emit({
|
|
@@ -1513,12 +1520,17 @@ export class Session {
|
|
|
1513
1520
|
};
|
|
1514
1521
|
}
|
|
1515
1522
|
async enableVoiceModeForAgent(agentId) {
|
|
1523
|
+
const startedAt = Date.now();
|
|
1516
1524
|
const ensureVoiceSocket = this.ensureVoiceMcpSocketForAgent;
|
|
1517
1525
|
if (!ensureVoiceSocket) {
|
|
1518
1526
|
throw new Error('Voice MCP socket bridge is not configured');
|
|
1519
1527
|
}
|
|
1528
|
+
this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureAgentLoaded.start');
|
|
1520
1529
|
const existing = await this.ensureAgentLoaded(agentId);
|
|
1530
|
+
this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureAgentLoaded.done');
|
|
1531
|
+
this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureVoiceSocket.start');
|
|
1521
1532
|
const socketPath = await ensureVoiceSocket(agentId);
|
|
1533
|
+
this.sessionLogger.info({ agentId, socketPath, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureVoiceSocket.done');
|
|
1522
1534
|
this.registerVoiceBridgeForAgent(agentId);
|
|
1523
1535
|
const baseConfig = {
|
|
1524
1536
|
systemPrompt: stripVoiceModeSystemPrompt(existing.config.systemPrompt),
|
|
@@ -1530,7 +1542,9 @@ export class Session {
|
|
|
1530
1542
|
mcpServers: this.buildVoiceModeMcpServers(baseConfig.mcpServers, socketPath),
|
|
1531
1543
|
};
|
|
1532
1544
|
try {
|
|
1545
|
+
this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.start');
|
|
1533
1546
|
const refreshed = await this.agentManager.reloadAgentSession(agentId, refreshOverrides);
|
|
1547
|
+
this.sessionLogger.info({ agentId, refreshedAgentId: refreshed.id, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.done');
|
|
1534
1548
|
return refreshed.id;
|
|
1535
1549
|
}
|
|
1536
1550
|
catch (error) {
|
|
@@ -1542,8 +1556,7 @@ export class Session {
|
|
|
1542
1556
|
}
|
|
1543
1557
|
}
|
|
1544
1558
|
async disableVoiceModeForActiveAgent(restoreAgentConfig) {
|
|
1545
|
-
this.
|
|
1546
|
-
this.cancelActiveVoiceDictationStream('voice mode disabled');
|
|
1559
|
+
await this.stopVoiceTurnController();
|
|
1547
1560
|
const agentId = this.voiceModeAgentId;
|
|
1548
1561
|
if (!agentId) {
|
|
1549
1562
|
this.voiceModeBaseConfig = null;
|
|
@@ -1569,197 +1582,107 @@ export class Session {
|
|
|
1569
1582
|
this.voiceModeBaseConfig = null;
|
|
1570
1583
|
this.voiceModeAgentId = null;
|
|
1571
1584
|
}
|
|
1572
|
-
isInternalVoiceDictationId(dictationId) {
|
|
1573
|
-
return dictationId.startsWith(VOICE_INTERNAL_DICTATION_ID_PREFIX);
|
|
1574
|
-
}
|
|
1575
1585
|
handleDictationManagerMessage(msg) {
|
|
1576
|
-
|
|
1577
|
-
const metadata = msg.payload.metadata;
|
|
1578
|
-
const dictationId = metadata && typeof metadata.dictationId === 'string' ? metadata.dictationId : null;
|
|
1579
|
-
if (dictationId && this.isInternalVoiceDictationId(dictationId)) {
|
|
1580
|
-
return;
|
|
1581
|
-
}
|
|
1582
|
-
this.emit(msg);
|
|
1583
|
-
return;
|
|
1584
|
-
}
|
|
1585
|
-
const payloadWithDictationId = msg.payload;
|
|
1586
|
-
const dictationId = payloadWithDictationId && typeof payloadWithDictationId.dictationId === 'string'
|
|
1587
|
-
? payloadWithDictationId.dictationId
|
|
1588
|
-
: null;
|
|
1589
|
-
if (!dictationId || !this.isInternalVoiceDictationId(dictationId)) {
|
|
1590
|
-
this.emit(msg);
|
|
1591
|
-
return;
|
|
1592
|
-
}
|
|
1593
|
-
if (msg.type === 'dictation_stream_final') {
|
|
1594
|
-
if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationResolve) {
|
|
1595
|
-
return;
|
|
1596
|
-
}
|
|
1597
|
-
this.activeVoiceDictationResolve({
|
|
1598
|
-
text: msg.payload.text,
|
|
1599
|
-
...(msg.payload.debugRecordingPath
|
|
1600
|
-
? { debugRecordingPath: msg.payload.debugRecordingPath }
|
|
1601
|
-
: {}),
|
|
1602
|
-
});
|
|
1603
|
-
return;
|
|
1604
|
-
}
|
|
1605
|
-
if (msg.type === 'dictation_stream_error') {
|
|
1606
|
-
if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationReject) {
|
|
1607
|
-
return;
|
|
1608
|
-
}
|
|
1609
|
-
this.activeVoiceDictationReject(new Error(msg.payload.error));
|
|
1610
|
-
return;
|
|
1611
|
-
}
|
|
1612
|
-
// Ack/partial messages for internal voice dictation are consumed server-side.
|
|
1613
|
-
}
|
|
1614
|
-
resetActiveVoiceDictationState() {
|
|
1615
|
-
this.activeVoiceDictationId = null;
|
|
1616
|
-
this.activeVoiceDictationFormat = null;
|
|
1617
|
-
this.activeVoiceDictationNextSeq = 0;
|
|
1618
|
-
this.activeVoiceDictationStartPromise = null;
|
|
1619
|
-
this.activeVoiceDictationFinalizePromise = null;
|
|
1620
|
-
this.activeVoiceDictationResultPromise = null;
|
|
1621
|
-
this.activeVoiceDictationResolve = null;
|
|
1622
|
-
this.activeVoiceDictationReject = null;
|
|
1623
|
-
}
|
|
1624
|
-
cancelActiveVoiceDictationStream(reason) {
|
|
1625
|
-
const dictationId = this.activeVoiceDictationId;
|
|
1626
|
-
if (!dictationId) {
|
|
1627
|
-
return;
|
|
1628
|
-
}
|
|
1629
|
-
this.sessionLogger.debug({ dictationId, reason }, 'Cancelling active internal voice dictation stream');
|
|
1630
|
-
if (this.activeVoiceDictationReject) {
|
|
1631
|
-
this.activeVoiceDictationReject(new Error(`Voice dictation cancelled: ${reason}`));
|
|
1632
|
-
}
|
|
1633
|
-
this.voiceStreamManager.handleCancel(dictationId);
|
|
1634
|
-
this.resetActiveVoiceDictationState();
|
|
1586
|
+
this.emit(msg);
|
|
1635
1587
|
}
|
|
1636
|
-
async
|
|
1637
|
-
if (this.
|
|
1638
|
-
|
|
1639
|
-
await this.activeVoiceDictationStartPromise;
|
|
1640
|
-
}
|
|
1588
|
+
async startVoiceTurnController() {
|
|
1589
|
+
if (this.voiceTurnController) {
|
|
1590
|
+
this.sessionLogger.info('startVoiceTurnController skipped: already running');
|
|
1641
1591
|
return;
|
|
1642
1592
|
}
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1593
|
+
const turnDetection = this.resolveVoiceTurnDetection();
|
|
1594
|
+
if (!turnDetection) {
|
|
1595
|
+
throw new Error('Voice turn detection is not configured');
|
|
1596
|
+
}
|
|
1597
|
+
this.sessionLogger.info({ providerId: turnDetection.id }, 'startVoiceTurnController creating controller');
|
|
1598
|
+
const controller = createVoiceTurnController({
|
|
1599
|
+
logger: this.sessionLogger.child({ component: 'voice-turn-controller' }),
|
|
1600
|
+
turnDetection,
|
|
1601
|
+
utteranceSink: {
|
|
1602
|
+
submitUtterance: async ({ pcm16, format, sampleRate, startedAt, endedAt }) => {
|
|
1603
|
+
this.sessionLogger.debug({
|
|
1604
|
+
audioBytes: pcm16.length,
|
|
1605
|
+
sampleRate,
|
|
1606
|
+
startedAt,
|
|
1607
|
+
endedAt,
|
|
1608
|
+
durationMs: Math.max(0, endedAt - startedAt),
|
|
1609
|
+
}, 'Submitting detected voice utterance');
|
|
1610
|
+
await this.processCompletedAudio(pcm16, format);
|
|
1611
|
+
},
|
|
1612
|
+
},
|
|
1613
|
+
callbacks: {
|
|
1614
|
+
onSpeechStarted: async () => {
|
|
1615
|
+
this.handleProvisionalVoiceSpeechStarted();
|
|
1616
|
+
},
|
|
1617
|
+
onSpeechStopped: async () => {
|
|
1618
|
+
this.handleVoiceSpeechStopped();
|
|
1619
|
+
},
|
|
1620
|
+
onError: (error) => {
|
|
1621
|
+
this.sessionLogger.error({ err: error }, 'Voice turn controller failed');
|
|
1622
|
+
},
|
|
1670
1623
|
},
|
|
1671
1624
|
});
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
}
|
|
1677
|
-
catch (error) {
|
|
1678
|
-
this.resetActiveVoiceDictationState();
|
|
1679
|
-
throw error;
|
|
1680
|
-
}
|
|
1681
|
-
finally {
|
|
1682
|
-
if (this.activeVoiceDictationId === dictationId) {
|
|
1683
|
-
this.activeVoiceDictationStartPromise = null;
|
|
1684
|
-
}
|
|
1685
|
-
}
|
|
1686
|
-
}
|
|
1687
|
-
async appendToActiveVoiceDictationStream(audioBase64, format) {
|
|
1688
|
-
if (this.activeVoiceDictationFinalizePromise) {
|
|
1689
|
-
await this.activeVoiceDictationFinalizePromise.catch(() => undefined);
|
|
1690
|
-
}
|
|
1691
|
-
await this.ensureActiveVoiceDictationStream(format);
|
|
1692
|
-
const dictationId = this.activeVoiceDictationId;
|
|
1693
|
-
if (!dictationId) {
|
|
1694
|
-
throw new Error('Voice dictation stream did not initialize');
|
|
1695
|
-
}
|
|
1696
|
-
const seq = this.activeVoiceDictationNextSeq;
|
|
1697
|
-
this.activeVoiceDictationNextSeq += 1;
|
|
1698
|
-
await this.voiceStreamManager.handleChunk({
|
|
1699
|
-
dictationId,
|
|
1700
|
-
seq,
|
|
1701
|
-
audioBase64,
|
|
1702
|
-
format,
|
|
1703
|
-
});
|
|
1625
|
+
this.sessionLogger.info('startVoiceTurnController connecting controller');
|
|
1626
|
+
await controller.start();
|
|
1627
|
+
this.voiceTurnController = controller;
|
|
1628
|
+
this.sessionLogger.info('startVoiceTurnController connected');
|
|
1704
1629
|
}
|
|
1705
|
-
async
|
|
1706
|
-
|
|
1707
|
-
if (!dictationId) {
|
|
1630
|
+
async stopVoiceTurnController() {
|
|
1631
|
+
if (!this.voiceTurnController) {
|
|
1708
1632
|
return;
|
|
1709
1633
|
}
|
|
1710
|
-
this.
|
|
1711
|
-
|
|
1712
|
-
|
|
1634
|
+
this.clearPendingVoiceSpeechStart('turn-controller-stop');
|
|
1635
|
+
const controller = this.voiceTurnController;
|
|
1636
|
+
this.voiceTurnController = null;
|
|
1637
|
+
await controller.stop();
|
|
1638
|
+
}
|
|
1639
|
+
clearPendingVoiceSpeechStart(reason) {
|
|
1640
|
+
if (this.pendingVoiceSpeechTimer) {
|
|
1641
|
+
clearTimeout(this.pendingVoiceSpeechTimer);
|
|
1642
|
+
this.pendingVoiceSpeechTimer = null;
|
|
1713
1643
|
}
|
|
1714
|
-
if (this.
|
|
1715
|
-
|
|
1716
|
-
|
|
1644
|
+
if (this.pendingVoiceSpeechStartAt !== null) {
|
|
1645
|
+
this.sessionLogger.debug({ reason }, 'Clearing provisional voice speech start');
|
|
1646
|
+
this.pendingVoiceSpeechStartAt = null;
|
|
1717
1647
|
}
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
if (
|
|
1721
|
-
this.resetActiveVoiceDictationState();
|
|
1648
|
+
}
|
|
1649
|
+
handleProvisionalVoiceSpeechStarted() {
|
|
1650
|
+
if (this.speechInProgress || this.pendingVoiceSpeechTimer) {
|
|
1722
1651
|
return;
|
|
1723
1652
|
}
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
this.
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
transcriptLength: transcriptText.length,
|
|
1735
|
-
transcript: transcriptText,
|
|
1736
|
-
}, 'Transcription result');
|
|
1737
|
-
await this.handleTranscriptionResultPayload({
|
|
1738
|
-
text: result.text,
|
|
1739
|
-
requestId,
|
|
1740
|
-
...(result.debugRecordingPath
|
|
1741
|
-
? { debugRecordingPath: result.debugRecordingPath, format: 'audio/wav' }
|
|
1742
|
-
: {}),
|
|
1743
|
-
});
|
|
1744
|
-
})();
|
|
1745
|
-
try {
|
|
1746
|
-
await this.activeVoiceDictationFinalizePromise;
|
|
1747
|
-
}
|
|
1748
|
-
catch (error) {
|
|
1749
|
-
this.resetActiveVoiceDictationState();
|
|
1750
|
-
this.setPhase('idle');
|
|
1751
|
-
this.clearSpeechInProgress('transcription error');
|
|
1653
|
+
const startedAt = Date.now();
|
|
1654
|
+
this.pendingVoiceSpeechStartAt = startedAt;
|
|
1655
|
+
this.sessionLogger.info({ confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Silero VAD provisional speech_started');
|
|
1656
|
+
this.pendingVoiceSpeechTimer = setTimeout(() => {
|
|
1657
|
+
this.pendingVoiceSpeechTimer = null;
|
|
1658
|
+
if (this.pendingVoiceSpeechStartAt !== startedAt || this.speechInProgress) {
|
|
1659
|
+
return;
|
|
1660
|
+
}
|
|
1661
|
+
this.pendingVoiceSpeechStartAt = null;
|
|
1662
|
+
this.sessionLogger.info('voice_input_state emitting isSpeaking=true');
|
|
1752
1663
|
this.emit({
|
|
1753
|
-
type: '
|
|
1664
|
+
type: 'voice_input_state',
|
|
1754
1665
|
payload: {
|
|
1755
|
-
|
|
1756
|
-
timestamp: new Date(),
|
|
1757
|
-
type: 'error',
|
|
1758
|
-
content: `Transcription error: ${error instanceof Error ? error.message : String(error)}`,
|
|
1666
|
+
isSpeaking: true,
|
|
1759
1667
|
},
|
|
1760
1668
|
});
|
|
1761
|
-
|
|
1669
|
+
void this.handleVoiceSpeechStart();
|
|
1670
|
+
}, VOICE_INTERRUPT_CONFIRMATION_MS);
|
|
1671
|
+
}
|
|
1672
|
+
handleVoiceSpeechStopped() {
|
|
1673
|
+
if (this.pendingVoiceSpeechStartAt !== null) {
|
|
1674
|
+
const durationMs = Date.now() - this.pendingVoiceSpeechStartAt;
|
|
1675
|
+
this.clearPendingVoiceSpeechStart('speech-stopped-before-confirmation');
|
|
1676
|
+
this.sessionLogger.info({ durationMs, confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Ignoring provisional voice speech start that ended before confirmation');
|
|
1677
|
+
return;
|
|
1762
1678
|
}
|
|
1679
|
+
this.sessionLogger.info('voice_input_state emitting isSpeaking=false');
|
|
1680
|
+
this.emit({
|
|
1681
|
+
type: 'voice_input_state',
|
|
1682
|
+
payload: {
|
|
1683
|
+
isSpeaking: false,
|
|
1684
|
+
},
|
|
1685
|
+
});
|
|
1763
1686
|
}
|
|
1764
1687
|
/**
|
|
1765
1688
|
* Handle text message to agent (with optional image attachments)
|
|
@@ -4959,18 +4882,37 @@ export class Session {
|
|
|
4959
4882
|
if (!this.isVoiceMode) {
|
|
4960
4883
|
this.sessionLogger.warn('Received voice_audio_chunk while voice mode is disabled; transcript will be emitted but voice assistant turn is skipped');
|
|
4961
4884
|
}
|
|
4962
|
-
await this.handleVoiceSpeechStart();
|
|
4963
4885
|
const chunkFormat = msg.format || 'audio/wav';
|
|
4964
4886
|
if (this.isVoiceMode) {
|
|
4965
|
-
|
|
4966
|
-
|
|
4967
|
-
|
|
4968
|
-
|
|
4969
|
-
|
|
4887
|
+
if (!this.voiceTurnController) {
|
|
4888
|
+
throw new Error('Voice mode is enabled but the voice turn controller is not running');
|
|
4889
|
+
}
|
|
4890
|
+
const chunkBytes = Buffer.byteLength(msg.audio, 'base64');
|
|
4891
|
+
this.voiceInputChunkCount += 1;
|
|
4892
|
+
this.voiceInputBytes += chunkBytes;
|
|
4893
|
+
if (this.voiceInputChunkCount === 1) {
|
|
4894
|
+
this.sessionLogger.info({
|
|
4895
|
+
format: chunkFormat,
|
|
4896
|
+
audioBytes: chunkBytes,
|
|
4897
|
+
}, 'Received first voice_audio_chunk for active voice mode');
|
|
4970
4898
|
}
|
|
4971
|
-
|
|
4972
|
-
this.
|
|
4973
|
-
|
|
4899
|
+
const now = Date.now();
|
|
4900
|
+
if (this.voiceInputChunkCount % 50 === 0 ||
|
|
4901
|
+
now - this.voiceInputWindowStartedAt >= 1000) {
|
|
4902
|
+
this.sessionLogger.info({
|
|
4903
|
+
chunkCount: this.voiceInputChunkCount,
|
|
4904
|
+
audioBytes: this.voiceInputBytes,
|
|
4905
|
+
windowMs: now - this.voiceInputWindowStartedAt,
|
|
4906
|
+
format: chunkFormat,
|
|
4907
|
+
}, 'Voice input chunk summary');
|
|
4908
|
+
this.voiceInputWindowStartedAt = now;
|
|
4909
|
+
this.voiceInputChunkCount = 0;
|
|
4910
|
+
this.voiceInputBytes = 0;
|
|
4911
|
+
}
|
|
4912
|
+
await this.voiceTurnController.appendClientChunk({
|
|
4913
|
+
audioBase64: msg.audio,
|
|
4914
|
+
format: chunkFormat,
|
|
4915
|
+
});
|
|
4974
4916
|
return;
|
|
4975
4917
|
}
|
|
4976
4918
|
const chunkBuffer = Buffer.from(msg.audio, 'base64');
|
|
@@ -5051,9 +4993,8 @@ export class Session {
|
|
|
5051
4993
|
};
|
|
5052
4994
|
}
|
|
5053
4995
|
async processCompletedAudio(audio, format) {
|
|
5054
|
-
|
|
5055
|
-
|
|
5056
|
-
this.sessionLogger.debug({ phase: this.processingPhase }, `Buffering audio segment (phase: ${this.processingPhase})`);
|
|
4996
|
+
if (this.processingPhase === 'transcribing') {
|
|
4997
|
+
this.sessionLogger.debug({ phase: this.processingPhase, segmentCount: this.pendingAudioSegments.length + 1 }, `Buffering audio segment (phase: ${this.processingPhase})`);
|
|
5057
4998
|
this.pendingAudioSegments.push({
|
|
5058
4999
|
audio,
|
|
5059
5000
|
format,
|
|
@@ -5077,6 +5018,18 @@ export class Session {
|
|
|
5077
5018
|
}
|
|
5078
5019
|
await this.processAudio(audio, format);
|
|
5079
5020
|
}
|
|
5021
|
+
async flushPendingAudioSegments(reason) {
|
|
5022
|
+
if (this.processingPhase === 'transcribing' || this.pendingAudioSegments.length === 0) {
|
|
5023
|
+
return;
|
|
5024
|
+
}
|
|
5025
|
+
const pendingSegments = [...this.pendingAudioSegments];
|
|
5026
|
+
this.pendingAudioSegments = [];
|
|
5027
|
+
this.clearBufferTimeout();
|
|
5028
|
+
this.sessionLogger.debug({ reason, segmentCount: pendingSegments.length }, `Flushing ${pendingSegments.length} buffered audio segment(s)`);
|
|
5029
|
+
const combinedAudio = Buffer.concat(pendingSegments.map((segment) => segment.audio));
|
|
5030
|
+
const combinedFormat = pendingSegments[pendingSegments.length - 1].format;
|
|
5031
|
+
await this.processAudio(combinedAudio, combinedFormat);
|
|
5032
|
+
}
|
|
5080
5033
|
/**
|
|
5081
5034
|
* Process audio through STT and then LLM
|
|
5082
5035
|
*/
|
|
@@ -5119,6 +5072,7 @@ export class Session {
|
|
|
5119
5072
|
catch (error) {
|
|
5120
5073
|
this.setPhase('idle');
|
|
5121
5074
|
this.clearSpeechInProgress('transcription error');
|
|
5075
|
+
await this.flushPendingAudioSegments('transcription error');
|
|
5122
5076
|
this.emit({
|
|
5123
5077
|
type: 'activity_log',
|
|
5124
5078
|
payload: {
|
|
@@ -5153,6 +5107,7 @@ export class Session {
|
|
|
5153
5107
|
this.sessionLogger.debug('Empty transcription (false positive), not aborting');
|
|
5154
5108
|
this.setPhase('idle');
|
|
5155
5109
|
this.clearSpeechInProgress('empty transcription');
|
|
5110
|
+
await this.flushPendingAudioSegments('empty transcription');
|
|
5156
5111
|
return;
|
|
5157
5112
|
}
|
|
5158
5113
|
// Has content - abort any in-progress stream now
|
|
@@ -5190,16 +5145,19 @@ export class Session {
|
|
|
5190
5145
|
this.setPhase('idle');
|
|
5191
5146
|
if (!this.isVoiceMode) {
|
|
5192
5147
|
this.sessionLogger.debug({ requestId: result.requestId }, 'Skipping voice agent processing because voice mode is disabled');
|
|
5148
|
+
await this.flushPendingAudioSegments('voice mode disabled');
|
|
5193
5149
|
return;
|
|
5194
5150
|
}
|
|
5195
5151
|
const agentId = this.voiceModeAgentId;
|
|
5196
5152
|
if (!agentId) {
|
|
5197
5153
|
this.sessionLogger.warn({ requestId: result.requestId }, 'Skipping voice agent processing because no agent is currently voice-enabled');
|
|
5154
|
+
await this.flushPendingAudioSegments('no active voice agent');
|
|
5198
5155
|
return;
|
|
5199
5156
|
}
|
|
5200
5157
|
// Route voice utterances through the same send path as regular text input:
|
|
5201
5158
|
// interrupt-if-running, record message, then start a new stream.
|
|
5202
5159
|
await this.handleSendAgentMessage(agentId, result.text);
|
|
5160
|
+
await this.flushPendingAudioSegments('transcription complete');
|
|
5203
5161
|
}
|
|
5204
5162
|
registerVoiceBridgeForAgent(agentId) {
|
|
5205
5163
|
this.registerVoiceSpeakHandler?.(agentId, async ({ text, signal }) => {
|
|
@@ -5280,8 +5238,6 @@ export class Session {
|
|
|
5280
5238
|
this.sessionLogger.debug({ chunks: this.audioBuffer.chunks.length, pcmBytes: this.audioBuffer.totalPCMBytes }, `Clearing partial audio buffer (${this.audioBuffer.chunks.length} chunk(s)${this.audioBuffer.isPCM ? `, ${this.audioBuffer.totalPCMBytes} PCM bytes` : ''})`);
|
|
5281
5239
|
this.audioBuffer = null;
|
|
5282
5240
|
}
|
|
5283
|
-
this.cancelActiveVoiceDictationStream('new speech turn started');
|
|
5284
|
-
this.clearVoiceModeInactivityTimeout();
|
|
5285
5241
|
this.clearBufferTimeout();
|
|
5286
5242
|
this.abortController.abort();
|
|
5287
5243
|
await this.handleAbort();
|
|
@@ -5292,6 +5248,7 @@ export class Session {
|
|
|
5292
5248
|
* Clear speech-in-progress flag once the user turn has completed
|
|
5293
5249
|
*/
|
|
5294
5250
|
clearSpeechInProgress(reason) {
|
|
5251
|
+
this.clearPendingVoiceSpeechStart(`clear-speech-in-progress:${reason}`);
|
|
5295
5252
|
if (!this.speechInProgress) {
|
|
5296
5253
|
return;
|
|
5297
5254
|
}
|
|
@@ -5321,6 +5278,11 @@ export class Session {
|
|
|
5321
5278
|
this.clearBufferTimeout();
|
|
5322
5279
|
this.bufferTimeout = setTimeout(async () => {
|
|
5323
5280
|
this.sessionLogger.debug('Buffer timeout reached, processing pending segments');
|
|
5281
|
+
if (this.processingPhase === 'transcribing') {
|
|
5282
|
+
this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, 'Buffer timeout deferred because transcription is still in progress');
|
|
5283
|
+
this.setBufferTimeout();
|
|
5284
|
+
return;
|
|
5285
|
+
}
|
|
5324
5286
|
if (this.pendingAudioSegments.length > 0) {
|
|
5325
5287
|
const segments = [...this.pendingAudioSegments];
|
|
5326
5288
|
this.pendingAudioSegments = [];
|
|
@@ -5330,32 +5292,6 @@ export class Session {
|
|
|
5330
5292
|
}
|
|
5331
5293
|
}, 10000); // 10 second timeout
|
|
5332
5294
|
}
|
|
5333
|
-
setVoiceModeInactivityTimeout() {
|
|
5334
|
-
if (!this.isVoiceMode) {
|
|
5335
|
-
return;
|
|
5336
|
-
}
|
|
5337
|
-
this.clearVoiceModeInactivityTimeout();
|
|
5338
|
-
this.voiceModeInactivityTimeout = setTimeout(() => {
|
|
5339
|
-
this.voiceModeInactivityTimeout = null;
|
|
5340
|
-
if (!this.isVoiceMode || !this.activeVoiceDictationId) {
|
|
5341
|
-
return;
|
|
5342
|
-
}
|
|
5343
|
-
this.sessionLogger.warn({
|
|
5344
|
-
timeoutMs: VOICE_MODE_INACTIVITY_FLUSH_MS,
|
|
5345
|
-
dictationId: this.activeVoiceDictationId,
|
|
5346
|
-
nextSeq: this.activeVoiceDictationNextSeq,
|
|
5347
|
-
}, 'Voice mode inactivity timeout reached without isLast; finalizing active voice dictation stream');
|
|
5348
|
-
void this.finalizeActiveVoiceDictationStream('inactivity timeout').catch((error) => {
|
|
5349
|
-
this.sessionLogger.error({ err: error }, 'Failed to finalize voice dictation stream after inactivity timeout');
|
|
5350
|
-
});
|
|
5351
|
-
}, VOICE_MODE_INACTIVITY_FLUSH_MS);
|
|
5352
|
-
}
|
|
5353
|
-
clearVoiceModeInactivityTimeout() {
|
|
5354
|
-
if (this.voiceModeInactivityTimeout) {
|
|
5355
|
-
clearTimeout(this.voiceModeInactivityTimeout);
|
|
5356
|
-
this.voiceModeInactivityTimeout = null;
|
|
5357
|
-
}
|
|
5358
|
-
}
|
|
5359
5295
|
/**
|
|
5360
5296
|
* Clear buffer timeout
|
|
5361
5297
|
*/
|
|
@@ -5431,16 +5367,14 @@ export class Session {
|
|
|
5431
5367
|
// Abort any ongoing operations
|
|
5432
5368
|
this.abortController.abort();
|
|
5433
5369
|
// Clear timeouts
|
|
5434
|
-
this.clearVoiceModeInactivityTimeout();
|
|
5435
5370
|
this.clearBufferTimeout();
|
|
5436
5371
|
// Clear buffers
|
|
5437
|
-
this.cancelActiveVoiceDictationStream('session cleanup');
|
|
5438
5372
|
this.pendingAudioSegments = [];
|
|
5439
5373
|
this.audioBuffer = null;
|
|
5374
|
+
await this.stopVoiceTurnController();
|
|
5440
5375
|
// Cleanup managers
|
|
5441
5376
|
this.ttsManager.cleanup();
|
|
5442
5377
|
this.sttManager.cleanup();
|
|
5443
|
-
this.voiceStreamManager.cleanupAll();
|
|
5444
5378
|
this.dictationStreamManager.cleanupAll();
|
|
5445
5379
|
// Close MCP clients
|
|
5446
5380
|
if (this.agentMcpClient) {
|