@getpaseo/server 0.1.27 → 0.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/dist/server/client/daemon-client.d.ts +1 -1
  2. package/dist/server/client/daemon-client.d.ts.map +1 -1
  3. package/dist/server/client/daemon-client.js +1 -1
  4. package/dist/server/client/daemon-client.js.map +1 -1
  5. package/dist/server/server/agent/agent-response-loop.js +1 -1
  6. package/dist/server/server/agent/agent-response-loop.js.map +1 -1
  7. package/dist/server/server/agent/provider-launch-config.d.ts +13 -2
  8. package/dist/server/server/agent/provider-launch-config.d.ts.map +1 -1
  9. package/dist/server/server/agent/provider-launch-config.js +17 -9
  10. package/dist/server/server/agent/provider-launch-config.js.map +1 -1
  11. package/dist/server/server/agent/provider-manifest.d.ts.map +1 -1
  12. package/dist/server/server/agent/provider-manifest.js +10 -5
  13. package/dist/server/server/agent/provider-manifest.js.map +1 -1
  14. package/dist/server/server/agent/providers/claude-agent.d.ts +3 -1
  15. package/dist/server/server/agent/providers/claude-agent.d.ts.map +1 -1
  16. package/dist/server/server/agent/providers/claude-agent.js +5 -1
  17. package/dist/server/server/agent/providers/claude-agent.js.map +1 -1
  18. package/dist/server/server/agent/providers/codex-app-server-agent.d.ts.map +1 -1
  19. package/dist/server/server/agent/providers/codex-app-server-agent.js +30 -1
  20. package/dist/server/server/agent/providers/codex-app-server-agent.js.map +1 -1
  21. package/dist/server/server/agent/providers/opencode-agent.d.ts +1 -0
  22. package/dist/server/server/agent/providers/opencode-agent.d.ts.map +1 -1
  23. package/dist/server/server/agent/providers/opencode-agent.js +110 -9
  24. package/dist/server/server/agent/providers/opencode-agent.js.map +1 -1
  25. package/dist/server/server/agent/tts-manager.d.ts +8 -1
  26. package/dist/server/server/agent/tts-manager.d.ts.map +1 -1
  27. package/dist/server/server/agent/tts-manager.js +215 -108
  28. package/dist/server/server/agent/tts-manager.js.map +1 -1
  29. package/dist/server/server/bootstrap.d.ts +2 -2
  30. package/dist/server/server/bootstrap.d.ts.map +1 -1
  31. package/dist/server/server/bootstrap.js +26 -5
  32. package/dist/server/server/bootstrap.js.map +1 -1
  33. package/dist/server/server/exports.d.ts +1 -0
  34. package/dist/server/server/exports.d.ts.map +1 -1
  35. package/dist/server/server/exports.js +2 -0
  36. package/dist/server/server/exports.js.map +1 -1
  37. package/dist/server/server/persisted-config.d.ts +25 -0
  38. package/dist/server/server/persisted-config.d.ts.map +1 -1
  39. package/dist/server/server/persisted-config.js +6 -0
  40. package/dist/server/server/persisted-config.js.map +1 -1
  41. package/dist/server/server/session.d.ts +16 -19
  42. package/dist/server/server/session.d.ts.map +1 -1
  43. package/dist/server/server/session.js +171 -237
  44. package/dist/server/server/session.js.map +1 -1
  45. package/dist/server/server/speech/providers/local/runtime.d.ts +2 -0
  46. package/dist/server/server/speech/providers/local/runtime.d.ts.map +1 -1
  47. package/dist/server/server/speech/providers/local/runtime.js +7 -0
  48. package/dist/server/server/speech/providers/local/runtime.js.map +1 -1
  49. package/dist/server/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
  50. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts +2 -0
  51. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts.map +1 -1
  52. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
  53. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts +13 -0
  54. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts.map +1 -0
  55. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
  56. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
  57. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts +32 -0
  58. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts.map +1 -0
  59. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
  60. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
  61. package/dist/server/server/speech/providers/openai/runtime.d.ts +2 -0
  62. package/dist/server/server/speech/providers/openai/runtime.d.ts.map +1 -1
  63. package/dist/server/server/speech/providers/openai/runtime.js +2 -0
  64. package/dist/server/server/speech/providers/openai/runtime.js.map +1 -1
  65. package/dist/server/server/speech/speech-config-resolver.d.ts.map +1 -1
  66. package/dist/server/server/speech/speech-config-resolver.js +35 -14
  67. package/dist/server/server/speech/speech-config-resolver.js.map +1 -1
  68. package/dist/server/server/speech/speech-runtime.d.ts +3 -1
  69. package/dist/server/server/speech/speech-runtime.d.ts.map +1 -1
  70. package/dist/server/server/speech/speech-runtime.js +39 -6
  71. package/dist/server/server/speech/speech-runtime.js.map +1 -1
  72. package/dist/server/server/speech/speech-types.d.ts +1 -0
  73. package/dist/server/server/speech/speech-types.d.ts.map +1 -1
  74. package/dist/server/server/speech/turn-detection-provider.d.ts +22 -0
  75. package/dist/server/server/speech/turn-detection-provider.d.ts.map +1 -0
  76. package/dist/server/server/speech/turn-detection-provider.js +2 -0
  77. package/dist/server/server/speech/turn-detection-provider.js.map +1 -0
  78. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts +16 -0
  79. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts.map +1 -0
  80. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
  81. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
  82. package/dist/server/server/voice/voice-turn-controller.d.ts +34 -0
  83. package/dist/server/server/voice/voice-turn-controller.d.ts.map +1 -0
  84. package/dist/server/server/voice/voice-turn-controller.js +161 -0
  85. package/dist/server/server/voice/voice-turn-controller.js.map +1 -0
  86. package/dist/server/server/websocket-server.d.ts +3 -0
  87. package/dist/server/server/websocket-server.d.ts.map +1 -1
  88. package/dist/server/server/websocket-server.js +5 -1
  89. package/dist/server/server/websocket-server.js.map +1 -1
  90. package/dist/server/server/workspace-registry.d.ts +2 -0
  91. package/dist/server/server/workspace-registry.d.ts.map +1 -1
  92. package/dist/server/server/workspace-registry.js +11 -4
  93. package/dist/server/server/workspace-registry.js.map +1 -1
  94. package/dist/server/shared/messages.d.ts +97 -0
  95. package/dist/server/shared/messages.d.ts.map +1 -1
  96. package/dist/server/shared/messages.js +7 -0
  97. package/dist/server/shared/messages.js.map +1 -1
  98. package/dist/server/shared/tool-call-display.d.ts.map +1 -1
  99. package/dist/server/shared/tool-call-display.js +58 -39
  100. package/dist/server/shared/tool-call-display.js.map +1 -1
  101. package/dist/src/server/agent/agent-response-loop.js +1 -1
  102. package/dist/src/server/agent/agent-response-loop.js.map +1 -1
  103. package/dist/src/server/agent/provider-launch-config.js +17 -9
  104. package/dist/src/server/agent/provider-launch-config.js.map +1 -1
  105. package/dist/src/server/agent/provider-manifest.js +10 -5
  106. package/dist/src/server/agent/provider-manifest.js.map +1 -1
  107. package/dist/src/server/agent/providers/claude-agent.js +5 -1
  108. package/dist/src/server/agent/providers/claude-agent.js.map +1 -1
  109. package/dist/src/server/agent/providers/codex-app-server-agent.js +30 -1
  110. package/dist/src/server/agent/providers/codex-app-server-agent.js.map +1 -1
  111. package/dist/src/server/agent/providers/opencode-agent.js +110 -9
  112. package/dist/src/server/agent/providers/opencode-agent.js.map +1 -1
  113. package/dist/src/server/agent/tts-manager.js +215 -108
  114. package/dist/src/server/agent/tts-manager.js.map +1 -1
  115. package/dist/src/server/bootstrap.js +26 -5
  116. package/dist/src/server/bootstrap.js.map +1 -1
  117. package/dist/src/server/persisted-config.js +6 -0
  118. package/dist/src/server/persisted-config.js.map +1 -1
  119. package/dist/src/server/session.js +171 -237
  120. package/dist/src/server/session.js.map +1 -1
  121. package/dist/src/server/speech/providers/local/runtime.js +7 -0
  122. package/dist/src/server/speech/providers/local/runtime.js.map +1 -1
  123. package/dist/src/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
  124. package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
  125. package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
  126. package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
  127. package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
  128. package/dist/src/server/speech/providers/openai/runtime.js +2 -0
  129. package/dist/src/server/speech/providers/openai/runtime.js.map +1 -1
  130. package/dist/src/server/speech/speech-config-resolver.js +35 -14
  131. package/dist/src/server/speech/speech-config-resolver.js.map +1 -1
  132. package/dist/src/server/speech/speech-runtime.js +39 -6
  133. package/dist/src/server/speech/speech-runtime.js.map +1 -1
  134. package/dist/src/server/speech/turn-detection-provider.js +2 -0
  135. package/dist/src/server/speech/turn-detection-provider.js.map +1 -0
  136. package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
  137. package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
  138. package/dist/src/server/voice/voice-turn-controller.js +161 -0
  139. package/dist/src/server/voice/voice-turn-controller.js.map +1 -0
  140. package/dist/src/server/websocket-server.js +5 -1
  141. package/dist/src/server/websocket-server.js.map +1 -1
  142. package/dist/src/server/workspace-registry.js +11 -4
  143. package/dist/src/server/workspace-registry.js.map +1 -1
  144. package/dist/src/shared/messages.js +7 -0
  145. package/dist/src/shared/messages.js.map +1 -1
  146. package/dist/src/shared/tool-call-display.js +58 -39
  147. package/dist/src/shared/tool-call-display.js.map +1 -1
  148. package/package.json +4 -3
  149. package/src/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
@@ -13,6 +13,7 @@ import { STTManager } from './agent/stt-manager.js';
13
13
  import { maybePersistTtsDebugAudio } from './agent/tts-debug.js';
14
14
  import { isPaseoDictationDebugEnabled } from './agent/recordings-debug.js';
15
15
  import { DictationStreamManager, } from './dictation/dictation-stream-manager.js';
16
+ import { createVoiceTurnController, } from './voice/voice-turn-controller.js';
16
17
  import { buildConfigOverrides, buildSessionConfig, extractTimestamps } from './persistence-hooks.js';
17
18
  import { experimental_createMCPClient } from 'ai';
18
19
  import { buildProviderRegistry } from './agent/provider-registry.js';
@@ -35,6 +36,7 @@ import { getProjectIcon } from '../utils/project-icon.js';
35
36
  import { expandTilde } from '../utils/path.js';
36
37
  import { searchHomeDirectories, searchWorkspaceEntries } from '../utils/directory-suggestions.js';
37
38
  import { ensureLocalSpeechModels, getLocalSpeechModelDir, listLocalSpeechModels, } from './speech/providers/local/models.js';
39
+ import { toResolver } from './speech/provider-resolver.js';
38
40
  import { resolveClientMessageId } from './client-message-id.js';
39
41
  const execAsync = promisify(exec);
40
42
  const MAX_INITIAL_AGENT_TITLE_CHARS = Math.min(60, MAX_EXPLICIT_AGENT_TITLE_CHARS);
@@ -88,11 +90,10 @@ const PCM_BITS_PER_SAMPLE = 16;
88
90
  const PCM_BYTES_PER_MS = (PCM_SAMPLE_RATE * PCM_CHANNELS * (PCM_BITS_PER_SAMPLE / 8)) / 1000;
89
91
  const MIN_STREAMING_SEGMENT_DURATION_MS = 1000;
90
92
  const MIN_STREAMING_SEGMENT_BYTES = Math.round(PCM_BYTES_PER_MS * MIN_STREAMING_SEGMENT_DURATION_MS);
91
- const VOICE_MODE_INACTIVITY_FLUSH_MS = 4500;
92
- const VOICE_INTERNAL_DICTATION_ID_PREFIX = '__voice_turn__:';
93
93
  const SAFE_GIT_REF_PATTERN = /^[A-Za-z0-9._\/-]+$/;
94
94
  const AgentIdSchema = z.string().uuid();
95
95
  const VOICE_MCP_SERVER_NAME = 'paseo_voice';
96
+ const VOICE_INTERRUPT_CONFIRMATION_MS = 500;
96
97
  class VoiceFeatureUnavailableError extends Error {
97
98
  constructor(context) {
98
99
  super(context.message);
@@ -161,19 +162,16 @@ export class Session {
161
162
  // Voice mode state
162
163
  this.isVoiceMode = false;
163
164
  this.speechInProgress = false;
165
+ this.pendingVoiceSpeechStartAt = null;
166
+ this.pendingVoiceSpeechTimer = null;
167
+ this.voiceTurnController = null;
168
+ this.voiceInputChunkCount = 0;
169
+ this.voiceInputBytes = 0;
170
+ this.voiceInputWindowStartedAt = Date.now();
164
171
  // Audio buffering for interruption handling
165
172
  this.pendingAudioSegments = [];
166
173
  this.bufferTimeout = null;
167
- this.voiceModeInactivityTimeout = null;
168
174
  this.audioBuffer = null;
169
- this.activeVoiceDictationId = null;
170
- this.activeVoiceDictationFormat = null;
171
- this.activeVoiceDictationNextSeq = 0;
172
- this.activeVoiceDictationStartPromise = null;
173
- this.activeVoiceDictationFinalizePromise = null;
174
- this.activeVoiceDictationResultPromise = null;
175
- this.activeVoiceDictationResolve = null;
176
- this.activeVoiceDictationReject = null;
177
175
  // Optional TTS debug capture (persisted per utterance)
178
176
  this.ttsDebugStreams = new Map();
179
177
  // Per-session MCP client and tools
@@ -221,6 +219,7 @@ export class Session {
221
219
  this.unsubscribeTerminalsChanged = this.terminalManager.subscribeTerminalsChanged((event) => this.handleTerminalsChanged(event));
222
220
  }
223
221
  this.voiceAgentMcpStdio = voice?.voiceAgentMcpStdio ?? null;
222
+ this.resolveVoiceTurnDetection = toResolver(voice?.turnDetection ?? null);
224
223
  const configuredModelsDir = dictation?.localModels?.modelsDir?.trim();
225
224
  this.localSpeechModelsDir =
226
225
  configuredModelsDir && configuredModelsDir.length > 0
@@ -257,13 +256,6 @@ export class Session {
257
256
  stt: dictation?.stt ?? null,
258
257
  finalTimeoutMs: dictation?.finalTimeoutMs,
259
258
  });
260
- this.voiceStreamManager = new DictationStreamManager({
261
- logger: this.sessionLogger.child({ stream: 'voice-internal' }),
262
- sessionId: this.sessionId,
263
- emit: (msg) => this.handleDictationManagerMessage(msg),
264
- stt: stt,
265
- finalTimeoutMs: dictation?.finalTimeoutMs,
266
- });
267
259
  // Initialize agent MCP client asynchronously
268
260
  void this.initializeAgentMcp();
269
261
  this.subscribeToAgentEvents();
@@ -1409,7 +1401,9 @@ export class Session {
1409
1401
  * Handle voice mode toggle
1410
1402
  */
1411
1403
  async handleSetVoiceMode(enabled, agentId, requestId) {
1404
+ const startedAt = Date.now();
1412
1405
  try {
1406
+ this.sessionLogger.info({ enabled, requestedAgentId: agentId ?? null, requestId: requestId ?? null }, 'set_voice_mode started');
1413
1407
  if (enabled) {
1414
1408
  const unavailable = this.resolveVoiceFeatureUnavailableContext('voice_mode');
1415
1409
  if (unavailable) {
@@ -1419,15 +1413,26 @@ export class Session {
1419
1413
  if (this.isVoiceMode &&
1420
1414
  this.voiceModeAgentId &&
1421
1415
  this.voiceModeAgentId !== normalizedAgentId) {
1416
+ this.sessionLogger.info({
1417
+ previousAgentId: this.voiceModeAgentId,
1418
+ nextAgentId: normalizedAgentId,
1419
+ elapsedMs: Date.now() - startedAt,
1420
+ }, 'set_voice_mode disabling previous active voice agent');
1422
1421
  await this.disableVoiceModeForActiveAgent(true);
1423
1422
  }
1424
1423
  if (!this.isVoiceMode || this.voiceModeAgentId !== normalizedAgentId) {
1424
+ this.sessionLogger.info({ agentId: normalizedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode enabling voice for agent');
1425
1425
  const refreshedAgentId = await this.enableVoiceModeForAgent(normalizedAgentId);
1426
1426
  this.voiceModeAgentId = refreshedAgentId;
1427
+ this.sessionLogger.info({ agentId: refreshedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode agent enable complete');
1427
1428
  }
1429
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode starting voice turn controller');
1430
+ await this.startVoiceTurnController();
1431
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode voice turn controller started');
1428
1432
  this.isVoiceMode = true;
1429
1433
  this.sessionLogger.info({
1430
1434
  agentId: this.voiceModeAgentId,
1435
+ elapsedMs: Date.now() - startedAt,
1431
1436
  }, 'Voice mode enabled for existing agent');
1432
1437
  if (requestId) {
1433
1438
  this.emit({
@@ -1443,9 +1448,10 @@ export class Session {
1443
1448
  }
1444
1449
  return;
1445
1450
  }
1451
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode disabling active voice mode');
1446
1452
  await this.disableVoiceModeForActiveAgent(true);
1447
1453
  this.isVoiceMode = false;
1448
- this.sessionLogger.info('Voice mode disabled');
1454
+ this.sessionLogger.info({ elapsedMs: Date.now() - startedAt }, 'Voice mode disabled');
1449
1455
  if (requestId) {
1450
1456
  this.emit({
1451
1457
  type: 'set_voice_mode_response',
@@ -1466,6 +1472,7 @@ export class Session {
1466
1472
  err: error,
1467
1473
  enabled,
1468
1474
  requestedAgentId: agentId ?? null,
1475
+ elapsedMs: Date.now() - startedAt,
1469
1476
  }, 'set_voice_mode failed');
1470
1477
  if (requestId) {
1471
1478
  this.emit({
@@ -1513,12 +1520,17 @@ export class Session {
1513
1520
  };
1514
1521
  }
1515
1522
  async enableVoiceModeForAgent(agentId) {
1523
+ const startedAt = Date.now();
1516
1524
  const ensureVoiceSocket = this.ensureVoiceMcpSocketForAgent;
1517
1525
  if (!ensureVoiceSocket) {
1518
1526
  throw new Error('Voice MCP socket bridge is not configured');
1519
1527
  }
1528
+ this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureAgentLoaded.start');
1520
1529
  const existing = await this.ensureAgentLoaded(agentId);
1530
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureAgentLoaded.done');
1531
+ this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureVoiceSocket.start');
1521
1532
  const socketPath = await ensureVoiceSocket(agentId);
1533
+ this.sessionLogger.info({ agentId, socketPath, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureVoiceSocket.done');
1522
1534
  this.registerVoiceBridgeForAgent(agentId);
1523
1535
  const baseConfig = {
1524
1536
  systemPrompt: stripVoiceModeSystemPrompt(existing.config.systemPrompt),
@@ -1530,7 +1542,9 @@ export class Session {
1530
1542
  mcpServers: this.buildVoiceModeMcpServers(baseConfig.mcpServers, socketPath),
1531
1543
  };
1532
1544
  try {
1545
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.start');
1533
1546
  const refreshed = await this.agentManager.reloadAgentSession(agentId, refreshOverrides);
1547
+ this.sessionLogger.info({ agentId, refreshedAgentId: refreshed.id, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.done');
1534
1548
  return refreshed.id;
1535
1549
  }
1536
1550
  catch (error) {
@@ -1542,8 +1556,7 @@ export class Session {
1542
1556
  }
1543
1557
  }
1544
1558
  async disableVoiceModeForActiveAgent(restoreAgentConfig) {
1545
- this.clearVoiceModeInactivityTimeout();
1546
- this.cancelActiveVoiceDictationStream('voice mode disabled');
1559
+ await this.stopVoiceTurnController();
1547
1560
  const agentId = this.voiceModeAgentId;
1548
1561
  if (!agentId) {
1549
1562
  this.voiceModeBaseConfig = null;
@@ -1569,197 +1582,107 @@ export class Session {
1569
1582
  this.voiceModeBaseConfig = null;
1570
1583
  this.voiceModeAgentId = null;
1571
1584
  }
1572
- isInternalVoiceDictationId(dictationId) {
1573
- return dictationId.startsWith(VOICE_INTERNAL_DICTATION_ID_PREFIX);
1574
- }
1575
1585
  handleDictationManagerMessage(msg) {
1576
- if (msg.type === 'activity_log') {
1577
- const metadata = msg.payload.metadata;
1578
- const dictationId = metadata && typeof metadata.dictationId === 'string' ? metadata.dictationId : null;
1579
- if (dictationId && this.isInternalVoiceDictationId(dictationId)) {
1580
- return;
1581
- }
1582
- this.emit(msg);
1583
- return;
1584
- }
1585
- const payloadWithDictationId = msg.payload;
1586
- const dictationId = payloadWithDictationId && typeof payloadWithDictationId.dictationId === 'string'
1587
- ? payloadWithDictationId.dictationId
1588
- : null;
1589
- if (!dictationId || !this.isInternalVoiceDictationId(dictationId)) {
1590
- this.emit(msg);
1591
- return;
1592
- }
1593
- if (msg.type === 'dictation_stream_final') {
1594
- if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationResolve) {
1595
- return;
1596
- }
1597
- this.activeVoiceDictationResolve({
1598
- text: msg.payload.text,
1599
- ...(msg.payload.debugRecordingPath
1600
- ? { debugRecordingPath: msg.payload.debugRecordingPath }
1601
- : {}),
1602
- });
1603
- return;
1604
- }
1605
- if (msg.type === 'dictation_stream_error') {
1606
- if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationReject) {
1607
- return;
1608
- }
1609
- this.activeVoiceDictationReject(new Error(msg.payload.error));
1610
- return;
1611
- }
1612
- // Ack/partial messages for internal voice dictation are consumed server-side.
1613
- }
1614
- resetActiveVoiceDictationState() {
1615
- this.activeVoiceDictationId = null;
1616
- this.activeVoiceDictationFormat = null;
1617
- this.activeVoiceDictationNextSeq = 0;
1618
- this.activeVoiceDictationStartPromise = null;
1619
- this.activeVoiceDictationFinalizePromise = null;
1620
- this.activeVoiceDictationResultPromise = null;
1621
- this.activeVoiceDictationResolve = null;
1622
- this.activeVoiceDictationReject = null;
1623
- }
1624
- cancelActiveVoiceDictationStream(reason) {
1625
- const dictationId = this.activeVoiceDictationId;
1626
- if (!dictationId) {
1627
- return;
1628
- }
1629
- this.sessionLogger.debug({ dictationId, reason }, 'Cancelling active internal voice dictation stream');
1630
- if (this.activeVoiceDictationReject) {
1631
- this.activeVoiceDictationReject(new Error(`Voice dictation cancelled: ${reason}`));
1632
- }
1633
- this.voiceStreamManager.handleCancel(dictationId);
1634
- this.resetActiveVoiceDictationState();
1586
+ this.emit(msg);
1635
1587
  }
1636
- async ensureActiveVoiceDictationStream(format) {
1637
- if (this.activeVoiceDictationId && this.activeVoiceDictationFormat === format) {
1638
- if (this.activeVoiceDictationStartPromise) {
1639
- await this.activeVoiceDictationStartPromise;
1640
- }
1588
+ async startVoiceTurnController() {
1589
+ if (this.voiceTurnController) {
1590
+ this.sessionLogger.info('startVoiceTurnController skipped: already running');
1641
1591
  return;
1642
1592
  }
1643
- if (this.activeVoiceDictationId) {
1644
- await this.finalizeActiveVoiceDictationStream('voice format changed');
1645
- }
1646
- const dictationId = `${VOICE_INTERNAL_DICTATION_ID_PREFIX}${uuidv4()}`;
1647
- let resolve = null;
1648
- let reject = null;
1649
- const resultPromise = new Promise((resolveFn, rejectFn) => {
1650
- resolve = resolveFn;
1651
- reject = rejectFn;
1652
- });
1653
- // Prevent process-level unhandled rejection warnings when cancellation races are resolved later.
1654
- void resultPromise.catch(() => undefined);
1655
- this.activeVoiceDictationId = dictationId;
1656
- this.activeVoiceDictationFormat = format;
1657
- this.activeVoiceDictationNextSeq = 0;
1658
- this.activeVoiceDictationFinalizePromise = null;
1659
- this.activeVoiceDictationResultPromise = resultPromise;
1660
- this.activeVoiceDictationResolve = resolve;
1661
- this.activeVoiceDictationReject = reject;
1662
- this.setPhase('transcribing');
1663
- this.emit({
1664
- type: 'activity_log',
1665
- payload: {
1666
- id: uuidv4(),
1667
- timestamp: new Date(),
1668
- type: 'system',
1669
- content: 'Transcribing audio...',
1593
+ const turnDetection = this.resolveVoiceTurnDetection();
1594
+ if (!turnDetection) {
1595
+ throw new Error('Voice turn detection is not configured');
1596
+ }
1597
+ this.sessionLogger.info({ providerId: turnDetection.id }, 'startVoiceTurnController creating controller');
1598
+ const controller = createVoiceTurnController({
1599
+ logger: this.sessionLogger.child({ component: 'voice-turn-controller' }),
1600
+ turnDetection,
1601
+ utteranceSink: {
1602
+ submitUtterance: async ({ pcm16, format, sampleRate, startedAt, endedAt }) => {
1603
+ this.sessionLogger.debug({
1604
+ audioBytes: pcm16.length,
1605
+ sampleRate,
1606
+ startedAt,
1607
+ endedAt,
1608
+ durationMs: Math.max(0, endedAt - startedAt),
1609
+ }, 'Submitting detected voice utterance');
1610
+ await this.processCompletedAudio(pcm16, format);
1611
+ },
1612
+ },
1613
+ callbacks: {
1614
+ onSpeechStarted: async () => {
1615
+ this.handleProvisionalVoiceSpeechStarted();
1616
+ },
1617
+ onSpeechStopped: async () => {
1618
+ this.handleVoiceSpeechStopped();
1619
+ },
1620
+ onError: (error) => {
1621
+ this.sessionLogger.error({ err: error }, 'Voice turn controller failed');
1622
+ },
1670
1623
  },
1671
1624
  });
1672
- const startPromise = this.voiceStreamManager.handleStart(dictationId, format);
1673
- this.activeVoiceDictationStartPromise = startPromise;
1674
- try {
1675
- await startPromise;
1676
- }
1677
- catch (error) {
1678
- this.resetActiveVoiceDictationState();
1679
- throw error;
1680
- }
1681
- finally {
1682
- if (this.activeVoiceDictationId === dictationId) {
1683
- this.activeVoiceDictationStartPromise = null;
1684
- }
1685
- }
1686
- }
1687
- async appendToActiveVoiceDictationStream(audioBase64, format) {
1688
- if (this.activeVoiceDictationFinalizePromise) {
1689
- await this.activeVoiceDictationFinalizePromise.catch(() => undefined);
1690
- }
1691
- await this.ensureActiveVoiceDictationStream(format);
1692
- const dictationId = this.activeVoiceDictationId;
1693
- if (!dictationId) {
1694
- throw new Error('Voice dictation stream did not initialize');
1695
- }
1696
- const seq = this.activeVoiceDictationNextSeq;
1697
- this.activeVoiceDictationNextSeq += 1;
1698
- await this.voiceStreamManager.handleChunk({
1699
- dictationId,
1700
- seq,
1701
- audioBase64,
1702
- format,
1703
- });
1625
+ this.sessionLogger.info('startVoiceTurnController connecting controller');
1626
+ await controller.start();
1627
+ this.voiceTurnController = controller;
1628
+ this.sessionLogger.info('startVoiceTurnController connected');
1704
1629
  }
1705
- async finalizeActiveVoiceDictationStream(reason) {
1706
- const dictationId = this.activeVoiceDictationId;
1707
- if (!dictationId) {
1630
+ async stopVoiceTurnController() {
1631
+ if (!this.voiceTurnController) {
1708
1632
  return;
1709
1633
  }
1710
- this.clearVoiceModeInactivityTimeout();
1711
- if (this.activeVoiceDictationStartPromise) {
1712
- await this.activeVoiceDictationStartPromise;
1634
+ this.clearPendingVoiceSpeechStart('turn-controller-stop');
1635
+ const controller = this.voiceTurnController;
1636
+ this.voiceTurnController = null;
1637
+ await controller.stop();
1638
+ }
1639
+ clearPendingVoiceSpeechStart(reason) {
1640
+ if (this.pendingVoiceSpeechTimer) {
1641
+ clearTimeout(this.pendingVoiceSpeechTimer);
1642
+ this.pendingVoiceSpeechTimer = null;
1713
1643
  }
1714
- if (this.activeVoiceDictationFinalizePromise) {
1715
- await this.activeVoiceDictationFinalizePromise;
1716
- return;
1644
+ if (this.pendingVoiceSpeechStartAt !== null) {
1645
+ this.sessionLogger.debug({ reason }, 'Clearing provisional voice speech start');
1646
+ this.pendingVoiceSpeechStartAt = null;
1717
1647
  }
1718
- const finalSeq = this.activeVoiceDictationNextSeq - 1;
1719
- const resultPromise = this.activeVoiceDictationResultPromise;
1720
- if (!resultPromise) {
1721
- this.resetActiveVoiceDictationState();
1648
+ }
1649
+ handleProvisionalVoiceSpeechStarted() {
1650
+ if (this.speechInProgress || this.pendingVoiceSpeechTimer) {
1722
1651
  return;
1723
1652
  }
1724
- this.activeVoiceDictationFinalizePromise = (async () => {
1725
- this.sessionLogger.debug({ dictationId, finalSeq, reason }, 'Finalizing internal voice dictation stream');
1726
- await this.voiceStreamManager.handleFinish(dictationId, finalSeq);
1727
- const result = await resultPromise;
1728
- this.resetActiveVoiceDictationState();
1729
- const requestId = uuidv4();
1730
- const transcriptText = result.text.trim();
1731
- this.sessionLogger.info({
1732
- requestId,
1733
- isVoiceMode: this.isVoiceMode,
1734
- transcriptLength: transcriptText.length,
1735
- transcript: transcriptText,
1736
- }, 'Transcription result');
1737
- await this.handleTranscriptionResultPayload({
1738
- text: result.text,
1739
- requestId,
1740
- ...(result.debugRecordingPath
1741
- ? { debugRecordingPath: result.debugRecordingPath, format: 'audio/wav' }
1742
- : {}),
1743
- });
1744
- })();
1745
- try {
1746
- await this.activeVoiceDictationFinalizePromise;
1747
- }
1748
- catch (error) {
1749
- this.resetActiveVoiceDictationState();
1750
- this.setPhase('idle');
1751
- this.clearSpeechInProgress('transcription error');
1653
+ const startedAt = Date.now();
1654
+ this.pendingVoiceSpeechStartAt = startedAt;
1655
+ this.sessionLogger.info({ confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Silero VAD provisional speech_started');
1656
+ this.pendingVoiceSpeechTimer = setTimeout(() => {
1657
+ this.pendingVoiceSpeechTimer = null;
1658
+ if (this.pendingVoiceSpeechStartAt !== startedAt || this.speechInProgress) {
1659
+ return;
1660
+ }
1661
+ this.pendingVoiceSpeechStartAt = null;
1662
+ this.sessionLogger.info('voice_input_state emitting isSpeaking=true');
1752
1663
  this.emit({
1753
- type: 'activity_log',
1664
+ type: 'voice_input_state',
1754
1665
  payload: {
1755
- id: uuidv4(),
1756
- timestamp: new Date(),
1757
- type: 'error',
1758
- content: `Transcription error: ${error instanceof Error ? error.message : String(error)}`,
1666
+ isSpeaking: true,
1759
1667
  },
1760
1668
  });
1761
- throw error;
1669
+ void this.handleVoiceSpeechStart();
1670
+ }, VOICE_INTERRUPT_CONFIRMATION_MS);
1671
+ }
1672
+ handleVoiceSpeechStopped() {
1673
+ if (this.pendingVoiceSpeechStartAt !== null) {
1674
+ const durationMs = Date.now() - this.pendingVoiceSpeechStartAt;
1675
+ this.clearPendingVoiceSpeechStart('speech-stopped-before-confirmation');
1676
+ this.sessionLogger.info({ durationMs, confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Ignoring provisional voice speech start that ended before confirmation');
1677
+ return;
1762
1678
  }
1679
+ this.sessionLogger.info('voice_input_state emitting isSpeaking=false');
1680
+ this.emit({
1681
+ type: 'voice_input_state',
1682
+ payload: {
1683
+ isSpeaking: false,
1684
+ },
1685
+ });
1763
1686
  }
1764
1687
  /**
1765
1688
  * Handle text message to agent (with optional image attachments)
@@ -4959,18 +4882,37 @@ export class Session {
4959
4882
  if (!this.isVoiceMode) {
4960
4883
  this.sessionLogger.warn('Received voice_audio_chunk while voice mode is disabled; transcript will be emitted but voice assistant turn is skipped');
4961
4884
  }
4962
- await this.handleVoiceSpeechStart();
4963
4885
  const chunkFormat = msg.format || 'audio/wav';
4964
4886
  if (this.isVoiceMode) {
4965
- await this.appendToActiveVoiceDictationStream(msg.audio, chunkFormat);
4966
- if (!msg.isLast) {
4967
- this.setVoiceModeInactivityTimeout();
4968
- this.sessionLogger.debug('Voice mode: streaming chunk, waiting for speech end');
4969
- return;
4887
+ if (!this.voiceTurnController) {
4888
+ throw new Error('Voice mode is enabled but the voice turn controller is not running');
4889
+ }
4890
+ const chunkBytes = Buffer.byteLength(msg.audio, 'base64');
4891
+ this.voiceInputChunkCount += 1;
4892
+ this.voiceInputBytes += chunkBytes;
4893
+ if (this.voiceInputChunkCount === 1) {
4894
+ this.sessionLogger.info({
4895
+ format: chunkFormat,
4896
+ audioBytes: chunkBytes,
4897
+ }, 'Received first voice_audio_chunk for active voice mode');
4970
4898
  }
4971
- this.clearVoiceModeInactivityTimeout();
4972
- this.sessionLogger.debug('Voice mode: speech ended, finalizing streaming transcription');
4973
- await this.finalizeActiveVoiceDictationStream('speech ended');
4899
+ const now = Date.now();
4900
+ if (this.voiceInputChunkCount % 50 === 0 ||
4901
+ now - this.voiceInputWindowStartedAt >= 1000) {
4902
+ this.sessionLogger.info({
4903
+ chunkCount: this.voiceInputChunkCount,
4904
+ audioBytes: this.voiceInputBytes,
4905
+ windowMs: now - this.voiceInputWindowStartedAt,
4906
+ format: chunkFormat,
4907
+ }, 'Voice input chunk summary');
4908
+ this.voiceInputWindowStartedAt = now;
4909
+ this.voiceInputChunkCount = 0;
4910
+ this.voiceInputBytes = 0;
4911
+ }
4912
+ await this.voiceTurnController.appendClientChunk({
4913
+ audioBase64: msg.audio,
4914
+ format: chunkFormat,
4915
+ });
4974
4916
  return;
4975
4917
  }
4976
4918
  const chunkBuffer = Buffer.from(msg.audio, 'base64');
@@ -5051,9 +4993,8 @@ export class Session {
5051
4993
  };
5052
4994
  }
5053
4995
  async processCompletedAudio(audio, format) {
5054
- const shouldBuffer = this.processingPhase === 'transcribing' && this.pendingAudioSegments.length === 0;
5055
- if (shouldBuffer) {
5056
- this.sessionLogger.debug({ phase: this.processingPhase }, `Buffering audio segment (phase: ${this.processingPhase})`);
4996
+ if (this.processingPhase === 'transcribing') {
4997
+ this.sessionLogger.debug({ phase: this.processingPhase, segmentCount: this.pendingAudioSegments.length + 1 }, `Buffering audio segment (phase: ${this.processingPhase})`);
5057
4998
  this.pendingAudioSegments.push({
5058
4999
  audio,
5059
5000
  format,
@@ -5077,6 +5018,18 @@ export class Session {
5077
5018
  }
5078
5019
  await this.processAudio(audio, format);
5079
5020
  }
5021
+ async flushPendingAudioSegments(reason) {
5022
+ if (this.processingPhase === 'transcribing' || this.pendingAudioSegments.length === 0) {
5023
+ return;
5024
+ }
5025
+ const pendingSegments = [...this.pendingAudioSegments];
5026
+ this.pendingAudioSegments = [];
5027
+ this.clearBufferTimeout();
5028
+ this.sessionLogger.debug({ reason, segmentCount: pendingSegments.length }, `Flushing ${pendingSegments.length} buffered audio segment(s)`);
5029
+ const combinedAudio = Buffer.concat(pendingSegments.map((segment) => segment.audio));
5030
+ const combinedFormat = pendingSegments[pendingSegments.length - 1].format;
5031
+ await this.processAudio(combinedAudio, combinedFormat);
5032
+ }
5080
5033
  /**
5081
5034
  * Process audio through STT and then LLM
5082
5035
  */
@@ -5119,6 +5072,7 @@ export class Session {
5119
5072
  catch (error) {
5120
5073
  this.setPhase('idle');
5121
5074
  this.clearSpeechInProgress('transcription error');
5075
+ await this.flushPendingAudioSegments('transcription error');
5122
5076
  this.emit({
5123
5077
  type: 'activity_log',
5124
5078
  payload: {
@@ -5153,6 +5107,7 @@ export class Session {
5153
5107
  this.sessionLogger.debug('Empty transcription (false positive), not aborting');
5154
5108
  this.setPhase('idle');
5155
5109
  this.clearSpeechInProgress('empty transcription');
5110
+ await this.flushPendingAudioSegments('empty transcription');
5156
5111
  return;
5157
5112
  }
5158
5113
  // Has content - abort any in-progress stream now
@@ -5190,16 +5145,19 @@ export class Session {
5190
5145
  this.setPhase('idle');
5191
5146
  if (!this.isVoiceMode) {
5192
5147
  this.sessionLogger.debug({ requestId: result.requestId }, 'Skipping voice agent processing because voice mode is disabled');
5148
+ await this.flushPendingAudioSegments('voice mode disabled');
5193
5149
  return;
5194
5150
  }
5195
5151
  const agentId = this.voiceModeAgentId;
5196
5152
  if (!agentId) {
5197
5153
  this.sessionLogger.warn({ requestId: result.requestId }, 'Skipping voice agent processing because no agent is currently voice-enabled');
5154
+ await this.flushPendingAudioSegments('no active voice agent');
5198
5155
  return;
5199
5156
  }
5200
5157
  // Route voice utterances through the same send path as regular text input:
5201
5158
  // interrupt-if-running, record message, then start a new stream.
5202
5159
  await this.handleSendAgentMessage(agentId, result.text);
5160
+ await this.flushPendingAudioSegments('transcription complete');
5203
5161
  }
5204
5162
  registerVoiceBridgeForAgent(agentId) {
5205
5163
  this.registerVoiceSpeakHandler?.(agentId, async ({ text, signal }) => {
@@ -5280,8 +5238,6 @@ export class Session {
5280
5238
  this.sessionLogger.debug({ chunks: this.audioBuffer.chunks.length, pcmBytes: this.audioBuffer.totalPCMBytes }, `Clearing partial audio buffer (${this.audioBuffer.chunks.length} chunk(s)${this.audioBuffer.isPCM ? `, ${this.audioBuffer.totalPCMBytes} PCM bytes` : ''})`);
5281
5239
  this.audioBuffer = null;
5282
5240
  }
5283
- this.cancelActiveVoiceDictationStream('new speech turn started');
5284
- this.clearVoiceModeInactivityTimeout();
5285
5241
  this.clearBufferTimeout();
5286
5242
  this.abortController.abort();
5287
5243
  await this.handleAbort();
@@ -5292,6 +5248,7 @@ export class Session {
5292
5248
  * Clear speech-in-progress flag once the user turn has completed
5293
5249
  */
5294
5250
  clearSpeechInProgress(reason) {
5251
+ this.clearPendingVoiceSpeechStart(`clear-speech-in-progress:${reason}`);
5295
5252
  if (!this.speechInProgress) {
5296
5253
  return;
5297
5254
  }
@@ -5321,6 +5278,11 @@ export class Session {
5321
5278
  this.clearBufferTimeout();
5322
5279
  this.bufferTimeout = setTimeout(async () => {
5323
5280
  this.sessionLogger.debug('Buffer timeout reached, processing pending segments');
5281
+ if (this.processingPhase === 'transcribing') {
5282
+ this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, 'Buffer timeout deferred because transcription is still in progress');
5283
+ this.setBufferTimeout();
5284
+ return;
5285
+ }
5324
5286
  if (this.pendingAudioSegments.length > 0) {
5325
5287
  const segments = [...this.pendingAudioSegments];
5326
5288
  this.pendingAudioSegments = [];
@@ -5330,32 +5292,6 @@ export class Session {
5330
5292
  }
5331
5293
  }, 10000); // 10 second timeout
5332
5294
  }
5333
- setVoiceModeInactivityTimeout() {
5334
- if (!this.isVoiceMode) {
5335
- return;
5336
- }
5337
- this.clearVoiceModeInactivityTimeout();
5338
- this.voiceModeInactivityTimeout = setTimeout(() => {
5339
- this.voiceModeInactivityTimeout = null;
5340
- if (!this.isVoiceMode || !this.activeVoiceDictationId) {
5341
- return;
5342
- }
5343
- this.sessionLogger.warn({
5344
- timeoutMs: VOICE_MODE_INACTIVITY_FLUSH_MS,
5345
- dictationId: this.activeVoiceDictationId,
5346
- nextSeq: this.activeVoiceDictationNextSeq,
5347
- }, 'Voice mode inactivity timeout reached without isLast; finalizing active voice dictation stream');
5348
- void this.finalizeActiveVoiceDictationStream('inactivity timeout').catch((error) => {
5349
- this.sessionLogger.error({ err: error }, 'Failed to finalize voice dictation stream after inactivity timeout');
5350
- });
5351
- }, VOICE_MODE_INACTIVITY_FLUSH_MS);
5352
- }
5353
- clearVoiceModeInactivityTimeout() {
5354
- if (this.voiceModeInactivityTimeout) {
5355
- clearTimeout(this.voiceModeInactivityTimeout);
5356
- this.voiceModeInactivityTimeout = null;
5357
- }
5358
- }
5359
5295
  /**
5360
5296
  * Clear buffer timeout
5361
5297
  */
@@ -5431,16 +5367,14 @@ export class Session {
5431
5367
  // Abort any ongoing operations
5432
5368
  this.abortController.abort();
5433
5369
  // Clear timeouts
5434
- this.clearVoiceModeInactivityTimeout();
5435
5370
  this.clearBufferTimeout();
5436
5371
  // Clear buffers
5437
- this.cancelActiveVoiceDictationStream('session cleanup');
5438
5372
  this.pendingAudioSegments = [];
5439
5373
  this.audioBuffer = null;
5374
+ await this.stopVoiceTurnController();
5440
5375
  // Cleanup managers
5441
5376
  this.ttsManager.cleanup();
5442
5377
  this.sttManager.cleanup();
5443
- this.voiceStreamManager.cleanupAll();
5444
5378
  this.dictationStreamManager.cleanupAll();
5445
5379
  // Close MCP clients
5446
5380
  if (this.agentMcpClient) {