@getpaseo/server 0.1.26 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. package/dist/server/client/daemon-client.d.ts +1 -1
  2. package/dist/server/client/daemon-client.d.ts.map +1 -1
  3. package/dist/server/client/daemon-client.js +1 -1
  4. package/dist/server/client/daemon-client.js.map +1 -1
  5. package/dist/server/server/agent/agent-response-loop.js +1 -1
  6. package/dist/server/server/agent/agent-response-loop.js.map +1 -1
  7. package/dist/server/server/agent/provider-launch-config.d.ts +14 -2
  8. package/dist/server/server/agent/provider-launch-config.d.ts.map +1 -1
  9. package/dist/server/server/agent/provider-launch-config.js +30 -8
  10. package/dist/server/server/agent/provider-launch-config.js.map +1 -1
  11. package/dist/server/server/agent/provider-manifest.d.ts.map +1 -1
  12. package/dist/server/server/agent/provider-manifest.js +10 -5
  13. package/dist/server/server/agent/provider-manifest.js.map +1 -1
  14. package/dist/server/server/agent/providers/claude/tool-call-detail-parser.d.ts.map +1 -1
  15. package/dist/server/server/agent/providers/claude/tool-call-detail-parser.js +2 -0
  16. package/dist/server/server/agent/providers/claude/tool-call-detail-parser.js.map +1 -1
  17. package/dist/server/server/agent/providers/claude/tool-call-mapper.d.ts.map +1 -1
  18. package/dist/server/server/agent/providers/claude/tool-call-mapper.js +2 -0
  19. package/dist/server/server/agent/providers/claude/tool-call-mapper.js.map +1 -1
  20. package/dist/server/server/agent/providers/claude-agent.d.ts +3 -1
  21. package/dist/server/server/agent/providers/claude-agent.d.ts.map +1 -1
  22. package/dist/server/server/agent/providers/claude-agent.js +5 -1
  23. package/dist/server/server/agent/providers/claude-agent.js.map +1 -1
  24. package/dist/server/server/agent/providers/codex-app-server-agent.d.ts.map +1 -1
  25. package/dist/server/server/agent/providers/codex-app-server-agent.js +146 -46
  26. package/dist/server/server/agent/providers/codex-app-server-agent.js.map +1 -1
  27. package/dist/server/server/agent/providers/codex-rollout-timeline.d.ts.map +1 -1
  28. package/dist/server/server/agent/providers/codex-rollout-timeline.js +77 -9
  29. package/dist/server/server/agent/providers/codex-rollout-timeline.js.map +1 -1
  30. package/dist/server/server/agent/providers/opencode-agent.d.ts +1 -0
  31. package/dist/server/server/agent/providers/opencode-agent.d.ts.map +1 -1
  32. package/dist/server/server/agent/providers/opencode-agent.js +115 -43
  33. package/dist/server/server/agent/providers/opencode-agent.js.map +1 -1
  34. package/dist/server/server/agent/providers/tool-call-mapper-utils.d.ts +1 -0
  35. package/dist/server/server/agent/providers/tool-call-mapper-utils.d.ts.map +1 -1
  36. package/dist/server/server/agent/providers/tool-call-mapper-utils.js +8 -0
  37. package/dist/server/server/agent/providers/tool-call-mapper-utils.js.map +1 -1
  38. package/dist/server/server/agent/tts-manager.d.ts +8 -1
  39. package/dist/server/server/agent/tts-manager.d.ts.map +1 -1
  40. package/dist/server/server/agent/tts-manager.js +215 -108
  41. package/dist/server/server/agent/tts-manager.js.map +1 -1
  42. package/dist/server/server/bootstrap.d.ts +2 -2
  43. package/dist/server/server/bootstrap.d.ts.map +1 -1
  44. package/dist/server/server/bootstrap.js +26 -5
  45. package/dist/server/server/bootstrap.js.map +1 -1
  46. package/dist/server/server/persisted-config.d.ts +25 -0
  47. package/dist/server/server/persisted-config.d.ts.map +1 -1
  48. package/dist/server/server/persisted-config.js +6 -0
  49. package/dist/server/server/persisted-config.js.map +1 -1
  50. package/dist/server/server/session.d.ts +22 -19
  51. package/dist/server/server/session.d.ts.map +1 -1
  52. package/dist/server/server/session.js +305 -294
  53. package/dist/server/server/session.js.map +1 -1
  54. package/dist/server/server/speech/providers/local/runtime.d.ts +2 -0
  55. package/dist/server/server/speech/providers/local/runtime.d.ts.map +1 -1
  56. package/dist/server/server/speech/providers/local/runtime.js +7 -0
  57. package/dist/server/server/speech/providers/local/runtime.js.map +1 -1
  58. package/dist/server/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
  59. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts +2 -0
  60. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.d.ts.map +1 -1
  61. package/dist/server/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
  62. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts +13 -0
  63. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.d.ts.map +1 -0
  64. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
  65. package/dist/server/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
  66. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts +32 -0
  67. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.d.ts.map +1 -0
  68. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
  69. package/dist/server/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
  70. package/dist/server/server/speech/providers/openai/runtime.d.ts +2 -0
  71. package/dist/server/server/speech/providers/openai/runtime.d.ts.map +1 -1
  72. package/dist/server/server/speech/providers/openai/runtime.js +2 -0
  73. package/dist/server/server/speech/providers/openai/runtime.js.map +1 -1
  74. package/dist/server/server/speech/speech-config-resolver.d.ts.map +1 -1
  75. package/dist/server/server/speech/speech-config-resolver.js +35 -14
  76. package/dist/server/server/speech/speech-config-resolver.js.map +1 -1
  77. package/dist/server/server/speech/speech-runtime.d.ts +3 -1
  78. package/dist/server/server/speech/speech-runtime.d.ts.map +1 -1
  79. package/dist/server/server/speech/speech-runtime.js +39 -6
  80. package/dist/server/server/speech/speech-runtime.js.map +1 -1
  81. package/dist/server/server/speech/speech-types.d.ts +1 -0
  82. package/dist/server/server/speech/speech-types.d.ts.map +1 -1
  83. package/dist/server/server/speech/turn-detection-provider.d.ts +22 -0
  84. package/dist/server/server/speech/turn-detection-provider.d.ts.map +1 -0
  85. package/dist/server/server/speech/turn-detection-provider.js +2 -0
  86. package/dist/server/server/speech/turn-detection-provider.js.map +1 -0
  87. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts +16 -0
  88. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.d.ts.map +1 -0
  89. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
  90. package/dist/server/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
  91. package/dist/server/server/voice/voice-turn-controller.d.ts +34 -0
  92. package/dist/server/server/voice/voice-turn-controller.d.ts.map +1 -0
  93. package/dist/server/server/voice/voice-turn-controller.js +161 -0
  94. package/dist/server/server/voice/voice-turn-controller.js.map +1 -0
  95. package/dist/server/server/websocket-server.d.ts +3 -0
  96. package/dist/server/server/websocket-server.d.ts.map +1 -1
  97. package/dist/server/server/websocket-server.js +5 -1
  98. package/dist/server/server/websocket-server.js.map +1 -1
  99. package/dist/server/server/workspace-registry.d.ts +2 -0
  100. package/dist/server/server/workspace-registry.d.ts.map +1 -1
  101. package/dist/server/server/workspace-registry.js +11 -4
  102. package/dist/server/server/workspace-registry.js.map +1 -1
  103. package/dist/server/shared/messages.d.ts +97 -0
  104. package/dist/server/shared/messages.d.ts.map +1 -1
  105. package/dist/server/shared/messages.js +7 -0
  106. package/dist/server/shared/messages.js.map +1 -1
  107. package/dist/server/shared/tool-call-display.d.ts.map +1 -1
  108. package/dist/server/shared/tool-call-display.js +59 -33
  109. package/dist/server/shared/tool-call-display.js.map +1 -1
  110. package/dist/src/server/agent/agent-response-loop.js +1 -1
  111. package/dist/src/server/agent/agent-response-loop.js.map +1 -1
  112. package/dist/src/server/agent/provider-launch-config.js +30 -8
  113. package/dist/src/server/agent/provider-launch-config.js.map +1 -1
  114. package/dist/src/server/agent/provider-manifest.js +10 -5
  115. package/dist/src/server/agent/provider-manifest.js.map +1 -1
  116. package/dist/src/server/agent/providers/claude/tool-call-detail-parser.js +2 -0
  117. package/dist/src/server/agent/providers/claude/tool-call-detail-parser.js.map +1 -1
  118. package/dist/src/server/agent/providers/claude/tool-call-mapper.js +2 -0
  119. package/dist/src/server/agent/providers/claude/tool-call-mapper.js.map +1 -1
  120. package/dist/src/server/agent/providers/claude-agent.js +5 -1
  121. package/dist/src/server/agent/providers/claude-agent.js.map +1 -1
  122. package/dist/src/server/agent/providers/codex-app-server-agent.js +146 -46
  123. package/dist/src/server/agent/providers/codex-app-server-agent.js.map +1 -1
  124. package/dist/src/server/agent/providers/codex-rollout-timeline.js +77 -9
  125. package/dist/src/server/agent/providers/codex-rollout-timeline.js.map +1 -1
  126. package/dist/src/server/agent/providers/opencode-agent.js +115 -43
  127. package/dist/src/server/agent/providers/opencode-agent.js.map +1 -1
  128. package/dist/src/server/agent/providers/tool-call-mapper-utils.js +8 -0
  129. package/dist/src/server/agent/providers/tool-call-mapper-utils.js.map +1 -1
  130. package/dist/src/server/agent/tts-manager.js +215 -108
  131. package/dist/src/server/agent/tts-manager.js.map +1 -1
  132. package/dist/src/server/bootstrap.js +26 -5
  133. package/dist/src/server/bootstrap.js.map +1 -1
  134. package/dist/src/server/persisted-config.js +6 -0
  135. package/dist/src/server/persisted-config.js.map +1 -1
  136. package/dist/src/server/session.js +305 -294
  137. package/dist/src/server/session.js.map +1 -1
  138. package/dist/src/server/speech/providers/local/runtime.js +7 -0
  139. package/dist/src/server/speech/providers/local/runtime.js.map +1 -1
  140. package/dist/src/server/speech/providers/local/sherpa/sherpa-onnx-node-loader.js.map +1 -1
  141. package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js +23 -0
  142. package/dist/src/server/speech/providers/local/sherpa/silero-vad-provider.js.map +1 -0
  143. package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js +107 -0
  144. package/dist/src/server/speech/providers/local/sherpa/silero-vad-session.js.map +1 -0
  145. package/dist/src/server/speech/providers/openai/runtime.js +2 -0
  146. package/dist/src/server/speech/providers/openai/runtime.js.map +1 -1
  147. package/dist/src/server/speech/speech-config-resolver.js +35 -14
  148. package/dist/src/server/speech/speech-config-resolver.js.map +1 -1
  149. package/dist/src/server/speech/speech-runtime.js +39 -6
  150. package/dist/src/server/speech/speech-runtime.js.map +1 -1
  151. package/dist/src/server/speech/turn-detection-provider.js +2 -0
  152. package/dist/src/server/speech/turn-detection-provider.js.map +1 -0
  153. package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js +35 -0
  154. package/dist/src/server/voice/fixed-duration-pcm-ring-buffer.js.map +1 -0
  155. package/dist/src/server/voice/voice-turn-controller.js +161 -0
  156. package/dist/src/server/voice/voice-turn-controller.js.map +1 -0
  157. package/dist/src/server/websocket-server.js +5 -1
  158. package/dist/src/server/websocket-server.js.map +1 -1
  159. package/dist/src/server/workspace-registry.js +11 -4
  160. package/dist/src/server/workspace-registry.js.map +1 -1
  161. package/dist/src/shared/messages.js +7 -0
  162. package/dist/src/shared/messages.js.map +1 -1
  163. package/dist/src/shared/tool-call-display.js +59 -33
  164. package/dist/src/shared/tool-call-display.js.map +1 -1
  165. package/package.json +7 -7
  166. package/src/server/speech/providers/local/sherpa/assets/silero_vad.onnx +0 -0
@@ -13,6 +13,7 @@ import { STTManager } from './agent/stt-manager.js';
13
13
  import { maybePersistTtsDebugAudio } from './agent/tts-debug.js';
14
14
  import { isPaseoDictationDebugEnabled } from './agent/recordings-debug.js';
15
15
  import { DictationStreamManager, } from './dictation/dictation-stream-manager.js';
16
+ import { createVoiceTurnController, } from './voice/voice-turn-controller.js';
16
17
  import { buildConfigOverrides, buildSessionConfig, extractTimestamps } from './persistence-hooks.js';
17
18
  import { experimental_createMCPClient } from 'ai';
18
19
  import { buildProviderRegistry } from './agent/provider-registry.js';
@@ -35,6 +36,7 @@ import { getProjectIcon } from '../utils/project-icon.js';
35
36
  import { expandTilde } from '../utils/path.js';
36
37
  import { searchHomeDirectories, searchWorkspaceEntries } from '../utils/directory-suggestions.js';
37
38
  import { ensureLocalSpeechModels, getLocalSpeechModelDir, listLocalSpeechModels, } from './speech/providers/local/models.js';
39
+ import { toResolver } from './speech/provider-resolver.js';
38
40
  import { resolveClientMessageId } from './client-message-id.js';
39
41
  const execAsync = promisify(exec);
40
42
  const MAX_INITIAL_AGENT_TITLE_CHARS = Math.min(60, MAX_EXPLICIT_AGENT_TITLE_CHARS);
@@ -88,11 +90,10 @@ const PCM_BITS_PER_SAMPLE = 16;
88
90
  const PCM_BYTES_PER_MS = (PCM_SAMPLE_RATE * PCM_CHANNELS * (PCM_BITS_PER_SAMPLE / 8)) / 1000;
89
91
  const MIN_STREAMING_SEGMENT_DURATION_MS = 1000;
90
92
  const MIN_STREAMING_SEGMENT_BYTES = Math.round(PCM_BYTES_PER_MS * MIN_STREAMING_SEGMENT_DURATION_MS);
91
- const VOICE_MODE_INACTIVITY_FLUSH_MS = 4500;
92
- const VOICE_INTERNAL_DICTATION_ID_PREFIX = '__voice_turn__:';
93
93
  const SAFE_GIT_REF_PATTERN = /^[A-Za-z0-9._\/-]+$/;
94
94
  const AgentIdSchema = z.string().uuid();
95
95
  const VOICE_MCP_SERVER_NAME = 'paseo_voice';
96
+ const VOICE_INTERRUPT_CONFIRMATION_MS = 500;
96
97
  class VoiceFeatureUnavailableError extends Error {
97
98
  constructor(context) {
98
99
  super(context.message);
@@ -161,19 +162,16 @@ export class Session {
161
162
  // Voice mode state
162
163
  this.isVoiceMode = false;
163
164
  this.speechInProgress = false;
165
+ this.pendingVoiceSpeechStartAt = null;
166
+ this.pendingVoiceSpeechTimer = null;
167
+ this.voiceTurnController = null;
168
+ this.voiceInputChunkCount = 0;
169
+ this.voiceInputBytes = 0;
170
+ this.voiceInputWindowStartedAt = Date.now();
164
171
  // Audio buffering for interruption handling
165
172
  this.pendingAudioSegments = [];
166
173
  this.bufferTimeout = null;
167
- this.voiceModeInactivityTimeout = null;
168
174
  this.audioBuffer = null;
169
- this.activeVoiceDictationId = null;
170
- this.activeVoiceDictationFormat = null;
171
- this.activeVoiceDictationNextSeq = 0;
172
- this.activeVoiceDictationStartPromise = null;
173
- this.activeVoiceDictationFinalizePromise = null;
174
- this.activeVoiceDictationResultPromise = null;
175
- this.activeVoiceDictationResolve = null;
176
- this.activeVoiceDictationReject = null;
177
175
  // Optional TTS debug capture (persisted per utterance)
178
176
  this.ttsDebugStreams = new Map();
179
177
  // Per-session MCP client and tools
@@ -221,6 +219,7 @@ export class Session {
221
219
  this.unsubscribeTerminalsChanged = this.terminalManager.subscribeTerminalsChanged((event) => this.handleTerminalsChanged(event));
222
220
  }
223
221
  this.voiceAgentMcpStdio = voice?.voiceAgentMcpStdio ?? null;
222
+ this.resolveVoiceTurnDetection = toResolver(voice?.turnDetection ?? null);
224
223
  const configuredModelsDir = dictation?.localModels?.modelsDir?.trim();
225
224
  this.localSpeechModelsDir =
226
225
  configuredModelsDir && configuredModelsDir.length > 0
@@ -257,13 +256,6 @@ export class Session {
257
256
  stt: dictation?.stt ?? null,
258
257
  finalTimeoutMs: dictation?.finalTimeoutMs,
259
258
  });
260
- this.voiceStreamManager = new DictationStreamManager({
261
- logger: this.sessionLogger.child({ stream: 'voice-internal' }),
262
- sessionId: this.sessionId,
263
- emit: (msg) => this.handleDictationManagerMessage(msg),
264
- stt: stt,
265
- finalTimeoutMs: dictation?.finalTimeoutMs,
266
- });
267
259
  // Initialize agent MCP client asynchronously
268
260
  void this.initializeAgentMcp();
269
261
  this.subscribeToAgentEvents();
@@ -699,6 +691,100 @@ export class Session {
699
691
  paseoHome: this.paseoHome,
700
692
  });
701
693
  }
694
+ buildPersistedProjectRecord(input) {
695
+ return createPersistedProjectRecord({
696
+ projectId: input.placement.projectKey,
697
+ rootPath: deriveProjectRootPath({
698
+ cwd: input.workspaceId,
699
+ checkout: input.placement.checkout,
700
+ }),
701
+ kind: deriveProjectKind(input.placement.checkout),
702
+ displayName: input.placement.projectName,
703
+ createdAt: input.createdAt,
704
+ updatedAt: input.updatedAt,
705
+ archivedAt: null,
706
+ });
707
+ }
708
+ buildPersistedWorkspaceRecord(input) {
709
+ return createPersistedWorkspaceRecord({
710
+ workspaceId: input.workspaceId,
711
+ projectId: input.placement.projectKey,
712
+ cwd: input.workspaceId,
713
+ kind: deriveWorkspaceKind(input.placement.checkout),
714
+ displayName: deriveWorkspaceDisplayName({
715
+ cwd: input.workspaceId,
716
+ checkout: input.placement.checkout,
717
+ }),
718
+ createdAt: input.createdAt,
719
+ updatedAt: input.updatedAt,
720
+ archivedAt: null,
721
+ });
722
+ }
723
+ async archiveProjectRecordIfEmpty(projectId, archivedAt) {
724
+ const siblingWorkspaces = (await this.workspaceRegistry.list()).filter((workspace) => workspace.projectId === projectId && !workspace.archivedAt);
725
+ if (siblingWorkspaces.length === 0) {
726
+ await this.projectRegistry.archive(projectId, archivedAt);
727
+ }
728
+ }
729
+ async reconcileWorkspaceRecord(workspaceId) {
730
+ const normalizedWorkspaceId = normalizePersistedWorkspaceId(workspaceId);
731
+ const existing = await this.workspaceRegistry.get(normalizedWorkspaceId);
732
+ const placement = await this.buildProjectPlacement(normalizedWorkspaceId);
733
+ const now = new Date().toISOString();
734
+ const nextProjectCreatedAt = existing?.createdAt ?? now;
735
+ const nextWorkspaceCreatedAt = existing?.createdAt ?? now;
736
+ const currentProjectRecord = await this.projectRegistry.get(placement.projectKey);
737
+ const nextProjectRecord = this.buildPersistedProjectRecord({
738
+ workspaceId: normalizedWorkspaceId,
739
+ placement,
740
+ createdAt: currentProjectRecord?.createdAt ?? nextProjectCreatedAt,
741
+ updatedAt: now,
742
+ });
743
+ const nextWorkspaceRecord = this.buildPersistedWorkspaceRecord({
744
+ workspaceId: normalizedWorkspaceId,
745
+ placement,
746
+ createdAt: nextWorkspaceCreatedAt,
747
+ updatedAt: now,
748
+ });
749
+ const needsWorkspaceUpdate = !existing ||
750
+ existing.archivedAt ||
751
+ existing.projectId !== nextWorkspaceRecord.projectId ||
752
+ existing.kind !== nextWorkspaceRecord.kind ||
753
+ existing.displayName !== nextWorkspaceRecord.displayName;
754
+ const needsProjectUpdate = !currentProjectRecord ||
755
+ currentProjectRecord.archivedAt ||
756
+ currentProjectRecord.rootPath !== nextProjectRecord.rootPath ||
757
+ currentProjectRecord.kind !== nextProjectRecord.kind ||
758
+ currentProjectRecord.displayName !== nextProjectRecord.displayName;
759
+ if (!needsWorkspaceUpdate && !needsProjectUpdate) {
760
+ return {
761
+ workspace: existing,
762
+ changed: false,
763
+ };
764
+ }
765
+ await this.projectRegistry.upsert(nextProjectRecord);
766
+ await this.workspaceRegistry.upsert(nextWorkspaceRecord);
767
+ if (existing &&
768
+ !existing.archivedAt &&
769
+ existing.projectId !== nextWorkspaceRecord.projectId) {
770
+ await this.archiveProjectRecordIfEmpty(existing.projectId, now);
771
+ }
772
+ return {
773
+ workspace: nextWorkspaceRecord,
774
+ changed: true,
775
+ };
776
+ }
777
+ async reconcileActiveWorkspaceRecords() {
778
+ const changedWorkspaceIds = new Set();
779
+ const activeWorkspaces = (await this.workspaceRegistry.list()).filter((workspace) => !workspace.archivedAt);
780
+ for (const workspace of activeWorkspaces) {
781
+ const result = await this.reconcileWorkspaceRecord(workspace.workspaceId);
782
+ if (result.changed) {
783
+ changedWorkspaceIds.add(result.workspace.workspaceId);
784
+ }
785
+ }
786
+ return changedWorkspaceIds;
787
+ }
702
788
  async forwardAgentUpdate(agent) {
703
789
  try {
704
790
  await this.ensureWorkspaceRegistered(agent.cwd);
@@ -1315,7 +1401,9 @@ export class Session {
1315
1401
  * Handle voice mode toggle
1316
1402
  */
1317
1403
  async handleSetVoiceMode(enabled, agentId, requestId) {
1404
+ const startedAt = Date.now();
1318
1405
  try {
1406
+ this.sessionLogger.info({ enabled, requestedAgentId: agentId ?? null, requestId: requestId ?? null }, 'set_voice_mode started');
1319
1407
  if (enabled) {
1320
1408
  const unavailable = this.resolveVoiceFeatureUnavailableContext('voice_mode');
1321
1409
  if (unavailable) {
@@ -1325,15 +1413,26 @@ export class Session {
1325
1413
  if (this.isVoiceMode &&
1326
1414
  this.voiceModeAgentId &&
1327
1415
  this.voiceModeAgentId !== normalizedAgentId) {
1416
+ this.sessionLogger.info({
1417
+ previousAgentId: this.voiceModeAgentId,
1418
+ nextAgentId: normalizedAgentId,
1419
+ elapsedMs: Date.now() - startedAt,
1420
+ }, 'set_voice_mode disabling previous active voice agent');
1328
1421
  await this.disableVoiceModeForActiveAgent(true);
1329
1422
  }
1330
1423
  if (!this.isVoiceMode || this.voiceModeAgentId !== normalizedAgentId) {
1424
+ this.sessionLogger.info({ agentId: normalizedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode enabling voice for agent');
1331
1425
  const refreshedAgentId = await this.enableVoiceModeForAgent(normalizedAgentId);
1332
1426
  this.voiceModeAgentId = refreshedAgentId;
1427
+ this.sessionLogger.info({ agentId: refreshedAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode agent enable complete');
1333
1428
  }
1429
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode starting voice turn controller');
1430
+ await this.startVoiceTurnController();
1431
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode voice turn controller started');
1334
1432
  this.isVoiceMode = true;
1335
1433
  this.sessionLogger.info({
1336
1434
  agentId: this.voiceModeAgentId,
1435
+ elapsedMs: Date.now() - startedAt,
1337
1436
  }, 'Voice mode enabled for existing agent');
1338
1437
  if (requestId) {
1339
1438
  this.emit({
@@ -1349,9 +1448,10 @@ export class Session {
1349
1448
  }
1350
1449
  return;
1351
1450
  }
1451
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, 'set_voice_mode disabling active voice mode');
1352
1452
  await this.disableVoiceModeForActiveAgent(true);
1353
1453
  this.isVoiceMode = false;
1354
- this.sessionLogger.info('Voice mode disabled');
1454
+ this.sessionLogger.info({ elapsedMs: Date.now() - startedAt }, 'Voice mode disabled');
1355
1455
  if (requestId) {
1356
1456
  this.emit({
1357
1457
  type: 'set_voice_mode_response',
@@ -1372,6 +1472,7 @@ export class Session {
1372
1472
  err: error,
1373
1473
  enabled,
1374
1474
  requestedAgentId: agentId ?? null,
1475
+ elapsedMs: Date.now() - startedAt,
1375
1476
  }, 'set_voice_mode failed');
1376
1477
  if (requestId) {
1377
1478
  this.emit({
@@ -1419,12 +1520,17 @@ export class Session {
1419
1520
  };
1420
1521
  }
1421
1522
  async enableVoiceModeForAgent(agentId) {
1523
+ const startedAt = Date.now();
1422
1524
  const ensureVoiceSocket = this.ensureVoiceMcpSocketForAgent;
1423
1525
  if (!ensureVoiceSocket) {
1424
1526
  throw new Error('Voice MCP socket bridge is not configured');
1425
1527
  }
1528
+ this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureAgentLoaded.start');
1426
1529
  const existing = await this.ensureAgentLoaded(agentId);
1530
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureAgentLoaded.done');
1531
+ this.sessionLogger.info({ agentId }, 'enableVoiceModeForAgent.ensureVoiceSocket.start');
1427
1532
  const socketPath = await ensureVoiceSocket(agentId);
1533
+ this.sessionLogger.info({ agentId, socketPath, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.ensureVoiceSocket.done');
1428
1534
  this.registerVoiceBridgeForAgent(agentId);
1429
1535
  const baseConfig = {
1430
1536
  systemPrompt: stripVoiceModeSystemPrompt(existing.config.systemPrompt),
@@ -1436,7 +1542,9 @@ export class Session {
1436
1542
  mcpServers: this.buildVoiceModeMcpServers(baseConfig.mcpServers, socketPath),
1437
1543
  };
1438
1544
  try {
1545
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.start');
1439
1546
  const refreshed = await this.agentManager.reloadAgentSession(agentId, refreshOverrides);
1547
+ this.sessionLogger.info({ agentId, refreshedAgentId: refreshed.id, elapsedMs: Date.now() - startedAt }, 'enableVoiceModeForAgent.reloadAgentSession.done');
1440
1548
  return refreshed.id;
1441
1549
  }
1442
1550
  catch (error) {
@@ -1448,8 +1556,7 @@ export class Session {
1448
1556
  }
1449
1557
  }
1450
1558
  async disableVoiceModeForActiveAgent(restoreAgentConfig) {
1451
- this.clearVoiceModeInactivityTimeout();
1452
- this.cancelActiveVoiceDictationStream('voice mode disabled');
1559
+ await this.stopVoiceTurnController();
1453
1560
  const agentId = this.voiceModeAgentId;
1454
1561
  if (!agentId) {
1455
1562
  this.voiceModeBaseConfig = null;
@@ -1475,197 +1582,107 @@ export class Session {
1475
1582
  this.voiceModeBaseConfig = null;
1476
1583
  this.voiceModeAgentId = null;
1477
1584
  }
1478
- isInternalVoiceDictationId(dictationId) {
1479
- return dictationId.startsWith(VOICE_INTERNAL_DICTATION_ID_PREFIX);
1480
- }
1481
1585
  handleDictationManagerMessage(msg) {
1482
- if (msg.type === 'activity_log') {
1483
- const metadata = msg.payload.metadata;
1484
- const dictationId = metadata && typeof metadata.dictationId === 'string' ? metadata.dictationId : null;
1485
- if (dictationId && this.isInternalVoiceDictationId(dictationId)) {
1486
- return;
1487
- }
1488
- this.emit(msg);
1489
- return;
1490
- }
1491
- const payloadWithDictationId = msg.payload;
1492
- const dictationId = payloadWithDictationId && typeof payloadWithDictationId.dictationId === 'string'
1493
- ? payloadWithDictationId.dictationId
1494
- : null;
1495
- if (!dictationId || !this.isInternalVoiceDictationId(dictationId)) {
1496
- this.emit(msg);
1497
- return;
1498
- }
1499
- if (msg.type === 'dictation_stream_final') {
1500
- if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationResolve) {
1501
- return;
1502
- }
1503
- this.activeVoiceDictationResolve({
1504
- text: msg.payload.text,
1505
- ...(msg.payload.debugRecordingPath
1506
- ? { debugRecordingPath: msg.payload.debugRecordingPath }
1507
- : {}),
1508
- });
1509
- return;
1510
- }
1511
- if (msg.type === 'dictation_stream_error') {
1512
- if (dictationId !== this.activeVoiceDictationId || !this.activeVoiceDictationReject) {
1513
- return;
1514
- }
1515
- this.activeVoiceDictationReject(new Error(msg.payload.error));
1516
- return;
1517
- }
1518
- // Ack/partial messages for internal voice dictation are consumed server-side.
1519
- }
1520
- resetActiveVoiceDictationState() {
1521
- this.activeVoiceDictationId = null;
1522
- this.activeVoiceDictationFormat = null;
1523
- this.activeVoiceDictationNextSeq = 0;
1524
- this.activeVoiceDictationStartPromise = null;
1525
- this.activeVoiceDictationFinalizePromise = null;
1526
- this.activeVoiceDictationResultPromise = null;
1527
- this.activeVoiceDictationResolve = null;
1528
- this.activeVoiceDictationReject = null;
1529
- }
1530
- cancelActiveVoiceDictationStream(reason) {
1531
- const dictationId = this.activeVoiceDictationId;
1532
- if (!dictationId) {
1533
- return;
1534
- }
1535
- this.sessionLogger.debug({ dictationId, reason }, 'Cancelling active internal voice dictation stream');
1536
- if (this.activeVoiceDictationReject) {
1537
- this.activeVoiceDictationReject(new Error(`Voice dictation cancelled: ${reason}`));
1538
- }
1539
- this.voiceStreamManager.handleCancel(dictationId);
1540
- this.resetActiveVoiceDictationState();
1586
+ this.emit(msg);
1541
1587
  }
1542
- async ensureActiveVoiceDictationStream(format) {
1543
- if (this.activeVoiceDictationId && this.activeVoiceDictationFormat === format) {
1544
- if (this.activeVoiceDictationStartPromise) {
1545
- await this.activeVoiceDictationStartPromise;
1546
- }
1588
+ async startVoiceTurnController() {
1589
+ if (this.voiceTurnController) {
1590
+ this.sessionLogger.info('startVoiceTurnController skipped: already running');
1547
1591
  return;
1548
1592
  }
1549
- if (this.activeVoiceDictationId) {
1550
- await this.finalizeActiveVoiceDictationStream('voice format changed');
1551
- }
1552
- const dictationId = `${VOICE_INTERNAL_DICTATION_ID_PREFIX}${uuidv4()}`;
1553
- let resolve = null;
1554
- let reject = null;
1555
- const resultPromise = new Promise((resolveFn, rejectFn) => {
1556
- resolve = resolveFn;
1557
- reject = rejectFn;
1558
- });
1559
- // Prevent process-level unhandled rejection warnings when cancellation races are resolved later.
1560
- void resultPromise.catch(() => undefined);
1561
- this.activeVoiceDictationId = dictationId;
1562
- this.activeVoiceDictationFormat = format;
1563
- this.activeVoiceDictationNextSeq = 0;
1564
- this.activeVoiceDictationFinalizePromise = null;
1565
- this.activeVoiceDictationResultPromise = resultPromise;
1566
- this.activeVoiceDictationResolve = resolve;
1567
- this.activeVoiceDictationReject = reject;
1568
- this.setPhase('transcribing');
1569
- this.emit({
1570
- type: 'activity_log',
1571
- payload: {
1572
- id: uuidv4(),
1573
- timestamp: new Date(),
1574
- type: 'system',
1575
- content: 'Transcribing audio...',
1593
+ const turnDetection = this.resolveVoiceTurnDetection();
1594
+ if (!turnDetection) {
1595
+ throw new Error('Voice turn detection is not configured');
1596
+ }
1597
+ this.sessionLogger.info({ providerId: turnDetection.id }, 'startVoiceTurnController creating controller');
1598
+ const controller = createVoiceTurnController({
1599
+ logger: this.sessionLogger.child({ component: 'voice-turn-controller' }),
1600
+ turnDetection,
1601
+ utteranceSink: {
1602
+ submitUtterance: async ({ pcm16, format, sampleRate, startedAt, endedAt }) => {
1603
+ this.sessionLogger.debug({
1604
+ audioBytes: pcm16.length,
1605
+ sampleRate,
1606
+ startedAt,
1607
+ endedAt,
1608
+ durationMs: Math.max(0, endedAt - startedAt),
1609
+ }, 'Submitting detected voice utterance');
1610
+ await this.processCompletedAudio(pcm16, format);
1611
+ },
1612
+ },
1613
+ callbacks: {
1614
+ onSpeechStarted: async () => {
1615
+ this.handleProvisionalVoiceSpeechStarted();
1616
+ },
1617
+ onSpeechStopped: async () => {
1618
+ this.handleVoiceSpeechStopped();
1619
+ },
1620
+ onError: (error) => {
1621
+ this.sessionLogger.error({ err: error }, 'Voice turn controller failed');
1622
+ },
1576
1623
  },
1577
1624
  });
1578
- const startPromise = this.voiceStreamManager.handleStart(dictationId, format);
1579
- this.activeVoiceDictationStartPromise = startPromise;
1580
- try {
1581
- await startPromise;
1582
- }
1583
- catch (error) {
1584
- this.resetActiveVoiceDictationState();
1585
- throw error;
1586
- }
1587
- finally {
1588
- if (this.activeVoiceDictationId === dictationId) {
1589
- this.activeVoiceDictationStartPromise = null;
1590
- }
1591
- }
1592
- }
1593
- async appendToActiveVoiceDictationStream(audioBase64, format) {
1594
- if (this.activeVoiceDictationFinalizePromise) {
1595
- await this.activeVoiceDictationFinalizePromise.catch(() => undefined);
1596
- }
1597
- await this.ensureActiveVoiceDictationStream(format);
1598
- const dictationId = this.activeVoiceDictationId;
1599
- if (!dictationId) {
1600
- throw new Error('Voice dictation stream did not initialize');
1601
- }
1602
- const seq = this.activeVoiceDictationNextSeq;
1603
- this.activeVoiceDictationNextSeq += 1;
1604
- await this.voiceStreamManager.handleChunk({
1605
- dictationId,
1606
- seq,
1607
- audioBase64,
1608
- format,
1609
- });
1625
+ this.sessionLogger.info('startVoiceTurnController connecting controller');
1626
+ await controller.start();
1627
+ this.voiceTurnController = controller;
1628
+ this.sessionLogger.info('startVoiceTurnController connected');
1610
1629
  }
1611
- async finalizeActiveVoiceDictationStream(reason) {
1612
- const dictationId = this.activeVoiceDictationId;
1613
- if (!dictationId) {
1630
+ async stopVoiceTurnController() {
1631
+ if (!this.voiceTurnController) {
1614
1632
  return;
1615
1633
  }
1616
- this.clearVoiceModeInactivityTimeout();
1617
- if (this.activeVoiceDictationStartPromise) {
1618
- await this.activeVoiceDictationStartPromise;
1634
+ this.clearPendingVoiceSpeechStart('turn-controller-stop');
1635
+ const controller = this.voiceTurnController;
1636
+ this.voiceTurnController = null;
1637
+ await controller.stop();
1638
+ }
1639
+ clearPendingVoiceSpeechStart(reason) {
1640
+ if (this.pendingVoiceSpeechTimer) {
1641
+ clearTimeout(this.pendingVoiceSpeechTimer);
1642
+ this.pendingVoiceSpeechTimer = null;
1619
1643
  }
1620
- if (this.activeVoiceDictationFinalizePromise) {
1621
- await this.activeVoiceDictationFinalizePromise;
1622
- return;
1644
+ if (this.pendingVoiceSpeechStartAt !== null) {
1645
+ this.sessionLogger.debug({ reason }, 'Clearing provisional voice speech start');
1646
+ this.pendingVoiceSpeechStartAt = null;
1623
1647
  }
1624
- const finalSeq = this.activeVoiceDictationNextSeq - 1;
1625
- const resultPromise = this.activeVoiceDictationResultPromise;
1626
- if (!resultPromise) {
1627
- this.resetActiveVoiceDictationState();
1648
+ }
1649
+ handleProvisionalVoiceSpeechStarted() {
1650
+ if (this.speechInProgress || this.pendingVoiceSpeechTimer) {
1628
1651
  return;
1629
1652
  }
1630
- this.activeVoiceDictationFinalizePromise = (async () => {
1631
- this.sessionLogger.debug({ dictationId, finalSeq, reason }, 'Finalizing internal voice dictation stream');
1632
- await this.voiceStreamManager.handleFinish(dictationId, finalSeq);
1633
- const result = await resultPromise;
1634
- this.resetActiveVoiceDictationState();
1635
- const requestId = uuidv4();
1636
- const transcriptText = result.text.trim();
1637
- this.sessionLogger.info({
1638
- requestId,
1639
- isVoiceMode: this.isVoiceMode,
1640
- transcriptLength: transcriptText.length,
1641
- transcript: transcriptText,
1642
- }, 'Transcription result');
1643
- await this.handleTranscriptionResultPayload({
1644
- text: result.text,
1645
- requestId,
1646
- ...(result.debugRecordingPath
1647
- ? { debugRecordingPath: result.debugRecordingPath, format: 'audio/wav' }
1648
- : {}),
1649
- });
1650
- })();
1651
- try {
1652
- await this.activeVoiceDictationFinalizePromise;
1653
- }
1654
- catch (error) {
1655
- this.resetActiveVoiceDictationState();
1656
- this.setPhase('idle');
1657
- this.clearSpeechInProgress('transcription error');
1653
+ const startedAt = Date.now();
1654
+ this.pendingVoiceSpeechStartAt = startedAt;
1655
+ this.sessionLogger.info({ confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Silero VAD provisional speech_started');
1656
+ this.pendingVoiceSpeechTimer = setTimeout(() => {
1657
+ this.pendingVoiceSpeechTimer = null;
1658
+ if (this.pendingVoiceSpeechStartAt !== startedAt || this.speechInProgress) {
1659
+ return;
1660
+ }
1661
+ this.pendingVoiceSpeechStartAt = null;
1662
+ this.sessionLogger.info('voice_input_state emitting isSpeaking=true');
1658
1663
  this.emit({
1659
- type: 'activity_log',
1664
+ type: 'voice_input_state',
1660
1665
  payload: {
1661
- id: uuidv4(),
1662
- timestamp: new Date(),
1663
- type: 'error',
1664
- content: `Transcription error: ${error instanceof Error ? error.message : String(error)}`,
1666
+ isSpeaking: true,
1665
1667
  },
1666
1668
  });
1667
- throw error;
1669
+ void this.handleVoiceSpeechStart();
1670
+ }, VOICE_INTERRUPT_CONFIRMATION_MS);
1671
+ }
1672
+ handleVoiceSpeechStopped() {
1673
+ if (this.pendingVoiceSpeechStartAt !== null) {
1674
+ const durationMs = Date.now() - this.pendingVoiceSpeechStartAt;
1675
+ this.clearPendingVoiceSpeechStart('speech-stopped-before-confirmation');
1676
+ this.sessionLogger.info({ durationMs, confirmationMs: VOICE_INTERRUPT_CONFIRMATION_MS }, 'Ignoring provisional voice speech start that ended before confirmation');
1677
+ return;
1668
1678
  }
1679
+ this.sessionLogger.info('voice_input_state emitting isSpeaking=false');
1680
+ this.emit({
1681
+ type: 'voice_input_state',
1682
+ payload: {
1683
+ isSpeaking: false,
1684
+ },
1685
+ });
1669
1686
  }
1670
1687
  /**
1671
1688
  * Handle text message to agent (with optional image attachments)
@@ -4027,7 +4044,7 @@ export class Session {
4027
4044
  diffStat,
4028
4045
  };
4029
4046
  }
4030
- async listWorkspaceDescriptors() {
4047
+ async listWorkspaceDescriptorsSnapshot() {
4031
4048
  const [agents, persistedWorkspaces, persistedProjects] = await Promise.all([
4032
4049
  this.listAgentPayloads(),
4033
4050
  this.workspaceRegistry.list(),
@@ -4058,6 +4075,10 @@ export class Session {
4058
4075
  }
4059
4076
  return Array.from(descriptorsByWorkspaceId.values());
4060
4077
  }
4078
+ async listWorkspaceDescriptors() {
4079
+ await this.reconcileActiveWorkspaceRecords();
4080
+ return this.listWorkspaceDescriptorsSnapshot();
4081
+ }
4061
4082
  normalizeFetchWorkspacesSort(sort) {
4062
4083
  const fallback = [{ key: 'activity_at', direction: 'desc' }];
4063
4084
  if (!sort || sort.length === 0) {
@@ -4257,41 +4278,7 @@ export class Session {
4257
4278
  }
4258
4279
  async ensureWorkspaceRegistered(cwd) {
4259
4280
  const workspaceId = normalizePersistedWorkspaceId(cwd);
4260
- const existing = await this.workspaceRegistry.get(workspaceId);
4261
- if (existing && !existing.archivedAt) {
4262
- return existing;
4263
- }
4264
- const placement = await this.buildProjectPlacement(workspaceId);
4265
- const now = new Date().toISOString();
4266
- const projectExisting = await this.projectRegistry.get(placement.projectKey);
4267
- const projectRecord = createPersistedProjectRecord({
4268
- projectId: placement.projectKey,
4269
- rootPath: deriveProjectRootPath({
4270
- cwd: workspaceId,
4271
- checkout: placement.checkout,
4272
- }),
4273
- kind: deriveProjectKind(placement.checkout),
4274
- displayName: placement.projectName,
4275
- createdAt: projectExisting?.createdAt ?? now,
4276
- updatedAt: now,
4277
- archivedAt: null,
4278
- });
4279
- await this.projectRegistry.upsert(projectRecord);
4280
- const workspaceRecord = createPersistedWorkspaceRecord({
4281
- workspaceId,
4282
- projectId: placement.projectKey,
4283
- cwd: workspaceId,
4284
- kind: deriveWorkspaceKind(placement.checkout),
4285
- displayName: deriveWorkspaceDisplayName({
4286
- cwd: workspaceId,
4287
- checkout: placement.checkout,
4288
- }),
4289
- createdAt: existing?.createdAt ?? now,
4290
- updatedAt: now,
4291
- archivedAt: null,
4292
- });
4293
- await this.workspaceRegistry.upsert(workspaceRecord);
4294
- return workspaceRecord;
4281
+ return (await this.reconcileWorkspaceRecord(workspaceId)).workspace;
4295
4282
  }
4296
4283
  async archiveWorkspaceRecord(workspaceId, archivedAt) {
4297
4284
  const existing = await this.workspaceRegistry.get(workspaceId);
@@ -4311,32 +4298,31 @@ export class Session {
4311
4298
  return;
4312
4299
  }
4313
4300
  const workspaceId = normalizePersistedWorkspaceId(cwd);
4314
- const all = await this.listWorkspaceDescriptors();
4315
- const workspace = all.find((entry) => entry.id === workspaceId);
4316
- if (!workspace) {
4317
- this.bufferOrEmitWorkspaceUpdate(subscription, {
4318
- kind: 'remove',
4319
- id: workspaceId,
4320
- });
4321
- return;
4322
- }
4323
- if (!this.matchesWorkspaceFilter({ workspace, filter: subscription.filter })) {
4301
+ const changedWorkspaceIds = await this.reconcileActiveWorkspaceRecords();
4302
+ const all = await this.listWorkspaceDescriptorsSnapshot();
4303
+ const descriptorsByWorkspaceId = new Map(all.map((entry) => [entry.id, entry]));
4304
+ const workspaceIdsToEmit = new Set([workspaceId, ...changedWorkspaceIds]);
4305
+ for (const nextWorkspaceId of workspaceIdsToEmit) {
4306
+ const workspace = descriptorsByWorkspaceId.get(nextWorkspaceId);
4307
+ if (!workspace || !this.matchesWorkspaceFilter({ workspace, filter: subscription.filter })) {
4308
+ this.bufferOrEmitWorkspaceUpdate(subscription, {
4309
+ kind: 'remove',
4310
+ id: nextWorkspaceId,
4311
+ });
4312
+ continue;
4313
+ }
4324
4314
  this.bufferOrEmitWorkspaceUpdate(subscription, {
4325
- kind: 'remove',
4326
- id: workspaceId,
4315
+ kind: 'upsert',
4316
+ workspace,
4327
4317
  });
4328
- return;
4329
4318
  }
4330
- this.bufferOrEmitWorkspaceUpdate(subscription, {
4331
- kind: 'upsert',
4332
- workspace,
4333
- });
4334
4319
  }
4335
4320
  async emitWorkspaceUpdatesForCwds(cwds) {
4336
4321
  if (!this.workspaceUpdatesSubscription) {
4337
4322
  return;
4338
4323
  }
4339
- const uniqueWorkspaceCwds = new Set();
4324
+ const changedWorkspaceIds = await this.reconcileActiveWorkspaceRecords();
4325
+ const uniqueWorkspaceCwds = new Set(changedWorkspaceIds);
4340
4326
  for (const cwd of cwds) {
4341
4327
  const normalized = normalizePersistedWorkspaceId(cwd);
4342
4328
  if (!normalized) {
@@ -4344,8 +4330,22 @@ export class Session {
4344
4330
  }
4345
4331
  uniqueWorkspaceCwds.add(normalized);
4346
4332
  }
4347
- for (const workspaceCwd of uniqueWorkspaceCwds) {
4348
- await this.emitWorkspaceUpdateForCwd(workspaceCwd);
4333
+ const subscription = this.workspaceUpdatesSubscription;
4334
+ const all = await this.listWorkspaceDescriptorsSnapshot();
4335
+ const descriptorsByWorkspaceId = new Map(all.map((entry) => [entry.id, entry]));
4336
+ for (const workspaceId of uniqueWorkspaceCwds) {
4337
+ const workspace = descriptorsByWorkspaceId.get(workspaceId);
4338
+ if (!workspace || !this.matchesWorkspaceFilter({ workspace, filter: subscription.filter })) {
4339
+ this.bufferOrEmitWorkspaceUpdate(subscription, {
4340
+ kind: 'remove',
4341
+ id: workspaceId,
4342
+ });
4343
+ continue;
4344
+ }
4345
+ this.bufferOrEmitWorkspaceUpdate(subscription, {
4346
+ kind: 'upsert',
4347
+ workspace,
4348
+ });
4349
4349
  }
4350
4350
  }
4351
4351
  async handleFetchAgents(request) {
@@ -4882,18 +4882,37 @@ export class Session {
4882
4882
  if (!this.isVoiceMode) {
4883
4883
  this.sessionLogger.warn('Received voice_audio_chunk while voice mode is disabled; transcript will be emitted but voice assistant turn is skipped');
4884
4884
  }
4885
- await this.handleVoiceSpeechStart();
4886
4885
  const chunkFormat = msg.format || 'audio/wav';
4887
4886
  if (this.isVoiceMode) {
4888
- await this.appendToActiveVoiceDictationStream(msg.audio, chunkFormat);
4889
- if (!msg.isLast) {
4890
- this.setVoiceModeInactivityTimeout();
4891
- this.sessionLogger.debug('Voice mode: streaming chunk, waiting for speech end');
4892
- return;
4887
+ if (!this.voiceTurnController) {
4888
+ throw new Error('Voice mode is enabled but the voice turn controller is not running');
4889
+ }
4890
+ const chunkBytes = Buffer.byteLength(msg.audio, 'base64');
4891
+ this.voiceInputChunkCount += 1;
4892
+ this.voiceInputBytes += chunkBytes;
4893
+ if (this.voiceInputChunkCount === 1) {
4894
+ this.sessionLogger.info({
4895
+ format: chunkFormat,
4896
+ audioBytes: chunkBytes,
4897
+ }, 'Received first voice_audio_chunk for active voice mode');
4893
4898
  }
4894
- this.clearVoiceModeInactivityTimeout();
4895
- this.sessionLogger.debug('Voice mode: speech ended, finalizing streaming transcription');
4896
- await this.finalizeActiveVoiceDictationStream('speech ended');
4899
+ const now = Date.now();
4900
+ if (this.voiceInputChunkCount % 50 === 0 ||
4901
+ now - this.voiceInputWindowStartedAt >= 1000) {
4902
+ this.sessionLogger.info({
4903
+ chunkCount: this.voiceInputChunkCount,
4904
+ audioBytes: this.voiceInputBytes,
4905
+ windowMs: now - this.voiceInputWindowStartedAt,
4906
+ format: chunkFormat,
4907
+ }, 'Voice input chunk summary');
4908
+ this.voiceInputWindowStartedAt = now;
4909
+ this.voiceInputChunkCount = 0;
4910
+ this.voiceInputBytes = 0;
4911
+ }
4912
+ await this.voiceTurnController.appendClientChunk({
4913
+ audioBase64: msg.audio,
4914
+ format: chunkFormat,
4915
+ });
4897
4916
  return;
4898
4917
  }
4899
4918
  const chunkBuffer = Buffer.from(msg.audio, 'base64');
@@ -4974,9 +4993,8 @@ export class Session {
4974
4993
  };
4975
4994
  }
4976
4995
  async processCompletedAudio(audio, format) {
4977
- const shouldBuffer = this.processingPhase === 'transcribing' && this.pendingAudioSegments.length === 0;
4978
- if (shouldBuffer) {
4979
- this.sessionLogger.debug({ phase: this.processingPhase }, `Buffering audio segment (phase: ${this.processingPhase})`);
4996
+ if (this.processingPhase === 'transcribing') {
4997
+ this.sessionLogger.debug({ phase: this.processingPhase, segmentCount: this.pendingAudioSegments.length + 1 }, `Buffering audio segment (phase: ${this.processingPhase})`);
4980
4998
  this.pendingAudioSegments.push({
4981
4999
  audio,
4982
5000
  format,
@@ -5000,6 +5018,18 @@ export class Session {
5000
5018
  }
5001
5019
  await this.processAudio(audio, format);
5002
5020
  }
5021
+ async flushPendingAudioSegments(reason) {
5022
+ if (this.processingPhase === 'transcribing' || this.pendingAudioSegments.length === 0) {
5023
+ return;
5024
+ }
5025
+ const pendingSegments = [...this.pendingAudioSegments];
5026
+ this.pendingAudioSegments = [];
5027
+ this.clearBufferTimeout();
5028
+ this.sessionLogger.debug({ reason, segmentCount: pendingSegments.length }, `Flushing ${pendingSegments.length} buffered audio segment(s)`);
5029
+ const combinedAudio = Buffer.concat(pendingSegments.map((segment) => segment.audio));
5030
+ const combinedFormat = pendingSegments[pendingSegments.length - 1].format;
5031
+ await this.processAudio(combinedAudio, combinedFormat);
5032
+ }
5003
5033
  /**
5004
5034
  * Process audio through STT and then LLM
5005
5035
  */
@@ -5042,6 +5072,7 @@ export class Session {
5042
5072
  catch (error) {
5043
5073
  this.setPhase('idle');
5044
5074
  this.clearSpeechInProgress('transcription error');
5075
+ await this.flushPendingAudioSegments('transcription error');
5045
5076
  this.emit({
5046
5077
  type: 'activity_log',
5047
5078
  payload: {
@@ -5076,6 +5107,7 @@ export class Session {
5076
5107
  this.sessionLogger.debug('Empty transcription (false positive), not aborting');
5077
5108
  this.setPhase('idle');
5078
5109
  this.clearSpeechInProgress('empty transcription');
5110
+ await this.flushPendingAudioSegments('empty transcription');
5079
5111
  return;
5080
5112
  }
5081
5113
  // Has content - abort any in-progress stream now
@@ -5113,16 +5145,19 @@ export class Session {
5113
5145
  this.setPhase('idle');
5114
5146
  if (!this.isVoiceMode) {
5115
5147
  this.sessionLogger.debug({ requestId: result.requestId }, 'Skipping voice agent processing because voice mode is disabled');
5148
+ await this.flushPendingAudioSegments('voice mode disabled');
5116
5149
  return;
5117
5150
  }
5118
5151
  const agentId = this.voiceModeAgentId;
5119
5152
  if (!agentId) {
5120
5153
  this.sessionLogger.warn({ requestId: result.requestId }, 'Skipping voice agent processing because no agent is currently voice-enabled');
5154
+ await this.flushPendingAudioSegments('no active voice agent');
5121
5155
  return;
5122
5156
  }
5123
5157
  // Route voice utterances through the same send path as regular text input:
5124
5158
  // interrupt-if-running, record message, then start a new stream.
5125
5159
  await this.handleSendAgentMessage(agentId, result.text);
5160
+ await this.flushPendingAudioSegments('transcription complete');
5126
5161
  }
5127
5162
  registerVoiceBridgeForAgent(agentId) {
5128
5163
  this.registerVoiceSpeakHandler?.(agentId, async ({ text, signal }) => {
@@ -5203,8 +5238,6 @@ export class Session {
5203
5238
  this.sessionLogger.debug({ chunks: this.audioBuffer.chunks.length, pcmBytes: this.audioBuffer.totalPCMBytes }, `Clearing partial audio buffer (${this.audioBuffer.chunks.length} chunk(s)${this.audioBuffer.isPCM ? `, ${this.audioBuffer.totalPCMBytes} PCM bytes` : ''})`);
5204
5239
  this.audioBuffer = null;
5205
5240
  }
5206
- this.cancelActiveVoiceDictationStream('new speech turn started');
5207
- this.clearVoiceModeInactivityTimeout();
5208
5241
  this.clearBufferTimeout();
5209
5242
  this.abortController.abort();
5210
5243
  await this.handleAbort();
@@ -5215,6 +5248,7 @@ export class Session {
5215
5248
  * Clear speech-in-progress flag once the user turn has completed
5216
5249
  */
5217
5250
  clearSpeechInProgress(reason) {
5251
+ this.clearPendingVoiceSpeechStart(`clear-speech-in-progress:${reason}`);
5218
5252
  if (!this.speechInProgress) {
5219
5253
  return;
5220
5254
  }
@@ -5244,6 +5278,11 @@ export class Session {
5244
5278
  this.clearBufferTimeout();
5245
5279
  this.bufferTimeout = setTimeout(async () => {
5246
5280
  this.sessionLogger.debug('Buffer timeout reached, processing pending segments');
5281
+ if (this.processingPhase === 'transcribing') {
5282
+ this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, 'Buffer timeout deferred because transcription is still in progress');
5283
+ this.setBufferTimeout();
5284
+ return;
5285
+ }
5247
5286
  if (this.pendingAudioSegments.length > 0) {
5248
5287
  const segments = [...this.pendingAudioSegments];
5249
5288
  this.pendingAudioSegments = [];
@@ -5253,32 +5292,6 @@ export class Session {
5253
5292
  }
5254
5293
  }, 10000); // 10 second timeout
5255
5294
  }
5256
- setVoiceModeInactivityTimeout() {
5257
- if (!this.isVoiceMode) {
5258
- return;
5259
- }
5260
- this.clearVoiceModeInactivityTimeout();
5261
- this.voiceModeInactivityTimeout = setTimeout(() => {
5262
- this.voiceModeInactivityTimeout = null;
5263
- if (!this.isVoiceMode || !this.activeVoiceDictationId) {
5264
- return;
5265
- }
5266
- this.sessionLogger.warn({
5267
- timeoutMs: VOICE_MODE_INACTIVITY_FLUSH_MS,
5268
- dictationId: this.activeVoiceDictationId,
5269
- nextSeq: this.activeVoiceDictationNextSeq,
5270
- }, 'Voice mode inactivity timeout reached without isLast; finalizing active voice dictation stream');
5271
- void this.finalizeActiveVoiceDictationStream('inactivity timeout').catch((error) => {
5272
- this.sessionLogger.error({ err: error }, 'Failed to finalize voice dictation stream after inactivity timeout');
5273
- });
5274
- }, VOICE_MODE_INACTIVITY_FLUSH_MS);
5275
- }
5276
- clearVoiceModeInactivityTimeout() {
5277
- if (this.voiceModeInactivityTimeout) {
5278
- clearTimeout(this.voiceModeInactivityTimeout);
5279
- this.voiceModeInactivityTimeout = null;
5280
- }
5281
- }
5282
5295
  /**
5283
5296
  * Clear buffer timeout
5284
5297
  */
@@ -5354,16 +5367,14 @@ export class Session {
5354
5367
  // Abort any ongoing operations
5355
5368
  this.abortController.abort();
5356
5369
  // Clear timeouts
5357
- this.clearVoiceModeInactivityTimeout();
5358
5370
  this.clearBufferTimeout();
5359
5371
  // Clear buffers
5360
- this.cancelActiveVoiceDictationStream('session cleanup');
5361
5372
  this.pendingAudioSegments = [];
5362
5373
  this.audioBuffer = null;
5374
+ await this.stopVoiceTurnController();
5363
5375
  // Cleanup managers
5364
5376
  this.ttsManager.cleanup();
5365
5377
  this.sttManager.cleanup();
5366
- this.voiceStreamManager.cleanupAll();
5367
5378
  this.dictationStreamManager.cleanupAll();
5368
5379
  // Close MCP clients
5369
5380
  if (this.agentMcpClient) {
@@ -5749,7 +5760,7 @@ export class Session {
5749
5760
  const streamId = this.allocateTerminalStreamId();
5750
5761
  const requestedResumeOffset = typeof msg.resumeOffset === 'number'
5751
5762
  ? msg.resumeOffset
5752
- : session.getOutputOffset();
5763
+ : 0;
5753
5764
  const initialOffset = Math.max(0, Math.floor(requestedResumeOffset));
5754
5765
  const binding = {
5755
5766
  terminalId: msg.terminalId,