@getpaseo/server 0.1.97 → 0.1.99

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/dist/server/server/agent/agent-manager.d.ts +11 -3
  2. package/dist/server/server/agent/agent-manager.js +96 -24
  3. package/dist/server/server/agent/agent-prompt.d.ts +1 -1
  4. package/dist/server/server/agent/agent-prompt.js +3 -10
  5. package/dist/server/server/agent/agent-sdk-types.d.ts +20 -9
  6. package/dist/server/server/agent/create-agent/create.d.ts +2 -0
  7. package/dist/server/server/agent/create-agent/create.js +8 -7
  8. package/dist/server/server/agent/lifecycle-command.d.ts +15 -1
  9. package/dist/server/server/agent/lifecycle-command.js +9 -2
  10. package/dist/server/server/agent/mcp-server.js +254 -115
  11. package/dist/server/server/agent/provider-notices.d.ts +3 -0
  12. package/dist/server/server/agent/provider-notices.js +5 -0
  13. package/dist/server/server/agent/provider-registry.d.ts +8 -3
  14. package/dist/server/server/agent/provider-registry.js +58 -25
  15. package/dist/server/server/agent/provider-snapshot-manager.d.ts +3 -0
  16. package/dist/server/server/agent/provider-snapshot-manager.js +37 -16
  17. package/dist/server/server/agent/providers/acp-agent.d.ts +5 -3
  18. package/dist/server/server/agent/providers/acp-agent.js +32 -19
  19. package/dist/server/server/agent/providers/claude/agent.d.ts +2 -2
  20. package/dist/server/server/agent/providers/claude/agent.js +261 -167
  21. package/dist/server/server/agent/providers/claude/models.js +7 -3
  22. package/dist/server/server/agent/providers/codex-app-server-agent.d.ts +6 -4
  23. package/dist/server/server/agent/providers/codex-app-server-agent.js +48 -25
  24. package/dist/server/server/agent/providers/copilot-acp-agent.js +4 -31
  25. package/dist/server/server/agent/providers/diagnostic-utils.d.ts +9 -0
  26. package/dist/server/server/agent/providers/diagnostic-utils.js +188 -0
  27. package/dist/server/server/agent/providers/generic-acp-agent.d.ts +0 -1
  28. package/dist/server/server/agent/providers/generic-acp-agent.js +2 -108
  29. package/dist/server/server/agent/providers/mock-load-test-agent.d.ts +2 -3
  30. package/dist/server/server/agent/providers/mock-load-test-agent.js +5 -5
  31. package/dist/server/server/agent/providers/mock-slow-provider.d.ts +2 -3
  32. package/dist/server/server/agent/providers/mock-slow-provider.js +3 -6
  33. package/dist/server/server/agent/providers/opencode/server-manager.d.ts +29 -2
  34. package/dist/server/server/agent/providers/opencode/server-manager.js +83 -17
  35. package/dist/server/server/agent/providers/opencode-agent.d.ts +6 -3
  36. package/dist/server/server/agent/providers/opencode-agent.js +61 -107
  37. package/dist/server/server/agent/providers/pi/agent.d.ts +2 -3
  38. package/dist/server/server/agent/providers/pi/agent.js +11 -63
  39. package/dist/server/server/agent/providers/pi/cli-runtime.js +2 -2
  40. package/dist/server/server/agent/providers/pi/runtime.d.ts +1 -1
  41. package/dist/server/server/agent/providers/pi/test-utils/fake-pi.d.ts +1 -1
  42. package/dist/server/server/agent/providers/pi/test-utils/fake-pi.js +1 -1
  43. package/dist/server/server/bootstrap.d.ts +2 -0
  44. package/dist/server/server/bootstrap.js +32 -2
  45. package/dist/server/server/managed-processes/managed-processes.d.ts +76 -0
  46. package/dist/server/server/managed-processes/managed-processes.js +326 -0
  47. package/dist/server/server/resolve-worktree-creation-intent.d.ts +3 -0
  48. package/dist/server/server/resolve-worktree-creation-intent.js +3 -3
  49. package/dist/server/server/session/agent-config/agent-config-session.d.ts +50 -0
  50. package/dist/server/server/session/agent-config/agent-config-session.js +98 -0
  51. package/dist/server/server/session/chat/chat-schedule-loop-session.d.ts +120 -0
  52. package/dist/server/server/session/chat/chat-schedule-loop-session.js +489 -0
  53. package/dist/server/server/session/checkout/checkout-session.d.ts +142 -0
  54. package/dist/server/server/session/checkout/checkout-session.js +925 -0
  55. package/dist/server/server/session/daemon/daemon-session.d.ts +50 -0
  56. package/dist/server/server/session/daemon/daemon-session.js +98 -0
  57. package/dist/server/server/session/files/workspace-files-session.d.ts +43 -0
  58. package/dist/server/server/session/files/workspace-files-session.js +218 -0
  59. package/dist/server/server/session/project-config/project-config-session.d.ts +34 -0
  60. package/dist/server/server/session/project-config/project-config-session.js +125 -0
  61. package/dist/server/server/session/provider/provider-catalog-session.d.ts +74 -0
  62. package/dist/server/server/session/provider/provider-catalog-session.js +339 -0
  63. package/dist/server/server/session/voice/voice-session.d.ts +166 -0
  64. package/dist/server/server/session/voice/voice-session.js +893 -0
  65. package/dist/server/server/{voice → session/voice}/voice-turn-controller.d.ts +2 -2
  66. package/dist/server/server/{voice → session/voice}/voice-turn-controller.js +2 -2
  67. package/dist/server/server/session.d.ts +23 -207
  68. package/dist/server/server/session.js +2319 -5102
  69. package/dist/server/server/speech/providers/openai/runtime.js +3 -4
  70. package/dist/server/server/websocket-server.d.ts +1 -0
  71. package/dist/server/server/websocket-server.js +11 -0
  72. package/dist/server/server/workspace-archive-service.js +2 -3
  73. package/dist/server/server/workspace-directory.js +5 -5
  74. package/dist/server/server/workspace-reconciliation-service.js +2 -2
  75. package/dist/server/server/worktree-core.d.ts +1 -0
  76. package/dist/server/server/worktree-core.js +5 -1
  77. package/dist/server/services/quota-fetcher/manifest.d.ts +4 -0
  78. package/dist/server/services/quota-fetcher/manifest.js +47 -0
  79. package/dist/server/services/quota-fetcher/provider.d.ts +17 -0
  80. package/dist/server/services/quota-fetcher/provider.js +2 -0
  81. package/dist/server/services/quota-fetcher/providers/claude.d.ts +26 -0
  82. package/dist/server/services/quota-fetcher/providers/claude.js +217 -0
  83. package/dist/server/services/quota-fetcher/providers/codex.d.ts +23 -0
  84. package/dist/server/services/quota-fetcher/providers/codex.js +211 -0
  85. package/dist/server/services/quota-fetcher/providers/copilot.d.ts +17 -0
  86. package/dist/server/services/quota-fetcher/providers/copilot.js +75 -0
  87. package/dist/server/services/quota-fetcher/providers/cursor.d.ts +17 -0
  88. package/dist/server/services/quota-fetcher/providers/cursor.js +123 -0
  89. package/dist/server/services/quota-fetcher/providers/grok.d.ts +18 -0
  90. package/dist/server/services/quota-fetcher/providers/grok.js +89 -0
  91. package/dist/server/services/quota-fetcher/providers/kimi.d.ts +20 -0
  92. package/dist/server/services/quota-fetcher/providers/kimi.js +89 -0
  93. package/dist/server/services/quota-fetcher/providers/zai.d.ts +17 -0
  94. package/dist/server/services/quota-fetcher/providers/zai.js +58 -0
  95. package/dist/server/services/quota-fetcher/service.d.ts +28 -0
  96. package/dist/server/services/quota-fetcher/service.js +58 -0
  97. package/dist/server/services/quota-fetcher/usage.d.ts +22 -0
  98. package/dist/server/services/quota-fetcher/usage.js +49 -0
  99. package/dist/server/utils/checkout-git.d.ts +6 -0
  100. package/dist/server/utils/directory-suggestions.js +98 -2
  101. package/package.json +5 -5
@@ -0,0 +1,893 @@
1
+ import { v4 as uuidv4 } from "uuid";
2
+ import { z } from "zod";
3
+ import { getErrorMessage } from "@getpaseo/protocol/error-utils";
4
+ import { TTSManager } from "../../agent/tts-manager.js";
5
+ import { STTManager } from "../../agent/stt-manager.js";
6
+ import { maybePersistTtsDebugAudio } from "../../agent/tts-debug.js";
7
+ import { isPaseoDictationDebugEnabled } from "../../agent/recordings-debug.js";
8
+ import { DictationStreamManager, } from "../../dictation/dictation-stream-manager.js";
9
+ import { createVoiceTurnController } from "./voice-turn-controller.js";
10
+ import { buildVoiceModeSystemPrompt, stripVoiceModeSystemPrompt } from "../../voice-config.js";
11
+ import { toResolver } from "../../speech/provider-resolver.js";
12
+ const PCM_SAMPLE_RATE = 16000;
13
+ const PCM_CHANNELS = 1;
14
+ const PCM_BITS_PER_SAMPLE = 16;
15
+ const PCM_BYTES_PER_MS = (PCM_SAMPLE_RATE * PCM_CHANNELS * (PCM_BITS_PER_SAMPLE / 8)) / 1000;
16
+ const MIN_STREAMING_SEGMENT_DURATION_MS = 1000;
17
+ const MIN_STREAMING_SEGMENT_BYTES = Math.round(PCM_BYTES_PER_MS * MIN_STREAMING_SEGMENT_DURATION_MS);
18
+ const AgentIdSchema = z.guid();
19
+ class VoiceFeatureUnavailableError extends Error {
20
+ constructor(context) {
21
+ super(context.message);
22
+ this.name = "VoiceFeatureUnavailableError";
23
+ this.reasonCode = context.reasonCode;
24
+ this.retryable = context.retryable;
25
+ this.missingModelIds = [...context.missingModelIds];
26
+ }
27
+ }
28
+ function convertPCMToWavBuffer(pcmBuffer, sampleRate, channels, bitsPerSample) {
29
+ const headerSize = 44;
30
+ const wavBuffer = Buffer.alloc(headerSize + pcmBuffer.length);
31
+ const byteRate = (sampleRate * channels * bitsPerSample) / 8;
32
+ const blockAlign = (channels * bitsPerSample) / 8;
33
+ wavBuffer.write("RIFF", 0);
34
+ wavBuffer.writeUInt32LE(36 + pcmBuffer.length, 4);
35
+ wavBuffer.write("WAVE", 8);
36
+ wavBuffer.write("fmt ", 12);
37
+ wavBuffer.writeUInt32LE(16, 16);
38
+ wavBuffer.writeUInt16LE(1, 20);
39
+ wavBuffer.writeUInt16LE(channels, 22);
40
+ wavBuffer.writeUInt32LE(sampleRate, 24);
41
+ wavBuffer.writeUInt32LE(byteRate, 28);
42
+ wavBuffer.writeUInt16LE(blockAlign, 32);
43
+ wavBuffer.writeUInt16LE(bitsPerSample, 34);
44
+ wavBuffer.write("data", 36);
45
+ wavBuffer.writeUInt32LE(pcmBuffer.length, 40);
46
+ pcmBuffer.copy(wavBuffer, 44);
47
+ return wavBuffer;
48
+ }
49
+ /**
50
+ * Owns the voice half of a client session: speech-to-text/text-to-speech
51
+ * managers, dictation streaming, the barge-in audio-buffering state machine,
52
+ * voice-turn detection, and the MCP voice bridge. The session delegates the
53
+ * voice/dictation/abort message types here and otherwise knows nothing about
54
+ * audio buffering or processing phases.
55
+ */
56
+ export class VoiceSession {
57
+ constructor(options) {
58
+ this.processingPhase = "idle";
59
+ this.isVoiceMode = false;
60
+ this.speechInProgress = false;
61
+ this.voiceTurnController = null;
62
+ this.voiceInputChunkCount = 0;
63
+ this.voiceInputBytes = 0;
64
+ this.voiceInputWindowStartedAt = Date.now();
65
+ // Audio buffering for interruption handling
66
+ this.pendingAudioSegments = [];
67
+ this.bufferTimeout = null;
68
+ this.audioBuffer = null;
69
+ // Optional TTS debug capture (persisted per utterance)
70
+ this.ttsDebugStreams = new Map();
71
+ this.voiceModeAgentId = null;
72
+ this.voiceModeBaseConfig = null;
73
+ const { host, logger, sessionId, sttLanguage, tts, stt, voice, voiceBridge, dictation } = options;
74
+ this.host = host;
75
+ this.sessionLogger = logger;
76
+ this.sessionId = sessionId;
77
+ this.sttLanguage = sttLanguage ?? "en";
78
+ this.abortController = new AbortController();
79
+ this.resolveVoiceTurnDetection = toResolver(voice?.turnDetection ?? null);
80
+ this.registerVoiceSpeakHandler = voiceBridge?.registerVoiceSpeakHandler;
81
+ this.unregisterVoiceSpeakHandler = voiceBridge?.unregisterVoiceSpeakHandler;
82
+ this.registerVoiceCallerContext = voiceBridge?.registerVoiceCallerContext;
83
+ this.unregisterVoiceCallerContext = voiceBridge?.unregisterVoiceCallerContext;
84
+ this.getSpeechReadiness = dictation?.getSpeechReadiness;
85
+ this.ttsManager = new TTSManager(this.sessionId, this.sessionLogger, tts);
86
+ this.sttManager = new STTManager(this.sessionId, this.sessionLogger, stt, {
87
+ language: sttLanguage,
88
+ });
89
+ this.dictationStreamManager = new DictationStreamManager({
90
+ logger: this.sessionLogger,
91
+ sessionId: this.sessionId,
92
+ emit: (msg) => this.handleDictationManagerMessage(msg),
93
+ stt: dictation?.stt ?? null,
94
+ language: dictation?.sttLanguage,
95
+ finalTimeoutMs: dictation?.finalTimeoutMs,
96
+ });
97
+ }
98
+ isActiveForAgent(agentId) {
99
+ return this.isVoiceMode && this.voiceModeAgentId === agentId;
100
+ }
101
+ handleDictationChunk(params) {
102
+ return this.dictationStreamManager.handleChunk(params);
103
+ }
104
+ handleDictationFinish(dictationId, finalSeq) {
105
+ return this.dictationStreamManager.handleFinish(dictationId, finalSeq);
106
+ }
107
+ handleDictationCancel(dictationId) {
108
+ this.dictationStreamManager.handleCancel(dictationId);
109
+ }
110
+ async handleDictationStreamStart(msg) {
111
+ const unavailable = this.resolveVoiceFeatureUnavailableContext("dictation");
112
+ if (unavailable) {
113
+ this.emit({
114
+ type: "dictation_stream_error",
115
+ payload: {
116
+ dictationId: msg.dictationId,
117
+ error: unavailable.message,
118
+ retryable: unavailable.retryable,
119
+ reasonCode: unavailable.reasonCode,
120
+ missingModelIds: unavailable.missingModelIds,
121
+ },
122
+ });
123
+ return;
124
+ }
125
+ await this.dictationStreamManager.handleStart(msg.dictationId, msg.format);
126
+ }
127
+ toVoiceFeatureUnavailableContext(state) {
128
+ return {
129
+ reasonCode: state.reasonCode,
130
+ message: state.message,
131
+ retryable: state.retryable,
132
+ missingModelIds: [...state.missingModelIds],
133
+ };
134
+ }
135
+ resolveModeReadinessState(readiness, mode) {
136
+ if (mode === "voice_mode") {
137
+ return readiness.realtimeVoice;
138
+ }
139
+ return readiness.dictation;
140
+ }
141
+ getVoiceFeatureUnavailableResponseMetadata(error) {
142
+ if (!(error instanceof VoiceFeatureUnavailableError)) {
143
+ return {};
144
+ }
145
+ return {
146
+ reasonCode: error.reasonCode,
147
+ retryable: error.retryable,
148
+ missingModelIds: error.missingModelIds,
149
+ };
150
+ }
151
+ resolveVoiceFeatureUnavailableContext(mode) {
152
+ const readiness = this.getSpeechReadiness?.();
153
+ if (!readiness) {
154
+ return null;
155
+ }
156
+ const modeReadiness = this.resolveModeReadinessState(readiness, mode);
157
+ if (!modeReadiness.enabled) {
158
+ return this.toVoiceFeatureUnavailableContext(modeReadiness);
159
+ }
160
+ if (!readiness.voiceFeature.available) {
161
+ return this.toVoiceFeatureUnavailableContext(readiness.voiceFeature);
162
+ }
163
+ if (!modeReadiness.available) {
164
+ return this.toVoiceFeatureUnavailableContext(modeReadiness);
165
+ }
166
+ return null;
167
+ }
168
+ /**
169
+ * Handle voice mode toggle
170
+ */
171
+ async handleSetVoiceMode(enabled, agentId, requestId) {
172
+ const startedAt = Date.now();
173
+ try {
174
+ this.sessionLogger.info({ enabled, requestedAgentId: agentId ?? null, requestId: requestId ?? null }, "set_voice_mode started");
175
+ if (enabled) {
176
+ const unavailable = this.resolveVoiceFeatureUnavailableContext("voice_mode");
177
+ if (unavailable) {
178
+ throw new VoiceFeatureUnavailableError(unavailable);
179
+ }
180
+ const normalizedAgentId = this.parseVoiceTargetAgentId(agentId ?? "", "set_voice_mode");
181
+ if (this.isVoiceMode &&
182
+ this.voiceModeAgentId &&
183
+ this.voiceModeAgentId !== normalizedAgentId) {
184
+ this.sessionLogger.info({
185
+ previousAgentId: this.voiceModeAgentId,
186
+ nextAgentId: normalizedAgentId,
187
+ elapsedMs: Date.now() - startedAt,
188
+ }, "set_voice_mode disabling previous active voice agent");
189
+ await this.disableVoiceModeForActiveAgent(true);
190
+ }
191
+ if (!this.isVoiceMode || this.voiceModeAgentId !== normalizedAgentId) {
192
+ this.sessionLogger.info({ agentId: normalizedAgentId, elapsedMs: Date.now() - startedAt }, "set_voice_mode enabling voice for agent");
193
+ const refreshedAgentId = await this.enableVoiceModeForAgent(normalizedAgentId);
194
+ this.voiceModeAgentId = refreshedAgentId;
195
+ this.sessionLogger.info({ agentId: refreshedAgentId, elapsedMs: Date.now() - startedAt }, "set_voice_mode agent enable complete");
196
+ }
197
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, "set_voice_mode starting voice turn controller");
198
+ await this.startVoiceTurnController();
199
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, "set_voice_mode voice turn controller started");
200
+ this.isVoiceMode = true;
201
+ this.sessionLogger.info({
202
+ agentId: this.voiceModeAgentId,
203
+ elapsedMs: Date.now() - startedAt,
204
+ }, "Voice mode enabled for existing agent");
205
+ if (requestId) {
206
+ this.emit({
207
+ type: "set_voice_mode_response",
208
+ payload: {
209
+ requestId,
210
+ enabled: true,
211
+ agentId: this.voiceModeAgentId,
212
+ accepted: true,
213
+ error: null,
214
+ },
215
+ });
216
+ }
217
+ return;
218
+ }
219
+ this.sessionLogger.info({ agentId: this.voiceModeAgentId, elapsedMs: Date.now() - startedAt }, "set_voice_mode disabling active voice mode");
220
+ await this.disableVoiceModeForActiveAgent(true);
221
+ this.isVoiceMode = false;
222
+ this.sessionLogger.info({ elapsedMs: Date.now() - startedAt }, "Voice mode disabled");
223
+ if (requestId) {
224
+ this.emit({
225
+ type: "set_voice_mode_response",
226
+ payload: {
227
+ requestId,
228
+ enabled: false,
229
+ agentId: null,
230
+ accepted: true,
231
+ error: null,
232
+ },
233
+ });
234
+ }
235
+ }
236
+ catch (error) {
237
+ const errorMessage = error instanceof Error ? error.message : "Failed to set voice mode";
238
+ const unavailable = this.getVoiceFeatureUnavailableResponseMetadata(error);
239
+ this.sessionLogger.error({
240
+ err: error,
241
+ enabled,
242
+ requestedAgentId: agentId ?? null,
243
+ elapsedMs: Date.now() - startedAt,
244
+ }, "set_voice_mode failed");
245
+ if (requestId) {
246
+ this.emit({
247
+ type: "set_voice_mode_response",
248
+ payload: {
249
+ requestId,
250
+ enabled: this.isVoiceMode,
251
+ agentId: this.voiceModeAgentId,
252
+ accepted: false,
253
+ error: errorMessage,
254
+ ...unavailable,
255
+ },
256
+ });
257
+ return;
258
+ }
259
+ throw error;
260
+ }
261
+ }
262
+ parseVoiceTargetAgentId(rawId, source) {
263
+ const parsed = AgentIdSchema.safeParse(rawId.trim());
264
+ if (!parsed.success) {
265
+ throw new Error(`${source}: agentId must be a UUID`);
266
+ }
267
+ return parsed.data;
268
+ }
269
+ async enableVoiceModeForAgent(agentId) {
270
+ const startedAt = Date.now();
271
+ this.sessionLogger.info({ agentId }, "enableVoiceModeForAgent.ensureAgentLoaded.start");
272
+ const existing = await this.host.loadAgent(agentId);
273
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, "enableVoiceModeForAgent.ensureAgentLoaded.done");
274
+ this.registerVoiceBridgeForAgent(agentId);
275
+ const baseConfig = {
276
+ systemPrompt: stripVoiceModeSystemPrompt(existing.config.systemPrompt),
277
+ };
278
+ this.voiceModeBaseConfig = baseConfig;
279
+ const refreshOverrides = {
280
+ systemPrompt: buildVoiceModeSystemPrompt(baseConfig.systemPrompt, true),
281
+ };
282
+ try {
283
+ this.sessionLogger.info({ agentId, elapsedMs: Date.now() - startedAt }, "enableVoiceModeForAgent.reloadAgentSession.start");
284
+ const refreshed = await this.host.reloadAgentSession(agentId, refreshOverrides);
285
+ this.sessionLogger.info({ agentId, refreshedAgentId: refreshed.id, elapsedMs: Date.now() - startedAt }, "enableVoiceModeForAgent.reloadAgentSession.done");
286
+ return refreshed.id;
287
+ }
288
+ catch (error) {
289
+ this.unregisterVoiceSpeakHandler?.(agentId);
290
+ this.unregisterVoiceCallerContext?.(agentId);
291
+ this.voiceModeBaseConfig = null;
292
+ throw error;
293
+ }
294
+ }
295
+ async disableVoiceModeForActiveAgent(restoreAgentConfig) {
296
+ await this.stopVoiceTurnController();
297
+ const agentId = this.voiceModeAgentId;
298
+ if (!agentId) {
299
+ this.voiceModeBaseConfig = null;
300
+ return;
301
+ }
302
+ this.unregisterVoiceSpeakHandler?.(agentId);
303
+ this.unregisterVoiceCallerContext?.(agentId);
304
+ if (restoreAgentConfig && this.voiceModeBaseConfig) {
305
+ const baseConfig = this.voiceModeBaseConfig;
306
+ try {
307
+ await this.host.reloadAgentSession(agentId, {
308
+ systemPrompt: buildVoiceModeSystemPrompt(baseConfig.systemPrompt, false),
309
+ });
310
+ }
311
+ catch (error) {
312
+ this.sessionLogger.warn({ err: error, agentId }, "Failed to restore agent config while disabling voice mode");
313
+ }
314
+ }
315
+ this.voiceModeBaseConfig = null;
316
+ this.voiceModeAgentId = null;
317
+ }
318
+ handleDictationManagerMessage(msg) {
319
+ this.emit(msg);
320
+ }
321
+ async startVoiceTurnController() {
322
+ if (this.voiceTurnController) {
323
+ this.sessionLogger.info("startVoiceTurnController skipped: already running");
324
+ return;
325
+ }
326
+ const turnDetection = this.resolveVoiceTurnDetection();
327
+ if (!turnDetection) {
328
+ throw new Error("Voice turn detection is not configured");
329
+ }
330
+ const stt = this.sttManager.getProvider();
331
+ if (!stt) {
332
+ throw new Error("Voice speech-to-text is not configured");
333
+ }
334
+ this.sessionLogger.info({ providerId: turnDetection.id }, "startVoiceTurnController creating controller");
335
+ const controller = createVoiceTurnController({
336
+ logger: this.sessionLogger.child({ component: "voice-turn-controller" }),
337
+ turnDetection,
338
+ stt,
339
+ sttLanguage: this.sttLanguage,
340
+ callbacks: {
341
+ onSpeechStarted: async () => {
342
+ this.sessionLogger.debug("Voice VAD speech_started");
343
+ },
344
+ onPartialTranscript: async ({ segmentId, transcript }) => {
345
+ this.sessionLogger.info({ segmentId, transcriptLength: transcript.trim().length }, "voice_input_state emitting isSpeaking=true");
346
+ this.emit({
347
+ type: "voice_input_state",
348
+ payload: {
349
+ isSpeaking: true,
350
+ },
351
+ });
352
+ await this.handleVoiceSpeechStart();
353
+ },
354
+ onSpeechStopped: async () => {
355
+ this.handleVoiceSpeechStopped();
356
+ this.setPhase("transcribing");
357
+ this.emit({
358
+ type: "activity_log",
359
+ payload: {
360
+ id: uuidv4(),
361
+ timestamp: new Date(),
362
+ type: "system",
363
+ content: "Transcribing audio...",
364
+ },
365
+ });
366
+ },
367
+ onFinalTranscript: async ({ transcript, language, durationMs, avgLogprob, isLowConfidence, }) => {
368
+ const requestId = uuidv4();
369
+ const transcriptText = isLowConfidence ? "" : transcript.trim();
370
+ if (isLowConfidence) {
371
+ this.sessionLogger.debug({ text: transcript, avgLogprob }, "Filtered low-confidence transcription (likely non-speech)");
372
+ }
373
+ this.sessionLogger.info({
374
+ requestId,
375
+ isVoiceMode: this.isVoiceMode,
376
+ transcriptLength: transcriptText.length,
377
+ transcript: transcriptText,
378
+ }, "Transcription result");
379
+ await this.handleTranscriptionResultPayload({
380
+ text: transcriptText,
381
+ requestId,
382
+ ...(language ? { language } : {}),
383
+ duration: durationMs,
384
+ ...(avgLogprob !== undefined ? { avgLogprob } : {}),
385
+ ...(isLowConfidence !== undefined ? { isLowConfidence } : {}),
386
+ });
387
+ },
388
+ onError: (error) => {
389
+ this.sessionLogger.error({ err: error }, "Voice turn controller failed");
390
+ },
391
+ },
392
+ });
393
+ this.sessionLogger.info("startVoiceTurnController connecting controller");
394
+ await controller.start();
395
+ this.voiceTurnController = controller;
396
+ this.sessionLogger.info("startVoiceTurnController connected");
397
+ }
398
+ async stopVoiceTurnController() {
399
+ if (!this.voiceTurnController) {
400
+ return;
401
+ }
402
+ const controller = this.voiceTurnController;
403
+ this.voiceTurnController = null;
404
+ await controller.stop();
405
+ }
406
+ handleVoiceSpeechStopped() {
407
+ this.sessionLogger.info("voice_input_state emitting isSpeaking=false");
408
+ this.emit({
409
+ type: "voice_input_state",
410
+ payload: {
411
+ isSpeaking: false,
412
+ },
413
+ });
414
+ }
415
+ async ensureAudioBufferForFormat(chunkFormat, isPCMChunk) {
416
+ if (!this.audioBuffer) {
417
+ this.audioBuffer = {
418
+ chunks: [],
419
+ format: chunkFormat,
420
+ isPCM: isPCMChunk,
421
+ totalPCMBytes: 0,
422
+ };
423
+ return this.audioBuffer;
424
+ }
425
+ if (this.audioBuffer.isPCM !== isPCMChunk) {
426
+ this.sessionLogger.debug({
427
+ oldFormat: this.audioBuffer.isPCM ? "pcm" : this.audioBuffer.format,
428
+ newFormat: chunkFormat,
429
+ }, `Audio format changed mid-stream, flushing current buffer`);
430
+ const finalized = this.finalizeBufferedAudio();
431
+ if (finalized) {
432
+ await this.processCompletedAudio(finalized.audio, finalized.format);
433
+ }
434
+ this.audioBuffer = {
435
+ chunks: [],
436
+ format: chunkFormat,
437
+ isPCM: isPCMChunk,
438
+ totalPCMBytes: 0,
439
+ };
440
+ return this.audioBuffer;
441
+ }
442
+ if (!this.audioBuffer.isPCM) {
443
+ this.audioBuffer.format = chunkFormat;
444
+ }
445
+ return this.audioBuffer;
446
+ }
447
+ async forwardAudioChunkToVoiceTurn(msg, chunkFormat) {
448
+ if (!this.voiceTurnController) {
449
+ throw new Error("Voice mode is enabled but the voice turn controller is not running");
450
+ }
451
+ const chunkBytes = Buffer.byteLength(msg.audio, "base64");
452
+ this.voiceInputChunkCount += 1;
453
+ this.voiceInputBytes += chunkBytes;
454
+ const now = Date.now();
455
+ if (this.voiceInputChunkCount % 50 === 0 || now - this.voiceInputWindowStartedAt >= 1000) {
456
+ this.sessionLogger.info({
457
+ chunkCount: this.voiceInputChunkCount,
458
+ audioBytes: this.voiceInputBytes,
459
+ windowMs: now - this.voiceInputWindowStartedAt,
460
+ format: chunkFormat,
461
+ }, "Voice input chunk summary");
462
+ this.voiceInputWindowStartedAt = now;
463
+ this.voiceInputChunkCount = 0;
464
+ this.voiceInputBytes = 0;
465
+ }
466
+ await this.voiceTurnController.appendClientChunk({
467
+ audioBase64: msg.audio,
468
+ format: chunkFormat,
469
+ });
470
+ }
471
+ async handleAudioChunk(msg) {
472
+ if (!this.isVoiceMode) {
473
+ this.sessionLogger.warn("Received voice_audio_chunk while voice mode is disabled; transcript will be emitted but voice assistant turn is skipped");
474
+ }
475
+ const chunkFormat = msg.format || "audio/wav";
476
+ if (this.isVoiceMode) {
477
+ await this.forwardAudioChunkToVoiceTurn(msg, chunkFormat);
478
+ return;
479
+ }
480
+ const chunkBuffer = Buffer.from(msg.audio, "base64");
481
+ const isPCMChunk = chunkFormat.toLowerCase().includes("pcm");
482
+ const buffer = await this.ensureAudioBufferForFormat(chunkFormat, isPCMChunk);
483
+ buffer.chunks.push(chunkBuffer);
484
+ if (buffer.isPCM) {
485
+ buffer.totalPCMBytes += chunkBuffer.length;
486
+ }
487
+ // In non-voice mode, use streaming threshold to process chunks
488
+ const reachedStreamingThreshold = !this.isVoiceMode && buffer.isPCM && buffer.totalPCMBytes >= MIN_STREAMING_SEGMENT_BYTES;
489
+ if (!msg.isLast && reachedStreamingThreshold) {
490
+ return;
491
+ }
492
+ const bufferedState = this.audioBuffer;
493
+ const finalized = this.finalizeBufferedAudio();
494
+ if (!finalized) {
495
+ return;
496
+ }
497
+ if (!msg.isLast && reachedStreamingThreshold) {
498
+ this.sessionLogger.debug({
499
+ minDuration: MIN_STREAMING_SEGMENT_DURATION_MS,
500
+ pcmBytes: bufferedState?.totalPCMBytes ?? 0,
501
+ }, `Minimum chunk duration reached (~${MIN_STREAMING_SEGMENT_DURATION_MS}ms, ${bufferedState?.totalPCMBytes ?? 0} PCM bytes) – triggering STT`);
502
+ }
503
+ else {
504
+ this.sessionLogger.debug({ audioBytes: finalized.audio.length, chunks: bufferedState?.chunks.length ?? 0 }, `Complete audio segment (${finalized.audio.length} bytes, ${bufferedState?.chunks.length ?? 0} chunk(s))`);
505
+ }
506
+ await this.processCompletedAudio(finalized.audio, finalized.format);
507
+ }
508
+ finalizeBufferedAudio() {
509
+ if (!this.audioBuffer) {
510
+ return null;
511
+ }
512
+ const bufferState = this.audioBuffer;
513
+ this.audioBuffer = null;
514
+ if (bufferState.isPCM) {
515
+ const pcmBuffer = Buffer.concat(bufferState.chunks);
516
+ const wavBuffer = convertPCMToWavBuffer(pcmBuffer, PCM_SAMPLE_RATE, PCM_CHANNELS, PCM_BITS_PER_SAMPLE);
517
+ return {
518
+ audio: wavBuffer,
519
+ format: "audio/wav",
520
+ };
521
+ }
522
+ return {
523
+ audio: Buffer.concat(bufferState.chunks),
524
+ format: bufferState.format,
525
+ };
526
+ }
527
+ async processCompletedAudio(audio, format) {
528
+ if (this.processingPhase === "transcribing") {
529
+ this.sessionLogger.debug({ phase: this.processingPhase, segmentCount: this.pendingAudioSegments.length + 1 }, `Buffering audio segment (phase: ${this.processingPhase})`);
530
+ this.pendingAudioSegments.push({
531
+ audio,
532
+ format,
533
+ });
534
+ this.setBufferTimeout();
535
+ return;
536
+ }
537
+ if (this.pendingAudioSegments.length > 0) {
538
+ this.pendingAudioSegments.push({
539
+ audio,
540
+ format,
541
+ });
542
+ this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, `Processing ${this.pendingAudioSegments.length} buffered segments together`);
543
+ const pendingSegments = [...this.pendingAudioSegments];
544
+ this.pendingAudioSegments = [];
545
+ this.clearBufferTimeout();
546
+ const combinedAudio = Buffer.concat(pendingSegments.map((segment) => segment.audio));
547
+ const combinedFormat = pendingSegments[pendingSegments.length - 1].format;
548
+ await this.processAudio(combinedAudio, combinedFormat);
549
+ return;
550
+ }
551
+ await this.processAudio(audio, format);
552
+ }
553
+ async flushPendingAudioSegments(reason) {
554
+ if (this.processingPhase === "transcribing" || this.pendingAudioSegments.length === 0) {
555
+ return;
556
+ }
557
+ const pendingSegments = [...this.pendingAudioSegments];
558
+ this.pendingAudioSegments = [];
559
+ this.clearBufferTimeout();
560
+ this.sessionLogger.debug({ reason, segmentCount: pendingSegments.length }, `Flushing ${pendingSegments.length} buffered audio segment(s)`);
561
+ const combinedAudio = Buffer.concat(pendingSegments.map((segment) => segment.audio));
562
+ const combinedFormat = pendingSegments[pendingSegments.length - 1].format;
563
+ await this.processAudio(combinedAudio, combinedFormat);
564
+ }
565
+ /**
566
+ * Process audio through STT and then LLM
567
+ */
568
+ async processAudio(audio, format) {
569
+ this.setPhase("transcribing");
570
+ this.emit({
571
+ type: "activity_log",
572
+ payload: {
573
+ id: uuidv4(),
574
+ timestamp: new Date(),
575
+ type: "system",
576
+ content: "Transcribing audio...",
577
+ },
578
+ });
579
+ try {
580
+ const requestId = uuidv4();
581
+ const result = await this.sttManager.transcribe(audio, format, {
582
+ requestId,
583
+ label: this.isVoiceMode ? "voice" : "buffered",
584
+ });
585
+ const transcriptText = result.text.trim();
586
+ this.sessionLogger.info({
587
+ requestId,
588
+ isVoiceMode: this.isVoiceMode,
589
+ transcriptLength: transcriptText.length,
590
+ transcript: transcriptText,
591
+ }, "Transcription result");
592
+ await this.handleTranscriptionResultPayload({
593
+ text: result.text,
594
+ language: result.language,
595
+ duration: result.duration,
596
+ requestId,
597
+ avgLogprob: result.avgLogprob,
598
+ isLowConfidence: result.isLowConfidence,
599
+ byteLength: result.byteLength,
600
+ format: result.format,
601
+ debugRecordingPath: result.debugRecordingPath,
602
+ });
603
+ }
604
+ catch (error) {
605
+ this.setPhase("idle");
606
+ this.clearSpeechInProgress("transcription error");
607
+ await this.flushPendingAudioSegments("transcription error");
608
+ this.emit({
609
+ type: "activity_log",
610
+ payload: {
611
+ id: uuidv4(),
612
+ timestamp: new Date(),
613
+ type: "error",
614
+ content: `Transcription error: ${getErrorMessage(error)}`,
615
+ },
616
+ });
617
+ throw error;
618
+ }
619
+ }
620
+ async handleTranscriptionResultPayload(result) {
621
+ const transcriptText = result.text.trim();
622
+ this.emit({
623
+ type: "transcription_result",
624
+ payload: {
625
+ text: result.text,
626
+ ...(result.language ? { language: result.language } : {}),
627
+ ...(result.duration !== undefined ? { duration: result.duration } : {}),
628
+ requestId: result.requestId,
629
+ ...(result.avgLogprob !== undefined ? { avgLogprob: result.avgLogprob } : {}),
630
+ ...(result.isLowConfidence !== undefined
631
+ ? { isLowConfidence: result.isLowConfidence }
632
+ : {}),
633
+ ...(result.byteLength !== undefined ? { byteLength: result.byteLength } : {}),
634
+ ...(result.format ? { format: result.format } : {}),
635
+ ...(result.debugRecordingPath ? { debugRecordingPath: result.debugRecordingPath } : {}),
636
+ },
637
+ });
638
+ if (!transcriptText) {
639
+ this.sessionLogger.debug("Empty transcription (false positive), not aborting");
640
+ this.setPhase("idle");
641
+ this.clearSpeechInProgress("empty transcription");
642
+ await this.flushPendingAudioSegments("empty transcription");
643
+ return;
644
+ }
645
+ // Has content - abort any in-progress stream now
646
+ this.createAbortController();
647
+ if (result.debugRecordingPath) {
648
+ this.emit({
649
+ type: "activity_log",
650
+ payload: {
651
+ id: uuidv4(),
652
+ timestamp: new Date(),
653
+ type: "system",
654
+ content: `Saved input audio: ${result.debugRecordingPath}`,
655
+ metadata: {
656
+ recordingPath: result.debugRecordingPath,
657
+ ...(result.format ? { format: result.format } : {}),
658
+ requestId: result.requestId,
659
+ },
660
+ },
661
+ });
662
+ }
663
+ this.emit({
664
+ type: "activity_log",
665
+ payload: {
666
+ id: uuidv4(),
667
+ timestamp: new Date(),
668
+ type: "transcript",
669
+ content: result.text,
670
+ metadata: {
671
+ ...(result.language ? { language: result.language } : {}),
672
+ ...(result.duration !== undefined ? { duration: result.duration } : {}),
673
+ },
674
+ },
675
+ });
676
+ this.clearSpeechInProgress("transcription complete");
677
+ this.setPhase("idle");
678
+ if (!this.isVoiceMode) {
679
+ this.sessionLogger.debug({ requestId: result.requestId }, "Skipping voice agent processing because voice mode is disabled");
680
+ await this.flushPendingAudioSegments("voice mode disabled");
681
+ return;
682
+ }
683
+ const agentId = this.voiceModeAgentId;
684
+ if (!agentId) {
685
+ this.sessionLogger.warn({ requestId: result.requestId }, "Skipping voice agent processing because no agent is currently voice-enabled");
686
+ await this.flushPendingAudioSegments("no active voice agent");
687
+ return;
688
+ }
689
+ await this.host.sendSpokenInput(agentId, result.text);
690
+ await this.flushPendingAudioSegments("transcription complete");
691
+ }
692
+ registerVoiceBridgeForAgent(agentId) {
693
+ this.registerVoiceSpeakHandler?.(agentId, async ({ text, signal }) => {
694
+ this.sessionLogger.info({
695
+ agentId,
696
+ textLength: text.length,
697
+ preview: text.slice(0, 160),
698
+ }, "Voice speak tool call received by session handler");
699
+ const abortSignal = signal ?? this.abortController.signal;
700
+ await this.ttsManager.generateAndWaitForPlayback(text, (msg) => this.emit(msg), abortSignal, true);
701
+ this.sessionLogger.info({ agentId, textLength: text.length }, "Voice speak tool call finished playback");
702
+ this.emit({
703
+ type: "activity_log",
704
+ payload: {
705
+ id: uuidv4(),
706
+ timestamp: new Date(),
707
+ type: "assistant",
708
+ content: text,
709
+ },
710
+ });
711
+ });
712
+ this.registerVoiceCallerContext?.(agentId, {
713
+ childAgentDefaultLabels: {},
714
+ allowCustomCwd: false,
715
+ enableVoiceTools: true,
716
+ });
717
+ }
718
+ /**
719
+ * Handle abort request from client
720
+ */
721
+ async handleAbort() {
722
+ this.sessionLogger.info({ phase: this.processingPhase }, `Abort request, phase: ${this.processingPhase}`);
723
+ this.abortController.abort();
724
+ this.ttsManager.cancelPendingPlaybacks("abort request");
725
+ // Voice abort should always interrupt active agent output immediately.
726
+ if (this.isVoiceMode && this.voiceModeAgentId) {
727
+ try {
728
+ await this.host.interruptAgentIfRunning(this.voiceModeAgentId);
729
+ }
730
+ catch (error) {
731
+ this.sessionLogger.warn({ err: error, agentId: this.voiceModeAgentId }, "Failed to interrupt active voice-mode agent on abort");
732
+ }
733
+ }
734
+ if (this.processingPhase === "transcribing") {
735
+ // Still in STT phase - we'll buffer the next audio
736
+ this.sessionLogger.debug("Will buffer next audio (currently transcribing)");
737
+ // Phase stays as 'transcribing', handleAudioChunk will handle buffering
738
+ return;
739
+ }
740
+ // Reset phase to idle and clear pending non-voice buffers.
741
+ this.setPhase("idle");
742
+ this.pendingAudioSegments = [];
743
+ this.clearBufferTimeout();
744
+ }
745
+ /**
746
+ * Handle audio playback confirmation from client
747
+ */
748
+ handleAudioPlayed(id) {
749
+ this.ttsManager.confirmAudioPlayed(id);
750
+ }
751
+ /**
752
+ * Mark speech detection start and abort any active playback/agent run.
753
+ */
754
+ async handleVoiceSpeechStart() {
755
+ if (this.speechInProgress) {
756
+ return;
757
+ }
758
+ const chunkReceivedAt = Date.now();
759
+ const phaseBeforeAbort = this.processingPhase;
760
+ const hadActiveStream = this.host.hasActiveAgentRun(this.voiceModeAgentId);
761
+ this.speechInProgress = true;
762
+ this.sessionLogger.debug("Voice speech detected – aborting playback and active agent run");
763
+ if (this.pendingAudioSegments.length > 0) {
764
+ this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, `Dropping ${this.pendingAudioSegments.length} buffered audio segment(s) due to voice speech`);
765
+ this.pendingAudioSegments = [];
766
+ }
767
+ if (this.audioBuffer) {
768
+ this.sessionLogger.debug({ chunks: this.audioBuffer.chunks.length, pcmBytes: this.audioBuffer.totalPCMBytes }, `Clearing partial audio buffer (${this.audioBuffer.chunks.length} chunk(s)${this.audioBuffer.isPCM ? `, ${this.audioBuffer.totalPCMBytes} PCM bytes` : ""})`);
769
+ this.audioBuffer = null;
770
+ }
771
+ this.clearBufferTimeout();
772
+ this.abortController.abort();
773
+ await this.handleAbort();
774
+ const latencyMs = Date.now() - chunkReceivedAt;
775
+ this.sessionLogger.debug({ latencyMs, phaseBeforeAbort, hadActiveStream }, "[Telemetry] barge_in.llm_abort_latency");
776
+ }
777
+ /**
778
+ * Clear speech-in-progress flag once the user turn has completed
779
+ */
780
+ clearSpeechInProgress(reason) {
781
+ if (!this.speechInProgress) {
782
+ return;
783
+ }
784
+ this.speechInProgress = false;
785
+ this.sessionLogger.debug({ reason }, `Speech turn complete (${reason}) – resuming TTS`);
786
+ }
787
+ /**
788
+ * Create new AbortController, aborting the previous one
789
+ */
790
+ createAbortController() {
791
+ this.abortController.abort();
792
+ this.abortController = new AbortController();
793
+ this.ttsDebugStreams.clear();
794
+ return this.abortController;
795
+ }
796
+ /**
797
+ * Set the processing phase
798
+ */
799
+ setPhase(phase) {
800
+ this.processingPhase = phase;
801
+ this.sessionLogger.debug({ phase }, `Phase: ${phase}`);
802
+ }
803
+ /**
804
+ * Set timeout to process buffered audio segments
805
+ */
806
+ setBufferTimeout() {
807
+ this.clearBufferTimeout();
808
+ this.bufferTimeout = setTimeout(async () => {
809
+ this.sessionLogger.debug("Buffer timeout reached, processing pending segments");
810
+ if (this.processingPhase === "transcribing") {
811
+ this.sessionLogger.debug({ segmentCount: this.pendingAudioSegments.length }, "Buffer timeout deferred because transcription is still in progress");
812
+ this.setBufferTimeout();
813
+ return;
814
+ }
815
+ if (this.pendingAudioSegments.length > 0) {
816
+ const segments = [...this.pendingAudioSegments];
817
+ this.pendingAudioSegments = [];
818
+ this.bufferTimeout = null;
819
+ const combined = Buffer.concat(segments.map((s) => s.audio));
820
+ await this.processAudio(combined, segments[0].format);
821
+ }
822
+ }, 10000); // 10 second timeout
823
+ }
824
+ /**
825
+ * Clear buffer timeout
826
+ */
827
+ clearBufferTimeout() {
828
+ if (this.bufferTimeout) {
829
+ clearTimeout(this.bufferTimeout);
830
+ this.bufferTimeout = null;
831
+ }
832
+ }
833
+ /**
834
+ * Emit a message to the client. Captures TTS audio_output frames for optional
835
+ * debug persistence before forwarding to the session emitter.
836
+ */
837
+ emit(msg) {
838
+ if (msg.type === "audio_output" &&
839
+ (process.env.TTS_DEBUG_AUDIO_DIR || isPaseoDictationDebugEnabled()) &&
840
+ msg.payload.groupId &&
841
+ typeof msg.payload.audio === "string") {
842
+ const groupId = msg.payload.groupId;
843
+ const existing = this.ttsDebugStreams.get(groupId) ??
844
+ { format: msg.payload.format, chunks: [] };
845
+ try {
846
+ existing.chunks.push(Buffer.from(msg.payload.audio, "base64"));
847
+ existing.format = msg.payload.format;
848
+ this.ttsDebugStreams.set(groupId, existing);
849
+ }
850
+ catch {
851
+ // ignore malformed base64
852
+ }
853
+ if (msg.payload.isLastChunk) {
854
+ const final = this.ttsDebugStreams.get(groupId);
855
+ this.ttsDebugStreams.delete(groupId);
856
+ if (final && final.chunks.length > 0) {
857
+ void (async () => {
858
+ const recordingPath = await maybePersistTtsDebugAudio(Buffer.concat(final.chunks), { sessionId: this.sessionId, groupId, format: final.format }, this.sessionLogger);
859
+ if (recordingPath) {
860
+ this.host.emit({
861
+ type: "activity_log",
862
+ payload: {
863
+ id: uuidv4(),
864
+ timestamp: new Date(),
865
+ type: "system",
866
+ content: `Saved TTS audio: ${recordingPath}`,
867
+ metadata: { recordingPath, format: final.format, groupId },
868
+ },
869
+ });
870
+ }
871
+ })();
872
+ }
873
+ }
874
+ }
875
+ this.host.emit(msg);
876
+ }
877
+ /**
878
+ * Tear down all voice resources.
879
+ */
880
+ async cleanup() {
881
+ this.abortController.abort();
882
+ this.clearBufferTimeout();
883
+ this.pendingAudioSegments = [];
884
+ this.audioBuffer = null;
885
+ await this.stopVoiceTurnController();
886
+ this.ttsManager.cleanup();
887
+ this.sttManager.cleanup();
888
+ this.dictationStreamManager.cleanupAll();
889
+ await this.disableVoiceModeForActiveAgent(true);
890
+ this.isVoiceMode = false;
891
+ }
892
+ }
893
+ //# sourceMappingURL=voice-session.js.map