discoclaw 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/.env.example +4 -6
  2. package/.env.example.full +13 -32
  3. package/README.md +1 -1
  4. package/dist/cli/dashboard.test.js +0 -4
  5. package/dist/cli/init-wizard.js +4 -8
  6. package/dist/cli/init-wizard.test.js +4 -10
  7. package/dist/config.js +2 -42
  8. package/dist/config.test.js +8 -72
  9. package/dist/dashboard/server.js +1 -5
  10. package/dist/dashboard/server.test.js +3 -6
  11. package/dist/discord/actions.js +112 -6
  12. package/dist/discord/actions.test.js +117 -1
  13. package/dist/discord/help-command.js +1 -1
  14. package/dist/discord/message-coordinator.js +3 -8
  15. package/dist/discord/models-command.js +1 -1
  16. package/dist/discord/reaction-handler.js +2 -2
  17. package/dist/discord/reaction-handler.test.js +55 -0
  18. package/dist/discord/verify-push.js +31 -36
  19. package/dist/discord/verify-push.test.js +34 -6
  20. package/dist/discord/voice-command.js +1 -31
  21. package/dist/discord/voice-command.test.js +21 -259
  22. package/dist/discord/voice-status-command.js +3 -22
  23. package/dist/discord/voice-status-command.test.js +16 -124
  24. package/dist/discord-followup.test.js +133 -0
  25. package/dist/health/config-doctor.js +5 -27
  26. package/dist/health/config-doctor.test.js +1 -4
  27. package/dist/index.js +1 -28
  28. package/dist/runtime-overrides.js +2 -3
  29. package/dist/runtime-overrides.test.js +27 -193
  30. package/dist/tasks/store.js +10 -6
  31. package/dist/tasks/store.test.js +44 -0
  32. package/dist/tasks/task-action-executor.test.js +162 -50
  33. package/dist/tasks/task-action-mutations.js +22 -2
  34. package/dist/tasks/task-action-read-ops.js +7 -1
  35. package/dist/tasks/task-action-runner-types.js +19 -1
  36. package/dist/voice/audio-pipeline.js +145 -298
  37. package/docs/configuration.md +4 -9
  38. package/docs/official-docs.md +6 -9
  39. package/docs/runtime-switching.md +1 -1
  40. package/package.json +1 -1
  41. package/dist/voice/audio-pipeline.test.js +0 -1100
  42. package/dist/voice/stt-deepgram.js +0 -154
  43. package/dist/voice/stt-deepgram.test.js +0 -275
  44. package/dist/voice/stt-factory.js +0 -42
  45. package/dist/voice/stt-factory.test.js +0 -45
  46. package/dist/voice/stt-openai.js +0 -156
  47. package/dist/voice/stt-openai.test.js +0 -281
  48. package/dist/voice/tts-cartesia.js +0 -169
  49. package/dist/voice/tts-cartesia.test.js +0 -228
  50. package/dist/voice/tts-deepgram.js +0 -84
  51. package/dist/voice/tts-deepgram.test.js +0 -220
  52. package/dist/voice/tts-factory.js +0 -52
  53. package/dist/voice/tts-factory.test.js +0 -53
  54. package/dist/voice/tts-openai.js +0 -70
  55. package/dist/voice/tts-openai.test.js +0 -138
  56. package/dist/voice/types.test.js +0 -90
@@ -8,9 +8,6 @@
8
8
  */
9
9
  import { VoiceConnectionStatus } from '@discordjs/voice';
10
10
  import { AudioReceiver } from './audio-receiver.js';
11
- import { createSttProvider } from './stt-factory.js';
12
- import { createTtsProvider } from './tts-factory.js';
13
- import { VoiceResponder } from './voice-responder.js';
14
11
  import { ConversationBuffer } from './conversation-buffer.js';
15
12
  import { GeminiLiveProvider } from './providers/gemini-live-provider.js';
16
13
  import { GeminiLiveResponder } from './providers/gemini-live-responder.js';
@@ -22,54 +19,44 @@ import { executeToolCall } from '../runtime/openai-tool-exec.js';
22
19
  // ---------------------------------------------------------------------------
23
20
  export class AudioPipelineManager {
24
21
  log;
25
- voiceConfig;
26
22
  allowedUserIds;
27
23
  createDecoder;
28
24
  onTranscription;
29
- createStt;
30
25
  invokeAi;
31
26
  runtime;
32
27
  runtimeModel;
33
28
  runtimeCwd;
34
29
  runtimeTimeoutMs;
35
- createTts;
36
30
  transcriptMirror;
37
31
  botDisplayName;
38
32
  backfill;
39
33
  buildGeminiSystemInstruction;
40
- voiceProvider;
41
34
  geminiApiKey;
42
35
  enabledTools;
43
36
  silentTools;
44
37
  sessionRotationMs;
45
- onFallbackTriggered;
46
38
  pipelines = new Map();
47
39
  /** Re-entrancy guard: VoiceConnection.subscribe() can synchronously fire stateChange→Ready. */
48
40
  starting = new Set();
49
41
  constructor(opts) {
50
42
  this.log = opts.log;
51
- this.voiceConfig = opts.voiceConfig;
52
43
  this.allowedUserIds = opts.allowedUserIds;
53
44
  this.createDecoder = opts.createDecoder;
54
45
  this.onTranscription = opts.onTranscription;
55
- this.createStt = opts.createStt ?? createSttProvider;
56
46
  this.invokeAi = opts.invokeAi;
57
47
  this.runtime = opts.runtime;
58
48
  this.runtimeModel = opts.runtimeModel;
59
49
  this.runtimeCwd = opts.runtimeCwd;
60
50
  this.runtimeTimeoutMs = opts.runtimeTimeoutMs;
61
- this.createTts = opts.createTts ?? createTtsProvider;
62
51
  this.transcriptMirror = opts.transcriptMirror;
63
52
  this.botDisplayName = opts.botDisplayName ?? 'Bot';
64
53
  this.backfill = opts.backfill;
65
54
  this.buildGeminiSystemInstruction = opts.buildGeminiSystemInstruction;
66
- this.voiceProvider = opts.voiceProvider ?? 'pipeline';
67
55
  this.geminiApiKey = opts.geminiApiKey;
68
56
  this.enabledTools = opts.enabledTools ?? [];
69
57
  this.silentTools = new Set(opts.silentTools ?? []);
70
58
  this.sessionRotationMs = opts.sessionRotationMs;
71
- this.onFallbackTriggered = opts.onFallbackTriggered;
72
- this.log.info({ voiceProvider: this.voiceProvider }, 'audio pipeline manager initialized');
59
+ this.log.info({ voiceProvider: 'gemini-live' }, 'audio pipeline manager initialized');
73
60
  }
74
61
  /**
75
62
  * Attach to a VoiceConnection and auto-manage the audio pipeline
@@ -87,8 +74,8 @@ export class AudioPipelineManager {
87
74
  }
88
75
  });
89
76
  }
90
- /** Start the audio receive pipeline for a guild. Pass `forceMode` to override the configured provider (used during fallback). */
91
- async startPipeline(guildId, connection, forceMode) {
77
+ /** Start the Gemini Live voice pipeline for a guild. */
78
+ async startPipeline(guildId, connection) {
92
79
  // Re-entrancy guard: VoiceConnection.subscribe() (called when wiring the
93
80
  // AudioPlayer) synchronously fires a stateChange→Ready event, which would
94
81
  // re-invoke startPipeline and recurse infinitely.
@@ -100,269 +87,171 @@ export class AudioPipelineManager {
100
87
  this.log.info({ guildId }, 'stopping existing pipeline before restart');
101
88
  await this.stopPipeline(guildId);
102
89
  }
103
- const effectiveMode = forceMode ?? this.voiceProvider;
104
90
  try {
105
- // ----- gemini-live mode: skip STT/TTS, use GeminiLiveProvider directly -----
106
- if (effectiveMode === 'gemini-live') {
107
- const apiKey = this.geminiApiKey;
108
- if (!apiKey)
109
- throw new Error('geminiApiKey is required for gemini-live voice provider');
110
- const buffer = new ConversationBuffer();
111
- if (this.backfill) {
112
- try {
113
- const turns = await this.backfill();
114
- buffer.backfill(turns);
115
- this.log.info({ guildId, turns: turns.length }, 'gemini-live conversation buffer backfilled');
116
- }
117
- catch (err) {
118
- this.log.warn({ guildId, err }, 'gemini-live conversation backfill failed — proceeding with empty history');
119
- }
120
- }
121
- const geminiLiveModel = normalizeGeminiLiveModel(this.runtimeModel) ?? DEFAULT_GEMINI_LIVE_MODEL;
122
- const supportsAsyncFunctionCalling = supportsGeminiLiveAsyncFunctionCalling(geminiLiveModel);
123
- const tools = buildGeminiToolDeclarations(this.enabledTools, { nonBlocking: supportsAsyncFunctionCalling });
124
- const systemInstruction = await this.buildGeminiSystemInstruction?.();
125
- const initialHistory = toGeminiLiveHistoryTurns(buffer.toTurns());
126
- const provider = new GeminiLiveProvider({
127
- apiKey,
128
- log: this.log,
129
- model: geminiLiveModel,
130
- systemInstruction,
131
- responseModalities: ['AUDIO'],
132
- tools,
133
- initialHistoryInClientContent: initialHistory.length > 0,
134
- sessionRotationMs: this.sessionRotationMs,
135
- });
136
- await provider.connect();
137
- if (initialHistory.length > 0) {
138
- provider.sendInitialHistory(initialHistory);
139
- this.log.info({ guildId, turns: initialHistory.length }, 'gemini-live conversation history seeded');
91
+ const apiKey = this.geminiApiKey;
92
+ if (!apiKey)
93
+ throw new Error('geminiApiKey is required for gemini-live voice provider');
94
+ const buffer = new ConversationBuffer();
95
+ if (this.backfill) {
96
+ try {
97
+ const turns = await this.backfill();
98
+ buffer.backfill(turns);
99
+ this.log.info({ guildId, turns: turns.length }, 'gemini-live conversation buffer backfilled');
140
100
  }
141
- if (!supportsAsyncFunctionCalling && this.silentTools.size > 0) {
142
- this.log.info({ guildId, model: geminiLiveModel, count: this.silentTools.size }, 'gemini-live: current model does not support scheduled tool responses; silent tool scheduling disabled');
101
+ catch (err) {
102
+ this.log.warn({ guildId, err }, 'gemini-live conversation backfill failed proceeding with empty history');
143
103
  }
144
- const mirror = this.transcriptMirror;
145
- const botName = this.botDisplayName;
146
- let latestInputTranscript;
147
- const responder = new GeminiLiveResponder({
148
- log: this.log,
149
- connection,
150
- provider,
151
- onBotResponse: mirror
152
- ? (text) => {
153
- if (latestInputTranscript && text.trim()) {
154
- buffer.push(latestInputTranscript, text);
155
- latestInputTranscript = undefined;
156
- }
157
- mirror.postBotResponse(botName, text).catch((err) => {
158
- this.log.warn({ guildId, err }, 'transcript-mirror: failed to post bot response');
159
- });
104
+ }
105
+ const geminiLiveModel = normalizeGeminiLiveModel(this.runtimeModel) ?? DEFAULT_GEMINI_LIVE_MODEL;
106
+ const supportsAsyncFunctionCalling = supportsGeminiLiveAsyncFunctionCalling(geminiLiveModel);
107
+ const tools = buildGeminiToolDeclarations(this.enabledTools, { nonBlocking: supportsAsyncFunctionCalling });
108
+ const systemInstruction = await this.buildGeminiSystemInstruction?.();
109
+ const initialHistory = toGeminiLiveHistoryTurns(buffer.toTurns());
110
+ const provider = new GeminiLiveProvider({
111
+ apiKey,
112
+ log: this.log,
113
+ model: geminiLiveModel,
114
+ systemInstruction,
115
+ responseModalities: ['AUDIO'],
116
+ tools,
117
+ initialHistoryInClientContent: initialHistory.length > 0,
118
+ sessionRotationMs: this.sessionRotationMs,
119
+ });
120
+ await provider.connect();
121
+ if (initialHistory.length > 0) {
122
+ provider.sendInitialHistory(initialHistory);
123
+ this.log.info({ guildId, turns: initialHistory.length }, 'gemini-live conversation history seeded');
124
+ }
125
+ if (!supportsAsyncFunctionCalling && this.silentTools.size > 0) {
126
+ this.log.info({ guildId, model: geminiLiveModel, count: this.silentTools.size }, 'gemini-live: current model does not support scheduled tool responses; silent tool scheduling disabled');
127
+ }
128
+ const mirror = this.transcriptMirror;
129
+ const botName = this.botDisplayName;
130
+ let latestInputTranscript;
131
+ const responder = new GeminiLiveResponder({
132
+ log: this.log,
133
+ connection,
134
+ provider,
135
+ onBotResponse: mirror
136
+ ? (text) => {
137
+ if (latestInputTranscript && text.trim()) {
138
+ buffer.push(latestInputTranscript, text);
139
+ latestInputTranscript = undefined;
160
140
  }
161
- : (text) => {
162
- if (latestInputTranscript && text.trim()) {
163
- buffer.push(latestInputTranscript, text);
164
- latestInputTranscript = undefined;
165
- }
166
- },
167
- onInputTranscript: mirror
168
- ? (text) => {
169
- if (text.trim())
170
- latestInputTranscript = text.trim();
171
- mirror.postUserTranscription('User', text).catch((err) => {
172
- this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
173
- });
141
+ mirror.postBotResponse(botName, text).catch((err) => {
142
+ this.log.warn({ guildId, err }, 'transcript-mirror: failed to post bot response');
143
+ });
144
+ }
145
+ : (text) => {
146
+ if (latestInputTranscript && text.trim()) {
147
+ buffer.push(latestInputTranscript, text);
148
+ latestInputTranscript = undefined;
174
149
  }
175
- : (text) => {
176
- if (text.trim())
177
- latestInputTranscript = text.trim();
178
- },
179
- onSessionTerminated: () => {
180
- this.log.error({ guildId }, 'gemini-live session terminally failed — no fallback (fallback disabled)');
181
150
  },
182
- onFallbackRecommended: (reason) => {
183
- this.log.warn({ guildId, reason }, 'gemini-live: fallback recommended but fallback is disabled');
184
- },
185
- onTokenWarning: (estimatedTokens, threshold) => {
186
- this.log.warn({ guildId, estimatedTokens, threshold }, 'gemini-live: token usage approaching context window limit');
151
+ onInputTranscript: mirror
152
+ ? (text) => {
153
+ if (text.trim())
154
+ latestInputTranscript = text.trim();
155
+ mirror.postUserTranscription('User', text).catch((err) => {
156
+ this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
157
+ });
158
+ }
159
+ : (text) => {
160
+ if (text.trim())
161
+ latestInputTranscript = text.trim();
187
162
  },
188
- onToolCall: tools
189
- ? (calls) => {
190
- this.log.info({ guildId, count: calls.length, names: calls.map((c) => c.name).join(',') }, 'gemini-live: tool call received — dispatching');
191
- const allowedRoots = this.runtimeCwd ? [this.runtimeCwd] : [];
192
- const allowedToolNames = new Set(buildToolSchemas(this.enabledTools).map((t) => t.function.name));
193
- const logFn = (msg) => this.log.info({ guildId }, msg);
194
- const execOpts = { enableHybridPipeline: false, allowedToolNames };
195
- // Gemini 3.1 Live only supports synchronous function calling.
196
- // Gemini 2.5 Live can opt into NON_BLOCKING declarations and scheduled responses.
197
- void (async () => {
198
- const results = await Promise.all(calls.map(async (call) => {
199
- const scheduling = supportsAsyncFunctionCalling
200
- ? (this.isSilentTool(call.name) ? 'SILENT' : 'INTERRUPT')
201
- : undefined;
202
- try {
203
- const res = await executeToolCall(call.name, call.args, allowedRoots, logFn, execOpts);
204
- return { id: call.id, name: call.name, output: res.result, scheduling };
205
- }
206
- catch (err) {
207
- const msg = err instanceof Error ? err.message : String(err);
208
- return { id: call.id, name: call.name, output: `Error: ${msg}`, scheduling };
209
- }
210
- }));
211
- const silentCount = supportsAsyncFunctionCalling
212
- ? results.filter((r) => r.scheduling === 'SILENT').length
213
- : 0;
214
- if (silentCount > 0) {
215
- this.log.info({ guildId, count: silentCount }, 'gemini-live: SILENT tool execution complete — results scheduled silently');
216
- }
163
+ onSessionTerminated: () => {
164
+ this.log.error({ guildId }, 'gemini-live session terminally failed — no fallback');
165
+ },
166
+ onFallbackRecommended: (reason) => {
167
+ this.log.warn({ guildId, reason }, 'gemini-live: fallback recommended but the legacy pipeline has been removed');
168
+ },
169
+ onTokenWarning: (estimatedTokens, threshold) => {
170
+ this.log.warn({ guildId, estimatedTokens, threshold }, 'gemini-live: token usage approaching context window limit');
171
+ },
172
+ onToolCall: tools
173
+ ? (calls) => {
174
+ this.log.info({ guildId, count: calls.length, names: calls.map((c) => c.name).join(',') }, 'gemini-live: tool call received — dispatching');
175
+ const allowedRoots = this.runtimeCwd ? [this.runtimeCwd] : [];
176
+ const allowedToolNames = new Set(buildToolSchemas(this.enabledTools).map((t) => t.function.name));
177
+ const logFn = (msg) => this.log.info({ guildId }, msg);
178
+ const execOpts = { enableHybridPipeline: false, allowedToolNames };
179
+ void (async () => {
180
+ const results = await Promise.all(calls.map(async (call) => {
181
+ const scheduling = supportsAsyncFunctionCalling
182
+ ? (this.isSilentTool(call.name) ? 'SILENT' : 'INTERRUPT')
183
+ : undefined;
217
184
  try {
218
- provider.sendToolResponse(results);
185
+ const res = await executeToolCall(call.name, call.args, allowedRoots, logFn, execOpts);
186
+ return { id: call.id, name: call.name, output: res.result, scheduling };
219
187
  }
220
188
  catch (err) {
221
- this.log.warn({ guildId, err }, 'gemini-live: sendToolResponse failed (provider likely disconnected)');
189
+ const msg = err instanceof Error ? err.message : String(err);
190
+ return { id: call.id, name: call.name, output: `Error: ${msg}`, scheduling };
222
191
  }
223
- })();
224
- }
225
- : undefined,
226
- });
227
- responder.start();
228
- // SttProvider shim: bridges AudioReceiver frames to GeminiLiveProvider.sendAudio
229
- const sttShim = {
230
- start: async () => { },
231
- stop: async () => { },
232
- onTranscription: () => { },
233
- feedAudio: (frame) => {
234
- try {
235
- provider.sendAudio(frame.buffer);
236
- }
237
- catch (err) {
238
- this.log.warn({ guildId, err }, 'gemini-live: sendAudio error (non-fatal)');
239
- }
240
- },
241
- };
242
- const receiver = new AudioReceiver({
243
- connection,
244
- allowedUserIds: this.allowedUserIds,
245
- sttProvider: sttShim,
246
- log: this.log,
247
- createDecoder: this.createDecoder,
248
- onUserSpeaking: () => { },
249
- onUserSilence: () => {
250
- try {
251
- provider.sendAudioStreamEnd();
252
- }
253
- catch (err) {
254
- this.log.warn({ guildId, err }, 'gemini-live: sendAudioStreamEnd error (non-fatal)');
255
- }
256
- },
257
- });
258
- receiver.start();
259
- this.pipelines.set(guildId, {
260
- connection,
261
- sttProvider: sttShim,
262
- receiver,
263
- buffer,
264
- geminiProvider: provider,
265
- geminiResponder: responder,
266
- mode: 'gemini-live',
267
- });
268
- this.log.info({ guildId }, 'audio pipeline started (gemini-live)');
269
- return;
270
- }
271
- // ----- default pipeline mode: STT/TTS/VoiceResponder -----
272
- const sttProvider = this.createStt(this.voiceConfig, this.log);
273
- const mirror = this.transcriptMirror;
274
- // Create conversation buffer and backfill history if available
275
- let buffer;
276
- if (this.invokeAi) {
277
- buffer = new ConversationBuffer();
278
- if (this.backfill) {
279
- try {
280
- const turns = await this.backfill();
281
- buffer.backfill(turns);
282
- this.log.info({ guildId, turns: turns.length }, 'conversation buffer backfilled');
283
- }
284
- catch (err) {
285
- this.log.warn({ guildId, err }, 'conversation backfill failed — proceeding with empty buffer');
286
- }
287
- }
288
- }
289
- // Create VoiceResponder for the full conversation loop if invokeAi is configured
290
- let responder;
291
- if (this.invokeAi) {
292
- try {
293
- const tts = this.createTts(this.voiceConfig, this.log);
294
- const botName = this.botDisplayName;
295
- responder = new VoiceResponder({
296
- log: this.log,
297
- tts,
298
- connection,
299
- invokeAi: this.invokeAi,
300
- onBotResponse: mirror
301
- ? (text) => {
302
- mirror.postBotResponse(botName, text).catch((err) => {
303
- this.log.warn({ guildId, err }, 'transcript-mirror: failed to post bot response');
304
- });
192
+ }));
193
+ const silentCount = supportsAsyncFunctionCalling
194
+ ? results.filter((r) => r.scheduling === 'SILENT').length
195
+ : 0;
196
+ if (silentCount > 0) {
197
+ this.log.info({ guildId, count: silentCount }, 'gemini-live: SILENT tool execution complete — results scheduled silently');
305
198
  }
306
- : undefined,
307
- buffer,
308
- });
309
- this.log.info({ guildId }, 'voice responder created');
310
- }
311
- catch (err) {
312
- this.log.warn({ guildId, err }, 'failed to create voice responder — continuing with STT-only mode');
313
- }
314
- }
315
- // Wire transcription callback — fires the external callback, transcript mirror, and responder
316
- const onTranscriptionCb = this.onTranscription;
317
- if (onTranscriptionCb || responder || mirror) {
318
- sttProvider.onTranscription((result) => {
319
- if (onTranscriptionCb) {
320
- onTranscriptionCb(guildId, result);
199
+ try {
200
+ provider.sendToolResponse(results);
201
+ }
202
+ catch (err) {
203
+ this.log.warn({ guildId, err }, 'gemini-live: sendToolResponse failed (provider likely disconnected)');
204
+ }
205
+ })();
321
206
  }
322
- // STT-confirmed barge-in: any transcription (interim or final) with
323
- // non-empty text stops ongoing playback. Echo produces empty
324
- // transcriptions; real speech produces non-empty ones.
325
- if (result.text.trim() && responder?.isPlaying) {
326
- this.log.info({ guildId }, 'barge-in detected');
327
- responder.stop();
207
+ : undefined,
208
+ });
209
+ responder.start();
210
+ // SttProvider shim: bridges AudioReceiver frames to GeminiLiveProvider.sendAudio
211
+ const sttShim = {
212
+ start: async () => { },
213
+ stop: async () => { },
214
+ onTranscription: () => { },
215
+ feedAudio: (frame) => {
216
+ try {
217
+ provider.sendAudio(frame.buffer);
328
218
  }
329
- if (result.isFinal && result.text.trim()) {
330
- if (mirror) {
331
- mirror.postUserTranscription('User', result.text).catch((err) => {
332
- this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
333
- });
334
- }
335
- if (responder) {
336
- responder.handleTranscription(result.text).catch((err) => {
337
- this.log.error({ guildId, err }, 'voice-responder: handleTranscription failed');
338
- });
339
- }
219
+ catch (err) {
220
+ this.log.warn({ guildId, err }, 'gemini-live: sendAudio error (non-fatal)');
340
221
  }
341
- });
342
- }
343
- await sttProvider.start();
222
+ },
223
+ };
344
224
  const receiver = new AudioReceiver({
345
225
  connection,
346
226
  allowedUserIds: this.allowedUserIds,
347
- sttProvider,
227
+ sttProvider: sttShim,
348
228
  log: this.log,
349
229
  createDecoder: this.createDecoder,
350
- onUserSpeaking: (_userId) => {
351
- // Barge-in is now gated on STT transcription (see onTranscription
352
- // callback above). This callback is kept for AudioReceiver
353
- // subscription management.
230
+ onUserSpeaking: () => { },
231
+ onUserSilence: () => {
232
+ try {
233
+ provider.sendAudioStreamEnd();
234
+ }
235
+ catch (err) {
236
+ this.log.warn({ guildId, err }, 'gemini-live: sendAudioStreamEnd error (non-fatal)');
237
+ }
354
238
  },
355
239
  });
356
240
  receiver.start();
357
- this.pipelines.set(guildId, { connection, sttProvider, receiver, responder, buffer, mode: 'pipeline' });
358
- this.log.info({ guildId, mode: effectiveMode }, 'audio pipeline started');
241
+ this.pipelines.set(guildId, {
242
+ connection,
243
+ sttProvider: sttShim,
244
+ receiver,
245
+ buffer,
246
+ geminiProvider: provider,
247
+ geminiResponder: responder,
248
+ mode: 'gemini-live',
249
+ });
250
+ this.log.info({ guildId }, 'audio pipeline started (gemini-live)');
359
251
  }
360
252
  catch (err) {
361
253
  this.log.error({ guildId, err }, 'failed to start audio pipeline');
362
- // Fallback disabled gemini-live must succeed or the pipeline stays down
363
- if (effectiveMode === 'gemini-live') {
364
- this.log.error({ guildId }, 'gemini-live: connection failed — no fallback (fallback disabled)');
365
- }
254
+ this.log.error({ guildId }, 'gemini-live: connection failed no fallback available');
366
255
  }
367
256
  finally {
368
257
  this.starting.delete(guildId);
@@ -378,7 +267,6 @@ export class AudioPipelineManager {
378
267
  if (pipeline.geminiProvider) {
379
268
  await pipeline.geminiProvider.disconnect();
380
269
  }
381
- pipeline.responder?.destroy();
382
270
  pipeline.receiver.stop();
383
271
  try {
384
272
  await pipeline.sttProvider.stop();
@@ -401,58 +289,17 @@ export class AudioPipelineManager {
401
289
  get activePipelineCount() {
402
290
  return this.pipelines.size;
403
291
  }
404
- /** Configured voice provider mode ('pipeline' or 'gemini-live'). */
292
+ /** Configured voice provider mode. */
405
293
  get activeVoiceProvider() {
406
- return this.voiceProvider;
294
+ return 'gemini-live';
407
295
  }
408
- /** Active mode for a specific guild (may differ from configured mode during fallback). */
296
+ /** Active mode for a specific guild. */
409
297
  pipelineMode(guildId) {
410
298
  return this.pipelines.get(guildId)?.mode;
411
299
  }
412
- /** Current Deepgram TTS voice model name. */
413
- get ttsVoice() {
414
- return this.voiceConfig.deepgramTtsVoice;
415
- }
416
- /**
417
- * Fall back from gemini-live to the standard pipeline for a guild.
418
- * Stops the current gemini-live session and starts a standard STT/AI/TTS pipeline.
419
- * No-op if no pipeline exists or the guild is already in standard mode.
420
- */
421
- async fallbackToPipeline(guildId, connection) {
422
- const pipeline = this.pipelines.get(guildId);
423
- if (!pipeline || pipeline.mode !== 'gemini-live')
424
- return;
425
- this.log.warn({ guildId }, 'gemini-live: initiating fallback to standard pipeline');
426
- await this.stopPipeline(guildId);
427
- await this.startPipeline(guildId, connection, 'pipeline');
428
- if (this.hasPipeline(guildId)) {
429
- this.log.info({ guildId }, 'gemini-live: fallback to standard pipeline succeeded');
430
- this.onFallbackTriggered?.(guildId, 'pipeline');
431
- }
432
- else {
433
- this.log.error({ guildId }, 'gemini-live: fallback to standard pipeline also failed — guild has no active pipeline');
434
- }
435
- }
436
300
  isSilentTool(toolName) {
437
301
  return this.silentTools.has(toolName) || this.silentTools.has(OPENAI_TO_DISCO_NAME[toolName] ?? toolName);
438
302
  }
439
- /**
440
- * Update the Deepgram TTS voice and restart all active pipelines so the
441
- * new voice takes effect immediately. No-op in gemini-live mode (TTS is
442
- * handled server-side).
443
- * @returns The number of pipelines that were restarted (0 in gemini-live mode).
444
- */
445
- async setTtsVoice(voice) {
446
- if (this.voiceProvider === 'gemini-live') {
447
- this.log.info({ voice }, 'TTS voice change ignored — gemini-live mode uses server-side TTS');
448
- return 0;
449
- }
450
- this.voiceConfig = { ...this.voiceConfig, deepgramTtsVoice: voice };
451
- this.log.info({ voice }, 'TTS voice updated — restarting active pipelines');
452
- const entries = [...this.pipelines.entries()];
453
- await Promise.all(entries.map(([guildId, pipeline]) => this.startPipeline(guildId, pipeline.connection)));
454
- return entries.length;
455
- }
456
303
  }
457
304
  function toGeminiLiveHistoryTurns(turns) {
458
305
  const history = [];
@@ -37,7 +37,7 @@ For npm-managed daemon installs, readiness is currently constrained by service e
37
37
  Model/runtime state is intentionally split across three storage modes:
38
38
 
39
39
  - `models.json` stores persisted model strings per role (`chat`, `fast`, `plan-run`, `voice`, forge roles, cron roles, etc.).
40
- - `runtime-overrides.json` stores persisted runtime-only overlays such as `fastRuntime` and `voiceRuntime` (plus non-model keys such as `ttsVoice`).
40
+ - `runtime-overrides.json` stores persisted runtime-only overlays such as `fastRuntime` and `voiceRuntime`.
41
41
  - Live chat runtime swaps stay in memory only. `!models set chat <runtime>` changes the active chat runtime immediately, but there is no persisted `chatRuntime` overlay.
42
42
 
43
43
  On first run, `models.json` is scaffolded from the instance startup defaults. After that:
@@ -322,24 +322,19 @@ The same forum-boundary rule applies to tasks: `DISCOCLAW_TASKS_FORUM` is the di
322
322
 
323
323
  ## Voice
324
324
 
325
- See [docs/voice.md](voice.md) for the full setup guide and provider details.
325
+ See [docs/voice.md](voice.md) for the full Gemini Live setup guide.
326
326
 
327
327
  | Variable | Default | Description |
328
328
  |----------|---------|-------------|
329
329
  | `DISCOCLAW_VOICE_ENABLED` | `false` | Master switch for voice subsystem |
330
330
  | `DISCOCLAW_VOICE_AUTO_JOIN` | `false` | Auto-join voice channels when users enter |
331
331
  | `ANTHROPIC_API_KEY` | — | Anthropic API key (required for direct Messages API voice responses) |
332
+ | `GEMINI_API_KEY` | — | Gemini API key required for Gemini Live voice |
332
333
  | `DISCOCLAW_VOICE_MODEL` | follows startup chat model | Model override for voice responses |
333
334
  | `DISCOCLAW_VOICE_SYSTEM_PROMPT` | — | System prompt override for voice (max 4000 chars) |
334
- | `DISCOCLAW_STT_PROVIDER` | `deepgram` | Speech-to-text provider: `deepgram`, `whisper`, `openai` |
335
- | `DISCOCLAW_TTS_PROVIDER` | `cartesia` | Text-to-speech provider: `cartesia`, `deepgram`, `kokoro`, `openai` |
335
+ | `DISCOCLAW_GEMINI_SESSION_ROTATION_MS` | `780000` | Proactive Gemini Live session rotation interval in milliseconds |
336
336
  | `DISCOCLAW_VOICE_HOME_CHANNEL` | — | Voice channel name or ID for prompt context |
337
337
  | `DISCOCLAW_VOICE_LOG_CHANNEL` | `voice-log` | Text channel for transcript mirror |
338
- | `DEEPGRAM_API_KEY` | — | Deepgram API key (required for Deepgram STT/TTS) |
339
- | `DEEPGRAM_STT_MODEL` | `nova-3-general` | Deepgram STT model |
340
- | `DEEPGRAM_TTS_VOICE` | `aura-2-asteria-en` | Deepgram TTS voice |
341
- | `DEEPGRAM_TTS_SPEED` | `1.3` | Deepgram TTS playback speed multiplier (0.5–1.5) |
342
- | `CARTESIA_API_KEY` | — | Cartesia API key (required for Cartesia TTS) |
343
338
 
344
339
  ## Webhook
345
340
 
@@ -6,8 +6,8 @@ Completeness pass for this index was cross-checked against:
6
6
 
7
7
  - `package.json`
8
8
  - `.context/runtime.md`
9
- - `src/voice/tts-factory.ts`
10
- - `src/voice/stt-factory.ts`
9
+ - `src/voice/audio-pipeline.ts`
10
+ - `src/voice/providers/gemini-live-provider.ts`
11
11
  - `src/cold-storage/embeddings.ts`
12
12
  - `src/cold-storage/openai-compat.ts`
13
13
  - `src/discord/actions-imagegen.ts`
@@ -24,8 +24,8 @@ Completeness pass for this index was cross-checked against:
24
24
  | Provider | What DiscoClaw uses | Official docs |
25
25
  |----------|----------------------|---------------|
26
26
  | Anthropic | Claude model families via `src/runtime/anthropic-rest.ts` and Claude Code CLI runtime | Models overview: <https://docs.anthropic.com/en/docs/about-claude/models/overview><br>Messages API: <https://platform.claude.com/docs/en/api/messages><br>Claude Code docs: <https://code.claude.com/docs/en/overview> |
27
- | OpenAI | OpenAI-compatible runtime, Codex runtime docs, OpenAI voice, embeddings, and image generation | Model IDs: <https://developers.openai.com/api/model-ids/><br>API reference overview: <https://platform.openai.com/docs/api-reference><br>Codex docs: <https://developers.openai.com/codex/><br>Codex app-server API: <https://developers.openai.com/codex/app-server> |
28
- | Google | Gemini API runtime and Gemini/Imagen image generation | Gemini models: <https://ai.google.dev/models/gemini><br>Gemini API docs: <https://ai.google.dev/gemini-api/docs> |
27
+ | OpenAI | OpenAI-compatible runtime, Codex runtime docs, embeddings, and image generation | Model IDs: <https://developers.openai.com/api/model-ids/><br>API reference overview: <https://platform.openai.com/docs/api-reference><br>Codex docs: <https://developers.openai.com/codex/><br>Codex app-server API: <https://developers.openai.com/codex/app-server> |
28
+ | Google | Gemini API runtime, Gemini Live voice, and Gemini/Imagen image generation | Gemini models: <https://ai.google.dev/models/gemini><br>Gemini API docs: <https://ai.google.dev/gemini-api/docs><br>Gemini Live API: <https://ai.google.dev/gemini-api/docs/live> |
29
29
  | OpenRouter | OpenRouter runtime through `src/runtime/openai-compat.ts` | Model list: <https://openrouter.ai/models><br>API docs: <https://openrouter.ai/docs/api/reference/overview> |
30
30
 
31
31
  ## Discord
@@ -53,11 +53,8 @@ Completeness pass for this index was cross-checked against:
53
53
 
54
54
  | Provider | Used in DiscoClaw | Official docs |
55
55
  |----------|-------------------|---------------|
56
- | Deepgram STT | `src/voice/stt-deepgram.ts` with Nova-3 streaming (`nova-3-general`) | STT API overview: <https://developers.deepgram.com/docs/speech-to-text><br>Streaming API: <https://developers.deepgram.com/reference/speech-to-text/listen-streaming><br>Nova-3 models: <https://developers.deepgram.com/docs/models-languages-overview> |
57
- | Deepgram TTS | `src/voice/tts-deepgram.ts` with Aura (`aura-2-asteria-en`) | TTS API overview: <https://developers.deepgram.com/docs/text-to-speech><br>Speak endpoint: <https://developers.deepgram.com/reference/text-to-speech/speak-streaming><br>Aura voices/models: <https://developers.deepgram.com/docs/tts-models> |
58
- | Cartesia TTS | `src/voice/tts-cartesia.ts` with Sonic-3 over WebSocket | API docs: <https://docs.cartesia.ai/api-reference><br>TTS WebSocket: <https://docs.cartesia.ai/api-reference/tts/websocket> |
59
- | OpenAI TTS | `src/voice/tts-openai.ts` (`/v1/audio/speech`, default `tts-1`) | Audio speech API reference: <https://platform.openai.com/docs/api-reference/audio/createSpeech> |
60
- | OpenAI STT | `src/voice/stt-openai.ts` (`/v1/audio/transcriptions`, `whisper-1`) | Audio transcription API reference: <https://platform.openai.com/docs/api-reference/audio/createTranscription> |
56
+ | Gemini Live | `src/voice/audio-pipeline.ts` and the Gemini Live provider handle speech recognition, reasoning, and speech synthesis in one session | Live API overview: <https://ai.google.dev/gemini-api/docs/live><br>Realtime guide: <https://ai.google.dev/gemini-api/docs/live-guide> |
57
+ | Anthropic Messages API (optional voice runtime) | `!models set voice claude-api` can switch voice response generation to direct Anthropic API calls while Discord audio transport stays on Gemini Live | API overview: <https://docs.anthropic.com/en/api/messages> |
61
58
 
62
59
  ## Image Generation
63
60