discoclaw 1.3.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +4 -6
- package/.env.example.full +13 -32
- package/README.md +1 -1
- package/dist/cli/dashboard.test.js +0 -4
- package/dist/cli/init-wizard.js +4 -8
- package/dist/cli/init-wizard.test.js +4 -10
- package/dist/config.js +2 -42
- package/dist/config.test.js +8 -72
- package/dist/dashboard/server.js +1 -5
- package/dist/dashboard/server.test.js +3 -6
- package/dist/discord/actions.js +112 -6
- package/dist/discord/actions.test.js +117 -1
- package/dist/discord/help-command.js +1 -1
- package/dist/discord/message-coordinator.js +3 -8
- package/dist/discord/models-command.js +1 -1
- package/dist/discord/reaction-handler.js +2 -2
- package/dist/discord/reaction-handler.test.js +55 -0
- package/dist/discord/verify-push.js +31 -36
- package/dist/discord/verify-push.test.js +34 -6
- package/dist/discord/voice-command.js +1 -31
- package/dist/discord/voice-command.test.js +21 -259
- package/dist/discord/voice-status-command.js +3 -22
- package/dist/discord/voice-status-command.test.js +16 -124
- package/dist/discord-followup.test.js +133 -0
- package/dist/health/config-doctor.js +5 -27
- package/dist/health/config-doctor.test.js +1 -4
- package/dist/index.js +1 -28
- package/dist/runtime-overrides.js +2 -3
- package/dist/runtime-overrides.test.js +27 -193
- package/dist/tasks/store.js +10 -6
- package/dist/tasks/store.test.js +44 -0
- package/dist/tasks/task-action-executor.test.js +162 -50
- package/dist/tasks/task-action-mutations.js +22 -2
- package/dist/tasks/task-action-read-ops.js +7 -1
- package/dist/tasks/task-action-runner-types.js +19 -1
- package/dist/voice/audio-pipeline.js +145 -298
- package/docs/configuration.md +4 -9
- package/docs/official-docs.md +6 -9
- package/docs/runtime-switching.md +1 -1
- package/package.json +1 -1
- package/dist/voice/audio-pipeline.test.js +0 -1100
- package/dist/voice/stt-deepgram.js +0 -154
- package/dist/voice/stt-deepgram.test.js +0 -275
- package/dist/voice/stt-factory.js +0 -42
- package/dist/voice/stt-factory.test.js +0 -45
- package/dist/voice/stt-openai.js +0 -156
- package/dist/voice/stt-openai.test.js +0 -281
- package/dist/voice/tts-cartesia.js +0 -169
- package/dist/voice/tts-cartesia.test.js +0 -228
- package/dist/voice/tts-deepgram.js +0 -84
- package/dist/voice/tts-deepgram.test.js +0 -220
- package/dist/voice/tts-factory.js +0 -52
- package/dist/voice/tts-factory.test.js +0 -53
- package/dist/voice/tts-openai.js +0 -70
- package/dist/voice/tts-openai.test.js +0 -138
- package/dist/voice/types.test.js +0 -90
|
@@ -8,9 +8,6 @@
|
|
|
8
8
|
*/
|
|
9
9
|
import { VoiceConnectionStatus } from '@discordjs/voice';
|
|
10
10
|
import { AudioReceiver } from './audio-receiver.js';
|
|
11
|
-
import { createSttProvider } from './stt-factory.js';
|
|
12
|
-
import { createTtsProvider } from './tts-factory.js';
|
|
13
|
-
import { VoiceResponder } from './voice-responder.js';
|
|
14
11
|
import { ConversationBuffer } from './conversation-buffer.js';
|
|
15
12
|
import { GeminiLiveProvider } from './providers/gemini-live-provider.js';
|
|
16
13
|
import { GeminiLiveResponder } from './providers/gemini-live-responder.js';
|
|
@@ -22,54 +19,44 @@ import { executeToolCall } from '../runtime/openai-tool-exec.js';
|
|
|
22
19
|
// ---------------------------------------------------------------------------
|
|
23
20
|
export class AudioPipelineManager {
|
|
24
21
|
log;
|
|
25
|
-
voiceConfig;
|
|
26
22
|
allowedUserIds;
|
|
27
23
|
createDecoder;
|
|
28
24
|
onTranscription;
|
|
29
|
-
createStt;
|
|
30
25
|
invokeAi;
|
|
31
26
|
runtime;
|
|
32
27
|
runtimeModel;
|
|
33
28
|
runtimeCwd;
|
|
34
29
|
runtimeTimeoutMs;
|
|
35
|
-
createTts;
|
|
36
30
|
transcriptMirror;
|
|
37
31
|
botDisplayName;
|
|
38
32
|
backfill;
|
|
39
33
|
buildGeminiSystemInstruction;
|
|
40
|
-
voiceProvider;
|
|
41
34
|
geminiApiKey;
|
|
42
35
|
enabledTools;
|
|
43
36
|
silentTools;
|
|
44
37
|
sessionRotationMs;
|
|
45
|
-
onFallbackTriggered;
|
|
46
38
|
pipelines = new Map();
|
|
47
39
|
/** Re-entrancy guard: VoiceConnection.subscribe() can synchronously fire stateChange→Ready. */
|
|
48
40
|
starting = new Set();
|
|
49
41
|
constructor(opts) {
|
|
50
42
|
this.log = opts.log;
|
|
51
|
-
this.voiceConfig = opts.voiceConfig;
|
|
52
43
|
this.allowedUserIds = opts.allowedUserIds;
|
|
53
44
|
this.createDecoder = opts.createDecoder;
|
|
54
45
|
this.onTranscription = opts.onTranscription;
|
|
55
|
-
this.createStt = opts.createStt ?? createSttProvider;
|
|
56
46
|
this.invokeAi = opts.invokeAi;
|
|
57
47
|
this.runtime = opts.runtime;
|
|
58
48
|
this.runtimeModel = opts.runtimeModel;
|
|
59
49
|
this.runtimeCwd = opts.runtimeCwd;
|
|
60
50
|
this.runtimeTimeoutMs = opts.runtimeTimeoutMs;
|
|
61
|
-
this.createTts = opts.createTts ?? createTtsProvider;
|
|
62
51
|
this.transcriptMirror = opts.transcriptMirror;
|
|
63
52
|
this.botDisplayName = opts.botDisplayName ?? 'Bot';
|
|
64
53
|
this.backfill = opts.backfill;
|
|
65
54
|
this.buildGeminiSystemInstruction = opts.buildGeminiSystemInstruction;
|
|
66
|
-
this.voiceProvider = opts.voiceProvider ?? 'pipeline';
|
|
67
55
|
this.geminiApiKey = opts.geminiApiKey;
|
|
68
56
|
this.enabledTools = opts.enabledTools ?? [];
|
|
69
57
|
this.silentTools = new Set(opts.silentTools ?? []);
|
|
70
58
|
this.sessionRotationMs = opts.sessionRotationMs;
|
|
71
|
-
this.
|
|
72
|
-
this.log.info({ voiceProvider: this.voiceProvider }, 'audio pipeline manager initialized');
|
|
59
|
+
this.log.info({ voiceProvider: 'gemini-live' }, 'audio pipeline manager initialized');
|
|
73
60
|
}
|
|
74
61
|
/**
|
|
75
62
|
* Attach to a VoiceConnection and auto-manage the audio pipeline
|
|
@@ -87,8 +74,8 @@ export class AudioPipelineManager {
|
|
|
87
74
|
}
|
|
88
75
|
});
|
|
89
76
|
}
|
|
90
|
-
/** Start the
|
|
91
|
-
async startPipeline(guildId, connection
|
|
77
|
+
/** Start the Gemini Live voice pipeline for a guild. */
|
|
78
|
+
async startPipeline(guildId, connection) {
|
|
92
79
|
// Re-entrancy guard: VoiceConnection.subscribe() (called when wiring the
|
|
93
80
|
// AudioPlayer) synchronously fires a stateChange→Ready event, which would
|
|
94
81
|
// re-invoke startPipeline and recurse infinitely.
|
|
@@ -100,269 +87,171 @@ export class AudioPipelineManager {
|
|
|
100
87
|
this.log.info({ guildId }, 'stopping existing pipeline before restart');
|
|
101
88
|
await this.stopPipeline(guildId);
|
|
102
89
|
}
|
|
103
|
-
const effectiveMode = forceMode ?? this.voiceProvider;
|
|
104
90
|
try {
|
|
105
|
-
|
|
106
|
-
if (
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
buffer.backfill(turns);
|
|
115
|
-
this.log.info({ guildId, turns: turns.length }, 'gemini-live conversation buffer backfilled');
|
|
116
|
-
}
|
|
117
|
-
catch (err) {
|
|
118
|
-
this.log.warn({ guildId, err }, 'gemini-live conversation backfill failed — proceeding with empty history');
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
const geminiLiveModel = normalizeGeminiLiveModel(this.runtimeModel) ?? DEFAULT_GEMINI_LIVE_MODEL;
|
|
122
|
-
const supportsAsyncFunctionCalling = supportsGeminiLiveAsyncFunctionCalling(geminiLiveModel);
|
|
123
|
-
const tools = buildGeminiToolDeclarations(this.enabledTools, { nonBlocking: supportsAsyncFunctionCalling });
|
|
124
|
-
const systemInstruction = await this.buildGeminiSystemInstruction?.();
|
|
125
|
-
const initialHistory = toGeminiLiveHistoryTurns(buffer.toTurns());
|
|
126
|
-
const provider = new GeminiLiveProvider({
|
|
127
|
-
apiKey,
|
|
128
|
-
log: this.log,
|
|
129
|
-
model: geminiLiveModel,
|
|
130
|
-
systemInstruction,
|
|
131
|
-
responseModalities: ['AUDIO'],
|
|
132
|
-
tools,
|
|
133
|
-
initialHistoryInClientContent: initialHistory.length > 0,
|
|
134
|
-
sessionRotationMs: this.sessionRotationMs,
|
|
135
|
-
});
|
|
136
|
-
await provider.connect();
|
|
137
|
-
if (initialHistory.length > 0) {
|
|
138
|
-
provider.sendInitialHistory(initialHistory);
|
|
139
|
-
this.log.info({ guildId, turns: initialHistory.length }, 'gemini-live conversation history seeded');
|
|
91
|
+
const apiKey = this.geminiApiKey;
|
|
92
|
+
if (!apiKey)
|
|
93
|
+
throw new Error('geminiApiKey is required for gemini-live voice provider');
|
|
94
|
+
const buffer = new ConversationBuffer();
|
|
95
|
+
if (this.backfill) {
|
|
96
|
+
try {
|
|
97
|
+
const turns = await this.backfill();
|
|
98
|
+
buffer.backfill(turns);
|
|
99
|
+
this.log.info({ guildId, turns: turns.length }, 'gemini-live conversation buffer backfilled');
|
|
140
100
|
}
|
|
141
|
-
|
|
142
|
-
this.log.
|
|
101
|
+
catch (err) {
|
|
102
|
+
this.log.warn({ guildId, err }, 'gemini-live conversation backfill failed — proceeding with empty history');
|
|
143
103
|
}
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
104
|
+
}
|
|
105
|
+
const geminiLiveModel = normalizeGeminiLiveModel(this.runtimeModel) ?? DEFAULT_GEMINI_LIVE_MODEL;
|
|
106
|
+
const supportsAsyncFunctionCalling = supportsGeminiLiveAsyncFunctionCalling(geminiLiveModel);
|
|
107
|
+
const tools = buildGeminiToolDeclarations(this.enabledTools, { nonBlocking: supportsAsyncFunctionCalling });
|
|
108
|
+
const systemInstruction = await this.buildGeminiSystemInstruction?.();
|
|
109
|
+
const initialHistory = toGeminiLiveHistoryTurns(buffer.toTurns());
|
|
110
|
+
const provider = new GeminiLiveProvider({
|
|
111
|
+
apiKey,
|
|
112
|
+
log: this.log,
|
|
113
|
+
model: geminiLiveModel,
|
|
114
|
+
systemInstruction,
|
|
115
|
+
responseModalities: ['AUDIO'],
|
|
116
|
+
tools,
|
|
117
|
+
initialHistoryInClientContent: initialHistory.length > 0,
|
|
118
|
+
sessionRotationMs: this.sessionRotationMs,
|
|
119
|
+
});
|
|
120
|
+
await provider.connect();
|
|
121
|
+
if (initialHistory.length > 0) {
|
|
122
|
+
provider.sendInitialHistory(initialHistory);
|
|
123
|
+
this.log.info({ guildId, turns: initialHistory.length }, 'gemini-live conversation history seeded');
|
|
124
|
+
}
|
|
125
|
+
if (!supportsAsyncFunctionCalling && this.silentTools.size > 0) {
|
|
126
|
+
this.log.info({ guildId, model: geminiLiveModel, count: this.silentTools.size }, 'gemini-live: current model does not support scheduled tool responses; silent tool scheduling disabled');
|
|
127
|
+
}
|
|
128
|
+
const mirror = this.transcriptMirror;
|
|
129
|
+
const botName = this.botDisplayName;
|
|
130
|
+
let latestInputTranscript;
|
|
131
|
+
const responder = new GeminiLiveResponder({
|
|
132
|
+
log: this.log,
|
|
133
|
+
connection,
|
|
134
|
+
provider,
|
|
135
|
+
onBotResponse: mirror
|
|
136
|
+
? (text) => {
|
|
137
|
+
if (latestInputTranscript && text.trim()) {
|
|
138
|
+
buffer.push(latestInputTranscript, text);
|
|
139
|
+
latestInputTranscript = undefined;
|
|
160
140
|
}
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if (text.trim())
|
|
170
|
-
latestInputTranscript = text.trim();
|
|
171
|
-
mirror.postUserTranscription('User', text).catch((err) => {
|
|
172
|
-
this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
|
|
173
|
-
});
|
|
141
|
+
mirror.postBotResponse(botName, text).catch((err) => {
|
|
142
|
+
this.log.warn({ guildId, err }, 'transcript-mirror: failed to post bot response');
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
: (text) => {
|
|
146
|
+
if (latestInputTranscript && text.trim()) {
|
|
147
|
+
buffer.push(latestInputTranscript, text);
|
|
148
|
+
latestInputTranscript = undefined;
|
|
174
149
|
}
|
|
175
|
-
: (text) => {
|
|
176
|
-
if (text.trim())
|
|
177
|
-
latestInputTranscript = text.trim();
|
|
178
|
-
},
|
|
179
|
-
onSessionTerminated: () => {
|
|
180
|
-
this.log.error({ guildId }, 'gemini-live session terminally failed — no fallback (fallback disabled)');
|
|
181
150
|
},
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
151
|
+
onInputTranscript: mirror
|
|
152
|
+
? (text) => {
|
|
153
|
+
if (text.trim())
|
|
154
|
+
latestInputTranscript = text.trim();
|
|
155
|
+
mirror.postUserTranscription('User', text).catch((err) => {
|
|
156
|
+
this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
: (text) => {
|
|
160
|
+
if (text.trim())
|
|
161
|
+
latestInputTranscript = text.trim();
|
|
187
162
|
},
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
}
|
|
210
|
-
}));
|
|
211
|
-
const silentCount = supportsAsyncFunctionCalling
|
|
212
|
-
? results.filter((r) => r.scheduling === 'SILENT').length
|
|
213
|
-
: 0;
|
|
214
|
-
if (silentCount > 0) {
|
|
215
|
-
this.log.info({ guildId, count: silentCount }, 'gemini-live: SILENT tool execution complete — results scheduled silently');
|
|
216
|
-
}
|
|
163
|
+
onSessionTerminated: () => {
|
|
164
|
+
this.log.error({ guildId }, 'gemini-live session terminally failed — no fallback');
|
|
165
|
+
},
|
|
166
|
+
onFallbackRecommended: (reason) => {
|
|
167
|
+
this.log.warn({ guildId, reason }, 'gemini-live: fallback recommended but the legacy pipeline has been removed');
|
|
168
|
+
},
|
|
169
|
+
onTokenWarning: (estimatedTokens, threshold) => {
|
|
170
|
+
this.log.warn({ guildId, estimatedTokens, threshold }, 'gemini-live: token usage approaching context window limit');
|
|
171
|
+
},
|
|
172
|
+
onToolCall: tools
|
|
173
|
+
? (calls) => {
|
|
174
|
+
this.log.info({ guildId, count: calls.length, names: calls.map((c) => c.name).join(',') }, 'gemini-live: tool call received — dispatching');
|
|
175
|
+
const allowedRoots = this.runtimeCwd ? [this.runtimeCwd] : [];
|
|
176
|
+
const allowedToolNames = new Set(buildToolSchemas(this.enabledTools).map((t) => t.function.name));
|
|
177
|
+
const logFn = (msg) => this.log.info({ guildId }, msg);
|
|
178
|
+
const execOpts = { enableHybridPipeline: false, allowedToolNames };
|
|
179
|
+
void (async () => {
|
|
180
|
+
const results = await Promise.all(calls.map(async (call) => {
|
|
181
|
+
const scheduling = supportsAsyncFunctionCalling
|
|
182
|
+
? (this.isSilentTool(call.name) ? 'SILENT' : 'INTERRUPT')
|
|
183
|
+
: undefined;
|
|
217
184
|
try {
|
|
218
|
-
|
|
185
|
+
const res = await executeToolCall(call.name, call.args, allowedRoots, logFn, execOpts);
|
|
186
|
+
return { id: call.id, name: call.name, output: res.result, scheduling };
|
|
219
187
|
}
|
|
220
188
|
catch (err) {
|
|
221
|
-
|
|
189
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
190
|
+
return { id: call.id, name: call.name, output: `Error: ${msg}`, scheduling };
|
|
222
191
|
}
|
|
223
|
-
})
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
const sttShim = {
|
|
230
|
-
start: async () => { },
|
|
231
|
-
stop: async () => { },
|
|
232
|
-
onTranscription: () => { },
|
|
233
|
-
feedAudio: (frame) => {
|
|
234
|
-
try {
|
|
235
|
-
provider.sendAudio(frame.buffer);
|
|
236
|
-
}
|
|
237
|
-
catch (err) {
|
|
238
|
-
this.log.warn({ guildId, err }, 'gemini-live: sendAudio error (non-fatal)');
|
|
239
|
-
}
|
|
240
|
-
},
|
|
241
|
-
};
|
|
242
|
-
const receiver = new AudioReceiver({
|
|
243
|
-
connection,
|
|
244
|
-
allowedUserIds: this.allowedUserIds,
|
|
245
|
-
sttProvider: sttShim,
|
|
246
|
-
log: this.log,
|
|
247
|
-
createDecoder: this.createDecoder,
|
|
248
|
-
onUserSpeaking: () => { },
|
|
249
|
-
onUserSilence: () => {
|
|
250
|
-
try {
|
|
251
|
-
provider.sendAudioStreamEnd();
|
|
252
|
-
}
|
|
253
|
-
catch (err) {
|
|
254
|
-
this.log.warn({ guildId, err }, 'gemini-live: sendAudioStreamEnd error (non-fatal)');
|
|
255
|
-
}
|
|
256
|
-
},
|
|
257
|
-
});
|
|
258
|
-
receiver.start();
|
|
259
|
-
this.pipelines.set(guildId, {
|
|
260
|
-
connection,
|
|
261
|
-
sttProvider: sttShim,
|
|
262
|
-
receiver,
|
|
263
|
-
buffer,
|
|
264
|
-
geminiProvider: provider,
|
|
265
|
-
geminiResponder: responder,
|
|
266
|
-
mode: 'gemini-live',
|
|
267
|
-
});
|
|
268
|
-
this.log.info({ guildId }, 'audio pipeline started (gemini-live)');
|
|
269
|
-
return;
|
|
270
|
-
}
|
|
271
|
-
// ----- default pipeline mode: STT/TTS/VoiceResponder -----
|
|
272
|
-
const sttProvider = this.createStt(this.voiceConfig, this.log);
|
|
273
|
-
const mirror = this.transcriptMirror;
|
|
274
|
-
// Create conversation buffer and backfill history if available
|
|
275
|
-
let buffer;
|
|
276
|
-
if (this.invokeAi) {
|
|
277
|
-
buffer = new ConversationBuffer();
|
|
278
|
-
if (this.backfill) {
|
|
279
|
-
try {
|
|
280
|
-
const turns = await this.backfill();
|
|
281
|
-
buffer.backfill(turns);
|
|
282
|
-
this.log.info({ guildId, turns: turns.length }, 'conversation buffer backfilled');
|
|
283
|
-
}
|
|
284
|
-
catch (err) {
|
|
285
|
-
this.log.warn({ guildId, err }, 'conversation backfill failed — proceeding with empty buffer');
|
|
286
|
-
}
|
|
287
|
-
}
|
|
288
|
-
}
|
|
289
|
-
// Create VoiceResponder for the full conversation loop if invokeAi is configured
|
|
290
|
-
let responder;
|
|
291
|
-
if (this.invokeAi) {
|
|
292
|
-
try {
|
|
293
|
-
const tts = this.createTts(this.voiceConfig, this.log);
|
|
294
|
-
const botName = this.botDisplayName;
|
|
295
|
-
responder = new VoiceResponder({
|
|
296
|
-
log: this.log,
|
|
297
|
-
tts,
|
|
298
|
-
connection,
|
|
299
|
-
invokeAi: this.invokeAi,
|
|
300
|
-
onBotResponse: mirror
|
|
301
|
-
? (text) => {
|
|
302
|
-
mirror.postBotResponse(botName, text).catch((err) => {
|
|
303
|
-
this.log.warn({ guildId, err }, 'transcript-mirror: failed to post bot response');
|
|
304
|
-
});
|
|
192
|
+
}));
|
|
193
|
+
const silentCount = supportsAsyncFunctionCalling
|
|
194
|
+
? results.filter((r) => r.scheduling === 'SILENT').length
|
|
195
|
+
: 0;
|
|
196
|
+
if (silentCount > 0) {
|
|
197
|
+
this.log.info({ guildId, count: silentCount }, 'gemini-live: SILENT tool execution complete — results scheduled silently');
|
|
305
198
|
}
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
// Wire transcription callback — fires the external callback, transcript mirror, and responder
|
|
316
|
-
const onTranscriptionCb = this.onTranscription;
|
|
317
|
-
if (onTranscriptionCb || responder || mirror) {
|
|
318
|
-
sttProvider.onTranscription((result) => {
|
|
319
|
-
if (onTranscriptionCb) {
|
|
320
|
-
onTranscriptionCb(guildId, result);
|
|
199
|
+
try {
|
|
200
|
+
provider.sendToolResponse(results);
|
|
201
|
+
}
|
|
202
|
+
catch (err) {
|
|
203
|
+
this.log.warn({ guildId, err }, 'gemini-live: sendToolResponse failed (provider likely disconnected)');
|
|
204
|
+
}
|
|
205
|
+
})();
|
|
321
206
|
}
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
207
|
+
: undefined,
|
|
208
|
+
});
|
|
209
|
+
responder.start();
|
|
210
|
+
// SttProvider shim: bridges AudioReceiver frames to GeminiLiveProvider.sendAudio
|
|
211
|
+
const sttShim = {
|
|
212
|
+
start: async () => { },
|
|
213
|
+
stop: async () => { },
|
|
214
|
+
onTranscription: () => { },
|
|
215
|
+
feedAudio: (frame) => {
|
|
216
|
+
try {
|
|
217
|
+
provider.sendAudio(frame.buffer);
|
|
328
218
|
}
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
mirror.postUserTranscription('User', result.text).catch((err) => {
|
|
332
|
-
this.log.warn({ guildId, err }, 'transcript-mirror: failed to post user transcription');
|
|
333
|
-
});
|
|
334
|
-
}
|
|
335
|
-
if (responder) {
|
|
336
|
-
responder.handleTranscription(result.text).catch((err) => {
|
|
337
|
-
this.log.error({ guildId, err }, 'voice-responder: handleTranscription failed');
|
|
338
|
-
});
|
|
339
|
-
}
|
|
219
|
+
catch (err) {
|
|
220
|
+
this.log.warn({ guildId, err }, 'gemini-live: sendAudio error (non-fatal)');
|
|
340
221
|
}
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
await sttProvider.start();
|
|
222
|
+
},
|
|
223
|
+
};
|
|
344
224
|
const receiver = new AudioReceiver({
|
|
345
225
|
connection,
|
|
346
226
|
allowedUserIds: this.allowedUserIds,
|
|
347
|
-
sttProvider,
|
|
227
|
+
sttProvider: sttShim,
|
|
348
228
|
log: this.log,
|
|
349
229
|
createDecoder: this.createDecoder,
|
|
350
|
-
onUserSpeaking: (
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
230
|
+
onUserSpeaking: () => { },
|
|
231
|
+
onUserSilence: () => {
|
|
232
|
+
try {
|
|
233
|
+
provider.sendAudioStreamEnd();
|
|
234
|
+
}
|
|
235
|
+
catch (err) {
|
|
236
|
+
this.log.warn({ guildId, err }, 'gemini-live: sendAudioStreamEnd error (non-fatal)');
|
|
237
|
+
}
|
|
354
238
|
},
|
|
355
239
|
});
|
|
356
240
|
receiver.start();
|
|
357
|
-
this.pipelines.set(guildId, {
|
|
358
|
-
|
|
241
|
+
this.pipelines.set(guildId, {
|
|
242
|
+
connection,
|
|
243
|
+
sttProvider: sttShim,
|
|
244
|
+
receiver,
|
|
245
|
+
buffer,
|
|
246
|
+
geminiProvider: provider,
|
|
247
|
+
geminiResponder: responder,
|
|
248
|
+
mode: 'gemini-live',
|
|
249
|
+
});
|
|
250
|
+
this.log.info({ guildId }, 'audio pipeline started (gemini-live)');
|
|
359
251
|
}
|
|
360
252
|
catch (err) {
|
|
361
253
|
this.log.error({ guildId, err }, 'failed to start audio pipeline');
|
|
362
|
-
|
|
363
|
-
if (effectiveMode === 'gemini-live') {
|
|
364
|
-
this.log.error({ guildId }, 'gemini-live: connection failed — no fallback (fallback disabled)');
|
|
365
|
-
}
|
|
254
|
+
this.log.error({ guildId }, 'gemini-live: connection failed — no fallback available');
|
|
366
255
|
}
|
|
367
256
|
finally {
|
|
368
257
|
this.starting.delete(guildId);
|
|
@@ -378,7 +267,6 @@ export class AudioPipelineManager {
|
|
|
378
267
|
if (pipeline.geminiProvider) {
|
|
379
268
|
await pipeline.geminiProvider.disconnect();
|
|
380
269
|
}
|
|
381
|
-
pipeline.responder?.destroy();
|
|
382
270
|
pipeline.receiver.stop();
|
|
383
271
|
try {
|
|
384
272
|
await pipeline.sttProvider.stop();
|
|
@@ -401,58 +289,17 @@ export class AudioPipelineManager {
|
|
|
401
289
|
get activePipelineCount() {
|
|
402
290
|
return this.pipelines.size;
|
|
403
291
|
}
|
|
404
|
-
/** Configured voice provider mode
|
|
292
|
+
/** Configured voice provider mode. */
|
|
405
293
|
get activeVoiceProvider() {
|
|
406
|
-
return
|
|
294
|
+
return 'gemini-live';
|
|
407
295
|
}
|
|
408
|
-
/** Active mode for a specific guild
|
|
296
|
+
/** Active mode for a specific guild. */
|
|
409
297
|
pipelineMode(guildId) {
|
|
410
298
|
return this.pipelines.get(guildId)?.mode;
|
|
411
299
|
}
|
|
412
|
-
/** Current Deepgram TTS voice model name. */
|
|
413
|
-
get ttsVoice() {
|
|
414
|
-
return this.voiceConfig.deepgramTtsVoice;
|
|
415
|
-
}
|
|
416
|
-
/**
|
|
417
|
-
* Fall back from gemini-live to the standard pipeline for a guild.
|
|
418
|
-
* Stops the current gemini-live session and starts a standard STT/AI/TTS pipeline.
|
|
419
|
-
* No-op if no pipeline exists or the guild is already in standard mode.
|
|
420
|
-
*/
|
|
421
|
-
async fallbackToPipeline(guildId, connection) {
|
|
422
|
-
const pipeline = this.pipelines.get(guildId);
|
|
423
|
-
if (!pipeline || pipeline.mode !== 'gemini-live')
|
|
424
|
-
return;
|
|
425
|
-
this.log.warn({ guildId }, 'gemini-live: initiating fallback to standard pipeline');
|
|
426
|
-
await this.stopPipeline(guildId);
|
|
427
|
-
await this.startPipeline(guildId, connection, 'pipeline');
|
|
428
|
-
if (this.hasPipeline(guildId)) {
|
|
429
|
-
this.log.info({ guildId }, 'gemini-live: fallback to standard pipeline succeeded');
|
|
430
|
-
this.onFallbackTriggered?.(guildId, 'pipeline');
|
|
431
|
-
}
|
|
432
|
-
else {
|
|
433
|
-
this.log.error({ guildId }, 'gemini-live: fallback to standard pipeline also failed — guild has no active pipeline');
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
300
|
isSilentTool(toolName) {
|
|
437
301
|
return this.silentTools.has(toolName) || this.silentTools.has(OPENAI_TO_DISCO_NAME[toolName] ?? toolName);
|
|
438
302
|
}
|
|
439
|
-
/**
|
|
440
|
-
* Update the Deepgram TTS voice and restart all active pipelines so the
|
|
441
|
-
* new voice takes effect immediately. No-op in gemini-live mode (TTS is
|
|
442
|
-
* handled server-side).
|
|
443
|
-
* @returns The number of pipelines that were restarted (0 in gemini-live mode).
|
|
444
|
-
*/
|
|
445
|
-
async setTtsVoice(voice) {
|
|
446
|
-
if (this.voiceProvider === 'gemini-live') {
|
|
447
|
-
this.log.info({ voice }, 'TTS voice change ignored — gemini-live mode uses server-side TTS');
|
|
448
|
-
return 0;
|
|
449
|
-
}
|
|
450
|
-
this.voiceConfig = { ...this.voiceConfig, deepgramTtsVoice: voice };
|
|
451
|
-
this.log.info({ voice }, 'TTS voice updated — restarting active pipelines');
|
|
452
|
-
const entries = [...this.pipelines.entries()];
|
|
453
|
-
await Promise.all(entries.map(([guildId, pipeline]) => this.startPipeline(guildId, pipeline.connection)));
|
|
454
|
-
return entries.length;
|
|
455
|
-
}
|
|
456
303
|
}
|
|
457
304
|
function toGeminiLiveHistoryTurns(turns) {
|
|
458
305
|
const history = [];
|
package/docs/configuration.md
CHANGED
|
@@ -37,7 +37,7 @@ For npm-managed daemon installs, readiness is currently constrained by service e
|
|
|
37
37
|
Model/runtime state is intentionally split across three storage modes:
|
|
38
38
|
|
|
39
39
|
- `models.json` stores persisted model strings per role (`chat`, `fast`, `plan-run`, `voice`, forge roles, cron roles, etc.).
|
|
40
|
-
- `runtime-overrides.json` stores persisted runtime-only overlays such as `fastRuntime` and `voiceRuntime
|
|
40
|
+
- `runtime-overrides.json` stores persisted runtime-only overlays such as `fastRuntime` and `voiceRuntime`.
|
|
41
41
|
- Live chat runtime swaps stay in memory only. `!models set chat <runtime>` changes the active chat runtime immediately, but there is no persisted `chatRuntime` overlay.
|
|
42
42
|
|
|
43
43
|
On first run, `models.json` is scaffolded from the instance startup defaults. After that:
|
|
@@ -322,24 +322,19 @@ The same forum-boundary rule applies to tasks: `DISCOCLAW_TASKS_FORUM` is the di
|
|
|
322
322
|
|
|
323
323
|
## Voice
|
|
324
324
|
|
|
325
|
-
See [docs/voice.md](voice.md) for the full setup guide
|
|
325
|
+
See [docs/voice.md](voice.md) for the full Gemini Live setup guide.
|
|
326
326
|
|
|
327
327
|
| Variable | Default | Description |
|
|
328
328
|
|----------|---------|-------------|
|
|
329
329
|
| `DISCOCLAW_VOICE_ENABLED` | `false` | Master switch for voice subsystem |
|
|
330
330
|
| `DISCOCLAW_VOICE_AUTO_JOIN` | `false` | Auto-join voice channels when users enter |
|
|
331
331
|
| `ANTHROPIC_API_KEY` | — | Anthropic API key (required for direct Messages API voice responses) |
|
|
332
|
+
| `GEMINI_API_KEY` | — | Gemini API key required for Gemini Live voice |
|
|
332
333
|
| `DISCOCLAW_VOICE_MODEL` | follows startup chat model | Model override for voice responses |
|
|
333
334
|
| `DISCOCLAW_VOICE_SYSTEM_PROMPT` | — | System prompt override for voice (max 4000 chars) |
|
|
334
|
-
| `
|
|
335
|
-
| `DISCOCLAW_TTS_PROVIDER` | `cartesia` | Text-to-speech provider: `cartesia`, `deepgram`, `kokoro`, `openai` |
|
|
335
|
+
| `DISCOCLAW_GEMINI_SESSION_ROTATION_MS` | `780000` | Proactive Gemini Live session rotation interval in milliseconds |
|
|
336
336
|
| `DISCOCLAW_VOICE_HOME_CHANNEL` | — | Voice channel name or ID for prompt context |
|
|
337
337
|
| `DISCOCLAW_VOICE_LOG_CHANNEL` | `voice-log` | Text channel for transcript mirror |
|
|
338
|
-
| `DEEPGRAM_API_KEY` | — | Deepgram API key (required for Deepgram STT/TTS) |
|
|
339
|
-
| `DEEPGRAM_STT_MODEL` | `nova-3-general` | Deepgram STT model |
|
|
340
|
-
| `DEEPGRAM_TTS_VOICE` | `aura-2-asteria-en` | Deepgram TTS voice |
|
|
341
|
-
| `DEEPGRAM_TTS_SPEED` | `1.3` | Deepgram TTS playback speed multiplier (0.5–1.5) |
|
|
342
|
-
| `CARTESIA_API_KEY` | — | Cartesia API key (required for Cartesia TTS) |
|
|
343
338
|
|
|
344
339
|
## Webhook
|
|
345
340
|
|
package/docs/official-docs.md
CHANGED
|
@@ -6,8 +6,8 @@ Completeness pass for this index was cross-checked against:
|
|
|
6
6
|
|
|
7
7
|
- `package.json`
|
|
8
8
|
- `.context/runtime.md`
|
|
9
|
-
- `src/voice/
|
|
10
|
-
- `src/voice/
|
|
9
|
+
- `src/voice/audio-pipeline.ts`
|
|
10
|
+
- `src/voice/providers/gemini-live-provider.ts`
|
|
11
11
|
- `src/cold-storage/embeddings.ts`
|
|
12
12
|
- `src/cold-storage/openai-compat.ts`
|
|
13
13
|
- `src/discord/actions-imagegen.ts`
|
|
@@ -24,8 +24,8 @@ Completeness pass for this index was cross-checked against:
|
|
|
24
24
|
| Provider | What DiscoClaw uses | Official docs |
|
|
25
25
|
|----------|----------------------|---------------|
|
|
26
26
|
| Anthropic | Claude model families via `src/runtime/anthropic-rest.ts` and Claude Code CLI runtime | Models overview: <https://docs.anthropic.com/en/docs/about-claude/models/overview><br>Messages API: <https://platform.claude.com/docs/en/api/messages><br>Claude Code docs: <https://code.claude.com/docs/en/overview> |
|
|
27
|
-
| OpenAI | OpenAI-compatible runtime, Codex runtime docs,
|
|
28
|
-
| Google | Gemini API runtime and Gemini/Imagen image generation | Gemini models: <https://ai.google.dev/models/gemini><br>Gemini API docs: <https://ai.google.dev/gemini-api/docs> |
|
|
27
|
+
| OpenAI | OpenAI-compatible runtime, Codex runtime docs, embeddings, and image generation | Model IDs: <https://developers.openai.com/api/model-ids/><br>API reference overview: <https://platform.openai.com/docs/api-reference><br>Codex docs: <https://developers.openai.com/codex/><br>Codex app-server API: <https://developers.openai.com/codex/app-server> |
|
|
28
|
+
| Google | Gemini API runtime, Gemini Live voice, and Gemini/Imagen image generation | Gemini models: <https://ai.google.dev/models/gemini><br>Gemini API docs: <https://ai.google.dev/gemini-api/docs><br>Gemini Live API: <https://ai.google.dev/gemini-api/docs/live> |
|
|
29
29
|
| OpenRouter | OpenRouter runtime through `src/runtime/openai-compat.ts` | Model list: <https://openrouter.ai/models><br>API docs: <https://openrouter.ai/docs/api/reference/overview> |
|
|
30
30
|
|
|
31
31
|
## Discord
|
|
@@ -53,11 +53,8 @@ Completeness pass for this index was cross-checked against:
|
|
|
53
53
|
|
|
54
54
|
| Provider | Used in DiscoClaw | Official docs |
|
|
55
55
|
|----------|-------------------|---------------|
|
|
56
|
-
|
|
|
57
|
-
|
|
|
58
|
-
| Cartesia TTS | `src/voice/tts-cartesia.ts` with Sonic-3 over WebSocket | API docs: <https://docs.cartesia.ai/api-reference><br>TTS WebSocket: <https://docs.cartesia.ai/api-reference/tts/websocket> |
|
|
59
|
-
| OpenAI TTS | `src/voice/tts-openai.ts` (`/v1/audio/speech`, default `tts-1`) | Audio speech API reference: <https://platform.openai.com/docs/api-reference/audio/createSpeech> |
|
|
60
|
-
| OpenAI STT | `src/voice/stt-openai.ts` (`/v1/audio/transcriptions`, `whisper-1`) | Audio transcription API reference: <https://platform.openai.com/docs/api-reference/audio/createTranscription> |
|
|
56
|
+
| Gemini Live | `src/voice/audio-pipeline.ts` and the Gemini Live provider handle speech recognition, reasoning, and speech synthesis in one session | Live API overview: <https://ai.google.dev/gemini-api/docs/live><br>Realtime guide: <https://ai.google.dev/gemini-api/docs/live-guide> |
|
|
57
|
+
| Anthropic Messages API (optional voice runtime) | `!models set voice claude-api` can switch voice response generation to direct Anthropic API calls while Discord audio transport stays on Gemini Live | API overview: <https://docs.anthropic.com/en/api/messages> |
|
|
61
58
|
|
|
62
59
|
## Image Generation
|
|
63
60
|
|