shuvmaki 0.4.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin.js +70 -0
- package/dist/ai-tool-to-genai.js +210 -0
- package/dist/ai-tool-to-genai.test.js +267 -0
- package/dist/channel-management.js +97 -0
- package/dist/cli.js +709 -0
- package/dist/commands/abort.js +78 -0
- package/dist/commands/add-project.js +98 -0
- package/dist/commands/agent.js +152 -0
- package/dist/commands/ask-question.js +183 -0
- package/dist/commands/create-new-project.js +78 -0
- package/dist/commands/fork.js +186 -0
- package/dist/commands/model.js +313 -0
- package/dist/commands/permissions.js +126 -0
- package/dist/commands/queue.js +129 -0
- package/dist/commands/resume.js +145 -0
- package/dist/commands/session.js +142 -0
- package/dist/commands/share.js +80 -0
- package/dist/commands/types.js +2 -0
- package/dist/commands/undo-redo.js +161 -0
- package/dist/commands/user-command.js +145 -0
- package/dist/database.js +184 -0
- package/dist/discord-bot.js +384 -0
- package/dist/discord-utils.js +217 -0
- package/dist/escape-backticks.test.js +410 -0
- package/dist/format-tables.js +96 -0
- package/dist/format-tables.test.js +418 -0
- package/dist/genai-worker-wrapper.js +109 -0
- package/dist/genai-worker.js +297 -0
- package/dist/genai.js +232 -0
- package/dist/interaction-handler.js +144 -0
- package/dist/logger.js +51 -0
- package/dist/markdown.js +310 -0
- package/dist/markdown.test.js +262 -0
- package/dist/message-formatting.js +273 -0
- package/dist/message-formatting.test.js +73 -0
- package/dist/openai-realtime.js +228 -0
- package/dist/opencode.js +216 -0
- package/dist/session-handler.js +580 -0
- package/dist/system-message.js +61 -0
- package/dist/tools.js +356 -0
- package/dist/utils.js +85 -0
- package/dist/voice-handler.js +541 -0
- package/dist/voice.js +314 -0
- package/dist/worker-types.js +4 -0
- package/dist/xml.js +92 -0
- package/dist/xml.test.js +32 -0
- package/package.json +60 -0
- package/src/__snapshots__/compact-session-context-no-system.md +35 -0
- package/src/__snapshots__/compact-session-context.md +47 -0
- package/src/ai-tool-to-genai.test.ts +296 -0
- package/src/ai-tool-to-genai.ts +255 -0
- package/src/channel-management.ts +161 -0
- package/src/cli.ts +1010 -0
- package/src/commands/abort.ts +94 -0
- package/src/commands/add-project.ts +139 -0
- package/src/commands/agent.ts +201 -0
- package/src/commands/ask-question.ts +276 -0
- package/src/commands/create-new-project.ts +111 -0
- package/src/commands/fork.ts +257 -0
- package/src/commands/model.ts +402 -0
- package/src/commands/permissions.ts +146 -0
- package/src/commands/queue.ts +181 -0
- package/src/commands/resume.ts +230 -0
- package/src/commands/session.ts +184 -0
- package/src/commands/share.ts +96 -0
- package/src/commands/types.ts +25 -0
- package/src/commands/undo-redo.ts +213 -0
- package/src/commands/user-command.ts +178 -0
- package/src/database.ts +220 -0
- package/src/discord-bot.ts +513 -0
- package/src/discord-utils.ts +282 -0
- package/src/escape-backticks.test.ts +447 -0
- package/src/format-tables.test.ts +440 -0
- package/src/format-tables.ts +110 -0
- package/src/genai-worker-wrapper.ts +160 -0
- package/src/genai-worker.ts +366 -0
- package/src/genai.ts +321 -0
- package/src/interaction-handler.ts +187 -0
- package/src/logger.ts +57 -0
- package/src/markdown.test.ts +358 -0
- package/src/markdown.ts +365 -0
- package/src/message-formatting.test.ts +81 -0
- package/src/message-formatting.ts +340 -0
- package/src/openai-realtime.ts +363 -0
- package/src/opencode.ts +277 -0
- package/src/session-handler.ts +758 -0
- package/src/system-message.ts +62 -0
- package/src/tools.ts +428 -0
- package/src/utils.ts +118 -0
- package/src/voice-handler.ts +760 -0
- package/src/voice.ts +432 -0
- package/src/worker-types.ts +66 -0
- package/src/xml.test.ts +37 -0
- package/src/xml.ts +121 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
// Worker thread for GenAI voice processing.
|
|
2
|
+
// Runs in a separate thread to handle audio encoding/decoding without blocking.
|
|
3
|
+
// Resamples 24kHz GenAI output to 48kHz stereo Opus packets for Discord.
|
|
4
|
+
|
|
5
|
+
import { parentPort, threadId } from 'node:worker_threads'
|
|
6
|
+
import { createWriteStream, type WriteStream } from 'node:fs'
|
|
7
|
+
import { mkdir } from 'node:fs/promises'
|
|
8
|
+
import path from 'node:path'
|
|
9
|
+
import { Resampler } from '@purinton/resampler'
|
|
10
|
+
import * as prism from 'prism-media'
|
|
11
|
+
import { startGenAiSession } from './genai.js'
|
|
12
|
+
import type { Session } from '@google/genai'
|
|
13
|
+
import { getTools } from './tools.js'
|
|
14
|
+
import type { WorkerInMessage, WorkerOutMessage } from './worker-types.js'
|
|
15
|
+
import { createLogger } from './logger.js'
|
|
16
|
+
|
|
17
|
+
if (!parentPort) {
|
|
18
|
+
throw new Error('This module must be run as a worker thread')
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const workerLogger = createLogger(`WORKER ${threadId}`)
|
|
22
|
+
workerLogger.log('GenAI worker started')
|
|
23
|
+
|
|
24
|
+
// Define sendError early so it can be used by global handlers
|
|
25
|
+
function sendError(error: string) {
|
|
26
|
+
if (parentPort) {
|
|
27
|
+
parentPort.postMessage({
|
|
28
|
+
type: 'error',
|
|
29
|
+
error,
|
|
30
|
+
} satisfies WorkerOutMessage)
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Add global error handlers for the worker thread
|
|
35
|
+
process.on('uncaughtException', (error) => {
|
|
36
|
+
workerLogger.error('Uncaught exception in worker:', error)
|
|
37
|
+
sendError(`Worker crashed: ${error.message}`)
|
|
38
|
+
// Exit immediately on uncaught exception
|
|
39
|
+
process.exit(1)
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
43
|
+
workerLogger.error(
|
|
44
|
+
'Unhandled rejection in worker:',
|
|
45
|
+
reason,
|
|
46
|
+
'at promise:',
|
|
47
|
+
promise,
|
|
48
|
+
)
|
|
49
|
+
sendError(`Worker unhandled rejection: ${reason}`)
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
// Audio configuration
|
|
53
|
+
const AUDIO_CONFIG = {
|
|
54
|
+
inputSampleRate: 24000, // GenAI output
|
|
55
|
+
inputChannels: 1,
|
|
56
|
+
outputSampleRate: 48000, // Discord expects
|
|
57
|
+
outputChannels: 2,
|
|
58
|
+
opusFrameSize: 960, // 20ms at 48kHz
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Initialize audio processing components
|
|
62
|
+
const resampler = new Resampler({
|
|
63
|
+
inRate: AUDIO_CONFIG.inputSampleRate,
|
|
64
|
+
outRate: AUDIO_CONFIG.outputSampleRate,
|
|
65
|
+
inChannels: AUDIO_CONFIG.inputChannels,
|
|
66
|
+
outChannels: AUDIO_CONFIG.outputChannels,
|
|
67
|
+
volume: 1,
|
|
68
|
+
filterWindow: 8,
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
const opusEncoder = new prism.opus.Encoder({
|
|
72
|
+
rate: AUDIO_CONFIG.outputSampleRate,
|
|
73
|
+
channels: AUDIO_CONFIG.outputChannels,
|
|
74
|
+
frameSize: AUDIO_CONFIG.opusFrameSize,
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
// Pipe resampler to encoder with error handling
|
|
78
|
+
resampler.pipe(opusEncoder).on('error', (error) => {
|
|
79
|
+
workerLogger.error('Pipe error between resampler and encoder:', error)
|
|
80
|
+
sendError(`Audio pipeline error: ${error.message}`)
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
// Opus packet queue and interval for 20ms packet sending
|
|
84
|
+
const opusPacketQueue: Buffer[] = []
|
|
85
|
+
let packetInterval: NodeJS.Timeout | null = null
|
|
86
|
+
|
|
87
|
+
// Send packets every 20ms
|
|
88
|
+
function startPacketSending() {
|
|
89
|
+
if (packetInterval) return
|
|
90
|
+
|
|
91
|
+
packetInterval = setInterval(() => {
|
|
92
|
+
const packet = opusPacketQueue.shift()
|
|
93
|
+
if (!packet) return
|
|
94
|
+
|
|
95
|
+
// Transfer packet as ArrayBuffer
|
|
96
|
+
const arrayBuffer = packet.buffer.slice(
|
|
97
|
+
packet.byteOffset,
|
|
98
|
+
packet.byteOffset + packet.byteLength,
|
|
99
|
+
) as ArrayBuffer
|
|
100
|
+
|
|
101
|
+
parentPort!.postMessage(
|
|
102
|
+
{
|
|
103
|
+
type: 'assistantOpusPacket',
|
|
104
|
+
packet: arrayBuffer,
|
|
105
|
+
} satisfies WorkerOutMessage,
|
|
106
|
+
[arrayBuffer], // Transfer ownership
|
|
107
|
+
)
|
|
108
|
+
}, 20)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function stopPacketSending() {
|
|
112
|
+
if (packetInterval) {
|
|
113
|
+
clearInterval(packetInterval)
|
|
114
|
+
packetInterval = null
|
|
115
|
+
}
|
|
116
|
+
opusPacketQueue.length = 0
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Session state
|
|
120
|
+
let session: { session: Session; stop: () => void } | null = null
|
|
121
|
+
|
|
122
|
+
// Audio log stream for assistant audio
|
|
123
|
+
let audioLogStream: WriteStream | null = null
|
|
124
|
+
|
|
125
|
+
// Create assistant audio log stream for debugging
|
|
126
|
+
async function createAssistantAudioLogStream(
|
|
127
|
+
guildId: string,
|
|
128
|
+
channelId: string,
|
|
129
|
+
): Promise<WriteStream | null> {
|
|
130
|
+
if (!process.env.DEBUG) return null
|
|
131
|
+
|
|
132
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
|
|
133
|
+
const audioDir = path.join(
|
|
134
|
+
process.cwd(),
|
|
135
|
+
'discord-audio-logs',
|
|
136
|
+
guildId,
|
|
137
|
+
channelId,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
try {
|
|
141
|
+
await mkdir(audioDir, { recursive: true })
|
|
142
|
+
|
|
143
|
+
// Create stream for assistant audio (24kHz mono s16le PCM)
|
|
144
|
+
const outputFileName = `assistant_${timestamp}.24.pcm`
|
|
145
|
+
const outputFilePath = path.join(audioDir, outputFileName)
|
|
146
|
+
const outputAudioStream = createWriteStream(outputFilePath)
|
|
147
|
+
|
|
148
|
+
// Add error handler to prevent crashes
|
|
149
|
+
outputAudioStream.on('error', (error) => {
|
|
150
|
+
workerLogger.error(`Assistant audio log stream error:`, error)
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
workerLogger.log(`Created assistant audio log: ${outputFilePath}`)
|
|
154
|
+
|
|
155
|
+
return outputAudioStream
|
|
156
|
+
} catch (error) {
|
|
157
|
+
workerLogger.error(`Failed to create audio log directory:`, error)
|
|
158
|
+
return null
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Handle encoded Opus packets
|
|
163
|
+
opusEncoder.on('data', (packet: Buffer) => {
|
|
164
|
+
opusPacketQueue.push(packet)
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
// Handle stream end events
|
|
168
|
+
opusEncoder.on('end', () => {
|
|
169
|
+
workerLogger.log('Opus encoder stream ended')
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
resampler.on('end', () => {
|
|
173
|
+
workerLogger.log('Resampler stream ended')
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
// Handle errors
|
|
177
|
+
resampler.on('error', (error: any) => {
|
|
178
|
+
workerLogger.error(`Resampler error:`, error)
|
|
179
|
+
sendError(`Resampler error: ${error.message}`)
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
opusEncoder.on('error', (error: any) => {
|
|
183
|
+
workerLogger.error(`Encoder error:`, error)
|
|
184
|
+
// Check for specific corrupted data errors
|
|
185
|
+
if (error.message?.includes('The compressed data passed is corrupted')) {
|
|
186
|
+
workerLogger.warn('Received corrupted audio data in opus encoder')
|
|
187
|
+
} else {
|
|
188
|
+
sendError(`Encoder error: ${error.message}`)
|
|
189
|
+
}
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
async function cleanupAsync(): Promise<void> {
|
|
193
|
+
workerLogger.log(`Starting async cleanup`)
|
|
194
|
+
|
|
195
|
+
stopPacketSending()
|
|
196
|
+
|
|
197
|
+
if (session) {
|
|
198
|
+
workerLogger.log(`Stopping GenAI session`)
|
|
199
|
+
session.stop()
|
|
200
|
+
session = null
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Wait for audio log stream to finish writing
|
|
204
|
+
if (audioLogStream) {
|
|
205
|
+
workerLogger.log(`Closing assistant audio log stream`)
|
|
206
|
+
await new Promise<void>((resolve, reject) => {
|
|
207
|
+
audioLogStream!.end(() => {
|
|
208
|
+
workerLogger.log(`Assistant audio log stream closed`)
|
|
209
|
+
resolve()
|
|
210
|
+
})
|
|
211
|
+
audioLogStream!.on('error', reject)
|
|
212
|
+
// Add timeout to prevent hanging
|
|
213
|
+
setTimeout(() => {
|
|
214
|
+
workerLogger.log(`Audio stream close timeout, continuing`)
|
|
215
|
+
resolve()
|
|
216
|
+
}, 3000)
|
|
217
|
+
})
|
|
218
|
+
audioLogStream = null
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Unpipe and end the encoder first
|
|
222
|
+
resampler.unpipe(opusEncoder)
|
|
223
|
+
|
|
224
|
+
// End the encoder stream
|
|
225
|
+
await new Promise<void>((resolve) => {
|
|
226
|
+
opusEncoder.end(() => {
|
|
227
|
+
workerLogger.log(`Opus encoder ended`)
|
|
228
|
+
resolve()
|
|
229
|
+
})
|
|
230
|
+
// Add timeout
|
|
231
|
+
setTimeout(resolve, 1000)
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
// End the resampler stream
|
|
235
|
+
await new Promise<void>((resolve) => {
|
|
236
|
+
resampler.end(() => {
|
|
237
|
+
workerLogger.log(`Resampler ended`)
|
|
238
|
+
resolve()
|
|
239
|
+
})
|
|
240
|
+
// Add timeout
|
|
241
|
+
setTimeout(resolve, 1000)
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
workerLogger.log(`Async cleanup complete`)
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Handle messages from main thread
|
|
248
|
+
parentPort.on('message', async (message: WorkerInMessage) => {
|
|
249
|
+
try {
|
|
250
|
+
switch (message.type) {
|
|
251
|
+
case 'init': {
|
|
252
|
+
workerLogger.log(`Initializing with directory:`, message.directory)
|
|
253
|
+
|
|
254
|
+
// Create audio log stream for assistant audio
|
|
255
|
+
audioLogStream = await createAssistantAudioLogStream(
|
|
256
|
+
message.guildId,
|
|
257
|
+
message.channelId,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
// Start packet sending interval
|
|
261
|
+
startPacketSending()
|
|
262
|
+
|
|
263
|
+
// Get tools for the directory
|
|
264
|
+
const { tools } = await getTools({
|
|
265
|
+
directory: message.directory,
|
|
266
|
+
onMessageCompleted: (params) => {
|
|
267
|
+
parentPort!.postMessage({
|
|
268
|
+
type: 'toolCallCompleted',
|
|
269
|
+
...params,
|
|
270
|
+
} satisfies WorkerOutMessage)
|
|
271
|
+
},
|
|
272
|
+
})
|
|
273
|
+
|
|
274
|
+
// Start GenAI session
|
|
275
|
+
session = await startGenAiSession({
|
|
276
|
+
tools,
|
|
277
|
+
systemMessage: message.systemMessage,
|
|
278
|
+
geminiApiKey: message.geminiApiKey,
|
|
279
|
+
onAssistantAudioChunk({ data }) {
|
|
280
|
+
// Write to audio log if enabled
|
|
281
|
+
if (audioLogStream && !audioLogStream.destroyed) {
|
|
282
|
+
audioLogStream.write(data, (err) => {
|
|
283
|
+
if (err) {
|
|
284
|
+
workerLogger.error('Error writing to audio log:', err)
|
|
285
|
+
}
|
|
286
|
+
})
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Write PCM data to resampler which will output Opus packets
|
|
290
|
+
if (!resampler.destroyed) {
|
|
291
|
+
resampler.write(data, (err) => {
|
|
292
|
+
if (err) {
|
|
293
|
+
workerLogger.error('Error writing to resampler:', err)
|
|
294
|
+
sendError(`Failed to process audio: ${err.message}`)
|
|
295
|
+
}
|
|
296
|
+
})
|
|
297
|
+
}
|
|
298
|
+
},
|
|
299
|
+
onAssistantStartSpeaking() {
|
|
300
|
+
parentPort!.postMessage({
|
|
301
|
+
type: 'assistantStartSpeaking',
|
|
302
|
+
} satisfies WorkerOutMessage)
|
|
303
|
+
},
|
|
304
|
+
onAssistantStopSpeaking() {
|
|
305
|
+
parentPort!.postMessage({
|
|
306
|
+
type: 'assistantStopSpeaking',
|
|
307
|
+
} satisfies WorkerOutMessage)
|
|
308
|
+
},
|
|
309
|
+
onAssistantInterruptSpeaking() {
|
|
310
|
+
parentPort!.postMessage({
|
|
311
|
+
type: 'assistantInterruptSpeaking',
|
|
312
|
+
} satisfies WorkerOutMessage)
|
|
313
|
+
},
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
// Notify main thread we're ready
|
|
317
|
+
parentPort!.postMessage({
|
|
318
|
+
type: 'ready',
|
|
319
|
+
} satisfies WorkerOutMessage)
|
|
320
|
+
break
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
case 'sendRealtimeInput': {
|
|
324
|
+
if (!session) {
|
|
325
|
+
sendError('Session not initialized')
|
|
326
|
+
return
|
|
327
|
+
}
|
|
328
|
+
session.session.sendRealtimeInput({
|
|
329
|
+
audio: message.audio,
|
|
330
|
+
audioStreamEnd: message.audioStreamEnd,
|
|
331
|
+
})
|
|
332
|
+
break
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
case 'sendTextInput': {
|
|
336
|
+
if (!session) {
|
|
337
|
+
sendError('Session not initialized')
|
|
338
|
+
return
|
|
339
|
+
}
|
|
340
|
+
session.session.sendRealtimeInput({
|
|
341
|
+
text: message.text,
|
|
342
|
+
})
|
|
343
|
+
break
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
case 'interrupt': {
|
|
347
|
+
workerLogger.log(`Interrupting playback`)
|
|
348
|
+
// Clear the opus packet queue
|
|
349
|
+
opusPacketQueue.length = 0
|
|
350
|
+
break
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
case 'stop': {
|
|
354
|
+
workerLogger.log(`Stopping worker`)
|
|
355
|
+
await cleanupAsync()
|
|
356
|
+
// process.exit(0)
|
|
357
|
+
break
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
} catch (error) {
|
|
361
|
+
workerLogger.error(`Error handling message:`, error)
|
|
362
|
+
sendError(
|
|
363
|
+
error instanceof Error ? error.message : 'Unknown error in worker',
|
|
364
|
+
)
|
|
365
|
+
}
|
|
366
|
+
})
|
package/src/genai.ts
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
// Google GenAI Live session manager for real-time voice interactions.
|
|
2
|
+
// Establishes bidirectional audio streaming with Gemini, handles tool calls,
|
|
3
|
+
// and manages the assistant's audio output for Discord voice channels.
|
|
4
|
+
|
|
5
|
+
import {
|
|
6
|
+
GoogleGenAI,
|
|
7
|
+
LiveServerMessage,
|
|
8
|
+
MediaResolution,
|
|
9
|
+
Modality,
|
|
10
|
+
Session,
|
|
11
|
+
} from '@google/genai'
|
|
12
|
+
import type { CallableTool } from '@google/genai'
|
|
13
|
+
import { writeFile } from 'fs'
|
|
14
|
+
import type { Tool as AITool } from 'ai'
|
|
15
|
+
|
|
16
|
+
import { createLogger } from './logger.js'
|
|
17
|
+
import { aiToolToCallableTool } from './ai-tool-to-genai.js'
|
|
18
|
+
|
|
19
|
+
const genaiLogger = createLogger('GENAI')
|
|
20
|
+
|
|
21
|
+
const audioParts: Buffer[] = []
|
|
22
|
+
|
|
23
|
+
function saveBinaryFile(fileName: string, content: Buffer) {
|
|
24
|
+
writeFile(fileName, content, 'utf8', (err) => {
|
|
25
|
+
if (err) {
|
|
26
|
+
genaiLogger.error(`Error writing file ${fileName}:`, err)
|
|
27
|
+
return
|
|
28
|
+
}
|
|
29
|
+
genaiLogger.log(`Appending stream content to file ${fileName}.`)
|
|
30
|
+
})
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
interface WavConversionOptions {
|
|
34
|
+
numChannels: number
|
|
35
|
+
sampleRate: number
|
|
36
|
+
bitsPerSample: number
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function convertToWav(rawData: Buffer[], mimeType: string) {
|
|
40
|
+
const options = parseMimeType(mimeType)
|
|
41
|
+
const dataLength = rawData.reduce((a, b) => a + b.length, 0)
|
|
42
|
+
const wavHeader = createWavHeader(dataLength, options)
|
|
43
|
+
const buffer = Buffer.concat(rawData)
|
|
44
|
+
|
|
45
|
+
return Buffer.concat([wavHeader, buffer])
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function parseMimeType(mimeType: string) {
|
|
49
|
+
const [fileType, ...params] = mimeType.split(';').map((s) => s.trim())
|
|
50
|
+
const [_, format] = fileType?.split('/') || []
|
|
51
|
+
|
|
52
|
+
const options: Partial<WavConversionOptions> = {
|
|
53
|
+
numChannels: 1,
|
|
54
|
+
bitsPerSample: 16,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (format && format.startsWith('L')) {
|
|
58
|
+
const bits = parseInt(format.slice(1), 10)
|
|
59
|
+
if (!isNaN(bits)) {
|
|
60
|
+
options.bitsPerSample = bits
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
for (const param of params) {
|
|
65
|
+
const [key, value] = param.split('=').map((s) => s.trim())
|
|
66
|
+
if (key === 'rate') {
|
|
67
|
+
options.sampleRate = parseInt(value || '', 10)
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return options as WavConversionOptions
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function createWavHeader(dataLength: number, options: WavConversionOptions) {
|
|
75
|
+
const { numChannels, sampleRate, bitsPerSample } = options
|
|
76
|
+
|
|
77
|
+
// http://soundfile.sapp.org/doc/WaveFormat
|
|
78
|
+
|
|
79
|
+
const byteRate = (sampleRate * numChannels * bitsPerSample) / 8
|
|
80
|
+
const blockAlign = (numChannels * bitsPerSample) / 8
|
|
81
|
+
const buffer = Buffer.alloc(44)
|
|
82
|
+
|
|
83
|
+
buffer.write('RIFF', 0) // ChunkID
|
|
84
|
+
buffer.writeUInt32LE(36 + dataLength, 4) // ChunkSize
|
|
85
|
+
buffer.write('WAVE', 8) // Format
|
|
86
|
+
buffer.write('fmt ', 12) // Subchunk1ID
|
|
87
|
+
buffer.writeUInt32LE(16, 16) // Subchunk1Size (PCM)
|
|
88
|
+
buffer.writeUInt16LE(1, 20) // AudioFormat (1 = PCM)
|
|
89
|
+
buffer.writeUInt16LE(numChannels, 22) // NumChannels
|
|
90
|
+
buffer.writeUInt32LE(sampleRate, 24) // SampleRate
|
|
91
|
+
buffer.writeUInt32LE(byteRate, 28) // ByteRate
|
|
92
|
+
buffer.writeUInt16LE(blockAlign, 32) // BlockAlign
|
|
93
|
+
buffer.writeUInt16LE(bitsPerSample, 34) // BitsPerSample
|
|
94
|
+
buffer.write('data', 36) // Subchunk2ID
|
|
95
|
+
buffer.writeUInt32LE(dataLength, 40) // Subchunk2Size
|
|
96
|
+
|
|
97
|
+
return buffer
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function defaultAudioChunkHandler({
|
|
101
|
+
data,
|
|
102
|
+
mimeType,
|
|
103
|
+
}: {
|
|
104
|
+
data: Buffer
|
|
105
|
+
mimeType: string
|
|
106
|
+
}) {
|
|
107
|
+
audioParts.push(data)
|
|
108
|
+
const fileName = 'audio.wav'
|
|
109
|
+
const buffer = convertToWav(audioParts, mimeType)
|
|
110
|
+
saveBinaryFile(fileName, buffer)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export async function startGenAiSession({
|
|
114
|
+
onAssistantAudioChunk,
|
|
115
|
+
onAssistantStartSpeaking,
|
|
116
|
+
onAssistantStopSpeaking,
|
|
117
|
+
onAssistantInterruptSpeaking,
|
|
118
|
+
systemMessage,
|
|
119
|
+
tools,
|
|
120
|
+
geminiApiKey,
|
|
121
|
+
}: {
|
|
122
|
+
onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
|
|
123
|
+
onAssistantStartSpeaking?: () => void
|
|
124
|
+
onAssistantStopSpeaking?: () => void
|
|
125
|
+
onAssistantInterruptSpeaking?: () => void
|
|
126
|
+
systemMessage?: string
|
|
127
|
+
tools?: Record<string, AITool<any, any>>
|
|
128
|
+
geminiApiKey?: string | null
|
|
129
|
+
} = {}) {
|
|
130
|
+
let session: Session | undefined = undefined
|
|
131
|
+
const callableTools: Array<CallableTool & { name: string }> = []
|
|
132
|
+
let isAssistantSpeaking = false
|
|
133
|
+
|
|
134
|
+
const audioChunkHandler = onAssistantAudioChunk || defaultAudioChunkHandler
|
|
135
|
+
|
|
136
|
+
// Convert AI SDK tools to GenAI CallableTools
|
|
137
|
+
if (tools) {
|
|
138
|
+
for (const [name, tool] of Object.entries(tools)) {
|
|
139
|
+
callableTools.push(aiToolToCallableTool(tool, name))
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function handleModelTurn(message: LiveServerMessage) {
|
|
144
|
+
if (message.toolCall) {
|
|
145
|
+
genaiLogger.log('Tool call:', message.toolCall)
|
|
146
|
+
|
|
147
|
+
// Handle tool calls
|
|
148
|
+
if (message.toolCall.functionCalls && callableTools.length > 0) {
|
|
149
|
+
for (const tool of callableTools) {
|
|
150
|
+
if (
|
|
151
|
+
!message.toolCall.functionCalls.some((x) => x.name === tool.name)
|
|
152
|
+
) {
|
|
153
|
+
continue
|
|
154
|
+
}
|
|
155
|
+
tool
|
|
156
|
+
.callTool(message.toolCall.functionCalls)
|
|
157
|
+
.then((parts) => {
|
|
158
|
+
const functionResponses = parts
|
|
159
|
+
.filter((part) => part.functionResponse)
|
|
160
|
+
.map((part) => ({
|
|
161
|
+
response: part.functionResponse!.response as Record<
|
|
162
|
+
string,
|
|
163
|
+
unknown
|
|
164
|
+
>,
|
|
165
|
+
id: part.functionResponse!.id,
|
|
166
|
+
name: part.functionResponse!.name,
|
|
167
|
+
}))
|
|
168
|
+
|
|
169
|
+
if (functionResponses.length > 0 && session) {
|
|
170
|
+
session.sendToolResponse({ functionResponses })
|
|
171
|
+
genaiLogger.log(
|
|
172
|
+
'client-toolResponse: ' +
|
|
173
|
+
JSON.stringify({ functionResponses }),
|
|
174
|
+
)
|
|
175
|
+
}
|
|
176
|
+
})
|
|
177
|
+
.catch((error) => {
|
|
178
|
+
genaiLogger.error('Error handling tool calls:', error)
|
|
179
|
+
})
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
if (message.serverContent?.modelTurn?.parts) {
|
|
184
|
+
for (const part of message.serverContent.modelTurn.parts) {
|
|
185
|
+
if (part?.fileData) {
|
|
186
|
+
genaiLogger.log(`File: ${part?.fileData.fileUri}`)
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (part?.inlineData) {
|
|
190
|
+
const inlineData = part.inlineData
|
|
191
|
+
if (
|
|
192
|
+
!inlineData.mimeType ||
|
|
193
|
+
!inlineData.mimeType.startsWith('audio/')
|
|
194
|
+
) {
|
|
195
|
+
genaiLogger.log(
|
|
196
|
+
'Skipping non-audio inlineData:',
|
|
197
|
+
inlineData.mimeType,
|
|
198
|
+
)
|
|
199
|
+
continue
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Trigger start speaking callback the first time audio is received
|
|
203
|
+
if (!isAssistantSpeaking && onAssistantStartSpeaking) {
|
|
204
|
+
isAssistantSpeaking = true
|
|
205
|
+
onAssistantStartSpeaking()
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const buffer = Buffer.from(inlineData?.data ?? '', 'base64')
|
|
209
|
+
audioChunkHandler({
|
|
210
|
+
data: buffer,
|
|
211
|
+
mimeType: inlineData.mimeType ?? '',
|
|
212
|
+
})
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (part?.text) {
|
|
216
|
+
genaiLogger.log('Text:', part.text)
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
// Handle input transcription (user's audio transcription)
|
|
221
|
+
if (message.serverContent?.inputTranscription?.text) {
|
|
222
|
+
genaiLogger.log(
|
|
223
|
+
'[user transcription]',
|
|
224
|
+
message.serverContent.inputTranscription.text,
|
|
225
|
+
)
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Handle output transcription (model's audio transcription)
|
|
229
|
+
if (message.serverContent?.outputTranscription?.text) {
|
|
230
|
+
genaiLogger.log(
|
|
231
|
+
'[assistant transcription]',
|
|
232
|
+
message.serverContent.outputTranscription.text,
|
|
233
|
+
)
|
|
234
|
+
}
|
|
235
|
+
if (message.serverContent?.interrupted) {
|
|
236
|
+
genaiLogger.log('Assistant was interrupted')
|
|
237
|
+
if (isAssistantSpeaking && onAssistantInterruptSpeaking) {
|
|
238
|
+
isAssistantSpeaking = false
|
|
239
|
+
onAssistantInterruptSpeaking()
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (message.serverContent?.turnComplete) {
|
|
243
|
+
genaiLogger.log('Assistant turn complete')
|
|
244
|
+
if (isAssistantSpeaking && onAssistantStopSpeaking) {
|
|
245
|
+
isAssistantSpeaking = false
|
|
246
|
+
onAssistantStopSpeaking()
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
252
|
+
|
|
253
|
+
if (!apiKey) {
|
|
254
|
+
genaiLogger.error('No Gemini API key provided')
|
|
255
|
+
throw new Error('Gemini API key is required for voice interactions')
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const ai = new GoogleGenAI({
|
|
259
|
+
apiKey,
|
|
260
|
+
})
|
|
261
|
+
|
|
262
|
+
const model = 'gemini-2.5-flash-native-audio-preview-12-2025'
|
|
263
|
+
|
|
264
|
+
session = await ai.live.connect({
|
|
265
|
+
model,
|
|
266
|
+
callbacks: {
|
|
267
|
+
onopen: function () {
|
|
268
|
+
genaiLogger.debug('Opened')
|
|
269
|
+
},
|
|
270
|
+
onmessage: function (message: LiveServerMessage) {
|
|
271
|
+
// genaiLogger.log(message)
|
|
272
|
+
try {
|
|
273
|
+
handleModelTurn(message)
|
|
274
|
+
} catch (error) {
|
|
275
|
+
genaiLogger.error('Error handling turn:', error)
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
onerror: function (e: ErrorEvent) {
|
|
279
|
+
genaiLogger.debug('Error:', e.message)
|
|
280
|
+
},
|
|
281
|
+
onclose: function (e: CloseEvent) {
|
|
282
|
+
genaiLogger.debug('Close:', e.reason)
|
|
283
|
+
},
|
|
284
|
+
},
|
|
285
|
+
config: {
|
|
286
|
+
tools: callableTools,
|
|
287
|
+
responseModalities: [Modality.AUDIO],
|
|
288
|
+
mediaResolution: MediaResolution.MEDIA_RESOLUTION_MEDIUM,
|
|
289
|
+
inputAudioTranscription: {}, // transcribes your input speech
|
|
290
|
+
outputAudioTranscription: {}, // transcribes the model's spoken audio
|
|
291
|
+
systemInstruction: {
|
|
292
|
+
parts: [
|
|
293
|
+
{
|
|
294
|
+
text: systemMessage || '',
|
|
295
|
+
},
|
|
296
|
+
],
|
|
297
|
+
},
|
|
298
|
+
speechConfig: {
|
|
299
|
+
voiceConfig: {
|
|
300
|
+
prebuiltVoiceConfig: {
|
|
301
|
+
voiceName: 'Charon', // Orus also not bad
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
},
|
|
305
|
+
contextWindowCompression: {
|
|
306
|
+
triggerTokens: '25600',
|
|
307
|
+
|
|
308
|
+
slidingWindow: { targetTokens: '12800' },
|
|
309
|
+
},
|
|
310
|
+
},
|
|
311
|
+
})
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
session,
|
|
315
|
+
stop: () => {
|
|
316
|
+
const currentSession = session
|
|
317
|
+
session = undefined
|
|
318
|
+
currentSession?.close()
|
|
319
|
+
},
|
|
320
|
+
}
|
|
321
|
+
}
|