shuvmaki 0.4.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/bin.js +70 -0
  2. package/dist/ai-tool-to-genai.js +210 -0
  3. package/dist/ai-tool-to-genai.test.js +267 -0
  4. package/dist/channel-management.js +97 -0
  5. package/dist/cli.js +709 -0
  6. package/dist/commands/abort.js +78 -0
  7. package/dist/commands/add-project.js +98 -0
  8. package/dist/commands/agent.js +152 -0
  9. package/dist/commands/ask-question.js +183 -0
  10. package/dist/commands/create-new-project.js +78 -0
  11. package/dist/commands/fork.js +186 -0
  12. package/dist/commands/model.js +313 -0
  13. package/dist/commands/permissions.js +126 -0
  14. package/dist/commands/queue.js +129 -0
  15. package/dist/commands/resume.js +145 -0
  16. package/dist/commands/session.js +142 -0
  17. package/dist/commands/share.js +80 -0
  18. package/dist/commands/types.js +2 -0
  19. package/dist/commands/undo-redo.js +161 -0
  20. package/dist/commands/user-command.js +145 -0
  21. package/dist/database.js +184 -0
  22. package/dist/discord-bot.js +384 -0
  23. package/dist/discord-utils.js +217 -0
  24. package/dist/escape-backticks.test.js +410 -0
  25. package/dist/format-tables.js +96 -0
  26. package/dist/format-tables.test.js +418 -0
  27. package/dist/genai-worker-wrapper.js +109 -0
  28. package/dist/genai-worker.js +297 -0
  29. package/dist/genai.js +232 -0
  30. package/dist/interaction-handler.js +144 -0
  31. package/dist/logger.js +51 -0
  32. package/dist/markdown.js +310 -0
  33. package/dist/markdown.test.js +262 -0
  34. package/dist/message-formatting.js +273 -0
  35. package/dist/message-formatting.test.js +73 -0
  36. package/dist/openai-realtime.js +228 -0
  37. package/dist/opencode.js +216 -0
  38. package/dist/session-handler.js +580 -0
  39. package/dist/system-message.js +61 -0
  40. package/dist/tools.js +356 -0
  41. package/dist/utils.js +85 -0
  42. package/dist/voice-handler.js +541 -0
  43. package/dist/voice.js +314 -0
  44. package/dist/worker-types.js +4 -0
  45. package/dist/xml.js +92 -0
  46. package/dist/xml.test.js +32 -0
  47. package/package.json +60 -0
  48. package/src/__snapshots__/compact-session-context-no-system.md +35 -0
  49. package/src/__snapshots__/compact-session-context.md +47 -0
  50. package/src/ai-tool-to-genai.test.ts +296 -0
  51. package/src/ai-tool-to-genai.ts +255 -0
  52. package/src/channel-management.ts +161 -0
  53. package/src/cli.ts +1010 -0
  54. package/src/commands/abort.ts +94 -0
  55. package/src/commands/add-project.ts +139 -0
  56. package/src/commands/agent.ts +201 -0
  57. package/src/commands/ask-question.ts +276 -0
  58. package/src/commands/create-new-project.ts +111 -0
  59. package/src/commands/fork.ts +257 -0
  60. package/src/commands/model.ts +402 -0
  61. package/src/commands/permissions.ts +146 -0
  62. package/src/commands/queue.ts +181 -0
  63. package/src/commands/resume.ts +230 -0
  64. package/src/commands/session.ts +184 -0
  65. package/src/commands/share.ts +96 -0
  66. package/src/commands/types.ts +25 -0
  67. package/src/commands/undo-redo.ts +213 -0
  68. package/src/commands/user-command.ts +178 -0
  69. package/src/database.ts +220 -0
  70. package/src/discord-bot.ts +513 -0
  71. package/src/discord-utils.ts +282 -0
  72. package/src/escape-backticks.test.ts +447 -0
  73. package/src/format-tables.test.ts +440 -0
  74. package/src/format-tables.ts +110 -0
  75. package/src/genai-worker-wrapper.ts +160 -0
  76. package/src/genai-worker.ts +366 -0
  77. package/src/genai.ts +321 -0
  78. package/src/interaction-handler.ts +187 -0
  79. package/src/logger.ts +57 -0
  80. package/src/markdown.test.ts +358 -0
  81. package/src/markdown.ts +365 -0
  82. package/src/message-formatting.test.ts +81 -0
  83. package/src/message-formatting.ts +340 -0
  84. package/src/openai-realtime.ts +363 -0
  85. package/src/opencode.ts +277 -0
  86. package/src/session-handler.ts +758 -0
  87. package/src/system-message.ts +62 -0
  88. package/src/tools.ts +428 -0
  89. package/src/utils.ts +118 -0
  90. package/src/voice-handler.ts +760 -0
  91. package/src/voice.ts +432 -0
  92. package/src/worker-types.ts +66 -0
  93. package/src/xml.test.ts +37 -0
  94. package/src/xml.ts +121 -0
@@ -0,0 +1,366 @@
1
+ // Worker thread for GenAI voice processing.
2
+ // Runs in a separate thread to handle audio encoding/decoding without blocking.
3
+ // Resamples 24kHz GenAI output to 48kHz stereo Opus packets for Discord.
4
+
5
+ import { parentPort, threadId } from 'node:worker_threads'
6
+ import { createWriteStream, type WriteStream } from 'node:fs'
7
+ import { mkdir } from 'node:fs/promises'
8
+ import path from 'node:path'
9
+ import { Resampler } from '@purinton/resampler'
10
+ import * as prism from 'prism-media'
11
+ import { startGenAiSession } from './genai.js'
12
+ import type { Session } from '@google/genai'
13
+ import { getTools } from './tools.js'
14
+ import type { WorkerInMessage, WorkerOutMessage } from './worker-types.js'
15
+ import { createLogger } from './logger.js'
16
+
17
+ if (!parentPort) {
18
+ throw new Error('This module must be run as a worker thread')
19
+ }
20
+
21
+ const workerLogger = createLogger(`WORKER ${threadId}`)
22
+ workerLogger.log('GenAI worker started')
23
+
24
+ // Define sendError early so it can be used by global handlers
25
+ function sendError(error: string) {
26
+ if (parentPort) {
27
+ parentPort.postMessage({
28
+ type: 'error',
29
+ error,
30
+ } satisfies WorkerOutMessage)
31
+ }
32
+ }
33
+
34
+ // Add global error handlers for the worker thread
35
+ process.on('uncaughtException', (error) => {
36
+ workerLogger.error('Uncaught exception in worker:', error)
37
+ sendError(`Worker crashed: ${error.message}`)
38
+ // Exit immediately on uncaught exception
39
+ process.exit(1)
40
+ })
41
+
42
+ process.on('unhandledRejection', (reason, promise) => {
43
+ workerLogger.error(
44
+ 'Unhandled rejection in worker:',
45
+ reason,
46
+ 'at promise:',
47
+ promise,
48
+ )
49
+ sendError(`Worker unhandled rejection: ${reason}`)
50
+ })
51
+
52
+ // Audio configuration
53
+ const AUDIO_CONFIG = {
54
+ inputSampleRate: 24000, // GenAI output
55
+ inputChannels: 1,
56
+ outputSampleRate: 48000, // Discord expects
57
+ outputChannels: 2,
58
+ opusFrameSize: 960, // 20ms at 48kHz
59
+ }
60
+
61
+ // Initialize audio processing components
62
+ const resampler = new Resampler({
63
+ inRate: AUDIO_CONFIG.inputSampleRate,
64
+ outRate: AUDIO_CONFIG.outputSampleRate,
65
+ inChannels: AUDIO_CONFIG.inputChannels,
66
+ outChannels: AUDIO_CONFIG.outputChannels,
67
+ volume: 1,
68
+ filterWindow: 8,
69
+ })
70
+
71
+ const opusEncoder = new prism.opus.Encoder({
72
+ rate: AUDIO_CONFIG.outputSampleRate,
73
+ channels: AUDIO_CONFIG.outputChannels,
74
+ frameSize: AUDIO_CONFIG.opusFrameSize,
75
+ })
76
+
77
+ // Pipe resampler to encoder with error handling
78
+ resampler.pipe(opusEncoder).on('error', (error) => {
79
+ workerLogger.error('Pipe error between resampler and encoder:', error)
80
+ sendError(`Audio pipeline error: ${error.message}`)
81
+ })
82
+
83
+ // Opus packet queue and interval for 20ms packet sending
84
+ const opusPacketQueue: Buffer[] = []
85
+ let packetInterval: NodeJS.Timeout | null = null
86
+
87
+ // Send packets every 20ms
88
+ function startPacketSending() {
89
+ if (packetInterval) return
90
+
91
+ packetInterval = setInterval(() => {
92
+ const packet = opusPacketQueue.shift()
93
+ if (!packet) return
94
+
95
+ // Transfer packet as ArrayBuffer
96
+ const arrayBuffer = packet.buffer.slice(
97
+ packet.byteOffset,
98
+ packet.byteOffset + packet.byteLength,
99
+ ) as ArrayBuffer
100
+
101
+ parentPort!.postMessage(
102
+ {
103
+ type: 'assistantOpusPacket',
104
+ packet: arrayBuffer,
105
+ } satisfies WorkerOutMessage,
106
+ [arrayBuffer], // Transfer ownership
107
+ )
108
+ }, 20)
109
+ }
110
+
111
+ function stopPacketSending() {
112
+ if (packetInterval) {
113
+ clearInterval(packetInterval)
114
+ packetInterval = null
115
+ }
116
+ opusPacketQueue.length = 0
117
+ }
118
+
119
+ // Session state
120
+ let session: { session: Session; stop: () => void } | null = null
121
+
122
+ // Audio log stream for assistant audio
123
+ let audioLogStream: WriteStream | null = null
124
+
125
+ // Create assistant audio log stream for debugging
126
+ async function createAssistantAudioLogStream(
127
+ guildId: string,
128
+ channelId: string,
129
+ ): Promise<WriteStream | null> {
130
+ if (!process.env.DEBUG) return null
131
+
132
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
133
+ const audioDir = path.join(
134
+ process.cwd(),
135
+ 'discord-audio-logs',
136
+ guildId,
137
+ channelId,
138
+ )
139
+
140
+ try {
141
+ await mkdir(audioDir, { recursive: true })
142
+
143
+ // Create stream for assistant audio (24kHz mono s16le PCM)
144
+ const outputFileName = `assistant_${timestamp}.24.pcm`
145
+ const outputFilePath = path.join(audioDir, outputFileName)
146
+ const outputAudioStream = createWriteStream(outputFilePath)
147
+
148
+ // Add error handler to prevent crashes
149
+ outputAudioStream.on('error', (error) => {
150
+ workerLogger.error(`Assistant audio log stream error:`, error)
151
+ })
152
+
153
+ workerLogger.log(`Created assistant audio log: ${outputFilePath}`)
154
+
155
+ return outputAudioStream
156
+ } catch (error) {
157
+ workerLogger.error(`Failed to create audio log directory:`, error)
158
+ return null
159
+ }
160
+ }
161
+
162
+ // Handle encoded Opus packets
163
+ opusEncoder.on('data', (packet: Buffer) => {
164
+ opusPacketQueue.push(packet)
165
+ })
166
+
167
+ // Handle stream end events
168
+ opusEncoder.on('end', () => {
169
+ workerLogger.log('Opus encoder stream ended')
170
+ })
171
+
172
+ resampler.on('end', () => {
173
+ workerLogger.log('Resampler stream ended')
174
+ })
175
+
176
+ // Handle errors
177
+ resampler.on('error', (error: any) => {
178
+ workerLogger.error(`Resampler error:`, error)
179
+ sendError(`Resampler error: ${error.message}`)
180
+ })
181
+
182
+ opusEncoder.on('error', (error: any) => {
183
+ workerLogger.error(`Encoder error:`, error)
184
+ // Check for specific corrupted data errors
185
+ if (error.message?.includes('The compressed data passed is corrupted')) {
186
+ workerLogger.warn('Received corrupted audio data in opus encoder')
187
+ } else {
188
+ sendError(`Encoder error: ${error.message}`)
189
+ }
190
+ })
191
+
192
+ async function cleanupAsync(): Promise<void> {
193
+ workerLogger.log(`Starting async cleanup`)
194
+
195
+ stopPacketSending()
196
+
197
+ if (session) {
198
+ workerLogger.log(`Stopping GenAI session`)
199
+ session.stop()
200
+ session = null
201
+ }
202
+
203
+ // Wait for audio log stream to finish writing
204
+ if (audioLogStream) {
205
+ workerLogger.log(`Closing assistant audio log stream`)
206
+ await new Promise<void>((resolve, reject) => {
207
+ audioLogStream!.end(() => {
208
+ workerLogger.log(`Assistant audio log stream closed`)
209
+ resolve()
210
+ })
211
+ audioLogStream!.on('error', reject)
212
+ // Add timeout to prevent hanging
213
+ setTimeout(() => {
214
+ workerLogger.log(`Audio stream close timeout, continuing`)
215
+ resolve()
216
+ }, 3000)
217
+ })
218
+ audioLogStream = null
219
+ }
220
+
221
+ // Unpipe and end the encoder first
222
+ resampler.unpipe(opusEncoder)
223
+
224
+ // End the encoder stream
225
+ await new Promise<void>((resolve) => {
226
+ opusEncoder.end(() => {
227
+ workerLogger.log(`Opus encoder ended`)
228
+ resolve()
229
+ })
230
+ // Add timeout
231
+ setTimeout(resolve, 1000)
232
+ })
233
+
234
+ // End the resampler stream
235
+ await new Promise<void>((resolve) => {
236
+ resampler.end(() => {
237
+ workerLogger.log(`Resampler ended`)
238
+ resolve()
239
+ })
240
+ // Add timeout
241
+ setTimeout(resolve, 1000)
242
+ })
243
+
244
+ workerLogger.log(`Async cleanup complete`)
245
+ }
246
+
247
+ // Handle messages from main thread
248
+ parentPort.on('message', async (message: WorkerInMessage) => {
249
+ try {
250
+ switch (message.type) {
251
+ case 'init': {
252
+ workerLogger.log(`Initializing with directory:`, message.directory)
253
+
254
+ // Create audio log stream for assistant audio
255
+ audioLogStream = await createAssistantAudioLogStream(
256
+ message.guildId,
257
+ message.channelId,
258
+ )
259
+
260
+ // Start packet sending interval
261
+ startPacketSending()
262
+
263
+ // Get tools for the directory
264
+ const { tools } = await getTools({
265
+ directory: message.directory,
266
+ onMessageCompleted: (params) => {
267
+ parentPort!.postMessage({
268
+ type: 'toolCallCompleted',
269
+ ...params,
270
+ } satisfies WorkerOutMessage)
271
+ },
272
+ })
273
+
274
+ // Start GenAI session
275
+ session = await startGenAiSession({
276
+ tools,
277
+ systemMessage: message.systemMessage,
278
+ geminiApiKey: message.geminiApiKey,
279
+ onAssistantAudioChunk({ data }) {
280
+ // Write to audio log if enabled
281
+ if (audioLogStream && !audioLogStream.destroyed) {
282
+ audioLogStream.write(data, (err) => {
283
+ if (err) {
284
+ workerLogger.error('Error writing to audio log:', err)
285
+ }
286
+ })
287
+ }
288
+
289
+ // Write PCM data to resampler which will output Opus packets
290
+ if (!resampler.destroyed) {
291
+ resampler.write(data, (err) => {
292
+ if (err) {
293
+ workerLogger.error('Error writing to resampler:', err)
294
+ sendError(`Failed to process audio: ${err.message}`)
295
+ }
296
+ })
297
+ }
298
+ },
299
+ onAssistantStartSpeaking() {
300
+ parentPort!.postMessage({
301
+ type: 'assistantStartSpeaking',
302
+ } satisfies WorkerOutMessage)
303
+ },
304
+ onAssistantStopSpeaking() {
305
+ parentPort!.postMessage({
306
+ type: 'assistantStopSpeaking',
307
+ } satisfies WorkerOutMessage)
308
+ },
309
+ onAssistantInterruptSpeaking() {
310
+ parentPort!.postMessage({
311
+ type: 'assistantInterruptSpeaking',
312
+ } satisfies WorkerOutMessage)
313
+ },
314
+ })
315
+
316
+ // Notify main thread we're ready
317
+ parentPort!.postMessage({
318
+ type: 'ready',
319
+ } satisfies WorkerOutMessage)
320
+ break
321
+ }
322
+
323
+ case 'sendRealtimeInput': {
324
+ if (!session) {
325
+ sendError('Session not initialized')
326
+ return
327
+ }
328
+ session.session.sendRealtimeInput({
329
+ audio: message.audio,
330
+ audioStreamEnd: message.audioStreamEnd,
331
+ })
332
+ break
333
+ }
334
+
335
+ case 'sendTextInput': {
336
+ if (!session) {
337
+ sendError('Session not initialized')
338
+ return
339
+ }
340
+ session.session.sendRealtimeInput({
341
+ text: message.text,
342
+ })
343
+ break
344
+ }
345
+
346
+ case 'interrupt': {
347
+ workerLogger.log(`Interrupting playback`)
348
+ // Clear the opus packet queue
349
+ opusPacketQueue.length = 0
350
+ break
351
+ }
352
+
353
+ case 'stop': {
354
+ workerLogger.log(`Stopping worker`)
355
+ await cleanupAsync()
356
+ // process.exit(0)
357
+ break
358
+ }
359
+ }
360
+ } catch (error) {
361
+ workerLogger.error(`Error handling message:`, error)
362
+ sendError(
363
+ error instanceof Error ? error.message : 'Unknown error in worker',
364
+ )
365
+ }
366
+ })
package/src/genai.ts ADDED
@@ -0,0 +1,321 @@
1
+ // Google GenAI Live session manager for real-time voice interactions.
2
+ // Establishes bidirectional audio streaming with Gemini, handles tool calls,
3
+ // and manages the assistant's audio output for Discord voice channels.
4
+
5
+ import {
6
+ GoogleGenAI,
7
+ LiveServerMessage,
8
+ MediaResolution,
9
+ Modality,
10
+ Session,
11
+ } from '@google/genai'
12
+ import type { CallableTool } from '@google/genai'
13
+ import { writeFile } from 'fs'
14
+ import type { Tool as AITool } from 'ai'
15
+
16
+ import { createLogger } from './logger.js'
17
+ import { aiToolToCallableTool } from './ai-tool-to-genai.js'
18
+
19
+ const genaiLogger = createLogger('GENAI')
20
+
21
+ const audioParts: Buffer[] = []
22
+
23
+ function saveBinaryFile(fileName: string, content: Buffer) {
24
+ writeFile(fileName, content, 'utf8', (err) => {
25
+ if (err) {
26
+ genaiLogger.error(`Error writing file ${fileName}:`, err)
27
+ return
28
+ }
29
+ genaiLogger.log(`Appending stream content to file ${fileName}.`)
30
+ })
31
+ }
32
+
33
+ interface WavConversionOptions {
34
+ numChannels: number
35
+ sampleRate: number
36
+ bitsPerSample: number
37
+ }
38
+
39
+ function convertToWav(rawData: Buffer[], mimeType: string) {
40
+ const options = parseMimeType(mimeType)
41
+ const dataLength = rawData.reduce((a, b) => a + b.length, 0)
42
+ const wavHeader = createWavHeader(dataLength, options)
43
+ const buffer = Buffer.concat(rawData)
44
+
45
+ return Buffer.concat([wavHeader, buffer])
46
+ }
47
+
48
+ function parseMimeType(mimeType: string) {
49
+ const [fileType, ...params] = mimeType.split(';').map((s) => s.trim())
50
+ const [_, format] = fileType?.split('/') || []
51
+
52
+ const options: Partial<WavConversionOptions> = {
53
+ numChannels: 1,
54
+ bitsPerSample: 16,
55
+ }
56
+
57
+ if (format && format.startsWith('L')) {
58
+ const bits = parseInt(format.slice(1), 10)
59
+ if (!isNaN(bits)) {
60
+ options.bitsPerSample = bits
61
+ }
62
+ }
63
+
64
+ for (const param of params) {
65
+ const [key, value] = param.split('=').map((s) => s.trim())
66
+ if (key === 'rate') {
67
+ options.sampleRate = parseInt(value || '', 10)
68
+ }
69
+ }
70
+
71
+ return options as WavConversionOptions
72
+ }
73
+
74
+ function createWavHeader(dataLength: number, options: WavConversionOptions) {
75
+ const { numChannels, sampleRate, bitsPerSample } = options
76
+
77
+ // http://soundfile.sapp.org/doc/WaveFormat
78
+
79
+ const byteRate = (sampleRate * numChannels * bitsPerSample) / 8
80
+ const blockAlign = (numChannels * bitsPerSample) / 8
81
+ const buffer = Buffer.alloc(44)
82
+
83
+ buffer.write('RIFF', 0) // ChunkID
84
+ buffer.writeUInt32LE(36 + dataLength, 4) // ChunkSize
85
+ buffer.write('WAVE', 8) // Format
86
+ buffer.write('fmt ', 12) // Subchunk1ID
87
+ buffer.writeUInt32LE(16, 16) // Subchunk1Size (PCM)
88
+ buffer.writeUInt16LE(1, 20) // AudioFormat (1 = PCM)
89
+ buffer.writeUInt16LE(numChannels, 22) // NumChannels
90
+ buffer.writeUInt32LE(sampleRate, 24) // SampleRate
91
+ buffer.writeUInt32LE(byteRate, 28) // ByteRate
92
+ buffer.writeUInt16LE(blockAlign, 32) // BlockAlign
93
+ buffer.writeUInt16LE(bitsPerSample, 34) // BitsPerSample
94
+ buffer.write('data', 36) // Subchunk2ID
95
+ buffer.writeUInt32LE(dataLength, 40) // Subchunk2Size
96
+
97
+ return buffer
98
+ }
99
+
100
+ function defaultAudioChunkHandler({
101
+ data,
102
+ mimeType,
103
+ }: {
104
+ data: Buffer
105
+ mimeType: string
106
+ }) {
107
+ audioParts.push(data)
108
+ const fileName = 'audio.wav'
109
+ const buffer = convertToWav(audioParts, mimeType)
110
+ saveBinaryFile(fileName, buffer)
111
+ }
112
+
113
+ export async function startGenAiSession({
114
+ onAssistantAudioChunk,
115
+ onAssistantStartSpeaking,
116
+ onAssistantStopSpeaking,
117
+ onAssistantInterruptSpeaking,
118
+ systemMessage,
119
+ tools,
120
+ geminiApiKey,
121
+ }: {
122
+ onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
123
+ onAssistantStartSpeaking?: () => void
124
+ onAssistantStopSpeaking?: () => void
125
+ onAssistantInterruptSpeaking?: () => void
126
+ systemMessage?: string
127
+ tools?: Record<string, AITool<any, any>>
128
+ geminiApiKey?: string | null
129
+ } = {}) {
130
+ let session: Session | undefined = undefined
131
+ const callableTools: Array<CallableTool & { name: string }> = []
132
+ let isAssistantSpeaking = false
133
+
134
+ const audioChunkHandler = onAssistantAudioChunk || defaultAudioChunkHandler
135
+
136
+ // Convert AI SDK tools to GenAI CallableTools
137
+ if (tools) {
138
+ for (const [name, tool] of Object.entries(tools)) {
139
+ callableTools.push(aiToolToCallableTool(tool, name))
140
+ }
141
+ }
142
+
143
+ function handleModelTurn(message: LiveServerMessage) {
144
+ if (message.toolCall) {
145
+ genaiLogger.log('Tool call:', message.toolCall)
146
+
147
+ // Handle tool calls
148
+ if (message.toolCall.functionCalls && callableTools.length > 0) {
149
+ for (const tool of callableTools) {
150
+ if (
151
+ !message.toolCall.functionCalls.some((x) => x.name === tool.name)
152
+ ) {
153
+ continue
154
+ }
155
+ tool
156
+ .callTool(message.toolCall.functionCalls)
157
+ .then((parts) => {
158
+ const functionResponses = parts
159
+ .filter((part) => part.functionResponse)
160
+ .map((part) => ({
161
+ response: part.functionResponse!.response as Record<
162
+ string,
163
+ unknown
164
+ >,
165
+ id: part.functionResponse!.id,
166
+ name: part.functionResponse!.name,
167
+ }))
168
+
169
+ if (functionResponses.length > 0 && session) {
170
+ session.sendToolResponse({ functionResponses })
171
+ genaiLogger.log(
172
+ 'client-toolResponse: ' +
173
+ JSON.stringify({ functionResponses }),
174
+ )
175
+ }
176
+ })
177
+ .catch((error) => {
178
+ genaiLogger.error('Error handling tool calls:', error)
179
+ })
180
+ }
181
+ }
182
+ }
183
+ if (message.serverContent?.modelTurn?.parts) {
184
+ for (const part of message.serverContent.modelTurn.parts) {
185
+ if (part?.fileData) {
186
+ genaiLogger.log(`File: ${part?.fileData.fileUri}`)
187
+ }
188
+
189
+ if (part?.inlineData) {
190
+ const inlineData = part.inlineData
191
+ if (
192
+ !inlineData.mimeType ||
193
+ !inlineData.mimeType.startsWith('audio/')
194
+ ) {
195
+ genaiLogger.log(
196
+ 'Skipping non-audio inlineData:',
197
+ inlineData.mimeType,
198
+ )
199
+ continue
200
+ }
201
+
202
+ // Trigger start speaking callback the first time audio is received
203
+ if (!isAssistantSpeaking && onAssistantStartSpeaking) {
204
+ isAssistantSpeaking = true
205
+ onAssistantStartSpeaking()
206
+ }
207
+
208
+ const buffer = Buffer.from(inlineData?.data ?? '', 'base64')
209
+ audioChunkHandler({
210
+ data: buffer,
211
+ mimeType: inlineData.mimeType ?? '',
212
+ })
213
+ }
214
+
215
+ if (part?.text) {
216
+ genaiLogger.log('Text:', part.text)
217
+ }
218
+ }
219
+ }
220
+ // Handle input transcription (user's audio transcription)
221
+ if (message.serverContent?.inputTranscription?.text) {
222
+ genaiLogger.log(
223
+ '[user transcription]',
224
+ message.serverContent.inputTranscription.text,
225
+ )
226
+ }
227
+
228
+ // Handle output transcription (model's audio transcription)
229
+ if (message.serverContent?.outputTranscription?.text) {
230
+ genaiLogger.log(
231
+ '[assistant transcription]',
232
+ message.serverContent.outputTranscription.text,
233
+ )
234
+ }
235
+ if (message.serverContent?.interrupted) {
236
+ genaiLogger.log('Assistant was interrupted')
237
+ if (isAssistantSpeaking && onAssistantInterruptSpeaking) {
238
+ isAssistantSpeaking = false
239
+ onAssistantInterruptSpeaking()
240
+ }
241
+ }
242
+ if (message.serverContent?.turnComplete) {
243
+ genaiLogger.log('Assistant turn complete')
244
+ if (isAssistantSpeaking && onAssistantStopSpeaking) {
245
+ isAssistantSpeaking = false
246
+ onAssistantStopSpeaking()
247
+ }
248
+ }
249
+ }
250
+
251
+ const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
252
+
253
+ if (!apiKey) {
254
+ genaiLogger.error('No Gemini API key provided')
255
+ throw new Error('Gemini API key is required for voice interactions')
256
+ }
257
+
258
+ const ai = new GoogleGenAI({
259
+ apiKey,
260
+ })
261
+
262
+ const model = 'gemini-2.5-flash-native-audio-preview-12-2025'
263
+
264
+ session = await ai.live.connect({
265
+ model,
266
+ callbacks: {
267
+ onopen: function () {
268
+ genaiLogger.debug('Opened')
269
+ },
270
+ onmessage: function (message: LiveServerMessage) {
271
+ // genaiLogger.log(message)
272
+ try {
273
+ handleModelTurn(message)
274
+ } catch (error) {
275
+ genaiLogger.error('Error handling turn:', error)
276
+ }
277
+ },
278
+ onerror: function (e: ErrorEvent) {
279
+ genaiLogger.debug('Error:', e.message)
280
+ },
281
+ onclose: function (e: CloseEvent) {
282
+ genaiLogger.debug('Close:', e.reason)
283
+ },
284
+ },
285
+ config: {
286
+ tools: callableTools,
287
+ responseModalities: [Modality.AUDIO],
288
+ mediaResolution: MediaResolution.MEDIA_RESOLUTION_MEDIUM,
289
+ inputAudioTranscription: {}, // transcribes your input speech
290
+ outputAudioTranscription: {}, // transcribes the model's spoken audio
291
+ systemInstruction: {
292
+ parts: [
293
+ {
294
+ text: systemMessage || '',
295
+ },
296
+ ],
297
+ },
298
+ speechConfig: {
299
+ voiceConfig: {
300
+ prebuiltVoiceConfig: {
301
+ voiceName: 'Charon', // Orus also not bad
302
+ },
303
+ },
304
+ },
305
+ contextWindowCompression: {
306
+ triggerTokens: '25600',
307
+
308
+ slidingWindow: { targetTokens: '12800' },
309
+ },
310
+ },
311
+ })
312
+
313
+ return {
314
+ session,
315
+ stop: () => {
316
+ const currentSession = session
317
+ session = undefined
318
+ currentSession?.close()
319
+ },
320
+ }
321
+ }