npm - kimaki - Versions diffs - 0.1.3 → 0.1.5 - Mend

kimaki 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/cli.js +19 -1
package/dist/discordBot.js +45 -4
package/dist/genai-worker-wrapper.js +2 -0
package/dist/genai-worker.js +1 -0
package/dist/genai.js +7 -2
package/dist/voice.js +51 -16
package/package.json +2 -2
package/src/cli.ts +29 -2
package/src/discordBot.ts +54 -2
package/src/genai-worker-wrapper.ts +4 -0
package/src/genai-worker.ts +1 -0
package/src/genai.ts +10 -1
package/src/voice.ts +54 -15
package/src/worker-types.ts +2 -0

package/dist/cli.js CHANGED Viewed

@@ -129,7 +129,7 @@ async function run({ restart, addChannels }) {
             '2. Click "Reset Token" to generate a new bot token (in case of errors try again)\n' +
             "3. Copy the token (you won't be able to see it again!)", 'Step 3: Get Bot Token');
         const tokenInput = await password({
-            message: 'Enter your Discord Bot Token (will be hidden):',
+            message: 'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
             validate(value) {
                 if (!value)
                     return 'Bot token is required';
@@ -142,6 +142,24 @@ async function run({ restart, addChannels }) {
             process.exit(0);
         }
         token = tokenInput;
+        note(`You can get a Gemini api Key at https://aistudio.google.com/apikey`, `Gemini API Key`);
+        const geminiApiKey = await password({
+            message: 'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
+            validate(value) {
+                if (value && value.length < 10)
+                    return 'Invalid API key format';
+                return undefined;
+            },
+        });
+        if (isCancel(geminiApiKey)) {
+            cancel('Setup cancelled');
+            process.exit(0);
+        }
+        // Store API key in database
+        if (geminiApiKey) {
+            db.prepare('INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)').run(appId, geminiApiKey || null);
+            note('API key saved successfully', 'API Key Stored');
+        }
         note(`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`, 'Step 4: Install Bot to Server');
         const installed = await text({
             message: 'Press Enter AFTER you have installed the bot in your server:',

package/dist/discordBot.js CHANGED Viewed

@@ -8,6 +8,7 @@ import { spawn, exec } from 'node:child_process';
 import fs, { createWriteStream } from 'node:fs';
 import { mkdir } from 'node:fs/promises';
 import net from 'node:net';
+import os from 'node:os';
 import path from 'node:path';
 import { promisify } from 'node:util';
 import { PassThrough, Transform } from 'node:stream';
@@ -78,7 +79,7 @@ async function createUserAudioLogStream(guildId, channelId) {
     }
 }
 // Set up voice handling for a connection (called once per connection)
-async function setupVoiceHandling({ connection, guildId, channelId, }) {
+async function setupVoiceHandling({ connection, guildId, channelId, appId, }) {
     voiceLogger.log(`Setting up voice handling for guild ${guildId}, channel ${channelId}`);
     // Check if this voice channel has an associated directory
     const channelDirRow = getDatabase()
@@ -98,11 +99,17 @@ async function setupVoiceHandling({ connection, guildId, channelId, }) {
     }
     // Create user audio stream for debugging
     voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId);
+    // Get API keys from database
+    const apiKeys = getDatabase()
+        .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
+        .get(appId);
     // Create GenAI worker
     const genAiWorker = await createGenAIWorker({
         directory,
         guildId,
         channelId,
+        appId,
+        geminiApiKey: apiKeys?.gemini_api_key,
         systemMessage: dedent `
     You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
@@ -316,7 +323,17 @@ export function frameMono16khz() {
 }
 export function getDatabase() {
     if (!db) {
-        db = new Database('discord-sessions.db');
+        // Create ~/.kimaki directory if it doesn't exist
+        const kimakiDir = path.join(os.homedir(), '.kimaki');
+        try {
+            fs.mkdirSync(kimakiDir, { recursive: true });
+        }
+        catch (error) {
+            dbLogger.error('Failed to create ~/.kimaki directory:', error);
+        }
+        const dbPath = path.join(kimakiDir, 'discord-sessions.db');
+        dbLogger.log(`Opening database at: ${dbPath}`);
+        db = new Database(dbPath);
         // Initialize tables
         db.exec(`
       CREATE TABLE IF NOT EXISTS thread_sessions (
@@ -347,6 +364,13 @@ export function getDatabase() {
         channel_type TEXT NOT NULL,
         created_at DATETIME DEFAULT CURRENT_TIMESTAMP
       )
+    `);
+        db.exec(`
+      CREATE TABLE IF NOT EXISTS bot_api_keys (
+        app_id TEXT PRIMARY KEY,
+        gemini_api_key TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+      )
     `);
     }
     return db;
@@ -458,7 +482,7 @@ async function waitForServer(port, maxAttempts = 30) {
     }
     throw new Error(`Server did not start on port ${port} after ${maxAttempts} seconds`);
 }
-async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, }) {
+async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, appId, }) {
     const audioAttachment = Array.from(message.attachments.values()).find((attachment) => attachment.contentType?.startsWith('audio/'));
     if (!audioAttachment)
         return null;
@@ -488,9 +512,20 @@ async function processVoiceAttachment({ message, thread, projectDirectory, isNew
             voiceLogger.log(`Could not get project tree:`, e);
         }
     }
+    // Get Gemini API key from database if appId is provided
+    let geminiApiKey;
+    if (appId) {
+        const apiKeys = getDatabase()
+            .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
+            .get(appId);
+        if (apiKeys?.gemini_api_key) {
+            geminiApiKey = apiKeys.gemini_api_key;
+        }
+    }
     const transcription = await transcribeAudio({
         audio: audioBuffer,
         prompt: transcriptionPrompt,
+        geminiApiKey,
     });
     voiceLogger.log(`Transcription successful: "${transcription.slice(0, 50)}${transcription.length > 50 ? '...' : ''}"`);
     // Update thread name with transcribed content only for new threads
@@ -1233,6 +1268,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
                     message,
                     thread,
                     projectDirectory,
+                    appId: currentAppId,
                 });
                 if (transcription) {
                     messageContent = transcription;
@@ -1291,6 +1327,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
                     thread,
                     projectDirectory,
                     isNewThread: true,
+                    appId: currentAppId,
                 });
                 if (transcription) {
                     messageContent = transcription;
@@ -1651,6 +1688,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
                     connection,
                     guildId: newState.guild.id,
                     channelId: voiceChannel.id,
+                    appId: currentAppId,
                 });
                 // Handle connection state changes
                 connection.on(VoiceConnectionStatus.Disconnected, async () => {
@@ -1721,7 +1759,10 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
             }
             opencodeServers.clear();
             discordLogger.log('Closing database...');
-            getDatabase().close();
+            if (db) {
+                db.close();
+                db = null;
+            }
             discordLogger.log('Destroying Discord client...');
             discordClient.destroy();
             discordLogger.log('Cleanup complete, exiting.');

package/dist/genai-worker-wrapper.js CHANGED Viewed

@@ -98,6 +98,8 @@ export function createGenAIWorker(options) {
             systemMessage: options.systemMessage,
             guildId: options.guildId,
             channelId: options.channelId,
+            appId: options.appId,
+            geminiApiKey: options.geminiApiKey,
         };
         worker.postMessage(initMessage);
     });

package/dist/genai-worker.js CHANGED Viewed

@@ -210,6 +210,7 @@ parentPort.on('message', async (message) => {
                 session = await startGenAiSession({
                     tools,
                     systemMessage: message.systemMessage,
+                    geminiApiKey: message.geminiApiKey,
                     onAssistantAudioChunk({ data }) {
                         // Write to audio log if enabled
                         if (audioLogStream && !audioLogStream.destroyed) {

package/dist/genai.js CHANGED Viewed

@@ -68,7 +68,7 @@ function defaultAudioChunkHandler({ data, mimeType, }) {
     const buffer = convertToWav(audioParts, mimeType);
     saveBinaryFile(fileName, buffer);
 }
-export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, } = {}) {
+export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, geminiApiKey, } = {}) {
     let session = undefined;
     const callableTools = [];
     let isAssistantSpeaking = false;
@@ -161,8 +161,13 @@ export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStar
             }
         }
     }
+    const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
+    if (!apiKey) {
+        genaiLogger.error('No Gemini API key provided');
+        throw new Error('Gemini API key is required for voice interactions');
+    }
     const ai = new GoogleGenAI({
-        apiKey: process.env.GEMINI_API_KEY,
+        apiKey,
     });
     const model = 'models/gemini-2.5-flash-live-preview';
     session = await ai.live.connect({

package/dist/voice.js CHANGED Viewed

@@ -1,25 +1,60 @@
-import { openai } from '@ai-sdk/openai';
-import { experimental_transcribe as transcribe } from 'ai';
+import { GoogleGenAI } from '@google/genai';
 import { createLogger } from './logger.js';
 const voiceLogger = createLogger('VOICE');
-export async function transcribeAudio({ audio, prompt, language, temperature, }) {
+export async function transcribeAudio({ audio, prompt, language, temperature, geminiApiKey, }) {
     try {
-        const result = await transcribe({
-            model: openai.transcription('whisper-1'),
-            audio,
-            ...(prompt || language || temperature !== undefined
-                ? {
-                    providerOptions: {
-                        openai: {
-                            ...(prompt && { prompt }),
-                            ...(language && { language }),
-                            ...(temperature !== undefined && { temperature }),
+        // Use provided API key or fall back to environment variable
+        const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
+        if (!apiKey) {
+            throw new Error('Gemini API key is required for audio transcription');
+        }
+        // Initialize Google Generative AI
+        const genAI = new GoogleGenAI({ apiKey });
+        // Convert audio to base64 string if it's not already
+        let audioBase64;
+        if (typeof audio === 'string') {
+            audioBase64 = audio;
+        }
+        else if (audio instanceof Buffer) {
+            audioBase64 = audio.toString('base64');
+        }
+        else if (audio instanceof Uint8Array) {
+            audioBase64 = Buffer.from(audio).toString('base64');
+        }
+        else if (audio instanceof ArrayBuffer) {
+            audioBase64 = Buffer.from(audio).toString('base64');
+        }
+        else {
+            throw new Error('Invalid audio format');
+        }
+        // Build the transcription prompt
+        let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`;
+        if (language) {
+            transcriptionPrompt += `\nThe audio is in ${language}.`;
+        }
+        // Create the content with audio using the inline data format
+        const response = await genAI.models.generateContent({
+            model: 'gemini-2.5-flash',
+            contents: [
+                {
+                    parts: [
+                        { text: transcriptionPrompt },
+                        {
+                            inlineData: {
+                                data: audioBase64,
+                                mimeType: 'audio/mpeg',
+                            },
                         },
-                    },
+                    ],
+                },
+            ],
+            config: temperature !== undefined
+                ? {
+                    temperature,
                 }
-                : {}),
+                : undefined,
         });
-        return result.text;
+        return response.text || '';
     }
     catch (error) {
         voiceLogger.error('Failed to transcribe audio:', error);

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "kimaki",
   "module": "index.ts",
   "type": "module",
-  "version": "0.1.3",
+  "version": "0.1.5",
   "repository": "https://github.com/remorses/kimaki",
   "bin": "bin.js",
   "files": [
@@ -19,7 +19,7 @@
     "tsx": "^4.20.5"
   },
   "dependencies": {
-    "@ai-sdk/openai": "^2.0.23",
+    "@ai-sdk/google": "^2.0.16",
     "@clack/prompts": "^0.11.0",
     "@discordjs/opus": "^0.10.0",
     "@discordjs/voice": "^0.19.0",

package/src/cli.ts CHANGED Viewed

@@ -223,9 +223,9 @@ async function run({ restart, addChannels }: CliOptions) {
         "3. Copy the token (you won't be able to see it again!)",
       'Step 3: Get Bot Token',
     )
     const tokenInput = await password({
-      message: 'Enter your Discord Bot Token (will be hidden):',
+      message:
+        'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
       validate(value) {
         if (!value) return 'Bot token is required'
         if (value.length < 50) return 'Invalid token format (too short)'
@@ -238,6 +238,33 @@ async function run({ restart, addChannels }: CliOptions) {
     }
     token = tokenInput
+    note(
+      `You can get a Gemini api Key at https://aistudio.google.com/apikey`,
+      `Gemini API Key`,
+    )
+    const geminiApiKey = await password({
+      message:
+        'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
+      validate(value) {
+        if (value && value.length < 10) return 'Invalid API key format'
+        return undefined
+      },
+    })
+    if (isCancel(geminiApiKey)) {
+      cancel('Setup cancelled')
+      process.exit(0)
+    }
+    // Store API key in database
+    if (geminiApiKey) {
+      db.prepare(
+        'INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)',
+      ).run(appId, geminiApiKey || null)
+      note('API key saved successfully', 'API Key Stored')
+    }
     note(
       `Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`,
       'Step 4: Install Bot to Server',

package/src/discordBot.ts CHANGED Viewed

@@ -34,6 +34,7 @@ import { spawn, exec, type ChildProcess } from 'node:child_process'
 import fs, { createWriteStream } from 'node:fs'
 import { mkdir } from 'node:fs/promises'
 import net from 'node:net'
+import os from 'node:os'
 import path from 'node:path'
 import { promisify } from 'node:util'
 import { PassThrough, Transform, type TransformCallback } from 'node:stream'
@@ -152,10 +153,12 @@ async function setupVoiceHandling({
   connection,
   guildId,
   channelId,
+  appId,
 }: {
   connection: VoiceConnection
   guildId: string
   channelId: string
+  appId: string
 }) {
   voiceLogger.log(
     `Setting up voice handling for guild ${guildId}, channel ${channelId}`,
@@ -188,11 +191,18 @@ async function setupVoiceHandling({
   // Create user audio stream for debugging
   voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId)
+  // Get API keys from database
+  const apiKeys = getDatabase()
+    .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
+    .get(appId) as { gemini_api_key: string | null } | undefined
   // Create GenAI worker
   const genAiWorker = await createGenAIWorker({
     directory,
     guildId,
     channelId,
+    appId,
+    geminiApiKey: apiKeys?.gemini_api_key,
     systemMessage: dedent`
     You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
@@ -444,7 +454,19 @@ export function frameMono16khz(): Transform {
 export function getDatabase(): Database.Database {
   if (!db) {
-    db = new Database('discord-sessions.db')
+    // Create ~/.kimaki directory if it doesn't exist
+    const kimakiDir = path.join(os.homedir(), '.kimaki')
+    try {
+      fs.mkdirSync(kimakiDir, { recursive: true })
+    } catch (error) {
+      dbLogger.error('Failed to create ~/.kimaki directory:', error)
+    }
+    const dbPath = path.join(kimakiDir, 'discord-sessions.db')
+    dbLogger.log(`Opening database at: ${dbPath}`)
+    db = new Database(dbPath)
     // Initialize tables
     db.exec(`
@@ -480,6 +502,14 @@ export function getDatabase(): Database.Database {
         created_at DATETIME DEFAULT CURRENT_TIMESTAMP
       )
     `)
+    db.exec(`
+      CREATE TABLE IF NOT EXISTS bot_api_keys (
+        app_id TEXT PRIMARY KEY,
+        gemini_api_key TEXT,
+        created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+      )
+    `)
   }
   return db
@@ -614,11 +644,13 @@ async function processVoiceAttachment({
   thread,
   projectDirectory,
   isNewThread = false,
+  appId,
 }: {
   message: Message
   thread: ThreadChannel
   projectDirectory?: string
   isNewThread?: boolean
+  appId?: string
 }): Promise<string | null> {
   const audioAttachment = Array.from(message.attachments.values()).find(
     (attachment) => attachment.contentType?.startsWith('audio/'),
@@ -660,9 +692,22 @@ async function processVoiceAttachment({
     }
   }
+  // Get Gemini API key from database if appId is provided
+  let geminiApiKey: string | undefined
+  if (appId) {
+    const apiKeys = getDatabase()
+      .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
+      .get(appId) as { gemini_api_key: string | null } | undefined
+    if (apiKeys?.gemini_api_key) {
+      geminiApiKey = apiKeys.gemini_api_key
+    }
+  }
   const transcription = await transcribeAudio({
     audio: audioBuffer,
     prompt: transcriptionPrompt,
+    geminiApiKey,
   })
   voiceLogger.log(
@@ -1502,6 +1547,7 @@ export async function startDiscordBot({
       discordLogger.log(`Bot Application ID (provided): ${currentAppId}`)
     }
     // List all guilds and channels that belong to this bot
     for (const guild of c.guilds.cache.values()) {
       discordLogger.log(`${guild.name} (${guild.id})`)
@@ -1635,6 +1681,7 @@ export async function startDiscordBot({
           message,
           thread,
           projectDirectory,
+          appId: currentAppId,
         })
         if (transcription) {
           messageContent = transcription
@@ -1727,6 +1774,7 @@ export async function startDiscordBot({
           thread,
           projectDirectory,
           isNewThread: true,
+          appId: currentAppId,
         })
         if (transcription) {
           messageContent = transcription
@@ -2224,6 +2272,7 @@ export async function startDiscordBot({
           connection,
           guildId: newState.guild.id,
           channelId: voiceChannel.id,
+          appId: currentAppId!,
         })
         // Handle connection state changes
@@ -2313,7 +2362,10 @@ export async function startDiscordBot({
       opencodeServers.clear()
       discordLogger.log('Closing database...')
-      getDatabase().close()
+      if (db) {
+        db.close()
+        db = null
+      }
       discordLogger.log('Destroying Discord client...')
       discordClient.destroy()

package/src/genai-worker-wrapper.ts CHANGED Viewed

@@ -11,6 +11,8 @@ export interface GenAIWorkerOptions {
   systemMessage?: string
   guildId: string
   channelId: string
+  appId: string
+  geminiApiKey?: string | null
   onAssistantOpusPacket: (packet: ArrayBuffer) => void
   onAssistantStartSpeaking?: () => void
   onAssistantStopSpeaking?: () => void
@@ -146,6 +148,8 @@ export function createGenAIWorker(
       systemMessage: options.systemMessage,
       guildId: options.guildId,
       channelId: options.channelId,
+      appId: options.appId,
+      geminiApiKey: options.geminiApiKey,
     }
     worker.postMessage(initMessage)
   })

package/src/genai-worker.ts CHANGED Viewed

@@ -271,6 +271,7 @@ parentPort.on('message', async (message: WorkerInMessage) => {
         session = await startGenAiSession({
           tools,
           systemMessage: message.systemMessage,
+          geminiApiKey: message.geminiApiKey,
           onAssistantAudioChunk({ data }) {
             // Write to audio log if enabled
             if (audioLogStream && !audioLogStream.destroyed) {

package/src/genai.ts CHANGED Viewed

@@ -113,6 +113,7 @@ export async function startGenAiSession({
   onAssistantInterruptSpeaking,
   systemMessage,
   tools,
+  geminiApiKey,
 }: {
   onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
   onAssistantStartSpeaking?: () => void
@@ -120,6 +121,7 @@ export async function startGenAiSession({
   onAssistantInterruptSpeaking?: () => void
   systemMessage?: string
   tools?: Record<string, AITool<any, any>>
+  geminiApiKey?: string | null
 } = {}) {
   let session: Session | undefined = undefined
   const callableTools: Array<CallableTool & { name: string }> = []
@@ -242,8 +244,15 @@ export async function startGenAiSession({
     }
   }
+  const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
+  if (!apiKey) {
+    genaiLogger.error('No Gemini API key provided')
+    throw new Error('Gemini API key is required for voice interactions')
+  }
   const ai = new GoogleGenAI({
-    apiKey: process.env.GEMINI_API_KEY,
+    apiKey,
   })
   const model = 'models/gemini-2.5-flash-live-preview'

package/src/voice.ts CHANGED Viewed

@@ -1,5 +1,4 @@
-import { openai } from '@ai-sdk/openai'
-import { experimental_transcribe as transcribe } from 'ai'
+import { GoogleGenAI } from '@google/genai'
 import { createLogger } from './logger.js'
 const voiceLogger = createLogger('VOICE')
@@ -9,30 +8,70 @@ export async function transcribeAudio({
   prompt,
   language,
   temperature,
+  geminiApiKey,
 }: {
   audio: Buffer | Uint8Array | ArrayBuffer | string
   prompt?: string
   language?: string
   temperature?: number
+  geminiApiKey?: string
 }): Promise<string> {
   try {
-    const result = await transcribe({
-      model: openai.transcription('whisper-1'),
-      audio,
-      ...(prompt || language || temperature !== undefined
-        ? {
-            providerOptions: {
-              openai: {
-                ...(prompt && { prompt }),
-                ...(language && { language }),
-                ...(temperature !== undefined && { temperature }),
+    // Use provided API key or fall back to environment variable
+    const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
+    if (!apiKey) {
+      throw new Error('Gemini API key is required for audio transcription')
+    }
+    // Initialize Google Generative AI
+    const genAI = new GoogleGenAI({ apiKey })
+    // Convert audio to base64 string if it's not already
+    let audioBase64: string
+    if (typeof audio === 'string') {
+      audioBase64 = audio
+    } else if (audio instanceof Buffer) {
+      audioBase64 = audio.toString('base64')
+    } else if (audio instanceof Uint8Array) {
+      audioBase64 = Buffer.from(audio).toString('base64')
+    } else if (audio instanceof ArrayBuffer) {
+      audioBase64 = Buffer.from(audio).toString('base64')
+    } else {
+      throw new Error('Invalid audio format')
+    }
+    // Build the transcription prompt
+    let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`
+    if (language) {
+      transcriptionPrompt += `\nThe audio is in ${language}.`
+    }
+    // Create the content with audio using the inline data format
+    const response = await genAI.models.generateContent({
+      model: 'gemini-2.5-flash',
+      contents: [
+        {
+          parts: [
+            { text: transcriptionPrompt },
+            {
+              inlineData: {
+                data: audioBase64,
+                mimeType: 'audio/mpeg',
               },
             },
-          }
-        : {}),
+          ],
+        },
+      ],
+      config:
+        temperature !== undefined
+          ? {
+              temperature,
+            }
+          : undefined,
     })
-    return result.text
+    return response.text || ''
   } catch (error) {
     voiceLogger.error('Failed to transcribe audio:', error)
     throw new Error(

package/src/worker-types.ts CHANGED Viewed

@@ -8,6 +8,8 @@ export type WorkerInMessage =
       systemMessage?: string
       guildId: string
       channelId: string
+      appId: string
+      geminiApiKey?: string | null
     }
   | {
       type: 'sendRealtimeInput'