kimaki 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -129,7 +129,7 @@ async function run({ restart, addChannels }) {
129
129
  '2. Click "Reset Token" to generate a new bot token (in case of errors try again)\n' +
130
130
  "3. Copy the token (you won't be able to see it again!)", 'Step 3: Get Bot Token');
131
131
  const tokenInput = await password({
132
- message: 'Enter your Discord Bot Token (will be hidden):',
132
+ message: 'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
133
133
  validate(value) {
134
134
  if (!value)
135
135
  return 'Bot token is required';
@@ -142,6 +142,24 @@ async function run({ restart, addChannels }) {
142
142
  process.exit(0);
143
143
  }
144
144
  token = tokenInput;
145
+ note(`You can get a Gemini api Key at https://aistudio.google.com/apikey`, `Gemini API Key`);
146
+ const geminiApiKey = await password({
147
+ message: 'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
148
+ validate(value) {
149
+ if (value && value.length < 10)
150
+ return 'Invalid API key format';
151
+ return undefined;
152
+ },
153
+ });
154
+ if (isCancel(geminiApiKey)) {
155
+ cancel('Setup cancelled');
156
+ process.exit(0);
157
+ }
158
+ // Store API key in database
159
+ if (geminiApiKey) {
160
+ db.prepare('INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)').run(appId, geminiApiKey || null);
161
+ note('API key saved successfully', 'API Key Stored');
162
+ }
145
163
  note(`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`, 'Step 4: Install Bot to Server');
146
164
  const installed = await text({
147
165
  message: 'Press Enter AFTER you have installed the bot in your server:',
@@ -8,6 +8,7 @@ import { spawn, exec } from 'node:child_process';
8
8
  import fs, { createWriteStream } from 'node:fs';
9
9
  import { mkdir } from 'node:fs/promises';
10
10
  import net from 'node:net';
11
+ import os from 'node:os';
11
12
  import path from 'node:path';
12
13
  import { promisify } from 'node:util';
13
14
  import { PassThrough, Transform } from 'node:stream';
@@ -78,7 +79,7 @@ async function createUserAudioLogStream(guildId, channelId) {
78
79
  }
79
80
  }
80
81
  // Set up voice handling for a connection (called once per connection)
81
- async function setupVoiceHandling({ connection, guildId, channelId, }) {
82
+ async function setupVoiceHandling({ connection, guildId, channelId, appId, }) {
82
83
  voiceLogger.log(`Setting up voice handling for guild ${guildId}, channel ${channelId}`);
83
84
  // Check if this voice channel has an associated directory
84
85
  const channelDirRow = getDatabase()
@@ -98,11 +99,17 @@ async function setupVoiceHandling({ connection, guildId, channelId, }) {
98
99
  }
99
100
  // Create user audio stream for debugging
100
101
  voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId);
102
+ // Get API keys from database
103
+ const apiKeys = getDatabase()
104
+ .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
105
+ .get(appId);
101
106
  // Create GenAI worker
102
107
  const genAiWorker = await createGenAIWorker({
103
108
  directory,
104
109
  guildId,
105
110
  channelId,
111
+ appId,
112
+ geminiApiKey: apiKeys?.gemini_api_key,
106
113
  systemMessage: dedent `
107
114
  You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
108
115
 
@@ -316,7 +323,17 @@ export function frameMono16khz() {
316
323
  }
317
324
  export function getDatabase() {
318
325
  if (!db) {
319
- db = new Database('discord-sessions.db');
326
+ // Create ~/.kimaki directory if it doesn't exist
327
+ const kimakiDir = path.join(os.homedir(), '.kimaki');
328
+ try {
329
+ fs.mkdirSync(kimakiDir, { recursive: true });
330
+ }
331
+ catch (error) {
332
+ dbLogger.error('Failed to create ~/.kimaki directory:', error);
333
+ }
334
+ const dbPath = path.join(kimakiDir, 'discord-sessions.db');
335
+ dbLogger.log(`Opening database at: ${dbPath}`);
336
+ db = new Database(dbPath);
320
337
  // Initialize tables
321
338
  db.exec(`
322
339
  CREATE TABLE IF NOT EXISTS thread_sessions (
@@ -347,6 +364,13 @@ export function getDatabase() {
347
364
  channel_type TEXT NOT NULL,
348
365
  created_at DATETIME DEFAULT CURRENT_TIMESTAMP
349
366
  )
367
+ `);
368
+ db.exec(`
369
+ CREATE TABLE IF NOT EXISTS bot_api_keys (
370
+ app_id TEXT PRIMARY KEY,
371
+ gemini_api_key TEXT,
372
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
373
+ )
350
374
  `);
351
375
  }
352
376
  return db;
@@ -458,7 +482,7 @@ async function waitForServer(port, maxAttempts = 30) {
458
482
  }
459
483
  throw new Error(`Server did not start on port ${port} after ${maxAttempts} seconds`);
460
484
  }
461
- async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, }) {
485
+ async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, appId, }) {
462
486
  const audioAttachment = Array.from(message.attachments.values()).find((attachment) => attachment.contentType?.startsWith('audio/'));
463
487
  if (!audioAttachment)
464
488
  return null;
@@ -488,9 +512,20 @@ async function processVoiceAttachment({ message, thread, projectDirectory, isNew
488
512
  voiceLogger.log(`Could not get project tree:`, e);
489
513
  }
490
514
  }
515
+ // Get Gemini API key from database if appId is provided
516
+ let geminiApiKey;
517
+ if (appId) {
518
+ const apiKeys = getDatabase()
519
+ .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
520
+ .get(appId);
521
+ if (apiKeys?.gemini_api_key) {
522
+ geminiApiKey = apiKeys.gemini_api_key;
523
+ }
524
+ }
491
525
  const transcription = await transcribeAudio({
492
526
  audio: audioBuffer,
493
527
  prompt: transcriptionPrompt,
528
+ geminiApiKey,
494
529
  });
495
530
  voiceLogger.log(`Transcription successful: "${transcription.slice(0, 50)}${transcription.length > 50 ? '...' : ''}"`);
496
531
  // Update thread name with transcribed content only for new threads
@@ -1233,6 +1268,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
1233
1268
  message,
1234
1269
  thread,
1235
1270
  projectDirectory,
1271
+ appId: currentAppId,
1236
1272
  });
1237
1273
  if (transcription) {
1238
1274
  messageContent = transcription;
@@ -1291,6 +1327,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
1291
1327
  thread,
1292
1328
  projectDirectory,
1293
1329
  isNewThread: true,
1330
+ appId: currentAppId,
1294
1331
  });
1295
1332
  if (transcription) {
1296
1333
  messageContent = transcription;
@@ -1651,6 +1688,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
1651
1688
  connection,
1652
1689
  guildId: newState.guild.id,
1653
1690
  channelId: voiceChannel.id,
1691
+ appId: currentAppId,
1654
1692
  });
1655
1693
  // Handle connection state changes
1656
1694
  connection.on(VoiceConnectionStatus.Disconnected, async () => {
@@ -1721,7 +1759,10 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
1721
1759
  }
1722
1760
  opencodeServers.clear();
1723
1761
  discordLogger.log('Closing database...');
1724
- getDatabase().close();
1762
+ if (db) {
1763
+ db.close();
1764
+ db = null;
1765
+ }
1725
1766
  discordLogger.log('Destroying Discord client...');
1726
1767
  discordClient.destroy();
1727
1768
  discordLogger.log('Cleanup complete, exiting.');
@@ -98,6 +98,8 @@ export function createGenAIWorker(options) {
98
98
  systemMessage: options.systemMessage,
99
99
  guildId: options.guildId,
100
100
  channelId: options.channelId,
101
+ appId: options.appId,
102
+ geminiApiKey: options.geminiApiKey,
101
103
  };
102
104
  worker.postMessage(initMessage);
103
105
  });
@@ -210,6 +210,7 @@ parentPort.on('message', async (message) => {
210
210
  session = await startGenAiSession({
211
211
  tools,
212
212
  systemMessage: message.systemMessage,
213
+ geminiApiKey: message.geminiApiKey,
213
214
  onAssistantAudioChunk({ data }) {
214
215
  // Write to audio log if enabled
215
216
  if (audioLogStream && !audioLogStream.destroyed) {
package/dist/genai.js CHANGED
@@ -68,7 +68,7 @@ function defaultAudioChunkHandler({ data, mimeType, }) {
68
68
  const buffer = convertToWav(audioParts, mimeType);
69
69
  saveBinaryFile(fileName, buffer);
70
70
  }
71
- export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, } = {}) {
71
+ export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, geminiApiKey, } = {}) {
72
72
  let session = undefined;
73
73
  const callableTools = [];
74
74
  let isAssistantSpeaking = false;
@@ -161,8 +161,13 @@ export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStar
161
161
  }
162
162
  }
163
163
  }
164
+ const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
165
+ if (!apiKey) {
166
+ genaiLogger.error('No Gemini API key provided');
167
+ throw new Error('Gemini API key is required for voice interactions');
168
+ }
164
169
  const ai = new GoogleGenAI({
165
- apiKey: process.env.GEMINI_API_KEY,
170
+ apiKey,
166
171
  });
167
172
  const model = 'models/gemini-2.5-flash-live-preview';
168
173
  session = await ai.live.connect({
package/dist/voice.js CHANGED
@@ -1,25 +1,60 @@
1
- import { openai } from '@ai-sdk/openai';
2
- import { experimental_transcribe as transcribe } from 'ai';
1
+ import { GoogleGenAI } from '@google/genai';
3
2
  import { createLogger } from './logger.js';
4
3
  const voiceLogger = createLogger('VOICE');
5
- export async function transcribeAudio({ audio, prompt, language, temperature, }) {
4
+ export async function transcribeAudio({ audio, prompt, language, temperature, geminiApiKey, }) {
6
5
  try {
7
- const result = await transcribe({
8
- model: openai.transcription('whisper-1'),
9
- audio,
10
- ...(prompt || language || temperature !== undefined
11
- ? {
12
- providerOptions: {
13
- openai: {
14
- ...(prompt && { prompt }),
15
- ...(language && { language }),
16
- ...(temperature !== undefined && { temperature }),
6
+ // Use provided API key or fall back to environment variable
7
+ const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
8
+ if (!apiKey) {
9
+ throw new Error('Gemini API key is required for audio transcription');
10
+ }
11
+ // Initialize Google Generative AI
12
+ const genAI = new GoogleGenAI({ apiKey });
13
+ // Convert audio to base64 string if it's not already
14
+ let audioBase64;
15
+ if (typeof audio === 'string') {
16
+ audioBase64 = audio;
17
+ }
18
+ else if (audio instanceof Buffer) {
19
+ audioBase64 = audio.toString('base64');
20
+ }
21
+ else if (audio instanceof Uint8Array) {
22
+ audioBase64 = Buffer.from(audio).toString('base64');
23
+ }
24
+ else if (audio instanceof ArrayBuffer) {
25
+ audioBase64 = Buffer.from(audio).toString('base64');
26
+ }
27
+ else {
28
+ throw new Error('Invalid audio format');
29
+ }
30
+ // Build the transcription prompt
31
+ let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`;
32
+ if (language) {
33
+ transcriptionPrompt += `\nThe audio is in ${language}.`;
34
+ }
35
+ // Create the content with audio using the inline data format
36
+ const response = await genAI.models.generateContent({
37
+ model: 'gemini-2.5-flash',
38
+ contents: [
39
+ {
40
+ parts: [
41
+ { text: transcriptionPrompt },
42
+ {
43
+ inlineData: {
44
+ data: audioBase64,
45
+ mimeType: 'audio/mpeg',
46
+ },
17
47
  },
18
- },
48
+ ],
49
+ },
50
+ ],
51
+ config: temperature !== undefined
52
+ ? {
53
+ temperature,
19
54
  }
20
- : {}),
55
+ : undefined,
21
56
  });
22
- return result.text;
57
+ return response.text || '';
23
58
  }
24
59
  catch (error) {
25
60
  voiceLogger.error('Failed to transcribe audio:', error);
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "kimaki",
3
3
  "module": "index.ts",
4
4
  "type": "module",
5
- "version": "0.1.3",
5
+ "version": "0.1.5",
6
6
  "repository": "https://github.com/remorses/kimaki",
7
7
  "bin": "bin.js",
8
8
  "files": [
@@ -19,7 +19,7 @@
19
19
  "tsx": "^4.20.5"
20
20
  },
21
21
  "dependencies": {
22
- "@ai-sdk/openai": "^2.0.23",
22
+ "@ai-sdk/google": "^2.0.16",
23
23
  "@clack/prompts": "^0.11.0",
24
24
  "@discordjs/opus": "^0.10.0",
25
25
  "@discordjs/voice": "^0.19.0",
package/src/cli.ts CHANGED
@@ -223,9 +223,9 @@ async function run({ restart, addChannels }: CliOptions) {
223
223
  "3. Copy the token (you won't be able to see it again!)",
224
224
  'Step 3: Get Bot Token',
225
225
  )
226
-
227
226
  const tokenInput = await password({
228
- message: 'Enter your Discord Bot Token (will be hidden):',
227
+ message:
228
+ 'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
229
229
  validate(value) {
230
230
  if (!value) return 'Bot token is required'
231
231
  if (value.length < 50) return 'Invalid token format (too short)'
@@ -238,6 +238,33 @@ async function run({ restart, addChannels }: CliOptions) {
238
238
  }
239
239
  token = tokenInput
240
240
 
241
+ note(
242
+ `You can get a Gemini api Key at https://aistudio.google.com/apikey`,
243
+ `Gemini API Key`,
244
+ )
245
+
246
+ const geminiApiKey = await password({
247
+ message:
248
+ 'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
249
+ validate(value) {
250
+ if (value && value.length < 10) return 'Invalid API key format'
251
+ return undefined
252
+ },
253
+ })
254
+
255
+ if (isCancel(geminiApiKey)) {
256
+ cancel('Setup cancelled')
257
+ process.exit(0)
258
+ }
259
+
260
+ // Store API key in database
261
+ if (geminiApiKey) {
262
+ db.prepare(
263
+ 'INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)',
264
+ ).run(appId, geminiApiKey || null)
265
+ note('API key saved successfully', 'API Key Stored')
266
+ }
267
+
241
268
  note(
242
269
  `Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`,
243
270
  'Step 4: Install Bot to Server',
package/src/discordBot.ts CHANGED
@@ -34,6 +34,7 @@ import { spawn, exec, type ChildProcess } from 'node:child_process'
34
34
  import fs, { createWriteStream } from 'node:fs'
35
35
  import { mkdir } from 'node:fs/promises'
36
36
  import net from 'node:net'
37
+ import os from 'node:os'
37
38
  import path from 'node:path'
38
39
  import { promisify } from 'node:util'
39
40
  import { PassThrough, Transform, type TransformCallback } from 'node:stream'
@@ -152,10 +153,12 @@ async function setupVoiceHandling({
152
153
  connection,
153
154
  guildId,
154
155
  channelId,
156
+ appId,
155
157
  }: {
156
158
  connection: VoiceConnection
157
159
  guildId: string
158
160
  channelId: string
161
+ appId: string
159
162
  }) {
160
163
  voiceLogger.log(
161
164
  `Setting up voice handling for guild ${guildId}, channel ${channelId}`,
@@ -188,11 +191,18 @@ async function setupVoiceHandling({
188
191
  // Create user audio stream for debugging
189
192
  voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId)
190
193
 
194
+ // Get API keys from database
195
+ const apiKeys = getDatabase()
196
+ .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
197
+ .get(appId) as { gemini_api_key: string | null } | undefined
198
+
191
199
  // Create GenAI worker
192
200
  const genAiWorker = await createGenAIWorker({
193
201
  directory,
194
202
  guildId,
195
203
  channelId,
204
+ appId,
205
+ geminiApiKey: apiKeys?.gemini_api_key,
196
206
  systemMessage: dedent`
197
207
  You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
198
208
 
@@ -444,7 +454,19 @@ export function frameMono16khz(): Transform {
444
454
 
445
455
  export function getDatabase(): Database.Database {
446
456
  if (!db) {
447
- db = new Database('discord-sessions.db')
457
+ // Create ~/.kimaki directory if it doesn't exist
458
+ const kimakiDir = path.join(os.homedir(), '.kimaki')
459
+
460
+ try {
461
+ fs.mkdirSync(kimakiDir, { recursive: true })
462
+ } catch (error) {
463
+ dbLogger.error('Failed to create ~/.kimaki directory:', error)
464
+ }
465
+
466
+ const dbPath = path.join(kimakiDir, 'discord-sessions.db')
467
+
468
+ dbLogger.log(`Opening database at: ${dbPath}`)
469
+ db = new Database(dbPath)
448
470
 
449
471
  // Initialize tables
450
472
  db.exec(`
@@ -480,6 +502,14 @@ export function getDatabase(): Database.Database {
480
502
  created_at DATETIME DEFAULT CURRENT_TIMESTAMP
481
503
  )
482
504
  `)
505
+
506
+ db.exec(`
507
+ CREATE TABLE IF NOT EXISTS bot_api_keys (
508
+ app_id TEXT PRIMARY KEY,
509
+ gemini_api_key TEXT,
510
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
511
+ )
512
+ `)
483
513
  }
484
514
 
485
515
  return db
@@ -614,11 +644,13 @@ async function processVoiceAttachment({
614
644
  thread,
615
645
  projectDirectory,
616
646
  isNewThread = false,
647
+ appId,
617
648
  }: {
618
649
  message: Message
619
650
  thread: ThreadChannel
620
651
  projectDirectory?: string
621
652
  isNewThread?: boolean
653
+ appId?: string
622
654
  }): Promise<string | null> {
623
655
  const audioAttachment = Array.from(message.attachments.values()).find(
624
656
  (attachment) => attachment.contentType?.startsWith('audio/'),
@@ -660,9 +692,22 @@ async function processVoiceAttachment({
660
692
  }
661
693
  }
662
694
 
695
+ // Get Gemini API key from database if appId is provided
696
+ let geminiApiKey: string | undefined
697
+ if (appId) {
698
+ const apiKeys = getDatabase()
699
+ .prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
700
+ .get(appId) as { gemini_api_key: string | null } | undefined
701
+
702
+ if (apiKeys?.gemini_api_key) {
703
+ geminiApiKey = apiKeys.gemini_api_key
704
+ }
705
+ }
706
+
663
707
  const transcription = await transcribeAudio({
664
708
  audio: audioBuffer,
665
709
  prompt: transcriptionPrompt,
710
+ geminiApiKey,
666
711
  })
667
712
 
668
713
  voiceLogger.log(
@@ -1502,6 +1547,7 @@ export async function startDiscordBot({
1502
1547
  discordLogger.log(`Bot Application ID (provided): ${currentAppId}`)
1503
1548
  }
1504
1549
 
1550
+
1505
1551
  // List all guilds and channels that belong to this bot
1506
1552
  for (const guild of c.guilds.cache.values()) {
1507
1553
  discordLogger.log(`${guild.name} (${guild.id})`)
@@ -1635,6 +1681,7 @@ export async function startDiscordBot({
1635
1681
  message,
1636
1682
  thread,
1637
1683
  projectDirectory,
1684
+ appId: currentAppId,
1638
1685
  })
1639
1686
  if (transcription) {
1640
1687
  messageContent = transcription
@@ -1727,6 +1774,7 @@ export async function startDiscordBot({
1727
1774
  thread,
1728
1775
  projectDirectory,
1729
1776
  isNewThread: true,
1777
+ appId: currentAppId,
1730
1778
  })
1731
1779
  if (transcription) {
1732
1780
  messageContent = transcription
@@ -2224,6 +2272,7 @@ export async function startDiscordBot({
2224
2272
  connection,
2225
2273
  guildId: newState.guild.id,
2226
2274
  channelId: voiceChannel.id,
2275
+ appId: currentAppId!,
2227
2276
  })
2228
2277
 
2229
2278
  // Handle connection state changes
@@ -2313,7 +2362,10 @@ export async function startDiscordBot({
2313
2362
  opencodeServers.clear()
2314
2363
 
2315
2364
  discordLogger.log('Closing database...')
2316
- getDatabase().close()
2365
+ if (db) {
2366
+ db.close()
2367
+ db = null
2368
+ }
2317
2369
 
2318
2370
  discordLogger.log('Destroying Discord client...')
2319
2371
  discordClient.destroy()
@@ -11,6 +11,8 @@ export interface GenAIWorkerOptions {
11
11
  systemMessage?: string
12
12
  guildId: string
13
13
  channelId: string
14
+ appId: string
15
+ geminiApiKey?: string | null
14
16
  onAssistantOpusPacket: (packet: ArrayBuffer) => void
15
17
  onAssistantStartSpeaking?: () => void
16
18
  onAssistantStopSpeaking?: () => void
@@ -146,6 +148,8 @@ export function createGenAIWorker(
146
148
  systemMessage: options.systemMessage,
147
149
  guildId: options.guildId,
148
150
  channelId: options.channelId,
151
+ appId: options.appId,
152
+ geminiApiKey: options.geminiApiKey,
149
153
  }
150
154
  worker.postMessage(initMessage)
151
155
  })
@@ -271,6 +271,7 @@ parentPort.on('message', async (message: WorkerInMessage) => {
271
271
  session = await startGenAiSession({
272
272
  tools,
273
273
  systemMessage: message.systemMessage,
274
+ geminiApiKey: message.geminiApiKey,
274
275
  onAssistantAudioChunk({ data }) {
275
276
  // Write to audio log if enabled
276
277
  if (audioLogStream && !audioLogStream.destroyed) {
package/src/genai.ts CHANGED
@@ -113,6 +113,7 @@ export async function startGenAiSession({
113
113
  onAssistantInterruptSpeaking,
114
114
  systemMessage,
115
115
  tools,
116
+ geminiApiKey,
116
117
  }: {
117
118
  onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
118
119
  onAssistantStartSpeaking?: () => void
@@ -120,6 +121,7 @@ export async function startGenAiSession({
120
121
  onAssistantInterruptSpeaking?: () => void
121
122
  systemMessage?: string
122
123
  tools?: Record<string, AITool<any, any>>
124
+ geminiApiKey?: string | null
123
125
  } = {}) {
124
126
  let session: Session | undefined = undefined
125
127
  const callableTools: Array<CallableTool & { name: string }> = []
@@ -242,8 +244,15 @@ export async function startGenAiSession({
242
244
  }
243
245
  }
244
246
 
247
+ const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
248
+
249
+ if (!apiKey) {
250
+ genaiLogger.error('No Gemini API key provided')
251
+ throw new Error('Gemini API key is required for voice interactions')
252
+ }
253
+
245
254
  const ai = new GoogleGenAI({
246
- apiKey: process.env.GEMINI_API_KEY,
255
+ apiKey,
247
256
  })
248
257
 
249
258
  const model = 'models/gemini-2.5-flash-live-preview'
package/src/voice.ts CHANGED
@@ -1,5 +1,4 @@
1
- import { openai } from '@ai-sdk/openai'
2
- import { experimental_transcribe as transcribe } from 'ai'
1
+ import { GoogleGenAI } from '@google/genai'
3
2
  import { createLogger } from './logger.js'
4
3
 
5
4
  const voiceLogger = createLogger('VOICE')
@@ -9,30 +8,70 @@ export async function transcribeAudio({
9
8
  prompt,
10
9
  language,
11
10
  temperature,
11
+ geminiApiKey,
12
12
  }: {
13
13
  audio: Buffer | Uint8Array | ArrayBuffer | string
14
14
  prompt?: string
15
15
  language?: string
16
16
  temperature?: number
17
+ geminiApiKey?: string
17
18
  }): Promise<string> {
18
19
  try {
19
- const result = await transcribe({
20
- model: openai.transcription('whisper-1'),
21
- audio,
22
- ...(prompt || language || temperature !== undefined
23
- ? {
24
- providerOptions: {
25
- openai: {
26
- ...(prompt && { prompt }),
27
- ...(language && { language }),
28
- ...(temperature !== undefined && { temperature }),
20
+ // Use provided API key or fall back to environment variable
21
+ const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
22
+
23
+ if (!apiKey) {
24
+ throw new Error('Gemini API key is required for audio transcription')
25
+ }
26
+
27
+ // Initialize Google Generative AI
28
+ const genAI = new GoogleGenAI({ apiKey })
29
+
30
+ // Convert audio to base64 string if it's not already
31
+ let audioBase64: string
32
+ if (typeof audio === 'string') {
33
+ audioBase64 = audio
34
+ } else if (audio instanceof Buffer) {
35
+ audioBase64 = audio.toString('base64')
36
+ } else if (audio instanceof Uint8Array) {
37
+ audioBase64 = Buffer.from(audio).toString('base64')
38
+ } else if (audio instanceof ArrayBuffer) {
39
+ audioBase64 = Buffer.from(audio).toString('base64')
40
+ } else {
41
+ throw new Error('Invalid audio format')
42
+ }
43
+
44
+ // Build the transcription prompt
45
+ let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`
46
+ if (language) {
47
+ transcriptionPrompt += `\nThe audio is in ${language}.`
48
+ }
49
+
50
+ // Create the content with audio using the inline data format
51
+ const response = await genAI.models.generateContent({
52
+ model: 'gemini-2.5-flash',
53
+ contents: [
54
+ {
55
+ parts: [
56
+ { text: transcriptionPrompt },
57
+ {
58
+ inlineData: {
59
+ data: audioBase64,
60
+ mimeType: 'audio/mpeg',
29
61
  },
30
62
  },
31
- }
32
- : {}),
63
+ ],
64
+ },
65
+ ],
66
+ config:
67
+ temperature !== undefined
68
+ ? {
69
+ temperature,
70
+ }
71
+ : undefined,
33
72
  })
34
73
 
35
- return result.text
74
+ return response.text || ''
36
75
  } catch (error) {
37
76
  voiceLogger.error('Failed to transcribe audio:', error)
38
77
  throw new Error(
@@ -8,6 +8,8 @@ export type WorkerInMessage =
8
8
  systemMessage?: string
9
9
  guildId: string
10
10
  channelId: string
11
+ appId: string
12
+ geminiApiKey?: string | null
11
13
  }
12
14
  | {
13
15
  type: 'sendRealtimeInput'