kimaki 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +19 -1
- package/dist/discordBot.js +29 -2
- package/dist/genai-worker-wrapper.js +2 -0
- package/dist/genai-worker.js +1 -0
- package/dist/genai.js +7 -2
- package/dist/voice.js +51 -16
- package/package.json +2 -2
- package/src/cli.ts +29 -2
- package/src/discordBot.ts +35 -0
- package/src/genai-worker-wrapper.ts +4 -0
- package/src/genai-worker.ts +1 -0
- package/src/genai.ts +10 -1
- package/src/voice.ts +54 -15
- package/src/worker-types.ts +2 -0
package/dist/cli.js
CHANGED
|
@@ -129,7 +129,7 @@ async function run({ restart, addChannels }) {
|
|
|
129
129
|
'2. Click "Reset Token" to generate a new bot token (in case of errors try again)\n' +
|
|
130
130
|
"3. Copy the token (you won't be able to see it again!)", 'Step 3: Get Bot Token');
|
|
131
131
|
const tokenInput = await password({
|
|
132
|
-
message: 'Enter your Discord Bot Token (
|
|
132
|
+
message: 'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
|
|
133
133
|
validate(value) {
|
|
134
134
|
if (!value)
|
|
135
135
|
return 'Bot token is required';
|
|
@@ -142,6 +142,24 @@ async function run({ restart, addChannels }) {
|
|
|
142
142
|
process.exit(0);
|
|
143
143
|
}
|
|
144
144
|
token = tokenInput;
|
|
145
|
+
note(`You can get a Gemini api Key at https://aistudio.google.com/apikey`, `Gemini API Key`);
|
|
146
|
+
const geminiApiKey = await password({
|
|
147
|
+
message: 'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
|
|
148
|
+
validate(value) {
|
|
149
|
+
if (value && value.length < 10)
|
|
150
|
+
return 'Invalid API key format';
|
|
151
|
+
return undefined;
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
if (isCancel(geminiApiKey)) {
|
|
155
|
+
cancel('Setup cancelled');
|
|
156
|
+
process.exit(0);
|
|
157
|
+
}
|
|
158
|
+
// Store API key in database
|
|
159
|
+
if (geminiApiKey) {
|
|
160
|
+
db.prepare('INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)').run(appId, geminiApiKey || null);
|
|
161
|
+
note('API key saved successfully', 'API Key Stored');
|
|
162
|
+
}
|
|
145
163
|
note(`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`, 'Step 4: Install Bot to Server');
|
|
146
164
|
const installed = await text({
|
|
147
165
|
message: 'Press Enter AFTER you have installed the bot in your server:',
|
package/dist/discordBot.js
CHANGED
|
@@ -78,7 +78,7 @@ async function createUserAudioLogStream(guildId, channelId) {
|
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
// Set up voice handling for a connection (called once per connection)
|
|
81
|
-
async function setupVoiceHandling({ connection, guildId, channelId, }) {
|
|
81
|
+
async function setupVoiceHandling({ connection, guildId, channelId, appId, }) {
|
|
82
82
|
voiceLogger.log(`Setting up voice handling for guild ${guildId}, channel ${channelId}`);
|
|
83
83
|
// Check if this voice channel has an associated directory
|
|
84
84
|
const channelDirRow = getDatabase()
|
|
@@ -98,11 +98,17 @@ async function setupVoiceHandling({ connection, guildId, channelId, }) {
|
|
|
98
98
|
}
|
|
99
99
|
// Create user audio stream for debugging
|
|
100
100
|
voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId);
|
|
101
|
+
// Get API keys from database
|
|
102
|
+
const apiKeys = getDatabase()
|
|
103
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
104
|
+
.get(appId);
|
|
101
105
|
// Create GenAI worker
|
|
102
106
|
const genAiWorker = await createGenAIWorker({
|
|
103
107
|
directory,
|
|
104
108
|
guildId,
|
|
105
109
|
channelId,
|
|
110
|
+
appId,
|
|
111
|
+
geminiApiKey: apiKeys?.gemini_api_key,
|
|
106
112
|
systemMessage: dedent `
|
|
107
113
|
You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
|
|
108
114
|
|
|
@@ -347,6 +353,13 @@ export function getDatabase() {
|
|
|
347
353
|
channel_type TEXT NOT NULL,
|
|
348
354
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
349
355
|
)
|
|
356
|
+
`);
|
|
357
|
+
db.exec(`
|
|
358
|
+
CREATE TABLE IF NOT EXISTS bot_api_keys (
|
|
359
|
+
app_id TEXT PRIMARY KEY,
|
|
360
|
+
gemini_api_key TEXT,
|
|
361
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
362
|
+
)
|
|
350
363
|
`);
|
|
351
364
|
}
|
|
352
365
|
return db;
|
|
@@ -458,7 +471,7 @@ async function waitForServer(port, maxAttempts = 30) {
|
|
|
458
471
|
}
|
|
459
472
|
throw new Error(`Server did not start on port ${port} after ${maxAttempts} seconds`);
|
|
460
473
|
}
|
|
461
|
-
async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, }) {
|
|
474
|
+
async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, appId, }) {
|
|
462
475
|
const audioAttachment = Array.from(message.attachments.values()).find((attachment) => attachment.contentType?.startsWith('audio/'));
|
|
463
476
|
if (!audioAttachment)
|
|
464
477
|
return null;
|
|
@@ -488,9 +501,20 @@ async function processVoiceAttachment({ message, thread, projectDirectory, isNew
|
|
|
488
501
|
voiceLogger.log(`Could not get project tree:`, e);
|
|
489
502
|
}
|
|
490
503
|
}
|
|
504
|
+
// Get Gemini API key from database if appId is provided
|
|
505
|
+
let geminiApiKey;
|
|
506
|
+
if (appId) {
|
|
507
|
+
const apiKeys = getDatabase()
|
|
508
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
509
|
+
.get(appId);
|
|
510
|
+
if (apiKeys?.gemini_api_key) {
|
|
511
|
+
geminiApiKey = apiKeys.gemini_api_key;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
491
514
|
const transcription = await transcribeAudio({
|
|
492
515
|
audio: audioBuffer,
|
|
493
516
|
prompt: transcriptionPrompt,
|
|
517
|
+
geminiApiKey,
|
|
494
518
|
});
|
|
495
519
|
voiceLogger.log(`Transcription successful: "${transcription.slice(0, 50)}${transcription.length > 50 ? '...' : ''}"`);
|
|
496
520
|
// Update thread name with transcribed content only for new threads
|
|
@@ -1233,6 +1257,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1233
1257
|
message,
|
|
1234
1258
|
thread,
|
|
1235
1259
|
projectDirectory,
|
|
1260
|
+
appId: currentAppId,
|
|
1236
1261
|
});
|
|
1237
1262
|
if (transcription) {
|
|
1238
1263
|
messageContent = transcription;
|
|
@@ -1291,6 +1316,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1291
1316
|
thread,
|
|
1292
1317
|
projectDirectory,
|
|
1293
1318
|
isNewThread: true,
|
|
1319
|
+
appId: currentAppId,
|
|
1294
1320
|
});
|
|
1295
1321
|
if (transcription) {
|
|
1296
1322
|
messageContent = transcription;
|
|
@@ -1651,6 +1677,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1651
1677
|
connection,
|
|
1652
1678
|
guildId: newState.guild.id,
|
|
1653
1679
|
channelId: voiceChannel.id,
|
|
1680
|
+
appId: currentAppId,
|
|
1654
1681
|
});
|
|
1655
1682
|
// Handle connection state changes
|
|
1656
1683
|
connection.on(VoiceConnectionStatus.Disconnected, async () => {
|
|
@@ -98,6 +98,8 @@ export function createGenAIWorker(options) {
|
|
|
98
98
|
systemMessage: options.systemMessage,
|
|
99
99
|
guildId: options.guildId,
|
|
100
100
|
channelId: options.channelId,
|
|
101
|
+
appId: options.appId,
|
|
102
|
+
geminiApiKey: options.geminiApiKey,
|
|
101
103
|
};
|
|
102
104
|
worker.postMessage(initMessage);
|
|
103
105
|
});
|
package/dist/genai-worker.js
CHANGED
|
@@ -210,6 +210,7 @@ parentPort.on('message', async (message) => {
|
|
|
210
210
|
session = await startGenAiSession({
|
|
211
211
|
tools,
|
|
212
212
|
systemMessage: message.systemMessage,
|
|
213
|
+
geminiApiKey: message.geminiApiKey,
|
|
213
214
|
onAssistantAudioChunk({ data }) {
|
|
214
215
|
// Write to audio log if enabled
|
|
215
216
|
if (audioLogStream && !audioLogStream.destroyed) {
|
package/dist/genai.js
CHANGED
|
@@ -68,7 +68,7 @@ function defaultAudioChunkHandler({ data, mimeType, }) {
|
|
|
68
68
|
const buffer = convertToWav(audioParts, mimeType);
|
|
69
69
|
saveBinaryFile(fileName, buffer);
|
|
70
70
|
}
|
|
71
|
-
export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, } = {}) {
|
|
71
|
+
export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, geminiApiKey, } = {}) {
|
|
72
72
|
let session = undefined;
|
|
73
73
|
const callableTools = [];
|
|
74
74
|
let isAssistantSpeaking = false;
|
|
@@ -161,8 +161,13 @@ export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStar
|
|
|
161
161
|
}
|
|
162
162
|
}
|
|
163
163
|
}
|
|
164
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
|
|
165
|
+
if (!apiKey) {
|
|
166
|
+
genaiLogger.error('No Gemini API key provided');
|
|
167
|
+
throw new Error('Gemini API key is required for voice interactions');
|
|
168
|
+
}
|
|
164
169
|
const ai = new GoogleGenAI({
|
|
165
|
-
apiKey
|
|
170
|
+
apiKey,
|
|
166
171
|
});
|
|
167
172
|
const model = 'models/gemini-2.5-flash-live-preview';
|
|
168
173
|
session = await ai.live.connect({
|
package/dist/voice.js
CHANGED
|
@@ -1,25 +1,60 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { experimental_transcribe as transcribe } from 'ai';
|
|
1
|
+
import { GoogleGenAI } from '@google/genai';
|
|
3
2
|
import { createLogger } from './logger.js';
|
|
4
3
|
const voiceLogger = createLogger('VOICE');
|
|
5
|
-
export async function transcribeAudio({ audio, prompt, language, temperature, }) {
|
|
4
|
+
export async function transcribeAudio({ audio, prompt, language, temperature, geminiApiKey, }) {
|
|
6
5
|
try {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
6
|
+
// Use provided API key or fall back to environment variable
|
|
7
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
|
|
8
|
+
if (!apiKey) {
|
|
9
|
+
throw new Error('Gemini API key is required for audio transcription');
|
|
10
|
+
}
|
|
11
|
+
// Initialize Google Generative AI
|
|
12
|
+
const genAI = new GoogleGenAI({ apiKey });
|
|
13
|
+
// Convert audio to base64 string if it's not already
|
|
14
|
+
let audioBase64;
|
|
15
|
+
if (typeof audio === 'string') {
|
|
16
|
+
audioBase64 = audio;
|
|
17
|
+
}
|
|
18
|
+
else if (audio instanceof Buffer) {
|
|
19
|
+
audioBase64 = audio.toString('base64');
|
|
20
|
+
}
|
|
21
|
+
else if (audio instanceof Uint8Array) {
|
|
22
|
+
audioBase64 = Buffer.from(audio).toString('base64');
|
|
23
|
+
}
|
|
24
|
+
else if (audio instanceof ArrayBuffer) {
|
|
25
|
+
audioBase64 = Buffer.from(audio).toString('base64');
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
throw new Error('Invalid audio format');
|
|
29
|
+
}
|
|
30
|
+
// Build the transcription prompt
|
|
31
|
+
let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`;
|
|
32
|
+
if (language) {
|
|
33
|
+
transcriptionPrompt += `\nThe audio is in ${language}.`;
|
|
34
|
+
}
|
|
35
|
+
// Create the content with audio using the inline data format
|
|
36
|
+
const response = await genAI.models.generateContent({
|
|
37
|
+
model: 'gemini-2.5-flash',
|
|
38
|
+
contents: [
|
|
39
|
+
{
|
|
40
|
+
parts: [
|
|
41
|
+
{ text: transcriptionPrompt },
|
|
42
|
+
{
|
|
43
|
+
inlineData: {
|
|
44
|
+
data: audioBase64,
|
|
45
|
+
mimeType: 'audio/mpeg',
|
|
46
|
+
},
|
|
17
47
|
},
|
|
18
|
-
|
|
48
|
+
],
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
config: temperature !== undefined
|
|
52
|
+
? {
|
|
53
|
+
temperature,
|
|
19
54
|
}
|
|
20
|
-
:
|
|
55
|
+
: undefined,
|
|
21
56
|
});
|
|
22
|
-
return
|
|
57
|
+
return response.text || '';
|
|
23
58
|
}
|
|
24
59
|
catch (error) {
|
|
25
60
|
voiceLogger.error('Failed to transcribe audio:', error);
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "kimaki",
|
|
3
3
|
"module": "index.ts",
|
|
4
4
|
"type": "module",
|
|
5
|
-
"version": "0.1.
|
|
5
|
+
"version": "0.1.4",
|
|
6
6
|
"repository": "https://github.com/remorses/kimaki",
|
|
7
7
|
"bin": "bin.js",
|
|
8
8
|
"files": [
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"tsx": "^4.20.5"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@ai-sdk/
|
|
22
|
+
"@ai-sdk/google": "^2.0.16",
|
|
23
23
|
"@clack/prompts": "^0.11.0",
|
|
24
24
|
"@discordjs/opus": "^0.10.0",
|
|
25
25
|
"@discordjs/voice": "^0.19.0",
|
package/src/cli.ts
CHANGED
|
@@ -223,9 +223,9 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
223
223
|
"3. Copy the token (you won't be able to see it again!)",
|
|
224
224
|
'Step 3: Get Bot Token',
|
|
225
225
|
)
|
|
226
|
-
|
|
227
226
|
const tokenInput = await password({
|
|
228
|
-
message:
|
|
227
|
+
message:
|
|
228
|
+
'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
|
|
229
229
|
validate(value) {
|
|
230
230
|
if (!value) return 'Bot token is required'
|
|
231
231
|
if (value.length < 50) return 'Invalid token format (too short)'
|
|
@@ -238,6 +238,33 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
238
238
|
}
|
|
239
239
|
token = tokenInput
|
|
240
240
|
|
|
241
|
+
note(
|
|
242
|
+
`You can get a Gemini api Key at https://aistudio.google.com/apikey`,
|
|
243
|
+
`Gemini API Key`,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
const geminiApiKey = await password({
|
|
247
|
+
message:
|
|
248
|
+
'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
|
|
249
|
+
validate(value) {
|
|
250
|
+
if (value && value.length < 10) return 'Invalid API key format'
|
|
251
|
+
return undefined
|
|
252
|
+
},
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
if (isCancel(geminiApiKey)) {
|
|
256
|
+
cancel('Setup cancelled')
|
|
257
|
+
process.exit(0)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Store API key in database
|
|
261
|
+
if (geminiApiKey) {
|
|
262
|
+
db.prepare(
|
|
263
|
+
'INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)',
|
|
264
|
+
).run(appId, geminiApiKey || null)
|
|
265
|
+
note('API key saved successfully', 'API Key Stored')
|
|
266
|
+
}
|
|
267
|
+
|
|
241
268
|
note(
|
|
242
269
|
`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`,
|
|
243
270
|
'Step 4: Install Bot to Server',
|
package/src/discordBot.ts
CHANGED
|
@@ -152,10 +152,12 @@ async function setupVoiceHandling({
|
|
|
152
152
|
connection,
|
|
153
153
|
guildId,
|
|
154
154
|
channelId,
|
|
155
|
+
appId,
|
|
155
156
|
}: {
|
|
156
157
|
connection: VoiceConnection
|
|
157
158
|
guildId: string
|
|
158
159
|
channelId: string
|
|
160
|
+
appId: string
|
|
159
161
|
}) {
|
|
160
162
|
voiceLogger.log(
|
|
161
163
|
`Setting up voice handling for guild ${guildId}, channel ${channelId}`,
|
|
@@ -188,11 +190,18 @@ async function setupVoiceHandling({
|
|
|
188
190
|
// Create user audio stream for debugging
|
|
189
191
|
voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId)
|
|
190
192
|
|
|
193
|
+
// Get API keys from database
|
|
194
|
+
const apiKeys = getDatabase()
|
|
195
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
196
|
+
.get(appId) as { gemini_api_key: string | null } | undefined
|
|
197
|
+
|
|
191
198
|
// Create GenAI worker
|
|
192
199
|
const genAiWorker = await createGenAIWorker({
|
|
193
200
|
directory,
|
|
194
201
|
guildId,
|
|
195
202
|
channelId,
|
|
203
|
+
appId,
|
|
204
|
+
geminiApiKey: apiKeys?.gemini_api_key,
|
|
196
205
|
systemMessage: dedent`
|
|
197
206
|
You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
|
|
198
207
|
|
|
@@ -480,6 +489,14 @@ export function getDatabase(): Database.Database {
|
|
|
480
489
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
481
490
|
)
|
|
482
491
|
`)
|
|
492
|
+
|
|
493
|
+
db.exec(`
|
|
494
|
+
CREATE TABLE IF NOT EXISTS bot_api_keys (
|
|
495
|
+
app_id TEXT PRIMARY KEY,
|
|
496
|
+
gemini_api_key TEXT,
|
|
497
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
498
|
+
)
|
|
499
|
+
`)
|
|
483
500
|
}
|
|
484
501
|
|
|
485
502
|
return db
|
|
@@ -614,11 +631,13 @@ async function processVoiceAttachment({
|
|
|
614
631
|
thread,
|
|
615
632
|
projectDirectory,
|
|
616
633
|
isNewThread = false,
|
|
634
|
+
appId,
|
|
617
635
|
}: {
|
|
618
636
|
message: Message
|
|
619
637
|
thread: ThreadChannel
|
|
620
638
|
projectDirectory?: string
|
|
621
639
|
isNewThread?: boolean
|
|
640
|
+
appId?: string
|
|
622
641
|
}): Promise<string | null> {
|
|
623
642
|
const audioAttachment = Array.from(message.attachments.values()).find(
|
|
624
643
|
(attachment) => attachment.contentType?.startsWith('audio/'),
|
|
@@ -660,9 +679,22 @@ async function processVoiceAttachment({
|
|
|
660
679
|
}
|
|
661
680
|
}
|
|
662
681
|
|
|
682
|
+
// Get Gemini API key from database if appId is provided
|
|
683
|
+
let geminiApiKey: string | undefined
|
|
684
|
+
if (appId) {
|
|
685
|
+
const apiKeys = getDatabase()
|
|
686
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
687
|
+
.get(appId) as { gemini_api_key: string | null } | undefined
|
|
688
|
+
|
|
689
|
+
if (apiKeys?.gemini_api_key) {
|
|
690
|
+
geminiApiKey = apiKeys.gemini_api_key
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
663
694
|
const transcription = await transcribeAudio({
|
|
664
695
|
audio: audioBuffer,
|
|
665
696
|
prompt: transcriptionPrompt,
|
|
697
|
+
geminiApiKey,
|
|
666
698
|
})
|
|
667
699
|
|
|
668
700
|
voiceLogger.log(
|
|
@@ -1635,6 +1667,7 @@ export async function startDiscordBot({
|
|
|
1635
1667
|
message,
|
|
1636
1668
|
thread,
|
|
1637
1669
|
projectDirectory,
|
|
1670
|
+
appId: currentAppId,
|
|
1638
1671
|
})
|
|
1639
1672
|
if (transcription) {
|
|
1640
1673
|
messageContent = transcription
|
|
@@ -1727,6 +1760,7 @@ export async function startDiscordBot({
|
|
|
1727
1760
|
thread,
|
|
1728
1761
|
projectDirectory,
|
|
1729
1762
|
isNewThread: true,
|
|
1763
|
+
appId: currentAppId,
|
|
1730
1764
|
})
|
|
1731
1765
|
if (transcription) {
|
|
1732
1766
|
messageContent = transcription
|
|
@@ -2224,6 +2258,7 @@ export async function startDiscordBot({
|
|
|
2224
2258
|
connection,
|
|
2225
2259
|
guildId: newState.guild.id,
|
|
2226
2260
|
channelId: voiceChannel.id,
|
|
2261
|
+
appId: currentAppId!,
|
|
2227
2262
|
})
|
|
2228
2263
|
|
|
2229
2264
|
// Handle connection state changes
|
|
@@ -11,6 +11,8 @@ export interface GenAIWorkerOptions {
|
|
|
11
11
|
systemMessage?: string
|
|
12
12
|
guildId: string
|
|
13
13
|
channelId: string
|
|
14
|
+
appId: string
|
|
15
|
+
geminiApiKey?: string | null
|
|
14
16
|
onAssistantOpusPacket: (packet: ArrayBuffer) => void
|
|
15
17
|
onAssistantStartSpeaking?: () => void
|
|
16
18
|
onAssistantStopSpeaking?: () => void
|
|
@@ -146,6 +148,8 @@ export function createGenAIWorker(
|
|
|
146
148
|
systemMessage: options.systemMessage,
|
|
147
149
|
guildId: options.guildId,
|
|
148
150
|
channelId: options.channelId,
|
|
151
|
+
appId: options.appId,
|
|
152
|
+
geminiApiKey: options.geminiApiKey,
|
|
149
153
|
}
|
|
150
154
|
worker.postMessage(initMessage)
|
|
151
155
|
})
|
package/src/genai-worker.ts
CHANGED
|
@@ -271,6 +271,7 @@ parentPort.on('message', async (message: WorkerInMessage) => {
|
|
|
271
271
|
session = await startGenAiSession({
|
|
272
272
|
tools,
|
|
273
273
|
systemMessage: message.systemMessage,
|
|
274
|
+
geminiApiKey: message.geminiApiKey,
|
|
274
275
|
onAssistantAudioChunk({ data }) {
|
|
275
276
|
// Write to audio log if enabled
|
|
276
277
|
if (audioLogStream && !audioLogStream.destroyed) {
|
package/src/genai.ts
CHANGED
|
@@ -113,6 +113,7 @@ export async function startGenAiSession({
|
|
|
113
113
|
onAssistantInterruptSpeaking,
|
|
114
114
|
systemMessage,
|
|
115
115
|
tools,
|
|
116
|
+
geminiApiKey,
|
|
116
117
|
}: {
|
|
117
118
|
onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
|
|
118
119
|
onAssistantStartSpeaking?: () => void
|
|
@@ -120,6 +121,7 @@ export async function startGenAiSession({
|
|
|
120
121
|
onAssistantInterruptSpeaking?: () => void
|
|
121
122
|
systemMessage?: string
|
|
122
123
|
tools?: Record<string, AITool<any, any>>
|
|
124
|
+
geminiApiKey?: string | null
|
|
123
125
|
} = {}) {
|
|
124
126
|
let session: Session | undefined = undefined
|
|
125
127
|
const callableTools: Array<CallableTool & { name: string }> = []
|
|
@@ -242,8 +244,15 @@ export async function startGenAiSession({
|
|
|
242
244
|
}
|
|
243
245
|
}
|
|
244
246
|
|
|
247
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
248
|
+
|
|
249
|
+
if (!apiKey) {
|
|
250
|
+
genaiLogger.error('No Gemini API key provided')
|
|
251
|
+
throw new Error('Gemini API key is required for voice interactions')
|
|
252
|
+
}
|
|
253
|
+
|
|
245
254
|
const ai = new GoogleGenAI({
|
|
246
|
-
apiKey
|
|
255
|
+
apiKey,
|
|
247
256
|
})
|
|
248
257
|
|
|
249
258
|
const model = 'models/gemini-2.5-flash-live-preview'
|
package/src/voice.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { experimental_transcribe as transcribe } from 'ai'
|
|
1
|
+
import { GoogleGenAI } from '@google/genai'
|
|
3
2
|
import { createLogger } from './logger.js'
|
|
4
3
|
|
|
5
4
|
const voiceLogger = createLogger('VOICE')
|
|
@@ -9,30 +8,70 @@ export async function transcribeAudio({
|
|
|
9
8
|
prompt,
|
|
10
9
|
language,
|
|
11
10
|
temperature,
|
|
11
|
+
geminiApiKey,
|
|
12
12
|
}: {
|
|
13
13
|
audio: Buffer | Uint8Array | ArrayBuffer | string
|
|
14
14
|
prompt?: string
|
|
15
15
|
language?: string
|
|
16
16
|
temperature?: number
|
|
17
|
+
geminiApiKey?: string
|
|
17
18
|
}): Promise<string> {
|
|
18
19
|
try {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
20
|
+
// Use provided API key or fall back to environment variable
|
|
21
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
22
|
+
|
|
23
|
+
if (!apiKey) {
|
|
24
|
+
throw new Error('Gemini API key is required for audio transcription')
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Initialize Google Generative AI
|
|
28
|
+
const genAI = new GoogleGenAI({ apiKey })
|
|
29
|
+
|
|
30
|
+
// Convert audio to base64 string if it's not already
|
|
31
|
+
let audioBase64: string
|
|
32
|
+
if (typeof audio === 'string') {
|
|
33
|
+
audioBase64 = audio
|
|
34
|
+
} else if (audio instanceof Buffer) {
|
|
35
|
+
audioBase64 = audio.toString('base64')
|
|
36
|
+
} else if (audio instanceof Uint8Array) {
|
|
37
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
38
|
+
} else if (audio instanceof ArrayBuffer) {
|
|
39
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
40
|
+
} else {
|
|
41
|
+
throw new Error('Invalid audio format')
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Build the transcription prompt
|
|
45
|
+
let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`
|
|
46
|
+
if (language) {
|
|
47
|
+
transcriptionPrompt += `\nThe audio is in ${language}.`
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Create the content with audio using the inline data format
|
|
51
|
+
const response = await genAI.models.generateContent({
|
|
52
|
+
model: 'gemini-2.5-flash',
|
|
53
|
+
contents: [
|
|
54
|
+
{
|
|
55
|
+
parts: [
|
|
56
|
+
{ text: transcriptionPrompt },
|
|
57
|
+
{
|
|
58
|
+
inlineData: {
|
|
59
|
+
data: audioBase64,
|
|
60
|
+
mimeType: 'audio/mpeg',
|
|
29
61
|
},
|
|
30
62
|
},
|
|
31
|
-
|
|
32
|
-
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
],
|
|
66
|
+
config:
|
|
67
|
+
temperature !== undefined
|
|
68
|
+
? {
|
|
69
|
+
temperature,
|
|
70
|
+
}
|
|
71
|
+
: undefined,
|
|
33
72
|
})
|
|
34
73
|
|
|
35
|
-
return
|
|
74
|
+
return response.text || ''
|
|
36
75
|
} catch (error) {
|
|
37
76
|
voiceLogger.error('Failed to transcribe audio:', error)
|
|
38
77
|
throw new Error(
|