kimaki 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +49 -17
- package/dist/discordBot.js +33 -4
- package/dist/genai-worker-wrapper.js +2 -0
- package/dist/genai-worker.js +1 -0
- package/dist/genai.js +7 -2
- package/dist/utils.js +10 -21
- package/dist/voice.js +51 -16
- package/package.json +2 -2
- package/src/cli.ts +71 -20
- package/src/discordBot.ts +40 -5
- package/src/genai-worker-wrapper.ts +4 -0
- package/src/genai-worker.ts +1 -0
- package/src/genai.ts +10 -1
- package/src/utils.ts +10 -21
- package/src/voice.ts +54 -15
- package/src/worker-types.ts +2 -0
package/dist/cli.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { cac } from 'cac';
|
|
3
|
-
import { intro, outro, text, password, note, cancel, isCancel, log, multiselect, spinner, } from '@clack/prompts';
|
|
4
|
-
import { generateBotInstallUrl } from './utils.js';
|
|
3
|
+
import { intro, outro, text, password, note, cancel, isCancel, confirm, log, multiselect, spinner, } from '@clack/prompts';
|
|
4
|
+
import { deduplicateByKey, generateBotInstallUrl } from './utils.js';
|
|
5
5
|
import { getChannelsWithDescriptions, createDiscordClient, getDatabase, startDiscordBot, initializeOpencodeForDirectory, } from './discordBot.js';
|
|
6
6
|
import { Events, ChannelType, REST, Routes, SlashCommandBuilder, } from 'discord.js';
|
|
7
7
|
import path from 'node:path';
|
|
@@ -74,7 +74,6 @@ async function ensureKimakiCategory(guild) {
|
|
|
74
74
|
}
|
|
75
75
|
async function run({ restart, addChannels }) {
|
|
76
76
|
const forceSetup = Boolean(restart);
|
|
77
|
-
const shouldAddChannels = Boolean(addChannels);
|
|
78
77
|
intro('🤖 Discord Bot Setup');
|
|
79
78
|
const db = getDatabase();
|
|
80
79
|
let appId;
|
|
@@ -82,6 +81,7 @@ async function run({ restart, addChannels }) {
|
|
|
82
81
|
const existingBot = db
|
|
83
82
|
.prepare('SELECT app_id, token FROM bot_tokens ORDER BY created_at DESC LIMIT 1')
|
|
84
83
|
.get();
|
|
84
|
+
const shouldAddChannels = !existingBot?.token || forceSetup || Boolean(addChannels);
|
|
85
85
|
if (existingBot && !forceSetup) {
|
|
86
86
|
appId = existingBot.app_id;
|
|
87
87
|
token = existingBot.token;
|
|
@@ -112,10 +112,24 @@ async function run({ restart, addChannels }) {
|
|
|
112
112
|
}
|
|
113
113
|
appId = appIdInput;
|
|
114
114
|
note('1. Go to the "Bot" section in the left sidebar\n' +
|
|
115
|
-
'2.
|
|
116
|
-
|
|
115
|
+
'2. Scroll down to "Privileged Gateway Intents"\n' +
|
|
116
|
+
'3. Enable these intents by toggling them ON:\n' +
|
|
117
|
+
' • SERVER MEMBERS INTENT\n' +
|
|
118
|
+
' • MESSAGE CONTENT INTENT\n' +
|
|
119
|
+
'4. Click "Save Changes" at the bottom', 'Step 2: Enable Required Intents');
|
|
120
|
+
const intentsConfirmed = await text({
|
|
121
|
+
message: 'Press Enter after enabling both intents:',
|
|
122
|
+
placeholder: 'Enter',
|
|
123
|
+
});
|
|
124
|
+
if (isCancel(intentsConfirmed)) {
|
|
125
|
+
cancel('Setup cancelled');
|
|
126
|
+
process.exit(0);
|
|
127
|
+
}
|
|
128
|
+
note('1. Still in the "Bot" section\n' +
|
|
129
|
+
'2. Click "Reset Token" to generate a new bot token (in case of errors try again)\n' +
|
|
130
|
+
"3. Copy the token (you won't be able to see it again!)", 'Step 3: Get Bot Token');
|
|
117
131
|
const tokenInput = await password({
|
|
118
|
-
message: 'Enter your Discord Bot Token (
|
|
132
|
+
message: 'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
|
|
119
133
|
validate(value) {
|
|
120
134
|
if (!value)
|
|
121
135
|
return 'Bot token is required';
|
|
@@ -128,16 +142,29 @@ async function run({ restart, addChannels }) {
|
|
|
128
142
|
process.exit(0);
|
|
129
143
|
}
|
|
130
144
|
token = tokenInput;
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
validate() {
|
|
145
|
+
note(`You can get a Gemini api Key at https://aistudio.google.com/apikey`, `Gemini API Key`);
|
|
146
|
+
const geminiApiKey = await password({
|
|
147
|
+
message: 'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
|
|
148
|
+
validate(value) {
|
|
149
|
+
if (value && value.length < 10)
|
|
150
|
+
return 'Invalid API key format';
|
|
138
151
|
return undefined;
|
|
139
152
|
},
|
|
140
153
|
});
|
|
154
|
+
if (isCancel(geminiApiKey)) {
|
|
155
|
+
cancel('Setup cancelled');
|
|
156
|
+
process.exit(0);
|
|
157
|
+
}
|
|
158
|
+
// Store API key in database
|
|
159
|
+
if (geminiApiKey) {
|
|
160
|
+
db.prepare('INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)').run(appId, geminiApiKey || null);
|
|
161
|
+
note('API key saved successfully', 'API Key Stored');
|
|
162
|
+
}
|
|
163
|
+
note(`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`, 'Step 4: Install Bot to Server');
|
|
164
|
+
const installed = await text({
|
|
165
|
+
message: 'Press Enter AFTER you have installed the bot in your server:',
|
|
166
|
+
placeholder: 'Enter',
|
|
167
|
+
});
|
|
141
168
|
if (isCancel(installed)) {
|
|
142
169
|
cancel('Setup cancelled');
|
|
143
170
|
process.exit(0);
|
|
@@ -172,6 +199,7 @@ async function run({ restart, addChannels }) {
|
|
|
172
199
|
cliLogger.error('Error: ' + (error instanceof Error ? error.message : String(error)));
|
|
173
200
|
process.exit(EXIT_NO_RESTART);
|
|
174
201
|
}
|
|
202
|
+
db.prepare('INSERT OR REPLACE INTO bot_tokens (app_id, token) VALUES (?, ?)').run(appId, token);
|
|
175
203
|
for (const { guild, channels } of kimakiChannels) {
|
|
176
204
|
for (const channel of channels) {
|
|
177
205
|
if (channel.kimakiDirectory) {
|
|
@@ -216,12 +244,16 @@ async function run({ restart, addChannels }) {
|
|
|
216
244
|
discordClient.destroy();
|
|
217
245
|
process.exit(EXIT_NO_RESTART);
|
|
218
246
|
}
|
|
219
|
-
const existingDirs = kimakiChannels.flatMap(({ channels }) => channels
|
|
220
|
-
|
|
247
|
+
const existingDirs = kimakiChannels.flatMap(({ channels }) => channels
|
|
248
|
+
.filter((ch) => ch.kimakiDirectory && ch.kimakiApp === appId)
|
|
249
|
+
.map((ch) => ch.kimakiDirectory)
|
|
250
|
+
.filter(Boolean));
|
|
251
|
+
const availableProjects = deduplicateByKey(projects.filter((project) => !existingDirs.includes(project.worktree)), (x) => x.worktree);
|
|
221
252
|
if (availableProjects.length === 0) {
|
|
222
253
|
note('All OpenCode projects already have Discord channels', 'No New Projects');
|
|
223
254
|
}
|
|
224
|
-
if (
|
|
255
|
+
if ((!existingDirs?.length && availableProjects.length > 0) ||
|
|
256
|
+
shouldAddChannels) {
|
|
225
257
|
const selectedProjects = await multiselect({
|
|
226
258
|
message: 'Select projects to create Discord channels for:',
|
|
227
259
|
options: availableProjects.map((project) => ({
|
|
@@ -262,7 +294,7 @@ async function run({ restart, addChannels }) {
|
|
|
262
294
|
if (!project)
|
|
263
295
|
continue;
|
|
264
296
|
const baseName = path.basename(project.worktree);
|
|
265
|
-
const channelName =
|
|
297
|
+
const channelName = `${baseName}`
|
|
266
298
|
.toLowerCase()
|
|
267
299
|
.replace(/[^a-z0-9-]/g, '-')
|
|
268
300
|
.slice(0, 100);
|
package/dist/discordBot.js
CHANGED
|
@@ -78,7 +78,7 @@ async function createUserAudioLogStream(guildId, channelId) {
|
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
// Set up voice handling for a connection (called once per connection)
|
|
81
|
-
async function setupVoiceHandling({ connection, guildId, channelId, }) {
|
|
81
|
+
async function setupVoiceHandling({ connection, guildId, channelId, appId, }) {
|
|
82
82
|
voiceLogger.log(`Setting up voice handling for guild ${guildId}, channel ${channelId}`);
|
|
83
83
|
// Check if this voice channel has an associated directory
|
|
84
84
|
const channelDirRow = getDatabase()
|
|
@@ -98,11 +98,17 @@ async function setupVoiceHandling({ connection, guildId, channelId, }) {
|
|
|
98
98
|
}
|
|
99
99
|
// Create user audio stream for debugging
|
|
100
100
|
voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId);
|
|
101
|
+
// Get API keys from database
|
|
102
|
+
const apiKeys = getDatabase()
|
|
103
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
104
|
+
.get(appId);
|
|
101
105
|
// Create GenAI worker
|
|
102
106
|
const genAiWorker = await createGenAIWorker({
|
|
103
107
|
directory,
|
|
104
108
|
guildId,
|
|
105
109
|
channelId,
|
|
110
|
+
appId,
|
|
111
|
+
geminiApiKey: apiKeys?.gemini_api_key,
|
|
106
112
|
systemMessage: dedent `
|
|
107
113
|
You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
|
|
108
114
|
|
|
@@ -227,14 +233,16 @@ async function setupVoiceHandling({ connection, guildId, channelId, }) {
|
|
|
227
233
|
.on('data', (frame) => {
|
|
228
234
|
// Check if a newer speaking session has started
|
|
229
235
|
if (currentSessionCount !== speakingSessionCount) {
|
|
230
|
-
voiceLogger.log(
|
|
236
|
+
// voiceLogger.log(
|
|
237
|
+
// `Skipping audio frame from session ${currentSessionCount} because newer session ${speakingSessionCount} has started`,
|
|
238
|
+
// )
|
|
231
239
|
return;
|
|
232
240
|
}
|
|
233
241
|
if (!voiceData.genAiWorker) {
|
|
234
242
|
voiceLogger.warn(`[VOICE] Received audio frame but no GenAI worker active for guild ${guildId}`);
|
|
235
243
|
return;
|
|
236
244
|
}
|
|
237
|
-
voiceLogger.debug('User audio chunk length', frame.length)
|
|
245
|
+
// voiceLogger.debug('User audio chunk length', frame.length)
|
|
238
246
|
// Write to PCM file if stream exists
|
|
239
247
|
voiceData.userAudioStream?.write(frame);
|
|
240
248
|
// stream incrementally — low latency
|
|
@@ -345,6 +353,13 @@ export function getDatabase() {
|
|
|
345
353
|
channel_type TEXT NOT NULL,
|
|
346
354
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
347
355
|
)
|
|
356
|
+
`);
|
|
357
|
+
db.exec(`
|
|
358
|
+
CREATE TABLE IF NOT EXISTS bot_api_keys (
|
|
359
|
+
app_id TEXT PRIMARY KEY,
|
|
360
|
+
gemini_api_key TEXT,
|
|
361
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
362
|
+
)
|
|
348
363
|
`);
|
|
349
364
|
}
|
|
350
365
|
return db;
|
|
@@ -456,7 +471,7 @@ async function waitForServer(port, maxAttempts = 30) {
|
|
|
456
471
|
}
|
|
457
472
|
throw new Error(`Server did not start on port ${port} after ${maxAttempts} seconds`);
|
|
458
473
|
}
|
|
459
|
-
async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, }) {
|
|
474
|
+
async function processVoiceAttachment({ message, thread, projectDirectory, isNewThread = false, appId, }) {
|
|
460
475
|
const audioAttachment = Array.from(message.attachments.values()).find((attachment) => attachment.contentType?.startsWith('audio/'));
|
|
461
476
|
if (!audioAttachment)
|
|
462
477
|
return null;
|
|
@@ -486,9 +501,20 @@ async function processVoiceAttachment({ message, thread, projectDirectory, isNew
|
|
|
486
501
|
voiceLogger.log(`Could not get project tree:`, e);
|
|
487
502
|
}
|
|
488
503
|
}
|
|
504
|
+
// Get Gemini API key from database if appId is provided
|
|
505
|
+
let geminiApiKey;
|
|
506
|
+
if (appId) {
|
|
507
|
+
const apiKeys = getDatabase()
|
|
508
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
509
|
+
.get(appId);
|
|
510
|
+
if (apiKeys?.gemini_api_key) {
|
|
511
|
+
geminiApiKey = apiKeys.gemini_api_key;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
489
514
|
const transcription = await transcribeAudio({
|
|
490
515
|
audio: audioBuffer,
|
|
491
516
|
prompt: transcriptionPrompt,
|
|
517
|
+
geminiApiKey,
|
|
492
518
|
});
|
|
493
519
|
voiceLogger.log(`Transcription successful: "${transcription.slice(0, 50)}${transcription.length > 50 ? '...' : ''}"`);
|
|
494
520
|
// Update thread name with transcribed content only for new threads
|
|
@@ -1231,6 +1257,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1231
1257
|
message,
|
|
1232
1258
|
thread,
|
|
1233
1259
|
projectDirectory,
|
|
1260
|
+
appId: currentAppId,
|
|
1234
1261
|
});
|
|
1235
1262
|
if (transcription) {
|
|
1236
1263
|
messageContent = transcription;
|
|
@@ -1289,6 +1316,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1289
1316
|
thread,
|
|
1290
1317
|
projectDirectory,
|
|
1291
1318
|
isNewThread: true,
|
|
1319
|
+
appId: currentAppId,
|
|
1292
1320
|
});
|
|
1293
1321
|
if (transcription) {
|
|
1294
1322
|
messageContent = transcription;
|
|
@@ -1649,6 +1677,7 @@ export async function startDiscordBot({ token, appId, discordClient, }) {
|
|
|
1649
1677
|
connection,
|
|
1650
1678
|
guildId: newState.guild.id,
|
|
1651
1679
|
channelId: voiceChannel.id,
|
|
1680
|
+
appId: currentAppId,
|
|
1652
1681
|
});
|
|
1653
1682
|
// Handle connection state changes
|
|
1654
1683
|
connection.on(VoiceConnectionStatus.Disconnected, async () => {
|
|
@@ -98,6 +98,8 @@ export function createGenAIWorker(options) {
|
|
|
98
98
|
systemMessage: options.systemMessage,
|
|
99
99
|
guildId: options.guildId,
|
|
100
100
|
channelId: options.channelId,
|
|
101
|
+
appId: options.appId,
|
|
102
|
+
geminiApiKey: options.geminiApiKey,
|
|
101
103
|
};
|
|
102
104
|
worker.postMessage(initMessage);
|
|
103
105
|
});
|
package/dist/genai-worker.js
CHANGED
|
@@ -210,6 +210,7 @@ parentPort.on('message', async (message) => {
|
|
|
210
210
|
session = await startGenAiSession({
|
|
211
211
|
tools,
|
|
212
212
|
systemMessage: message.systemMessage,
|
|
213
|
+
geminiApiKey: message.geminiApiKey,
|
|
213
214
|
onAssistantAudioChunk({ data }) {
|
|
214
215
|
// Write to audio log if enabled
|
|
215
216
|
if (audioLogStream && !audioLogStream.destroyed) {
|
package/dist/genai.js
CHANGED
|
@@ -68,7 +68,7 @@ function defaultAudioChunkHandler({ data, mimeType, }) {
|
|
|
68
68
|
const buffer = convertToWav(audioParts, mimeType);
|
|
69
69
|
saveBinaryFile(fileName, buffer);
|
|
70
70
|
}
|
|
71
|
-
export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, } = {}) {
|
|
71
|
+
export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStartSpeaking, onAssistantStopSpeaking, onAssistantInterruptSpeaking, systemMessage, tools, geminiApiKey, } = {}) {
|
|
72
72
|
let session = undefined;
|
|
73
73
|
const callableTools = [];
|
|
74
74
|
let isAssistantSpeaking = false;
|
|
@@ -161,8 +161,13 @@ export async function startGenAiSession({ onAssistantAudioChunk, onAssistantStar
|
|
|
161
161
|
}
|
|
162
162
|
}
|
|
163
163
|
}
|
|
164
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
|
|
165
|
+
if (!apiKey) {
|
|
166
|
+
genaiLogger.error('No Gemini API key provided');
|
|
167
|
+
throw new Error('Gemini API key is required for voice interactions');
|
|
168
|
+
}
|
|
164
169
|
const ai = new GoogleGenAI({
|
|
165
|
-
apiKey
|
|
170
|
+
apiKey,
|
|
166
171
|
});
|
|
167
172
|
const model = 'models/gemini-2.5-flash-live-preview';
|
|
168
173
|
session = await ai.live.connect({
|
package/dist/utils.js
CHANGED
|
@@ -28,25 +28,14 @@ export function generateBotInstallUrl({ clientId, permissions = [
|
|
|
28
28
|
}
|
|
29
29
|
return url.toString();
|
|
30
30
|
}
|
|
31
|
-
function
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
PermissionsBitField.Flags.ManageMessages,
|
|
42
|
-
PermissionsBitField.Flags.UseExternalEmojis,
|
|
43
|
-
PermissionsBitField.Flags.AttachFiles,
|
|
44
|
-
PermissionsBitField.Flags.Connect,
|
|
45
|
-
PermissionsBitField.Flags.Speak,
|
|
46
|
-
];
|
|
47
|
-
}
|
|
48
|
-
function getPermissionNames() {
|
|
49
|
-
const permissions = getRequiredBotPermissions();
|
|
50
|
-
const permissionsBitField = new PermissionsBitField(permissions);
|
|
51
|
-
return permissionsBitField.toArray();
|
|
31
|
+
export function deduplicateByKey(arr, keyFn) {
|
|
32
|
+
const seen = new Set();
|
|
33
|
+
return arr.filter(item => {
|
|
34
|
+
const key = keyFn(item);
|
|
35
|
+
if (seen.has(key)) {
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
seen.add(key);
|
|
39
|
+
return true;
|
|
40
|
+
});
|
|
52
41
|
}
|
package/dist/voice.js
CHANGED
|
@@ -1,25 +1,60 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { experimental_transcribe as transcribe } from 'ai';
|
|
1
|
+
import { GoogleGenAI } from '@google/genai';
|
|
3
2
|
import { createLogger } from './logger.js';
|
|
4
3
|
const voiceLogger = createLogger('VOICE');
|
|
5
|
-
export async function transcribeAudio({ audio, prompt, language, temperature, }) {
|
|
4
|
+
export async function transcribeAudio({ audio, prompt, language, temperature, geminiApiKey, }) {
|
|
6
5
|
try {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
6
|
+
// Use provided API key or fall back to environment variable
|
|
7
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY;
|
|
8
|
+
if (!apiKey) {
|
|
9
|
+
throw new Error('Gemini API key is required for audio transcription');
|
|
10
|
+
}
|
|
11
|
+
// Initialize Google Generative AI
|
|
12
|
+
const genAI = new GoogleGenAI({ apiKey });
|
|
13
|
+
// Convert audio to base64 string if it's not already
|
|
14
|
+
let audioBase64;
|
|
15
|
+
if (typeof audio === 'string') {
|
|
16
|
+
audioBase64 = audio;
|
|
17
|
+
}
|
|
18
|
+
else if (audio instanceof Buffer) {
|
|
19
|
+
audioBase64 = audio.toString('base64');
|
|
20
|
+
}
|
|
21
|
+
else if (audio instanceof Uint8Array) {
|
|
22
|
+
audioBase64 = Buffer.from(audio).toString('base64');
|
|
23
|
+
}
|
|
24
|
+
else if (audio instanceof ArrayBuffer) {
|
|
25
|
+
audioBase64 = Buffer.from(audio).toString('base64');
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
throw new Error('Invalid audio format');
|
|
29
|
+
}
|
|
30
|
+
// Build the transcription prompt
|
|
31
|
+
let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`;
|
|
32
|
+
if (language) {
|
|
33
|
+
transcriptionPrompt += `\nThe audio is in ${language}.`;
|
|
34
|
+
}
|
|
35
|
+
// Create the content with audio using the inline data format
|
|
36
|
+
const response = await genAI.models.generateContent({
|
|
37
|
+
model: 'gemini-2.5-flash',
|
|
38
|
+
contents: [
|
|
39
|
+
{
|
|
40
|
+
parts: [
|
|
41
|
+
{ text: transcriptionPrompt },
|
|
42
|
+
{
|
|
43
|
+
inlineData: {
|
|
44
|
+
data: audioBase64,
|
|
45
|
+
mimeType: 'audio/mpeg',
|
|
46
|
+
},
|
|
17
47
|
},
|
|
18
|
-
|
|
48
|
+
],
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
config: temperature !== undefined
|
|
52
|
+
? {
|
|
53
|
+
temperature,
|
|
19
54
|
}
|
|
20
|
-
:
|
|
55
|
+
: undefined,
|
|
21
56
|
});
|
|
22
|
-
return
|
|
57
|
+
return response.text || '';
|
|
23
58
|
}
|
|
24
59
|
catch (error) {
|
|
25
60
|
voiceLogger.error('Failed to transcribe audio:', error);
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "kimaki",
|
|
3
3
|
"module": "index.ts",
|
|
4
4
|
"type": "module",
|
|
5
|
-
"version": "0.1.
|
|
5
|
+
"version": "0.1.4",
|
|
6
6
|
"repository": "https://github.com/remorses/kimaki",
|
|
7
7
|
"bin": "bin.js",
|
|
8
8
|
"files": [
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"tsx": "^4.20.5"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@ai-sdk/
|
|
22
|
+
"@ai-sdk/google": "^2.0.16",
|
|
23
23
|
"@clack/prompts": "^0.11.0",
|
|
24
24
|
"@discordjs/opus": "^0.10.0",
|
|
25
25
|
"@discordjs/voice": "^0.19.0",
|
package/src/cli.ts
CHANGED
|
@@ -8,11 +8,12 @@ import {
|
|
|
8
8
|
note,
|
|
9
9
|
cancel,
|
|
10
10
|
isCancel,
|
|
11
|
+
confirm,
|
|
11
12
|
log,
|
|
12
13
|
multiselect,
|
|
13
14
|
spinner,
|
|
14
15
|
} from '@clack/prompts'
|
|
15
|
-
import { generateBotInstallUrl } from './utils.js'
|
|
16
|
+
import { deduplicateByKey, generateBotInstallUrl } from './utils.js'
|
|
16
17
|
import {
|
|
17
18
|
getChannelsWithDescriptions,
|
|
18
19
|
createDiscordClient,
|
|
@@ -138,7 +139,6 @@ async function ensureKimakiCategory(guild: Guild): Promise<CategoryChannel> {
|
|
|
138
139
|
|
|
139
140
|
async function run({ restart, addChannels }: CliOptions) {
|
|
140
141
|
const forceSetup = Boolean(restart)
|
|
141
|
-
const shouldAddChannels = Boolean(addChannels)
|
|
142
142
|
|
|
143
143
|
intro('🤖 Discord Bot Setup')
|
|
144
144
|
|
|
@@ -152,6 +152,9 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
152
152
|
)
|
|
153
153
|
.get() as { app_id: string; token: string } | undefined
|
|
154
154
|
|
|
155
|
+
const shouldAddChannels =
|
|
156
|
+
!existingBot?.token || forceSetup || Boolean(addChannels)
|
|
157
|
+
|
|
155
158
|
if (existingBot && !forceSetup) {
|
|
156
159
|
appId = existingBot.app_id
|
|
157
160
|
token = existingBot.token
|
|
@@ -196,13 +199,33 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
196
199
|
|
|
197
200
|
note(
|
|
198
201
|
'1. Go to the "Bot" section in the left sidebar\n' +
|
|
199
|
-
'2.
|
|
200
|
-
|
|
201
|
-
|
|
202
|
+
'2. Scroll down to "Privileged Gateway Intents"\n' +
|
|
203
|
+
'3. Enable these intents by toggling them ON:\n' +
|
|
204
|
+
' • SERVER MEMBERS INTENT\n' +
|
|
205
|
+
' • MESSAGE CONTENT INTENT\n' +
|
|
206
|
+
'4. Click "Save Changes" at the bottom',
|
|
207
|
+
'Step 2: Enable Required Intents',
|
|
202
208
|
)
|
|
203
209
|
|
|
210
|
+
const intentsConfirmed = await text({
|
|
211
|
+
message: 'Press Enter after enabling both intents:',
|
|
212
|
+
placeholder: 'Enter',
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
if (isCancel(intentsConfirmed)) {
|
|
216
|
+
cancel('Setup cancelled')
|
|
217
|
+
process.exit(0)
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
note(
|
|
221
|
+
'1. Still in the "Bot" section\n' +
|
|
222
|
+
'2. Click "Reset Token" to generate a new bot token (in case of errors try again)\n' +
|
|
223
|
+
"3. Copy the token (you won't be able to see it again!)",
|
|
224
|
+
'Step 3: Get Bot Token',
|
|
225
|
+
)
|
|
204
226
|
const tokenInput = await password({
|
|
205
|
-
message:
|
|
227
|
+
message:
|
|
228
|
+
'Enter your Discord Bot Token (from "Bot" section - click "Reset Token" if needed):',
|
|
206
229
|
validate(value) {
|
|
207
230
|
if (!value) return 'Bot token is required'
|
|
208
231
|
if (value.length < 50) return 'Invalid token format (too short)'
|
|
@@ -215,23 +238,41 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
215
238
|
}
|
|
216
239
|
token = tokenInput
|
|
217
240
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
241
|
+
note(
|
|
242
|
+
`You can get a Gemini api Key at https://aistudio.google.com/apikey`,
|
|
243
|
+
`Gemini API Key`,
|
|
244
|
+
)
|
|
221
245
|
|
|
222
|
-
|
|
246
|
+
const geminiApiKey = await password({
|
|
247
|
+
message:
|
|
248
|
+
'Enter your Gemini API Key for voice channels and audio transcription (optional, press Enter to skip):',
|
|
249
|
+
validate(value) {
|
|
250
|
+
if (value && value.length < 10) return 'Invalid API key format'
|
|
251
|
+
return undefined
|
|
252
|
+
},
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
if (isCancel(geminiApiKey)) {
|
|
256
|
+
cancel('Setup cancelled')
|
|
257
|
+
process.exit(0)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Store API key in database
|
|
261
|
+
if (geminiApiKey) {
|
|
262
|
+
db.prepare(
|
|
263
|
+
'INSERT OR REPLACE INTO bot_api_keys (app_id, gemini_api_key) VALUES (?, ?)',
|
|
264
|
+
).run(appId, geminiApiKey || null)
|
|
265
|
+
note('API key saved successfully', 'API Key Stored')
|
|
266
|
+
}
|
|
223
267
|
|
|
224
268
|
note(
|
|
225
269
|
`Bot install URL:\n${generateBotInstallUrl({ clientId: appId })}\n\nYou MUST install the bot in your Discord server before continuing.`,
|
|
226
|
-
'Step
|
|
270
|
+
'Step 4: Install Bot to Server',
|
|
227
271
|
)
|
|
228
272
|
|
|
229
273
|
const installed = await text({
|
|
230
274
|
message: 'Press Enter AFTER you have installed the bot in your server:',
|
|
231
|
-
placeholder: '
|
|
232
|
-
validate() {
|
|
233
|
-
return undefined
|
|
234
|
-
},
|
|
275
|
+
placeholder: 'Enter',
|
|
235
276
|
})
|
|
236
277
|
|
|
237
278
|
if (isCancel(installed)) {
|
|
@@ -282,6 +323,9 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
282
323
|
)
|
|
283
324
|
process.exit(EXIT_NO_RESTART)
|
|
284
325
|
}
|
|
326
|
+
db.prepare(
|
|
327
|
+
'INSERT OR REPLACE INTO bot_tokens (app_id, token) VALUES (?, ?)',
|
|
328
|
+
).run(appId, token)
|
|
285
329
|
|
|
286
330
|
for (const { guild, channels } of kimakiChannels) {
|
|
287
331
|
for (const channel of channels) {
|
|
@@ -350,11 +394,15 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
350
394
|
}
|
|
351
395
|
|
|
352
396
|
const existingDirs = kimakiChannels.flatMap(({ channels }) =>
|
|
353
|
-
channels
|
|
397
|
+
channels
|
|
398
|
+
.filter((ch) => ch.kimakiDirectory && ch.kimakiApp === appId)
|
|
399
|
+
.map((ch) => ch.kimakiDirectory)
|
|
400
|
+
.filter(Boolean),
|
|
354
401
|
)
|
|
355
402
|
|
|
356
|
-
const availableProjects =
|
|
357
|
-
(project) => !existingDirs.includes(project.worktree),
|
|
403
|
+
const availableProjects = deduplicateByKey(
|
|
404
|
+
projects.filter((project) => !existingDirs.includes(project.worktree)),
|
|
405
|
+
(x) => x.worktree,
|
|
358
406
|
)
|
|
359
407
|
|
|
360
408
|
if (availableProjects.length === 0) {
|
|
@@ -364,7 +412,10 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
364
412
|
)
|
|
365
413
|
}
|
|
366
414
|
|
|
367
|
-
if (
|
|
415
|
+
if (
|
|
416
|
+
(!existingDirs?.length && availableProjects.length > 0) ||
|
|
417
|
+
shouldAddChannels
|
|
418
|
+
) {
|
|
368
419
|
const selectedProjects = await multiselect({
|
|
369
420
|
message: 'Select projects to create Discord channels for:',
|
|
370
421
|
options: availableProjects.map((project) => ({
|
|
@@ -410,7 +461,7 @@ async function run({ restart, addChannels }: CliOptions) {
|
|
|
410
461
|
if (!project) continue
|
|
411
462
|
|
|
412
463
|
const baseName = path.basename(project.worktree)
|
|
413
|
-
const channelName =
|
|
464
|
+
const channelName = `${baseName}`
|
|
414
465
|
.toLowerCase()
|
|
415
466
|
.replace(/[^a-z0-9-]/g, '-')
|
|
416
467
|
.slice(0, 100)
|
package/src/discordBot.ts
CHANGED
|
@@ -152,10 +152,12 @@ async function setupVoiceHandling({
|
|
|
152
152
|
connection,
|
|
153
153
|
guildId,
|
|
154
154
|
channelId,
|
|
155
|
+
appId,
|
|
155
156
|
}: {
|
|
156
157
|
connection: VoiceConnection
|
|
157
158
|
guildId: string
|
|
158
159
|
channelId: string
|
|
160
|
+
appId: string
|
|
159
161
|
}) {
|
|
160
162
|
voiceLogger.log(
|
|
161
163
|
`Setting up voice handling for guild ${guildId}, channel ${channelId}`,
|
|
@@ -188,11 +190,18 @@ async function setupVoiceHandling({
|
|
|
188
190
|
// Create user audio stream for debugging
|
|
189
191
|
voiceData.userAudioStream = await createUserAudioLogStream(guildId, channelId)
|
|
190
192
|
|
|
193
|
+
// Get API keys from database
|
|
194
|
+
const apiKeys = getDatabase()
|
|
195
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
196
|
+
.get(appId) as { gemini_api_key: string | null } | undefined
|
|
197
|
+
|
|
191
198
|
// Create GenAI worker
|
|
192
199
|
const genAiWorker = await createGenAIWorker({
|
|
193
200
|
directory,
|
|
194
201
|
guildId,
|
|
195
202
|
channelId,
|
|
203
|
+
appId,
|
|
204
|
+
geminiApiKey: apiKeys?.gemini_api_key,
|
|
196
205
|
systemMessage: dedent`
|
|
197
206
|
You are Kimaki, an AI similar to Jarvis: you help your user (an engineer) controlling his coding agent, just like Jarvis controls Ironman armor and machines. Speak fast.
|
|
198
207
|
|
|
@@ -334,9 +343,9 @@ async function setupVoiceHandling({
|
|
|
334
343
|
.on('data', (frame: Buffer) => {
|
|
335
344
|
// Check if a newer speaking session has started
|
|
336
345
|
if (currentSessionCount !== speakingSessionCount) {
|
|
337
|
-
voiceLogger.log(
|
|
338
|
-
|
|
339
|
-
)
|
|
346
|
+
// voiceLogger.log(
|
|
347
|
+
// `Skipping audio frame from session ${currentSessionCount} because newer session ${speakingSessionCount} has started`,
|
|
348
|
+
// )
|
|
340
349
|
return
|
|
341
350
|
}
|
|
342
351
|
|
|
@@ -346,7 +355,7 @@ async function setupVoiceHandling({
|
|
|
346
355
|
)
|
|
347
356
|
return
|
|
348
357
|
}
|
|
349
|
-
voiceLogger.debug('User audio chunk length', frame.length)
|
|
358
|
+
// voiceLogger.debug('User audio chunk length', frame.length)
|
|
350
359
|
|
|
351
360
|
// Write to PCM file if stream exists
|
|
352
361
|
voiceData.userAudioStream?.write(frame)
|
|
@@ -480,6 +489,14 @@ export function getDatabase(): Database.Database {
|
|
|
480
489
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
481
490
|
)
|
|
482
491
|
`)
|
|
492
|
+
|
|
493
|
+
db.exec(`
|
|
494
|
+
CREATE TABLE IF NOT EXISTS bot_api_keys (
|
|
495
|
+
app_id TEXT PRIMARY KEY,
|
|
496
|
+
gemini_api_key TEXT,
|
|
497
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
498
|
+
)
|
|
499
|
+
`)
|
|
483
500
|
}
|
|
484
501
|
|
|
485
502
|
return db
|
|
@@ -614,11 +631,13 @@ async function processVoiceAttachment({
|
|
|
614
631
|
thread,
|
|
615
632
|
projectDirectory,
|
|
616
633
|
isNewThread = false,
|
|
634
|
+
appId,
|
|
617
635
|
}: {
|
|
618
636
|
message: Message
|
|
619
637
|
thread: ThreadChannel
|
|
620
638
|
projectDirectory?: string
|
|
621
639
|
isNewThread?: boolean
|
|
640
|
+
appId?: string
|
|
622
641
|
}): Promise<string | null> {
|
|
623
642
|
const audioAttachment = Array.from(message.attachments.values()).find(
|
|
624
643
|
(attachment) => attachment.contentType?.startsWith('audio/'),
|
|
@@ -660,9 +679,22 @@ async function processVoiceAttachment({
|
|
|
660
679
|
}
|
|
661
680
|
}
|
|
662
681
|
|
|
682
|
+
// Get Gemini API key from database if appId is provided
|
|
683
|
+
let geminiApiKey: string | undefined
|
|
684
|
+
if (appId) {
|
|
685
|
+
const apiKeys = getDatabase()
|
|
686
|
+
.prepare('SELECT gemini_api_key FROM bot_api_keys WHERE app_id = ?')
|
|
687
|
+
.get(appId) as { gemini_api_key: string | null } | undefined
|
|
688
|
+
|
|
689
|
+
if (apiKeys?.gemini_api_key) {
|
|
690
|
+
geminiApiKey = apiKeys.gemini_api_key
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
663
694
|
const transcription = await transcribeAudio({
|
|
664
695
|
audio: audioBuffer,
|
|
665
696
|
prompt: transcriptionPrompt,
|
|
697
|
+
geminiApiKey,
|
|
666
698
|
})
|
|
667
699
|
|
|
668
700
|
voiceLogger.log(
|
|
@@ -1635,6 +1667,7 @@ export async function startDiscordBot({
|
|
|
1635
1667
|
message,
|
|
1636
1668
|
thread,
|
|
1637
1669
|
projectDirectory,
|
|
1670
|
+
appId: currentAppId,
|
|
1638
1671
|
})
|
|
1639
1672
|
if (transcription) {
|
|
1640
1673
|
messageContent = transcription
|
|
@@ -1727,6 +1760,7 @@ export async function startDiscordBot({
|
|
|
1727
1760
|
thread,
|
|
1728
1761
|
projectDirectory,
|
|
1729
1762
|
isNewThread: true,
|
|
1763
|
+
appId: currentAppId,
|
|
1730
1764
|
})
|
|
1731
1765
|
if (transcription) {
|
|
1732
1766
|
messageContent = transcription
|
|
@@ -1950,7 +1984,7 @@ export async function startDiscordBot({
|
|
|
1950
1984
|
return ''
|
|
1951
1985
|
})
|
|
1952
1986
|
.filter((t) => t.trim())
|
|
1953
|
-
|
|
1987
|
+
|
|
1954
1988
|
const userText = userTexts.join('\n\n')
|
|
1955
1989
|
if (userText) {
|
|
1956
1990
|
// Escape backticks in user messages to prevent formatting issues
|
|
@@ -2224,6 +2258,7 @@ export async function startDiscordBot({
|
|
|
2224
2258
|
connection,
|
|
2225
2259
|
guildId: newState.guild.id,
|
|
2226
2260
|
channelId: voiceChannel.id,
|
|
2261
|
+
appId: currentAppId!,
|
|
2227
2262
|
})
|
|
2228
2263
|
|
|
2229
2264
|
// Handle connection state changes
|
|
@@ -11,6 +11,8 @@ export interface GenAIWorkerOptions {
|
|
|
11
11
|
systemMessage?: string
|
|
12
12
|
guildId: string
|
|
13
13
|
channelId: string
|
|
14
|
+
appId: string
|
|
15
|
+
geminiApiKey?: string | null
|
|
14
16
|
onAssistantOpusPacket: (packet: ArrayBuffer) => void
|
|
15
17
|
onAssistantStartSpeaking?: () => void
|
|
16
18
|
onAssistantStopSpeaking?: () => void
|
|
@@ -146,6 +148,8 @@ export function createGenAIWorker(
|
|
|
146
148
|
systemMessage: options.systemMessage,
|
|
147
149
|
guildId: options.guildId,
|
|
148
150
|
channelId: options.channelId,
|
|
151
|
+
appId: options.appId,
|
|
152
|
+
geminiApiKey: options.geminiApiKey,
|
|
149
153
|
}
|
|
150
154
|
worker.postMessage(initMessage)
|
|
151
155
|
})
|
package/src/genai-worker.ts
CHANGED
|
@@ -271,6 +271,7 @@ parentPort.on('message', async (message: WorkerInMessage) => {
|
|
|
271
271
|
session = await startGenAiSession({
|
|
272
272
|
tools,
|
|
273
273
|
systemMessage: message.systemMessage,
|
|
274
|
+
geminiApiKey: message.geminiApiKey,
|
|
274
275
|
onAssistantAudioChunk({ data }) {
|
|
275
276
|
// Write to audio log if enabled
|
|
276
277
|
if (audioLogStream && !audioLogStream.destroyed) {
|
package/src/genai.ts
CHANGED
|
@@ -113,6 +113,7 @@ export async function startGenAiSession({
|
|
|
113
113
|
onAssistantInterruptSpeaking,
|
|
114
114
|
systemMessage,
|
|
115
115
|
tools,
|
|
116
|
+
geminiApiKey,
|
|
116
117
|
}: {
|
|
117
118
|
onAssistantAudioChunk?: (args: { data: Buffer; mimeType: string }) => void
|
|
118
119
|
onAssistantStartSpeaking?: () => void
|
|
@@ -120,6 +121,7 @@ export async function startGenAiSession({
|
|
|
120
121
|
onAssistantInterruptSpeaking?: () => void
|
|
121
122
|
systemMessage?: string
|
|
122
123
|
tools?: Record<string, AITool<any, any>>
|
|
124
|
+
geminiApiKey?: string | null
|
|
123
125
|
} = {}) {
|
|
124
126
|
let session: Session | undefined = undefined
|
|
125
127
|
const callableTools: Array<CallableTool & { name: string }> = []
|
|
@@ -242,8 +244,15 @@ export async function startGenAiSession({
|
|
|
242
244
|
}
|
|
243
245
|
}
|
|
244
246
|
|
|
247
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
248
|
+
|
|
249
|
+
if (!apiKey) {
|
|
250
|
+
genaiLogger.error('No Gemini API key provided')
|
|
251
|
+
throw new Error('Gemini API key is required for voice interactions')
|
|
252
|
+
}
|
|
253
|
+
|
|
245
254
|
const ai = new GoogleGenAI({
|
|
246
|
-
apiKey
|
|
255
|
+
apiKey,
|
|
247
256
|
})
|
|
248
257
|
|
|
249
258
|
const model = 'models/gemini-2.5-flash-live-preview'
|
package/src/utils.ts
CHANGED
|
@@ -48,26 +48,15 @@ export function generateBotInstallUrl({
|
|
|
48
48
|
return url.toString()
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
-
function getRequiredBotPermissions(): bigint[] {
|
|
52
|
-
return [
|
|
53
|
-
PermissionsBitField.Flags.ViewChannel,
|
|
54
|
-
PermissionsBitField.Flags.ManageChannels,
|
|
55
|
-
PermissionsBitField.Flags.SendMessages,
|
|
56
|
-
PermissionsBitField.Flags.SendMessagesInThreads,
|
|
57
|
-
PermissionsBitField.Flags.CreatePublicThreads,
|
|
58
|
-
PermissionsBitField.Flags.ManageThreads,
|
|
59
|
-
PermissionsBitField.Flags.ReadMessageHistory,
|
|
60
|
-
PermissionsBitField.Flags.AddReactions,
|
|
61
|
-
PermissionsBitField.Flags.ManageMessages,
|
|
62
|
-
PermissionsBitField.Flags.UseExternalEmojis,
|
|
63
|
-
PermissionsBitField.Flags.AttachFiles,
|
|
64
|
-
PermissionsBitField.Flags.Connect,
|
|
65
|
-
PermissionsBitField.Flags.Speak,
|
|
66
|
-
]
|
|
67
|
-
}
|
|
68
51
|
|
|
69
|
-
function
|
|
70
|
-
const
|
|
71
|
-
|
|
72
|
-
|
|
52
|
+
export function deduplicateByKey<T, K>(arr: T[], keyFn: (item: T) => K): T[] {
|
|
53
|
+
const seen = new Set<K>()
|
|
54
|
+
return arr.filter(item => {
|
|
55
|
+
const key = keyFn(item)
|
|
56
|
+
if (seen.has(key)) {
|
|
57
|
+
return false
|
|
58
|
+
}
|
|
59
|
+
seen.add(key)
|
|
60
|
+
return true
|
|
61
|
+
})
|
|
73
62
|
}
|
package/src/voice.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { experimental_transcribe as transcribe } from 'ai'
|
|
1
|
+
import { GoogleGenAI } from '@google/genai'
|
|
3
2
|
import { createLogger } from './logger.js'
|
|
4
3
|
|
|
5
4
|
const voiceLogger = createLogger('VOICE')
|
|
@@ -9,30 +8,70 @@ export async function transcribeAudio({
|
|
|
9
8
|
prompt,
|
|
10
9
|
language,
|
|
11
10
|
temperature,
|
|
11
|
+
geminiApiKey,
|
|
12
12
|
}: {
|
|
13
13
|
audio: Buffer | Uint8Array | ArrayBuffer | string
|
|
14
14
|
prompt?: string
|
|
15
15
|
language?: string
|
|
16
16
|
temperature?: number
|
|
17
|
+
geminiApiKey?: string
|
|
17
18
|
}): Promise<string> {
|
|
18
19
|
try {
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
20
|
+
// Use provided API key or fall back to environment variable
|
|
21
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
22
|
+
|
|
23
|
+
if (!apiKey) {
|
|
24
|
+
throw new Error('Gemini API key is required for audio transcription')
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Initialize Google Generative AI
|
|
28
|
+
const genAI = new GoogleGenAI({ apiKey })
|
|
29
|
+
|
|
30
|
+
// Convert audio to base64 string if it's not already
|
|
31
|
+
let audioBase64: string
|
|
32
|
+
if (typeof audio === 'string') {
|
|
33
|
+
audioBase64 = audio
|
|
34
|
+
} else if (audio instanceof Buffer) {
|
|
35
|
+
audioBase64 = audio.toString('base64')
|
|
36
|
+
} else if (audio instanceof Uint8Array) {
|
|
37
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
38
|
+
} else if (audio instanceof ArrayBuffer) {
|
|
39
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
40
|
+
} else {
|
|
41
|
+
throw new Error('Invalid audio format')
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Build the transcription prompt
|
|
45
|
+
let transcriptionPrompt = `Please transcribe this audio file accurately. Here is some relevant information and filenames that may be present in the audio:\n<context>\n${prompt}\n</context>\n`
|
|
46
|
+
if (language) {
|
|
47
|
+
transcriptionPrompt += `\nThe audio is in ${language}.`
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Create the content with audio using the inline data format
|
|
51
|
+
const response = await genAI.models.generateContent({
|
|
52
|
+
model: 'gemini-2.5-flash',
|
|
53
|
+
contents: [
|
|
54
|
+
{
|
|
55
|
+
parts: [
|
|
56
|
+
{ text: transcriptionPrompt },
|
|
57
|
+
{
|
|
58
|
+
inlineData: {
|
|
59
|
+
data: audioBase64,
|
|
60
|
+
mimeType: 'audio/mpeg',
|
|
29
61
|
},
|
|
30
62
|
},
|
|
31
|
-
|
|
32
|
-
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
],
|
|
66
|
+
config:
|
|
67
|
+
temperature !== undefined
|
|
68
|
+
? {
|
|
69
|
+
temperature,
|
|
70
|
+
}
|
|
71
|
+
: undefined,
|
|
33
72
|
})
|
|
34
73
|
|
|
35
|
-
return
|
|
74
|
+
return response.text || ''
|
|
36
75
|
} catch (error) {
|
|
37
76
|
voiceLogger.error('Failed to transcribe audio:', error)
|
|
38
77
|
throw new Error(
|