kimaki 0.4.21 → 0.4.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/channel-management.js +92 -0
- package/dist/cli.js +10 -2
- package/dist/database.js +130 -0
- package/dist/discord-bot.js +381 -0
- package/dist/discord-utils.js +151 -0
- package/dist/discordBot.js +60 -31
- package/dist/escape-backticks.test.js +1 -1
- package/dist/fork.js +163 -0
- package/dist/format-tables.js +93 -0
- package/dist/format-tables.test.js +418 -0
- package/dist/interaction-handler.js +750 -0
- package/dist/markdown.js +3 -3
- package/dist/message-formatting.js +188 -0
- package/dist/model-command.js +293 -0
- package/dist/opencode.js +135 -0
- package/dist/session-handler.js +467 -0
- package/dist/system-message.js +92 -0
- package/dist/tools.js +3 -5
- package/dist/utils.js +31 -0
- package/dist/voice-handler.js +528 -0
- package/dist/voice.js +257 -35
- package/package.json +3 -2
- package/src/channel-management.ts +145 -0
- package/src/cli.ts +10 -2
- package/src/database.ts +155 -0
- package/src/discord-bot.ts +506 -0
- package/src/discord-utils.ts +208 -0
- package/src/escape-backticks.test.ts +1 -1
- package/src/fork.ts +224 -0
- package/src/format-tables.test.ts +440 -0
- package/src/format-tables.ts +106 -0
- package/src/interaction-handler.ts +1000 -0
- package/src/markdown.ts +3 -3
- package/src/message-formatting.ts +227 -0
- package/src/model-command.ts +380 -0
- package/src/opencode.ts +180 -0
- package/src/session-handler.ts +601 -0
- package/src/system-message.ts +92 -0
- package/src/tools.ts +3 -5
- package/src/utils.ts +37 -0
- package/src/voice-handler.ts +745 -0
- package/src/voice.ts +354 -36
- package/src/discordBot.ts +0 -3643
package/src/voice.ts
CHANGED
|
@@ -1,33 +1,324 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
GoogleGenAI,
|
|
3
|
+
Type,
|
|
4
|
+
type Content,
|
|
5
|
+
type Part,
|
|
6
|
+
type Tool,
|
|
7
|
+
} from '@google/genai'
|
|
2
8
|
import { createLogger } from './logger.js'
|
|
9
|
+
import { glob } from 'glob'
|
|
10
|
+
import { ripGrep } from 'ripgrep-js'
|
|
3
11
|
|
|
4
12
|
const voiceLogger = createLogger('VOICE')
|
|
5
13
|
|
|
14
|
+
export type TranscriptionToolRunner = ({
|
|
15
|
+
name,
|
|
16
|
+
args,
|
|
17
|
+
}: {
|
|
18
|
+
name: string
|
|
19
|
+
args: Record<string, string> | undefined
|
|
20
|
+
}) => Promise<
|
|
21
|
+
| { type: 'result'; transcription: string }
|
|
22
|
+
| { type: 'toolResponse'; name: string; output: string }
|
|
23
|
+
| { type: 'skip' }
|
|
24
|
+
>
|
|
25
|
+
|
|
26
|
+
async function runGrep({
|
|
27
|
+
pattern,
|
|
28
|
+
directory,
|
|
29
|
+
}: {
|
|
30
|
+
pattern: string
|
|
31
|
+
directory: string
|
|
32
|
+
}): Promise<string> {
|
|
33
|
+
try {
|
|
34
|
+
const results = await ripGrep(directory, {
|
|
35
|
+
string: pattern,
|
|
36
|
+
globs: ['!node_modules/**', '!.git/**', '!dist/**', '!build/**'],
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
if (results.length === 0) {
|
|
40
|
+
return 'No matches found'
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const output = results
|
|
44
|
+
.slice(0, 10)
|
|
45
|
+
.map((match) => {
|
|
46
|
+
return `${match.path.text}:${match.line_number}: ${match.lines.text.trim()}`
|
|
47
|
+
})
|
|
48
|
+
.join('\n')
|
|
49
|
+
|
|
50
|
+
return output.slice(0, 2000)
|
|
51
|
+
} catch {
|
|
52
|
+
return 'grep search failed'
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
async function runGlob({
|
|
57
|
+
pattern,
|
|
58
|
+
directory,
|
|
59
|
+
}: {
|
|
60
|
+
pattern: string
|
|
61
|
+
directory: string
|
|
62
|
+
}): Promise<string> {
|
|
63
|
+
try {
|
|
64
|
+
const files = await glob(pattern, {
|
|
65
|
+
cwd: directory,
|
|
66
|
+
nodir: false,
|
|
67
|
+
ignore: ['node_modules/**', '.git/**', 'dist/**', 'build/**'],
|
|
68
|
+
maxDepth: 10,
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
if (files.length === 0) {
|
|
72
|
+
return 'No files found'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return files.slice(0, 30).join('\n')
|
|
76
|
+
} catch (error) {
|
|
77
|
+
return `Glob search failed: ${error instanceof Error ? error.message : 'Unknown error'}`
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const grepToolDeclaration = {
|
|
82
|
+
name: 'grep',
|
|
83
|
+
description:
|
|
84
|
+
'Search for a pattern in file contents to verify if a technical term, function name, or variable exists in the code. Use this to check if transcribed words match actual code.',
|
|
85
|
+
parameters: {
|
|
86
|
+
type: Type.OBJECT,
|
|
87
|
+
properties: {
|
|
88
|
+
pattern: {
|
|
89
|
+
type: Type.STRING,
|
|
90
|
+
description:
|
|
91
|
+
'The search pattern (case-insensitive). Can be a word, function name, or partial match.',
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
required: ['pattern'],
|
|
95
|
+
},
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const globToolDeclaration = {
|
|
99
|
+
name: 'glob',
|
|
100
|
+
description:
|
|
101
|
+
'Search for files by name pattern. Use this to verify if a filename or directory mentioned in the audio actually exists in the project.',
|
|
102
|
+
parameters: {
|
|
103
|
+
type: Type.OBJECT,
|
|
104
|
+
properties: {
|
|
105
|
+
pattern: {
|
|
106
|
+
type: Type.STRING,
|
|
107
|
+
description:
|
|
108
|
+
'The glob pattern to match files. Examples: "*.ts", "**/*.json", "**/config*", "src/**/*.tsx"',
|
|
109
|
+
},
|
|
110
|
+
},
|
|
111
|
+
required: ['pattern'],
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const transcriptionResultToolDeclaration = {
|
|
116
|
+
name: 'transcriptionResult',
|
|
117
|
+
description:
|
|
118
|
+
'MANDATORY: You MUST call this tool to complete the task. This is the ONLY way to return results - text responses are ignored. Call this with your transcription, even if imperfect. An imperfect transcription is better than none.',
|
|
119
|
+
parameters: {
|
|
120
|
+
type: Type.OBJECT,
|
|
121
|
+
properties: {
|
|
122
|
+
transcription: {
|
|
123
|
+
type: Type.STRING,
|
|
124
|
+
description:
|
|
125
|
+
'The final transcription of the audio. MUST be non-empty. If audio is unclear, transcribe your best interpretation. If silent, use "[inaudible audio]".',
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
required: ['transcription'],
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function createToolRunner({
|
|
133
|
+
directory,
|
|
134
|
+
}: {
|
|
135
|
+
directory?: string
|
|
136
|
+
}): TranscriptionToolRunner {
|
|
137
|
+
const hasDirectory = directory && directory.trim().length > 0
|
|
138
|
+
|
|
139
|
+
return async ({ name, args }) => {
|
|
140
|
+
if (name === 'transcriptionResult') {
|
|
141
|
+
return {
|
|
142
|
+
type: 'result',
|
|
143
|
+
transcription: args?.transcription || '',
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (name === 'grep' && hasDirectory) {
|
|
148
|
+
const pattern = args?.pattern || ''
|
|
149
|
+
voiceLogger.log(`Grep search: "${pattern}"`)
|
|
150
|
+
const output = await runGrep({ pattern, directory })
|
|
151
|
+
voiceLogger.log(`Grep result: ${output.slice(0, 100)}...`)
|
|
152
|
+
return { type: 'toolResponse', name: 'grep', output }
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (name === 'glob' && hasDirectory) {
|
|
156
|
+
const pattern = args?.pattern || ''
|
|
157
|
+
voiceLogger.log(`Glob search: "${pattern}"`)
|
|
158
|
+
const output = await runGlob({ pattern, directory })
|
|
159
|
+
voiceLogger.log(`Glob result: ${output.slice(0, 100)}...`)
|
|
160
|
+
return { type: 'toolResponse', name: 'glob', output }
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return { type: 'skip' }
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
export async function runTranscriptionLoop({
|
|
168
|
+
genAI,
|
|
169
|
+
model,
|
|
170
|
+
initialContents,
|
|
171
|
+
tools,
|
|
172
|
+
temperature,
|
|
173
|
+
toolRunner,
|
|
174
|
+
maxSteps = 10,
|
|
175
|
+
}: {
|
|
176
|
+
genAI: GoogleGenAI
|
|
177
|
+
model: string
|
|
178
|
+
initialContents: Content[]
|
|
179
|
+
tools: Tool[]
|
|
180
|
+
temperature: number
|
|
181
|
+
toolRunner: TranscriptionToolRunner
|
|
182
|
+
maxSteps?: number
|
|
183
|
+
}): Promise<string> {
|
|
184
|
+
let response = await genAI.models.generateContent({
|
|
185
|
+
model,
|
|
186
|
+
contents: initialContents,
|
|
187
|
+
config: {
|
|
188
|
+
temperature,
|
|
189
|
+
thinkingConfig: {
|
|
190
|
+
thinkingBudget: 1024,
|
|
191
|
+
},
|
|
192
|
+
tools,
|
|
193
|
+
},
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
const conversationHistory: Content[] = [...initialContents]
|
|
197
|
+
let stepsRemaining = maxSteps
|
|
198
|
+
|
|
199
|
+
while (true) {
|
|
200
|
+
const candidate = response.candidates?.[0]
|
|
201
|
+
if (!candidate?.content?.parts) {
|
|
202
|
+
const text = response.text?.trim()
|
|
203
|
+
if (text) {
|
|
204
|
+
voiceLogger.log(`No parts but got text response: "${text.slice(0, 100)}..."`)
|
|
205
|
+
return text
|
|
206
|
+
}
|
|
207
|
+
throw new Error('Transcription failed: No response content from model')
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const functionCalls = candidate.content.parts.filter(
|
|
211
|
+
(part): part is Part & { functionCall: NonNullable<Part['functionCall']> } =>
|
|
212
|
+
'functionCall' in part && !!part.functionCall,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if (functionCalls.length === 0) {
|
|
216
|
+
const text = response.text?.trim()
|
|
217
|
+
if (text) {
|
|
218
|
+
voiceLogger.log(`No function calls but got text: "${text.slice(0, 100)}..."`)
|
|
219
|
+
return text
|
|
220
|
+
}
|
|
221
|
+
throw new Error('Transcription failed: Model did not produce a transcription')
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
conversationHistory.push({
|
|
225
|
+
role: 'model',
|
|
226
|
+
parts: candidate.content.parts,
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
const functionResponseParts: Array<{
|
|
230
|
+
functionResponse: { name: string; response: { output: string } }
|
|
231
|
+
}> = []
|
|
232
|
+
|
|
233
|
+
for (const part of functionCalls) {
|
|
234
|
+
const call = part.functionCall
|
|
235
|
+
const args = call.args as Record<string, string> | undefined
|
|
236
|
+
const result = await toolRunner({ name: call.name || '', args })
|
|
237
|
+
|
|
238
|
+
if (result.type === 'result') {
|
|
239
|
+
const transcription = result.transcription?.trim() || ''
|
|
240
|
+
voiceLogger.log(
|
|
241
|
+
`Transcription result received: "${transcription.slice(0, 100)}..."`,
|
|
242
|
+
)
|
|
243
|
+
if (!transcription) {
|
|
244
|
+
throw new Error('Transcription failed: Model returned empty transcription')
|
|
245
|
+
}
|
|
246
|
+
return transcription
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (result.type === 'toolResponse') {
|
|
250
|
+
stepsRemaining--
|
|
251
|
+
const stepsWarning: string = (() => {
|
|
252
|
+
if (stepsRemaining <= 0) {
|
|
253
|
+
return '\n\n[CRITICAL: Tool limit reached. You MUST call transcriptionResult NOW. No more grep/glob allowed. Call transcriptionResult immediately with your best transcription.]'
|
|
254
|
+
}
|
|
255
|
+
if (stepsRemaining === 1) {
|
|
256
|
+
return '\n\n[URGENT: FINAL STEP. You MUST call transcriptionResult NOW. Do NOT call grep or glob. Call transcriptionResult with your transcription immediately.]'
|
|
257
|
+
}
|
|
258
|
+
if (stepsRemaining <= 3) {
|
|
259
|
+
return `\n\n[WARNING: Only ${stepsRemaining} steps remaining. Finish searching soon and call transcriptionResult. Do not wait until the last step.]`
|
|
260
|
+
}
|
|
261
|
+
return ''
|
|
262
|
+
})()
|
|
263
|
+
|
|
264
|
+
functionResponseParts.push({
|
|
265
|
+
functionResponse: {
|
|
266
|
+
name: result.name,
|
|
267
|
+
response: { output: result.output + stepsWarning },
|
|
268
|
+
},
|
|
269
|
+
})
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (functionResponseParts.length === 0) {
|
|
274
|
+
throw new Error('Transcription failed: No valid tool responses')
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
conversationHistory.push({
|
|
278
|
+
role: 'user',
|
|
279
|
+
parts: functionResponseParts,
|
|
280
|
+
} as Content)
|
|
281
|
+
|
|
282
|
+
response = await genAI.models.generateContent({
|
|
283
|
+
model,
|
|
284
|
+
contents: conversationHistory,
|
|
285
|
+
config: {
|
|
286
|
+
temperature,
|
|
287
|
+
thinkingConfig: {
|
|
288
|
+
thinkingBudget: 512,
|
|
289
|
+
},
|
|
290
|
+
tools: stepsRemaining <= 0 ? [{ functionDeclarations: [transcriptionResultToolDeclaration] }] : tools,
|
|
291
|
+
},
|
|
292
|
+
})
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
6
296
|
export async function transcribeAudio({
|
|
7
297
|
audio,
|
|
8
298
|
prompt,
|
|
9
299
|
language,
|
|
10
300
|
temperature,
|
|
11
301
|
geminiApiKey,
|
|
302
|
+
directory,
|
|
303
|
+
sessionMessages,
|
|
12
304
|
}: {
|
|
13
305
|
audio: Buffer | Uint8Array | ArrayBuffer | string
|
|
14
306
|
prompt?: string
|
|
15
307
|
language?: string
|
|
16
308
|
temperature?: number
|
|
17
309
|
geminiApiKey?: string
|
|
310
|
+
directory?: string
|
|
311
|
+
sessionMessages?: string
|
|
18
312
|
}): Promise<string> {
|
|
19
313
|
try {
|
|
20
|
-
// Use provided API key or fall back to environment variable
|
|
21
314
|
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
22
315
|
|
|
23
316
|
if (!apiKey) {
|
|
24
317
|
throw new Error('Gemini API key is required for audio transcription')
|
|
25
318
|
}
|
|
26
319
|
|
|
27
|
-
// Initialize Google Generative AI
|
|
28
320
|
const genAI = new GoogleGenAI({ apiKey })
|
|
29
321
|
|
|
30
|
-
// Convert audio to base64 string if it's not already
|
|
31
322
|
let audioBase64: string
|
|
32
323
|
if (typeof audio === 'string') {
|
|
33
324
|
audioBase64 = audio
|
|
@@ -41,47 +332,74 @@ export async function transcribeAudio({
|
|
|
41
332
|
throw new Error('Invalid audio format')
|
|
42
333
|
}
|
|
43
334
|
|
|
44
|
-
|
|
45
|
-
|
|
335
|
+
const languageHint = language ? `The audio is in ${language}.\n\n` : ''
|
|
336
|
+
|
|
337
|
+
const transcriptionPrompt = `${languageHint}Transcribe this audio for a coding agent (like Claude Code or OpenCode).
|
|
338
|
+
|
|
339
|
+
CRITICAL REQUIREMENT: You MUST call the "transcriptionResult" tool to complete this task.
|
|
340
|
+
- The transcriptionResult tool is the ONLY way to return results
|
|
341
|
+
- Text responses are completely ignored - only tool calls work
|
|
342
|
+
- You MUST call transcriptionResult even if you run out of tool calls
|
|
343
|
+
- An imperfect transcription is better than no transcription
|
|
344
|
+
- DO NOT end without calling transcriptionResult
|
|
345
|
+
|
|
346
|
+
This is a software development environment. The speaker is giving instructions to an AI coding assistant. Expect:
|
|
347
|
+
- File paths, function names, CLI commands, package names, API endpoints
|
|
46
348
|
|
|
47
|
-
|
|
349
|
+
RULES:
|
|
350
|
+
1. You have LIMITED tool calls - use grep/glob sparingly, call them in parallel
|
|
351
|
+
2. If audio is unclear, transcribe your best interpretation
|
|
352
|
+
3. If audio seems silent/empty, call transcriptionResult with "[inaudible audio]"
|
|
353
|
+
4. When warned about remaining steps, STOP searching and call transcriptionResult immediately
|
|
48
354
|
|
|
49
|
-
|
|
355
|
+
Common corrections (apply without tool calls):
|
|
356
|
+
- "reacked" → "React", "jason" → "JSON", "get hub" → "GitHub", "no JS" → "Node.js", "dacker" → "Docker"
|
|
50
357
|
|
|
51
|
-
|
|
358
|
+
Project context for reference:
|
|
52
359
|
<context>
|
|
53
360
|
${prompt}
|
|
54
361
|
</context>
|
|
55
|
-
|
|
56
|
-
if (language) {
|
|
57
|
-
transcriptionPrompt += `\nThe audio is in ${language}.`
|
|
58
|
-
}
|
|
362
|
+
${sessionMessages ? `\nRecent session messages:\n<session_messages>\n${sessionMessages}\n</session_messages>` : ''}
|
|
59
363
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
364
|
+
REMEMBER: Call "transcriptionResult" tool with your transcription. This is mandatory.
|
|
365
|
+
|
|
366
|
+
Note: "critique" is a CLI tool for showing diffs in the browser.`
|
|
367
|
+
|
|
368
|
+
const hasDirectory = directory && directory.trim().length > 0
|
|
369
|
+
const tools = [
|
|
370
|
+
{
|
|
371
|
+
functionDeclarations: [
|
|
372
|
+
transcriptionResultToolDeclaration,
|
|
373
|
+
...(hasDirectory ? [grepToolDeclaration, globToolDeclaration] : []),
|
|
374
|
+
],
|
|
375
|
+
},
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
const initialContents: Content[] = [
|
|
379
|
+
{
|
|
380
|
+
role: 'user',
|
|
381
|
+
parts: [
|
|
382
|
+
{ text: transcriptionPrompt },
|
|
383
|
+
{
|
|
384
|
+
inlineData: {
|
|
385
|
+
data: audioBase64,
|
|
386
|
+
mimeType: 'audio/mpeg',
|
|
72
387
|
},
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
temperature,
|
|
80
|
-
}
|
|
81
|
-
: undefined,
|
|
82
|
-
})
|
|
388
|
+
},
|
|
389
|
+
],
|
|
390
|
+
},
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
const toolRunner = createToolRunner({ directory })
|
|
83
394
|
|
|
84
|
-
return
|
|
395
|
+
return await runTranscriptionLoop({
|
|
396
|
+
genAI,
|
|
397
|
+
model: 'gemini-2.5-flash',
|
|
398
|
+
initialContents,
|
|
399
|
+
tools,
|
|
400
|
+
temperature: temperature ?? 0.3,
|
|
401
|
+
toolRunner,
|
|
402
|
+
})
|
|
85
403
|
} catch (error) {
|
|
86
404
|
voiceLogger.error('Failed to transcribe audio:', error)
|
|
87
405
|
throw new Error(
|