kimaki 0.4.21 → 0.4.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/channel-management.js +92 -0
  2. package/dist/cli.js +10 -2
  3. package/dist/database.js +130 -0
  4. package/dist/discord-bot.js +381 -0
  5. package/dist/discord-utils.js +151 -0
  6. package/dist/discordBot.js +60 -31
  7. package/dist/escape-backticks.test.js +1 -1
  8. package/dist/fork.js +163 -0
  9. package/dist/format-tables.js +93 -0
  10. package/dist/format-tables.test.js +418 -0
  11. package/dist/interaction-handler.js +750 -0
  12. package/dist/markdown.js +3 -3
  13. package/dist/message-formatting.js +188 -0
  14. package/dist/model-command.js +293 -0
  15. package/dist/opencode.js +135 -0
  16. package/dist/session-handler.js +467 -0
  17. package/dist/system-message.js +92 -0
  18. package/dist/tools.js +3 -5
  19. package/dist/utils.js +31 -0
  20. package/dist/voice-handler.js +528 -0
  21. package/dist/voice.js +257 -35
  22. package/package.json +3 -2
  23. package/src/channel-management.ts +145 -0
  24. package/src/cli.ts +10 -2
  25. package/src/database.ts +155 -0
  26. package/src/discord-bot.ts +506 -0
  27. package/src/discord-utils.ts +208 -0
  28. package/src/escape-backticks.test.ts +1 -1
  29. package/src/fork.ts +224 -0
  30. package/src/format-tables.test.ts +440 -0
  31. package/src/format-tables.ts +106 -0
  32. package/src/interaction-handler.ts +1000 -0
  33. package/src/markdown.ts +3 -3
  34. package/src/message-formatting.ts +227 -0
  35. package/src/model-command.ts +380 -0
  36. package/src/opencode.ts +180 -0
  37. package/src/session-handler.ts +601 -0
  38. package/src/system-message.ts +92 -0
  39. package/src/tools.ts +3 -5
  40. package/src/utils.ts +37 -0
  41. package/src/voice-handler.ts +745 -0
  42. package/src/voice.ts +354 -36
  43. package/src/discordBot.ts +0 -3643
package/src/voice.ts CHANGED
@@ -1,33 +1,324 @@
1
- import { GoogleGenAI } from '@google/genai'
1
+ import {
2
+ GoogleGenAI,
3
+ Type,
4
+ type Content,
5
+ type Part,
6
+ type Tool,
7
+ } from '@google/genai'
2
8
  import { createLogger } from './logger.js'
9
+ import { glob } from 'glob'
10
+ import { ripGrep } from 'ripgrep-js'
3
11
 
4
12
  const voiceLogger = createLogger('VOICE')
5
13
 
14
+ export type TranscriptionToolRunner = ({
15
+ name,
16
+ args,
17
+ }: {
18
+ name: string
19
+ args: Record<string, string> | undefined
20
+ }) => Promise<
21
+ | { type: 'result'; transcription: string }
22
+ | { type: 'toolResponse'; name: string; output: string }
23
+ | { type: 'skip' }
24
+ >
25
+
26
+ async function runGrep({
27
+ pattern,
28
+ directory,
29
+ }: {
30
+ pattern: string
31
+ directory: string
32
+ }): Promise<string> {
33
+ try {
34
+ const results = await ripGrep(directory, {
35
+ string: pattern,
36
+ globs: ['!node_modules/**', '!.git/**', '!dist/**', '!build/**'],
37
+ })
38
+
39
+ if (results.length === 0) {
40
+ return 'No matches found'
41
+ }
42
+
43
+ const output = results
44
+ .slice(0, 10)
45
+ .map((match) => {
46
+ return `${match.path.text}:${match.line_number}: ${match.lines.text.trim()}`
47
+ })
48
+ .join('\n')
49
+
50
+ return output.slice(0, 2000)
51
+ } catch {
52
+ return 'grep search failed'
53
+ }
54
+ }
55
+
56
+ async function runGlob({
57
+ pattern,
58
+ directory,
59
+ }: {
60
+ pattern: string
61
+ directory: string
62
+ }): Promise<string> {
63
+ try {
64
+ const files = await glob(pattern, {
65
+ cwd: directory,
66
+ nodir: false,
67
+ ignore: ['node_modules/**', '.git/**', 'dist/**', 'build/**'],
68
+ maxDepth: 10,
69
+ })
70
+
71
+ if (files.length === 0) {
72
+ return 'No files found'
73
+ }
74
+
75
+ return files.slice(0, 30).join('\n')
76
+ } catch (error) {
77
+ return `Glob search failed: ${error instanceof Error ? error.message : 'Unknown error'}`
78
+ }
79
+ }
80
+
81
+ const grepToolDeclaration = {
82
+ name: 'grep',
83
+ description:
84
+ 'Search for a pattern in file contents to verify if a technical term, function name, or variable exists in the code. Use this to check if transcribed words match actual code.',
85
+ parameters: {
86
+ type: Type.OBJECT,
87
+ properties: {
88
+ pattern: {
89
+ type: Type.STRING,
90
+ description:
91
+ 'The search pattern (case-insensitive). Can be a word, function name, or partial match.',
92
+ },
93
+ },
94
+ required: ['pattern'],
95
+ },
96
+ }
97
+
98
+ const globToolDeclaration = {
99
+ name: 'glob',
100
+ description:
101
+ 'Search for files by name pattern. Use this to verify if a filename or directory mentioned in the audio actually exists in the project.',
102
+ parameters: {
103
+ type: Type.OBJECT,
104
+ properties: {
105
+ pattern: {
106
+ type: Type.STRING,
107
+ description:
108
+ 'The glob pattern to match files. Examples: "*.ts", "**/*.json", "**/config*", "src/**/*.tsx"',
109
+ },
110
+ },
111
+ required: ['pattern'],
112
+ },
113
+ }
114
+
115
+ const transcriptionResultToolDeclaration = {
116
+ name: 'transcriptionResult',
117
+ description:
118
+ 'MANDATORY: You MUST call this tool to complete the task. This is the ONLY way to return results - text responses are ignored. Call this with your transcription, even if imperfect. An imperfect transcription is better than none.',
119
+ parameters: {
120
+ type: Type.OBJECT,
121
+ properties: {
122
+ transcription: {
123
+ type: Type.STRING,
124
+ description:
125
+ 'The final transcription of the audio. MUST be non-empty. If audio is unclear, transcribe your best interpretation. If silent, use "[inaudible audio]".',
126
+ },
127
+ },
128
+ required: ['transcription'],
129
+ },
130
+ }
131
+
132
+ function createToolRunner({
133
+ directory,
134
+ }: {
135
+ directory?: string
136
+ }): TranscriptionToolRunner {
137
+ const hasDirectory = directory && directory.trim().length > 0
138
+
139
+ return async ({ name, args }) => {
140
+ if (name === 'transcriptionResult') {
141
+ return {
142
+ type: 'result',
143
+ transcription: args?.transcription || '',
144
+ }
145
+ }
146
+
147
+ if (name === 'grep' && hasDirectory) {
148
+ const pattern = args?.pattern || ''
149
+ voiceLogger.log(`Grep search: "${pattern}"`)
150
+ const output = await runGrep({ pattern, directory })
151
+ voiceLogger.log(`Grep result: ${output.slice(0, 100)}...`)
152
+ return { type: 'toolResponse', name: 'grep', output }
153
+ }
154
+
155
+ if (name === 'glob' && hasDirectory) {
156
+ const pattern = args?.pattern || ''
157
+ voiceLogger.log(`Glob search: "${pattern}"`)
158
+ const output = await runGlob({ pattern, directory })
159
+ voiceLogger.log(`Glob result: ${output.slice(0, 100)}...`)
160
+ return { type: 'toolResponse', name: 'glob', output }
161
+ }
162
+
163
+ return { type: 'skip' }
164
+ }
165
+ }
166
+
167
+ export async function runTranscriptionLoop({
168
+ genAI,
169
+ model,
170
+ initialContents,
171
+ tools,
172
+ temperature,
173
+ toolRunner,
174
+ maxSteps = 10,
175
+ }: {
176
+ genAI: GoogleGenAI
177
+ model: string
178
+ initialContents: Content[]
179
+ tools: Tool[]
180
+ temperature: number
181
+ toolRunner: TranscriptionToolRunner
182
+ maxSteps?: number
183
+ }): Promise<string> {
184
+ let response = await genAI.models.generateContent({
185
+ model,
186
+ contents: initialContents,
187
+ config: {
188
+ temperature,
189
+ thinkingConfig: {
190
+ thinkingBudget: 1024,
191
+ },
192
+ tools,
193
+ },
194
+ })
195
+
196
+ const conversationHistory: Content[] = [...initialContents]
197
+ let stepsRemaining = maxSteps
198
+
199
+ while (true) {
200
+ const candidate = response.candidates?.[0]
201
+ if (!candidate?.content?.parts) {
202
+ const text = response.text?.trim()
203
+ if (text) {
204
+ voiceLogger.log(`No parts but got text response: "${text.slice(0, 100)}..."`)
205
+ return text
206
+ }
207
+ throw new Error('Transcription failed: No response content from model')
208
+ }
209
+
210
+ const functionCalls = candidate.content.parts.filter(
211
+ (part): part is Part & { functionCall: NonNullable<Part['functionCall']> } =>
212
+ 'functionCall' in part && !!part.functionCall,
213
+ )
214
+
215
+ if (functionCalls.length === 0) {
216
+ const text = response.text?.trim()
217
+ if (text) {
218
+ voiceLogger.log(`No function calls but got text: "${text.slice(0, 100)}..."`)
219
+ return text
220
+ }
221
+ throw new Error('Transcription failed: Model did not produce a transcription')
222
+ }
223
+
224
+ conversationHistory.push({
225
+ role: 'model',
226
+ parts: candidate.content.parts,
227
+ })
228
+
229
+ const functionResponseParts: Array<{
230
+ functionResponse: { name: string; response: { output: string } }
231
+ }> = []
232
+
233
+ for (const part of functionCalls) {
234
+ const call = part.functionCall
235
+ const args = call.args as Record<string, string> | undefined
236
+ const result = await toolRunner({ name: call.name || '', args })
237
+
238
+ if (result.type === 'result') {
239
+ const transcription = result.transcription?.trim() || ''
240
+ voiceLogger.log(
241
+ `Transcription result received: "${transcription.slice(0, 100)}..."`,
242
+ )
243
+ if (!transcription) {
244
+ throw new Error('Transcription failed: Model returned empty transcription')
245
+ }
246
+ return transcription
247
+ }
248
+
249
+ if (result.type === 'toolResponse') {
250
+ stepsRemaining--
251
+ const stepsWarning: string = (() => {
252
+ if (stepsRemaining <= 0) {
253
+ return '\n\n[CRITICAL: Tool limit reached. You MUST call transcriptionResult NOW. No more grep/glob allowed. Call transcriptionResult immediately with your best transcription.]'
254
+ }
255
+ if (stepsRemaining === 1) {
256
+ return '\n\n[URGENT: FINAL STEP. You MUST call transcriptionResult NOW. Do NOT call grep or glob. Call transcriptionResult with your transcription immediately.]'
257
+ }
258
+ if (stepsRemaining <= 3) {
259
+ return `\n\n[WARNING: Only ${stepsRemaining} steps remaining. Finish searching soon and call transcriptionResult. Do not wait until the last step.]`
260
+ }
261
+ return ''
262
+ })()
263
+
264
+ functionResponseParts.push({
265
+ functionResponse: {
266
+ name: result.name,
267
+ response: { output: result.output + stepsWarning },
268
+ },
269
+ })
270
+ }
271
+ }
272
+
273
+ if (functionResponseParts.length === 0) {
274
+ throw new Error('Transcription failed: No valid tool responses')
275
+ }
276
+
277
+ conversationHistory.push({
278
+ role: 'user',
279
+ parts: functionResponseParts,
280
+ } as Content)
281
+
282
+ response = await genAI.models.generateContent({
283
+ model,
284
+ contents: conversationHistory,
285
+ config: {
286
+ temperature,
287
+ thinkingConfig: {
288
+ thinkingBudget: 512,
289
+ },
290
+ tools: stepsRemaining <= 0 ? [{ functionDeclarations: [transcriptionResultToolDeclaration] }] : tools,
291
+ },
292
+ })
293
+ }
294
+ }
295
+
6
296
  export async function transcribeAudio({
7
297
  audio,
8
298
  prompt,
9
299
  language,
10
300
  temperature,
11
301
  geminiApiKey,
302
+ directory,
303
+ sessionMessages,
12
304
  }: {
13
305
  audio: Buffer | Uint8Array | ArrayBuffer | string
14
306
  prompt?: string
15
307
  language?: string
16
308
  temperature?: number
17
309
  geminiApiKey?: string
310
+ directory?: string
311
+ sessionMessages?: string
18
312
  }): Promise<string> {
19
313
  try {
20
- // Use provided API key or fall back to environment variable
21
314
  const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
22
315
 
23
316
  if (!apiKey) {
24
317
  throw new Error('Gemini API key is required for audio transcription')
25
318
  }
26
319
 
27
- // Initialize Google Generative AI
28
320
  const genAI = new GoogleGenAI({ apiKey })
29
321
 
30
- // Convert audio to base64 string if it's not already
31
322
  let audioBase64: string
32
323
  if (typeof audio === 'string') {
33
324
  audioBase64 = audio
@@ -41,47 +332,74 @@ export async function transcribeAudio({
41
332
  throw new Error('Invalid audio format')
42
333
  }
43
334
 
44
- // Build the transcription prompt
45
- let transcriptionPrompt = `Transcribe this audio accurately. The transcription will be sent to a coding agent (like Claude Code) to execute programming tasks.
335
+ const languageHint = language ? `The audio is in ${language}.\n\n` : ''
336
+
337
+ const transcriptionPrompt = `${languageHint}Transcribe this audio for a coding agent (like Claude Code or OpenCode).
338
+
339
+ CRITICAL REQUIREMENT: You MUST call the "transcriptionResult" tool to complete this task.
340
+ - The transcriptionResult tool is the ONLY way to return results
341
+ - Text responses are completely ignored - only tool calls work
342
+ - You MUST call transcriptionResult even if you run out of tool calls
343
+ - An imperfect transcription is better than no transcription
344
+ - DO NOT end without calling transcriptionResult
345
+
346
+ This is a software development environment. The speaker is giving instructions to an AI coding assistant. Expect:
347
+ - File paths, function names, CLI commands, package names, API endpoints
46
348
 
47
- Assume the speaker is using technical and programming terminology: file paths, function names, CLI commands, package names, API names, programming concepts, etc. Prioritize technical accuracy over literal transcription - if a word sounds like a common programming term, prefer that interpretation.
349
+ RULES:
350
+ 1. You have LIMITED tool calls - use grep/glob sparingly, call them in parallel
351
+ 2. If audio is unclear, transcribe your best interpretation
352
+ 3. If audio seems silent/empty, call transcriptionResult with "[inaudible audio]"
353
+ 4. When warned about remaining steps, STOP searching and call transcriptionResult immediately
48
354
 
49
- If the spoken message is unclear or ambiguous, rephrase it to better convey the intended meaning for a coding agent. The goal is effective communication of the user's programming intent, not a word-for-word transcription.
355
+ Common corrections (apply without tool calls):
356
+ - "reacked" → "React", "jason" → "JSON", "get hub" → "GitHub", "no JS" → "Node.js", "dacker" → "Docker"
50
357
 
51
- Here are relevant filenames and context that may appear in the audio:
358
+ Project context for reference:
52
359
  <context>
53
360
  ${prompt}
54
361
  </context>
55
- `
56
- if (language) {
57
- transcriptionPrompt += `\nThe audio is in ${language}.`
58
- }
362
+ ${sessionMessages ? `\nRecent session messages:\n<session_messages>\n${sessionMessages}\n</session_messages>` : ''}
59
363
 
60
- // Create the content with audio using the inline data format
61
- const response = await genAI.models.generateContent({
62
- model: 'gemini-2.5-flash',
63
- contents: [
64
- {
65
- parts: [
66
- { text: transcriptionPrompt },
67
- {
68
- inlineData: {
69
- data: audioBase64,
70
- mimeType: 'audio/mpeg',
71
- },
364
+ REMEMBER: Call "transcriptionResult" tool with your transcription. This is mandatory.
365
+
366
+ Note: "critique" is a CLI tool for showing diffs in the browser.`
367
+
368
+ const hasDirectory = directory && directory.trim().length > 0
369
+ const tools = [
370
+ {
371
+ functionDeclarations: [
372
+ transcriptionResultToolDeclaration,
373
+ ...(hasDirectory ? [grepToolDeclaration, globToolDeclaration] : []),
374
+ ],
375
+ },
376
+ ]
377
+
378
+ const initialContents: Content[] = [
379
+ {
380
+ role: 'user',
381
+ parts: [
382
+ { text: transcriptionPrompt },
383
+ {
384
+ inlineData: {
385
+ data: audioBase64,
386
+ mimeType: 'audio/mpeg',
72
387
  },
73
- ],
74
- },
75
- ],
76
- config:
77
- temperature !== undefined
78
- ? {
79
- temperature,
80
- }
81
- : undefined,
82
- })
388
+ },
389
+ ],
390
+ },
391
+ ]
392
+
393
+ const toolRunner = createToolRunner({ directory })
83
394
 
84
- return response.text || ''
395
+ return await runTranscriptionLoop({
396
+ genAI,
397
+ model: 'gemini-2.5-flash',
398
+ initialContents,
399
+ tools,
400
+ temperature: temperature ?? 0.3,
401
+ toolRunner,
402
+ })
85
403
  } catch (error) {
86
404
  voiceLogger.error('Failed to transcribe audio:', error)
87
405
  throw new Error(