shuvmaki 0.4.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin.js +70 -0
- package/dist/ai-tool-to-genai.js +210 -0
- package/dist/ai-tool-to-genai.test.js +267 -0
- package/dist/channel-management.js +97 -0
- package/dist/cli.js +709 -0
- package/dist/commands/abort.js +78 -0
- package/dist/commands/add-project.js +98 -0
- package/dist/commands/agent.js +152 -0
- package/dist/commands/ask-question.js +183 -0
- package/dist/commands/create-new-project.js +78 -0
- package/dist/commands/fork.js +186 -0
- package/dist/commands/model.js +313 -0
- package/dist/commands/permissions.js +126 -0
- package/dist/commands/queue.js +129 -0
- package/dist/commands/resume.js +145 -0
- package/dist/commands/session.js +142 -0
- package/dist/commands/share.js +80 -0
- package/dist/commands/types.js +2 -0
- package/dist/commands/undo-redo.js +161 -0
- package/dist/commands/user-command.js +145 -0
- package/dist/database.js +184 -0
- package/dist/discord-bot.js +384 -0
- package/dist/discord-utils.js +217 -0
- package/dist/escape-backticks.test.js +410 -0
- package/dist/format-tables.js +96 -0
- package/dist/format-tables.test.js +418 -0
- package/dist/genai-worker-wrapper.js +109 -0
- package/dist/genai-worker.js +297 -0
- package/dist/genai.js +232 -0
- package/dist/interaction-handler.js +144 -0
- package/dist/logger.js +51 -0
- package/dist/markdown.js +310 -0
- package/dist/markdown.test.js +262 -0
- package/dist/message-formatting.js +273 -0
- package/dist/message-formatting.test.js +73 -0
- package/dist/openai-realtime.js +228 -0
- package/dist/opencode.js +216 -0
- package/dist/session-handler.js +580 -0
- package/dist/system-message.js +61 -0
- package/dist/tools.js +356 -0
- package/dist/utils.js +85 -0
- package/dist/voice-handler.js +541 -0
- package/dist/voice.js +314 -0
- package/dist/worker-types.js +4 -0
- package/dist/xml.js +92 -0
- package/dist/xml.test.js +32 -0
- package/package.json +60 -0
- package/src/__snapshots__/compact-session-context-no-system.md +35 -0
- package/src/__snapshots__/compact-session-context.md +47 -0
- package/src/ai-tool-to-genai.test.ts +296 -0
- package/src/ai-tool-to-genai.ts +255 -0
- package/src/channel-management.ts +161 -0
- package/src/cli.ts +1010 -0
- package/src/commands/abort.ts +94 -0
- package/src/commands/add-project.ts +139 -0
- package/src/commands/agent.ts +201 -0
- package/src/commands/ask-question.ts +276 -0
- package/src/commands/create-new-project.ts +111 -0
- package/src/commands/fork.ts +257 -0
- package/src/commands/model.ts +402 -0
- package/src/commands/permissions.ts +146 -0
- package/src/commands/queue.ts +181 -0
- package/src/commands/resume.ts +230 -0
- package/src/commands/session.ts +184 -0
- package/src/commands/share.ts +96 -0
- package/src/commands/types.ts +25 -0
- package/src/commands/undo-redo.ts +213 -0
- package/src/commands/user-command.ts +178 -0
- package/src/database.ts +220 -0
- package/src/discord-bot.ts +513 -0
- package/src/discord-utils.ts +282 -0
- package/src/escape-backticks.test.ts +447 -0
- package/src/format-tables.test.ts +440 -0
- package/src/format-tables.ts +110 -0
- package/src/genai-worker-wrapper.ts +160 -0
- package/src/genai-worker.ts +366 -0
- package/src/genai.ts +321 -0
- package/src/interaction-handler.ts +187 -0
- package/src/logger.ts +57 -0
- package/src/markdown.test.ts +358 -0
- package/src/markdown.ts +365 -0
- package/src/message-formatting.test.ts +81 -0
- package/src/message-formatting.ts +340 -0
- package/src/openai-realtime.ts +363 -0
- package/src/opencode.ts +277 -0
- package/src/session-handler.ts +758 -0
- package/src/system-message.ts +62 -0
- package/src/tools.ts +428 -0
- package/src/utils.ts +118 -0
- package/src/voice-handler.ts +760 -0
- package/src/voice.ts +432 -0
- package/src/worker-types.ts +66 -0
- package/src/xml.test.ts +37 -0
- package/src/xml.ts +121 -0
package/src/voice.ts
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
// Audio transcription service using Google Gemini.
|
|
2
|
+
// Transcribes voice messages with code-aware context, using grep/glob tools
|
|
3
|
+
// to verify technical terms, filenames, and function names in the codebase.
|
|
4
|
+
|
|
5
|
+
import {
|
|
6
|
+
GoogleGenAI,
|
|
7
|
+
Type,
|
|
8
|
+
type Content,
|
|
9
|
+
type Part,
|
|
10
|
+
type Tool,
|
|
11
|
+
} from '@google/genai'
|
|
12
|
+
import { createLogger } from './logger.js'
|
|
13
|
+
import { glob } from 'glob'
|
|
14
|
+
import { ripGrep } from 'ripgrep-js'
|
|
15
|
+
|
|
16
|
+
const voiceLogger = createLogger('VOICE')
|
|
17
|
+
|
|
18
|
+
export type TranscriptionToolRunner = ({
|
|
19
|
+
name,
|
|
20
|
+
args,
|
|
21
|
+
}: {
|
|
22
|
+
name: string
|
|
23
|
+
args: Record<string, string> | undefined
|
|
24
|
+
}) => Promise<
|
|
25
|
+
| { type: 'result'; transcription: string }
|
|
26
|
+
| { type: 'toolResponse'; name: string; output: string }
|
|
27
|
+
| { type: 'skip' }
|
|
28
|
+
>
|
|
29
|
+
|
|
30
|
+
async function runGrep({
|
|
31
|
+
pattern,
|
|
32
|
+
directory,
|
|
33
|
+
}: {
|
|
34
|
+
pattern: string
|
|
35
|
+
directory: string
|
|
36
|
+
}): Promise<string> {
|
|
37
|
+
try {
|
|
38
|
+
const results = await ripGrep(directory, {
|
|
39
|
+
string: pattern,
|
|
40
|
+
globs: ['!node_modules/**', '!.git/**', '!dist/**', '!build/**'],
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
if (results.length === 0) {
|
|
44
|
+
return 'No matches found'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const output = results
|
|
48
|
+
.slice(0, 10)
|
|
49
|
+
.map((match) => {
|
|
50
|
+
return `${match.path.text}:${match.line_number}: ${match.lines.text.trim()}`
|
|
51
|
+
})
|
|
52
|
+
.join('\n')
|
|
53
|
+
|
|
54
|
+
return output.slice(0, 2000)
|
|
55
|
+
} catch (e) {
|
|
56
|
+
voiceLogger.error('grep search failed:', e)
|
|
57
|
+
return 'grep search failed'
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function runGlob({
|
|
62
|
+
pattern,
|
|
63
|
+
directory,
|
|
64
|
+
}: {
|
|
65
|
+
pattern: string
|
|
66
|
+
directory: string
|
|
67
|
+
}): Promise<string> {
|
|
68
|
+
try {
|
|
69
|
+
const files = await glob(pattern, {
|
|
70
|
+
cwd: directory,
|
|
71
|
+
nodir: false,
|
|
72
|
+
ignore: ['node_modules/**', '.git/**', 'dist/**', 'build/**'],
|
|
73
|
+
maxDepth: 10,
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
if (files.length === 0) {
|
|
77
|
+
return 'No files found'
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return files.slice(0, 30).join('\n')
|
|
81
|
+
} catch (error) {
|
|
82
|
+
return `Glob search failed: ${error instanceof Error ? error.message : 'Unknown error'}`
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const grepToolDeclaration = {
|
|
87
|
+
name: 'grep',
|
|
88
|
+
description:
|
|
89
|
+
'Search for a pattern in file contents to verify if a technical term, function name, or variable exists in the code. Use this to check if transcribed words match actual code.',
|
|
90
|
+
parameters: {
|
|
91
|
+
type: Type.OBJECT,
|
|
92
|
+
properties: {
|
|
93
|
+
pattern: {
|
|
94
|
+
type: Type.STRING,
|
|
95
|
+
description:
|
|
96
|
+
'The search pattern (case-insensitive). Can be a word, function name, or partial match.',
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
required: ['pattern'],
|
|
100
|
+
},
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const globToolDeclaration = {
|
|
104
|
+
name: 'glob',
|
|
105
|
+
description:
|
|
106
|
+
'Search for files by name pattern. Use this to verify if a filename or directory mentioned in the audio actually exists in the project.',
|
|
107
|
+
parameters: {
|
|
108
|
+
type: Type.OBJECT,
|
|
109
|
+
properties: {
|
|
110
|
+
pattern: {
|
|
111
|
+
type: Type.STRING,
|
|
112
|
+
description:
|
|
113
|
+
'The glob pattern to match files. Examples: "*.ts", "**/*.json", "**/config*", "src/**/*.tsx"',
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
required: ['pattern'],
|
|
117
|
+
},
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const transcriptionResultToolDeclaration = {
|
|
121
|
+
name: 'transcriptionResult',
|
|
122
|
+
description:
|
|
123
|
+
'MANDATORY: You MUST call this tool to complete the task. This is the ONLY way to return results - text responses are ignored. Call this with your transcription, even if imperfect. An imperfect transcription is better than none.',
|
|
124
|
+
parameters: {
|
|
125
|
+
type: Type.OBJECT,
|
|
126
|
+
properties: {
|
|
127
|
+
transcription: {
|
|
128
|
+
type: Type.STRING,
|
|
129
|
+
description:
|
|
130
|
+
'The final transcription of the audio. MUST be non-empty. If audio is unclear, transcribe your best interpretation. If silent, use "[inaudible audio]".',
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
required: ['transcription'],
|
|
134
|
+
},
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function createToolRunner({
|
|
138
|
+
directory,
|
|
139
|
+
}: {
|
|
140
|
+
directory?: string
|
|
141
|
+
}): TranscriptionToolRunner {
|
|
142
|
+
const hasDirectory = directory && directory.trim().length > 0
|
|
143
|
+
|
|
144
|
+
return async ({ name, args }) => {
|
|
145
|
+
if (name === 'transcriptionResult') {
|
|
146
|
+
return {
|
|
147
|
+
type: 'result',
|
|
148
|
+
transcription: args?.transcription || '',
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (name === 'grep' && hasDirectory) {
|
|
153
|
+
const pattern = args?.pattern || ''
|
|
154
|
+
voiceLogger.log(`Grep search: "${pattern}"`)
|
|
155
|
+
const output = await runGrep({ pattern, directory })
|
|
156
|
+
voiceLogger.log(`Grep result: ${output.slice(0, 100)}...`)
|
|
157
|
+
return { type: 'toolResponse', name: 'grep', output }
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (name === 'glob' && hasDirectory) {
|
|
161
|
+
const pattern = args?.pattern || ''
|
|
162
|
+
voiceLogger.log(`Glob search: "${pattern}"`)
|
|
163
|
+
const output = await runGlob({ pattern, directory })
|
|
164
|
+
voiceLogger.log(`Glob result: ${output.slice(0, 100)}...`)
|
|
165
|
+
return { type: 'toolResponse', name: 'glob', output }
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return { type: 'skip' }
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
export async function runTranscriptionLoop({
|
|
173
|
+
genAI,
|
|
174
|
+
model,
|
|
175
|
+
initialContents,
|
|
176
|
+
tools,
|
|
177
|
+
temperature,
|
|
178
|
+
toolRunner,
|
|
179
|
+
maxSteps = 10,
|
|
180
|
+
}: {
|
|
181
|
+
genAI: GoogleGenAI
|
|
182
|
+
model: string
|
|
183
|
+
initialContents: Content[]
|
|
184
|
+
tools: Tool[]
|
|
185
|
+
temperature: number
|
|
186
|
+
toolRunner: TranscriptionToolRunner
|
|
187
|
+
maxSteps?: number
|
|
188
|
+
}): Promise<string> {
|
|
189
|
+
let response = await genAI.models.generateContent({
|
|
190
|
+
model,
|
|
191
|
+
contents: initialContents,
|
|
192
|
+
config: {
|
|
193
|
+
temperature,
|
|
194
|
+
thinkingConfig: {
|
|
195
|
+
thinkingBudget: 1024,
|
|
196
|
+
},
|
|
197
|
+
tools,
|
|
198
|
+
},
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
const conversationHistory: Content[] = [...initialContents]
|
|
202
|
+
let stepsRemaining = maxSteps
|
|
203
|
+
|
|
204
|
+
while (true) {
|
|
205
|
+
const candidate = response.candidates?.[0]
|
|
206
|
+
if (!candidate?.content?.parts) {
|
|
207
|
+
const text = response.text?.trim()
|
|
208
|
+
if (text) {
|
|
209
|
+
voiceLogger.log(`No parts but got text response: "${text.slice(0, 100)}..."`)
|
|
210
|
+
return text
|
|
211
|
+
}
|
|
212
|
+
throw new Error('Transcription failed: No response content from model')
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
const functionCalls = candidate.content.parts.filter(
|
|
216
|
+
(part): part is Part & { functionCall: NonNullable<Part['functionCall']> } =>
|
|
217
|
+
'functionCall' in part && !!part.functionCall,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if (functionCalls.length === 0) {
|
|
221
|
+
const text = response.text?.trim()
|
|
222
|
+
if (text) {
|
|
223
|
+
voiceLogger.log(`No function calls but got text: "${text.slice(0, 100)}..."`)
|
|
224
|
+
return text
|
|
225
|
+
}
|
|
226
|
+
throw new Error('Transcription failed: Model did not produce a transcription')
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
conversationHistory.push({
|
|
230
|
+
role: 'model',
|
|
231
|
+
parts: candidate.content.parts,
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
const functionResponseParts: Array<{
|
|
235
|
+
functionResponse: { name: string; response: { output: string } }
|
|
236
|
+
}> = []
|
|
237
|
+
|
|
238
|
+
for (const part of functionCalls) {
|
|
239
|
+
const call = part.functionCall
|
|
240
|
+
const args = call.args as Record<string, string> | undefined
|
|
241
|
+
const result = await toolRunner({ name: call.name || '', args })
|
|
242
|
+
|
|
243
|
+
if (result.type === 'result') {
|
|
244
|
+
const transcription = result.transcription?.trim() || ''
|
|
245
|
+
voiceLogger.log(
|
|
246
|
+
`Transcription result received: "${transcription.slice(0, 100)}..."`,
|
|
247
|
+
)
|
|
248
|
+
if (!transcription) {
|
|
249
|
+
throw new Error('Transcription failed: Model returned empty transcription')
|
|
250
|
+
}
|
|
251
|
+
return transcription
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (result.type === 'toolResponse') {
|
|
255
|
+
stepsRemaining--
|
|
256
|
+
const stepsWarning: string = (() => {
|
|
257
|
+
if (stepsRemaining <= 0) {
|
|
258
|
+
return '\n\n[CRITICAL: Tool limit reached. You MUST call transcriptionResult NOW. No more grep/glob allowed. Call transcriptionResult immediately with your best transcription.]'
|
|
259
|
+
}
|
|
260
|
+
if (stepsRemaining === 1) {
|
|
261
|
+
return '\n\n[URGENT: FINAL STEP. You MUST call transcriptionResult NOW. Do NOT call grep or glob. Call transcriptionResult with your transcription immediately.]'
|
|
262
|
+
}
|
|
263
|
+
if (stepsRemaining <= 3) {
|
|
264
|
+
return `\n\n[WARNING: Only ${stepsRemaining} steps remaining. Finish searching soon and call transcriptionResult. Do not wait until the last step.]`
|
|
265
|
+
}
|
|
266
|
+
return ''
|
|
267
|
+
})()
|
|
268
|
+
|
|
269
|
+
functionResponseParts.push({
|
|
270
|
+
functionResponse: {
|
|
271
|
+
name: result.name,
|
|
272
|
+
response: { output: result.output + stepsWarning },
|
|
273
|
+
},
|
|
274
|
+
})
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (functionResponseParts.length === 0) {
|
|
279
|
+
throw new Error('Transcription failed: No valid tool responses')
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
conversationHistory.push({
|
|
283
|
+
role: 'user',
|
|
284
|
+
parts: functionResponseParts,
|
|
285
|
+
} as Content)
|
|
286
|
+
|
|
287
|
+
response = await genAI.models.generateContent({
|
|
288
|
+
model,
|
|
289
|
+
contents: conversationHistory,
|
|
290
|
+
config: {
|
|
291
|
+
temperature,
|
|
292
|
+
thinkingConfig: {
|
|
293
|
+
thinkingBudget: 512,
|
|
294
|
+
},
|
|
295
|
+
tools: stepsRemaining <= 0 ? [{ functionDeclarations: [transcriptionResultToolDeclaration] }] : tools,
|
|
296
|
+
},
|
|
297
|
+
})
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
export async function transcribeAudio({
|
|
302
|
+
audio,
|
|
303
|
+
prompt,
|
|
304
|
+
language,
|
|
305
|
+
temperature,
|
|
306
|
+
geminiApiKey,
|
|
307
|
+
directory,
|
|
308
|
+
currentSessionContext,
|
|
309
|
+
lastSessionContext,
|
|
310
|
+
}: {
|
|
311
|
+
audio: Buffer | Uint8Array | ArrayBuffer | string
|
|
312
|
+
prompt?: string
|
|
313
|
+
language?: string
|
|
314
|
+
temperature?: number
|
|
315
|
+
geminiApiKey?: string
|
|
316
|
+
directory?: string
|
|
317
|
+
currentSessionContext?: string
|
|
318
|
+
lastSessionContext?: string
|
|
319
|
+
}): Promise<string> {
|
|
320
|
+
try {
|
|
321
|
+
const apiKey = geminiApiKey || process.env.GEMINI_API_KEY
|
|
322
|
+
|
|
323
|
+
if (!apiKey) {
|
|
324
|
+
throw new Error('Gemini API key is required for audio transcription')
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const genAI = new GoogleGenAI({ apiKey })
|
|
328
|
+
|
|
329
|
+
let audioBase64: string
|
|
330
|
+
if (typeof audio === 'string') {
|
|
331
|
+
audioBase64 = audio
|
|
332
|
+
} else if (audio instanceof Buffer) {
|
|
333
|
+
audioBase64 = audio.toString('base64')
|
|
334
|
+
} else if (audio instanceof Uint8Array) {
|
|
335
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
336
|
+
} else if (audio instanceof ArrayBuffer) {
|
|
337
|
+
audioBase64 = Buffer.from(audio).toString('base64')
|
|
338
|
+
} else {
|
|
339
|
+
throw new Error('Invalid audio format')
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const languageHint = language ? `The audio is in ${language}.\n\n` : ''
|
|
343
|
+
|
|
344
|
+
// build session context section
|
|
345
|
+
const sessionContextParts: string[] = []
|
|
346
|
+
if (lastSessionContext) {
|
|
347
|
+
sessionContextParts.push(`<last_session>
|
|
348
|
+
${lastSessionContext}
|
|
349
|
+
</last_session>`)
|
|
350
|
+
}
|
|
351
|
+
if (currentSessionContext) {
|
|
352
|
+
sessionContextParts.push(`<current_session>
|
|
353
|
+
${currentSessionContext}
|
|
354
|
+
</current_session>`)
|
|
355
|
+
}
|
|
356
|
+
const sessionContextSection = sessionContextParts.length > 0
|
|
357
|
+
? `\nSession context (use to understand references to files, functions, tools used):\n${sessionContextParts.join('\n\n')}`
|
|
358
|
+
: ''
|
|
359
|
+
|
|
360
|
+
const transcriptionPrompt = `${languageHint}Transcribe this audio for a coding agent (like Claude Code or OpenCode).
|
|
361
|
+
|
|
362
|
+
CRITICAL REQUIREMENT: You MUST call the "transcriptionResult" tool to complete this task.
|
|
363
|
+
- The transcriptionResult tool is the ONLY way to return results
|
|
364
|
+
- Text responses are completely ignored - only tool calls work
|
|
365
|
+
- You MUST call transcriptionResult even if you run out of tool calls
|
|
366
|
+
- An imperfect transcription is better than no transcription
|
|
367
|
+
- DO NOT end without calling transcriptionResult
|
|
368
|
+
|
|
369
|
+
This is a software development environment. The speaker is giving instructions to an AI coding assistant. Expect:
|
|
370
|
+
- File paths, function names, CLI commands, package names, API endpoints
|
|
371
|
+
|
|
372
|
+
RULES:
|
|
373
|
+
1. If audio is unclear, transcribe your best interpretation, interpreting words event with strong accents are present, identifying the accent being used first so you can guess what the words meawn
|
|
374
|
+
2. If audio seems silent/empty, call transcriptionResult with "[inaudible audio]"
|
|
375
|
+
3. Use the session context below to understand technical terms, file names, function names mentioned
|
|
376
|
+
|
|
377
|
+
Common corrections (apply without tool calls):
|
|
378
|
+
- "reacked" → "React", "jason" → "JSON", "get hub" → "GitHub", "no JS" → "Node.js", "dacker" → "Docker"
|
|
379
|
+
|
|
380
|
+
Project file structure:
|
|
381
|
+
<file_tree>
|
|
382
|
+
${prompt}
|
|
383
|
+
</file_tree>
|
|
384
|
+
${sessionContextSection}
|
|
385
|
+
|
|
386
|
+
REMEMBER: Call "transcriptionResult" tool with your transcription. This is mandatory.
|
|
387
|
+
|
|
388
|
+
Note: "critique" is a CLI tool for showing diffs in the browser.`
|
|
389
|
+
|
|
390
|
+
// const hasDirectory = directory && directory.trim().length > 0
|
|
391
|
+
const tools = [
|
|
392
|
+
{
|
|
393
|
+
functionDeclarations: [
|
|
394
|
+
transcriptionResultToolDeclaration,
|
|
395
|
+
// grep/glob disabled - was causing transcription to hang
|
|
396
|
+
// ...(hasDirectory ? [grepToolDeclaration, globToolDeclaration] : []),
|
|
397
|
+
],
|
|
398
|
+
},
|
|
399
|
+
]
|
|
400
|
+
|
|
401
|
+
const initialContents: Content[] = [
|
|
402
|
+
{
|
|
403
|
+
role: 'user',
|
|
404
|
+
parts: [
|
|
405
|
+
{ text: transcriptionPrompt },
|
|
406
|
+
{
|
|
407
|
+
inlineData: {
|
|
408
|
+
data: audioBase64,
|
|
409
|
+
mimeType: 'audio/mpeg',
|
|
410
|
+
},
|
|
411
|
+
},
|
|
412
|
+
],
|
|
413
|
+
},
|
|
414
|
+
]
|
|
415
|
+
|
|
416
|
+
const toolRunner = createToolRunner({ directory })
|
|
417
|
+
|
|
418
|
+
return await runTranscriptionLoop({
|
|
419
|
+
genAI,
|
|
420
|
+
model: 'gemini-2.5-flash',
|
|
421
|
+
initialContents,
|
|
422
|
+
tools,
|
|
423
|
+
temperature: temperature ?? 0.3,
|
|
424
|
+
toolRunner,
|
|
425
|
+
})
|
|
426
|
+
} catch (error) {
|
|
427
|
+
voiceLogger.error('Failed to transcribe audio:', error)
|
|
428
|
+
throw new Error(
|
|
429
|
+
`Audio transcription failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
430
|
+
)
|
|
431
|
+
}
|
|
432
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
// Type definitions for worker thread message passing.
|
|
2
|
+
// Defines the protocol between main thread and GenAI worker for
|
|
3
|
+
// audio streaming, tool calls, and session lifecycle management.
|
|
4
|
+
|
|
5
|
+
import type { Tool as AITool } from 'ai'
|
|
6
|
+
|
|
7
|
+
// Messages sent from main thread to worker
|
|
8
|
+
export type WorkerInMessage =
|
|
9
|
+
| {
|
|
10
|
+
type: 'init'
|
|
11
|
+
directory: string // Project directory for tools
|
|
12
|
+
systemMessage?: string
|
|
13
|
+
guildId: string
|
|
14
|
+
channelId: string
|
|
15
|
+
appId: string
|
|
16
|
+
geminiApiKey?: string | null
|
|
17
|
+
}
|
|
18
|
+
| {
|
|
19
|
+
type: 'sendRealtimeInput'
|
|
20
|
+
audio?: {
|
|
21
|
+
mimeType: string
|
|
22
|
+
data: string // base64
|
|
23
|
+
}
|
|
24
|
+
audioStreamEnd?: boolean
|
|
25
|
+
}
|
|
26
|
+
| {
|
|
27
|
+
type: 'sendTextInput'
|
|
28
|
+
text: string
|
|
29
|
+
}
|
|
30
|
+
| {
|
|
31
|
+
type: 'interrupt'
|
|
32
|
+
}
|
|
33
|
+
| {
|
|
34
|
+
type: 'stop'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Messages sent from worker to main thread via parentPort
|
|
38
|
+
export type WorkerOutMessage =
|
|
39
|
+
| {
|
|
40
|
+
type: 'assistantOpusPacket'
|
|
41
|
+
packet: ArrayBuffer // Opus encoded audio packet
|
|
42
|
+
}
|
|
43
|
+
| {
|
|
44
|
+
type: 'assistantStartSpeaking'
|
|
45
|
+
}
|
|
46
|
+
| {
|
|
47
|
+
type: 'assistantStopSpeaking'
|
|
48
|
+
}
|
|
49
|
+
| {
|
|
50
|
+
type: 'assistantInterruptSpeaking'
|
|
51
|
+
}
|
|
52
|
+
| {
|
|
53
|
+
type: 'toolCallCompleted'
|
|
54
|
+
sessionId: string
|
|
55
|
+
messageId: string
|
|
56
|
+
data?: any
|
|
57
|
+
error?: any
|
|
58
|
+
markdown?: string
|
|
59
|
+
}
|
|
60
|
+
| {
|
|
61
|
+
type: 'error'
|
|
62
|
+
error: string
|
|
63
|
+
}
|
|
64
|
+
| {
|
|
65
|
+
type: 'ready'
|
|
66
|
+
}
|
package/src/xml.test.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { describe, test, expect } from 'vitest'
|
|
2
|
+
import { extractNonXmlContent } from './xml.js'
|
|
3
|
+
|
|
4
|
+
describe('extractNonXmlContent', () => {
|
|
5
|
+
test('removes xml tags and returns only text content', () => {
|
|
6
|
+
const xml = 'Hello <tag>content</tag> world <nested><inner>deep</inner></nested> end'
|
|
7
|
+
expect(extractNonXmlContent(xml)).toMatchInlineSnapshot(`
|
|
8
|
+
"Hello
|
|
9
|
+
world
|
|
10
|
+
end"
|
|
11
|
+
`)
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
test('handles multiple text segments', () => {
|
|
15
|
+
const xml = 'Start <a>tag1</a> middle <b>tag2</b> finish'
|
|
16
|
+
expect(extractNonXmlContent(xml)).toMatchInlineSnapshot(`
|
|
17
|
+
"Start
|
|
18
|
+
middle
|
|
19
|
+
finish"
|
|
20
|
+
`)
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
test('handles only xml without text', () => {
|
|
24
|
+
const xml = '<root><child>content</child></root>'
|
|
25
|
+
expect(extractNonXmlContent(xml)).toMatchInlineSnapshot(`""`)
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('handles only text without xml', () => {
|
|
29
|
+
const xml = 'Just plain text'
|
|
30
|
+
expect(extractNonXmlContent(xml)).toMatchInlineSnapshot(`"Just plain text"`)
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
test('handles empty string', () => {
|
|
34
|
+
const xml = ''
|
|
35
|
+
expect(extractNonXmlContent(xml)).toMatchInlineSnapshot(`""`)
|
|
36
|
+
})
|
|
37
|
+
})
|
package/src/xml.ts
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
// XML/HTML tag content extractor.
|
|
2
|
+
// Parses XML-like tags from strings (e.g., channel topics) to extract
|
|
3
|
+
// Kimaki configuration like directory paths and app IDs.
|
|
4
|
+
|
|
5
|
+
import { DomHandler, Parser, ElementType } from 'htmlparser2'
|
|
6
|
+
import type { ChildNode, Element, Text } from 'domhandler'
|
|
7
|
+
import { createLogger } from './logger.js'
|
|
8
|
+
|
|
9
|
+
const xmlLogger = createLogger('XML')
|
|
10
|
+
|
|
11
|
+
export function extractTagsArrays<T extends string>({
|
|
12
|
+
xml,
|
|
13
|
+
tags,
|
|
14
|
+
}: {
|
|
15
|
+
xml: string
|
|
16
|
+
tags: T[]
|
|
17
|
+
}): Record<T, string[]> & { others: string[] } {
|
|
18
|
+
const result: Record<string, string[]> = {
|
|
19
|
+
others: [],
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Initialize arrays for each tag
|
|
23
|
+
tags.forEach((tag) => {
|
|
24
|
+
result[tag] = []
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
try {
|
|
28
|
+
const handler = new DomHandler(
|
|
29
|
+
(error, dom) => {
|
|
30
|
+
if (error) {
|
|
31
|
+
xmlLogger.error('Error parsing XML:', error)
|
|
32
|
+
} else {
|
|
33
|
+
const findTags = (nodes: ChildNode[], path: string[] = []) => {
|
|
34
|
+
nodes.forEach((node) => {
|
|
35
|
+
if (node.type === ElementType.Tag) {
|
|
36
|
+
const element = node as Element
|
|
37
|
+
const currentPath = [...path, element.name]
|
|
38
|
+
const pathString = currentPath.join('.')
|
|
39
|
+
|
|
40
|
+
// Extract content using original string positions
|
|
41
|
+
const extractContent = (): string => {
|
|
42
|
+
// Use element's own indices but exclude the tags
|
|
43
|
+
if (
|
|
44
|
+
element.startIndex !== null &&
|
|
45
|
+
element.endIndex !== null
|
|
46
|
+
) {
|
|
47
|
+
// Extract the full element including tags
|
|
48
|
+
const fullElement = xml.substring(
|
|
49
|
+
element.startIndex,
|
|
50
|
+
element.endIndex + 1,
|
|
51
|
+
)
|
|
52
|
+
// Find where content starts (after opening tag)
|
|
53
|
+
const contentStart = fullElement.indexOf('>') + 1
|
|
54
|
+
// Find where content ends (before this element's closing tag)
|
|
55
|
+
const closingTag = `</${element.name}>`
|
|
56
|
+
const contentEnd = fullElement.lastIndexOf(closingTag)
|
|
57
|
+
|
|
58
|
+
if (contentStart > 0 && contentEnd > contentStart) {
|
|
59
|
+
return fullElement.substring(contentStart, contentEnd)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return ''
|
|
63
|
+
}
|
|
64
|
+
return ''
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Check both single tag names and nested paths
|
|
68
|
+
if (tags.includes(element.name as T)) {
|
|
69
|
+
const content = extractContent()
|
|
70
|
+
result[element.name as T]?.push(content)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Check for nested path matches
|
|
74
|
+
if (tags.includes(pathString as T)) {
|
|
75
|
+
const content = extractContent()
|
|
76
|
+
result[pathString as T]?.push(content)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (element.children) {
|
|
80
|
+
findTags(element.children, currentPath)
|
|
81
|
+
}
|
|
82
|
+
} else if (
|
|
83
|
+
node.type === ElementType.Text &&
|
|
84
|
+
node.parent?.type === ElementType.Root
|
|
85
|
+
) {
|
|
86
|
+
const textNode = node as Text
|
|
87
|
+
if (textNode.data.trim()) {
|
|
88
|
+
// console.log('node.parent',node.parent)
|
|
89
|
+
result.others?.push(textNode.data.trim())
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
})
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
findTags(dom)
|
|
96
|
+
}
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
withStartIndices: true,
|
|
100
|
+
withEndIndices: true,
|
|
101
|
+
xmlMode: true,
|
|
102
|
+
},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
const parser = new Parser(handler, {
|
|
106
|
+
xmlMode: true,
|
|
107
|
+
decodeEntities: false,
|
|
108
|
+
})
|
|
109
|
+
parser.write(xml)
|
|
110
|
+
parser.end()
|
|
111
|
+
} catch (error) {
|
|
112
|
+
xmlLogger.error('Unexpected error in extractTags:', error)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return result as Record<T, string[]> & { others: string[] }
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function extractNonXmlContent(xml: string): string {
|
|
119
|
+
const result = extractTagsArrays({ xml, tags: [] })
|
|
120
|
+
return result.others.join('\n')
|
|
121
|
+
}
|