@bprp/flockcode 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,222 @@
1
+ /**
2
+ * Walking-mode voice prompt handler.
3
+ *
4
+ * When the user is in "walking" hands-free mode and sends audio:
5
+ * 1. Transcribe the audio (existing Gemini transcription).
6
+ * 2. Use Gemini Flash to decide whether to forward to the agent or respond directly.
7
+ * 3. If responding directly, generate TTS audio via Gemini TTS and return it.
8
+ * 4. If forwarding, call sendPrompt() in the background.
9
+ */
10
+
11
+ import { chat } from "@tanstack/ai"
12
+ import { geminiText } from "@tanstack/ai-gemini"
13
+ import { generateSpeech } from "@tanstack/ai"
14
+ import { geminiSpeech } from "@tanstack/ai-gemini"
15
+ import type { OpencodeClient } from "./opencode"
16
+ import { mapMessage } from "./opencode"
17
+ import { transcribeAudio } from "./transcribe"
18
+ import { sendPrompt } from "./prompt"
19
+ import type { Message } from "./types"
20
+
21
+ export type VoicePromptResult =
22
+ | { action: "forwarded" }
23
+ | { action: "responded"; text: string; audioData: string; mimeType: string }
24
+
25
+ /**
26
+ * Handles a voice prompt in walking mode.
27
+ *
28
+ * Transcribes the audio, then uses a single Gemini Flash call to decide
29
+ * whether to forward to the agent or respond directly. If responding,
30
+ * generates TTS audio and returns it.
31
+ */
32
+ export async function handleVoicePrompt(
33
+ client: OpencodeClient,
34
+ sessionId: string,
35
+ audioData: string,
36
+ mimeType: string,
37
+ directory?: string,
38
+ model?: { providerID: string; modelID: string },
39
+ ): Promise<VoicePromptResult> {
40
+ // 1. Fetch conversation context
41
+ let conversationContext: Message[] | undefined
42
+ try {
43
+ const res = await client.session.messages({ sessionID: sessionId, directory })
44
+ if (!res.error && res.data) {
45
+ conversationContext = res.data.map(mapMessage)
46
+ }
47
+ } catch {}
48
+
49
+ // 2. Transcribe the audio
50
+ const transcription = await transcribeAudio(audioData, mimeType, conversationContext)
51
+ if (!transcription?.trim()) {
52
+ // Nothing audible — don't do anything
53
+ return { action: "forwarded" }
54
+ }
55
+
56
+ console.log(`[voice-prompt] session=${sessionId} transcription: "${transcription.slice(0, 200)}"`)
57
+
58
+ // 3. Build context summary for the routing call
59
+ const contextSummary = buildContextSummary(conversationContext)
60
+
61
+ // 4. Single Gemini Flash call to route AND optionally respond
62
+ const routingPrompt = `You are a voice assistant proxy for an AI coding agent. The user has sent a voice message (transcribed below).
63
+ Decide whether to handle the message yourself or forward it to the coding agent.
64
+
65
+ FORWARD to the agent if the user is asking it to do something, giving a new instruction, or continuing/modifying a task.
66
+ RESPOND yourself if the user is asking a question about what the agent just did, asking for a status update, or asking what changed.
67
+
68
+ If forwarding, respond with ONLY:
69
+ <forward/>
70
+
71
+ If responding, reply with:
72
+ <respond>
73
+ [your terse response here, suitable for text-to-speech — 1-3 sentences max]
74
+ </respond>
75
+
76
+ Conversation context:
77
+ ${contextSummary}
78
+
79
+ User message: "${transcription}"`
80
+
81
+ const routingResult = await chat({
82
+ adapter: geminiText("gemini-2.5-flash"),
83
+ messages: [{ role: "user", content: routingPrompt }],
84
+ stream: false,
85
+ })
86
+
87
+ const routingText = routingResult.trim()
88
+ console.log(`[voice-prompt] session=${sessionId} routing result: "${routingText.slice(0, 300)}"`)
89
+
90
+ // 5. Parse the response
91
+ if (routingText.includes("<forward")) {
92
+ // Forward to agent
93
+ console.log(`[voice-prompt] session=${sessionId} forwarding to agent`)
94
+ sendPrompt(
95
+ client,
96
+ sessionId,
97
+ [{ type: "text", text: transcription }],
98
+ directory,
99
+ model,
100
+ ).catch((err) => {
101
+ console.error(`[voice-prompt] session=${sessionId} sendPrompt failed:`, err)
102
+ })
103
+ return { action: "forwarded" }
104
+ }
105
+
106
+ // Extract the response text from <respond>...</respond>
107
+ const respondMatch = routingText.match(/<respond>([\s\S]*?)<\/respond>/)
108
+ const responseText = respondMatch?.[1]?.trim() ?? routingText.replace(/<\/?respond>/g, "").trim()
109
+
110
+ if (!responseText) {
111
+ // No response text — just forward
112
+ sendPrompt(
113
+ client,
114
+ sessionId,
115
+ [{ type: "text", text: transcription }],
116
+ directory,
117
+ model,
118
+ ).catch((err) => {
119
+ console.error(`[voice-prompt] session=${sessionId} sendPrompt failed:`, err)
120
+ })
121
+ return { action: "forwarded" }
122
+ }
123
+
124
+ console.log(`[voice-prompt] session=${sessionId} responding directly: "${responseText.slice(0, 200)}"`)
125
+
126
+ // 6. Generate TTS audio
127
+ try {
128
+ const ttsResult = await generateSpeech({
129
+ adapter: geminiSpeech("gemini-2.5-flash-preview-tts"),
130
+ text: responseText,
131
+ })
132
+
133
+ const rawFormat = ttsResult.format || "wav"
134
+ console.log(
135
+ `[voice-prompt] session=${sessionId} TTS complete: format=${rawFormat}, audioLength=${ttsResult.audio?.length ?? 0} chars`,
136
+ )
137
+
138
+ // Gemini TTS returns raw PCM with a format string like
139
+ // "L16;codec=pcm;rate=24000". AVAudioPlayer can't play raw PCM,
140
+ // so wrap it in a WAV header.
141
+ let resultAudio = ttsResult.audio
142
+ let audioMime = `audio/${rawFormat}`
143
+ const isRawPcm = /l16|pcm|raw/i.test(rawFormat)
144
+ if (isRawPcm) {
145
+ // Extract sample rate from format string (e.g. "rate=24000"), default 24kHz
146
+ const rateMatch = rawFormat.match(/rate=(\d+)/)
147
+ const sampleRate = rateMatch ? parseInt(rateMatch[1], 10) : 24000
148
+ resultAudio = wrapPcmInWav(ttsResult.audio, sampleRate, 1, 16)
149
+ audioMime = "audio/wav"
150
+ console.log(`[voice-prompt] session=${sessionId} wrapped raw PCM in WAV header (sampleRate=${sampleRate})`)
151
+ }
152
+
153
+ return {
154
+ action: "responded",
155
+ text: responseText,
156
+ audioData: resultAudio,
157
+ mimeType: audioMime,
158
+ }
159
+ } catch (err) {
160
+ console.error(`[voice-prompt] session=${sessionId} TTS failed:`, err)
161
+ // Fall back to forwarding if TTS fails
162
+ sendPrompt(
163
+ client,
164
+ sessionId,
165
+ [{ type: "text", text: transcription }],
166
+ directory,
167
+ model,
168
+ ).catch(() => {})
169
+ return { action: "forwarded" }
170
+ }
171
+ }
172
+
173
+ /**
174
+ * Wraps raw PCM (signed 16-bit LE) base64 audio data in a WAV container
175
+ * so AVAudioPlayer can decode it.
176
+ */
177
+ function wrapPcmInWav(
178
+ pcmBase64: string,
179
+ sampleRate: number,
180
+ numChannels: number,
181
+ bitsPerSample: number,
182
+ ): string {
183
+ const pcmData = Buffer.from(pcmBase64, "base64")
184
+ const byteRate = sampleRate * numChannels * (bitsPerSample / 8)
185
+ const blockAlign = numChannels * (bitsPerSample / 8)
186
+ const dataSize = pcmData.length
187
+
188
+ // 44-byte WAV header
189
+ const header = Buffer.alloc(44)
190
+ header.write("RIFF", 0)
191
+ header.writeUInt32LE(36 + dataSize, 4)
192
+ header.write("WAVE", 8)
193
+ header.write("fmt ", 12)
194
+ header.writeUInt32LE(16, 16) // PCM subchunk size
195
+ header.writeUInt16LE(1, 20) // PCM format
196
+ header.writeUInt16LE(numChannels, 22)
197
+ header.writeUInt32LE(sampleRate, 24)
198
+ header.writeUInt32LE(byteRate, 28)
199
+ header.writeUInt16LE(blockAlign, 32)
200
+ header.writeUInt16LE(bitsPerSample, 34)
201
+ header.write("data", 36)
202
+ header.writeUInt32LE(dataSize, 40)
203
+
204
+ return Buffer.concat([header, pcmData]).toString("base64")
205
+ }
206
+
207
+ function buildContextSummary(messages?: Message[]): string {
208
+ if (!messages?.length) return "(no conversation history)"
209
+
210
+ const recent = messages.slice(-10)
211
+ return recent
212
+ .map((m) => {
213
+ const textParts = m.parts
214
+ .filter((p): p is { type: "text"; id: string; text: string } => p.type === "text")
215
+ .map((p) => p.text)
216
+ .join(" ")
217
+ if (!textParts) return null
218
+ return `${m.role}: ${textParts.slice(0, 300)}`
219
+ })
220
+ .filter(Boolean)
221
+ .join("\n")
222
+ }
@@ -0,0 +1,62 @@
1
+ import { chat } from "@tanstack/ai"
2
+ import { geminiText } from "@tanstack/ai-gemini"
3
+ import { env } from "./env"
4
+
5
+ const SYSTEM_PROMPT = `\
6
+ You are a git branch name generator for a coding assistant.
7
+
8
+ Given a description of a coding task, generate a concise kebab-case slug (3-6 words) that describes the task.
9
+
10
+ Rules:
11
+ - Output ONLY the slug, nothing else — no explanation, no punctuation, no quotes
12
+ - Use lowercase letters, numbers, and hyphens only
13
+ - 3-6 words separated by hyphens
14
+ - Be specific and descriptive about the task
15
+ - Do not include words like "task", "feature", "branch", "worktree", "implement", "add", "fix" unless they are truly essential to the meaning
16
+
17
+ Examples:
18
+ "Add a button to the home screen for settings" -> home-screen-settings-button
19
+ "Fix the login bug where users can't sign in" -> login-sign-in-bug-fix
20
+ "Refactor the database connection pooling logic" -> database-connection-pool-refactor
21
+ "Update the user profile page to show avatars" -> user-profile-avatar-display`
22
+
23
+ /**
24
+ * Generate a descriptive kebab-case slug for a worktree branch name using Gemini.
25
+ *
26
+ * @param promptText The user's initial prompt text describing the coding task.
27
+ * @returns A 3-6 word kebab-case slug, e.g. `"home-screen-settings-button"`.
28
+ * @throws If `GEMINI_API_KEY` is not set or the Gemini API call fails.
29
+ */
30
+ export async function generateWorktreeSlug(promptText: string): Promise<string> {
31
+ if (!env.GEMINI_API_KEY) {
32
+ throw new Error("GEMINI_API_KEY is not set — cannot generate worktree name")
33
+ }
34
+
35
+ const result = await chat({
36
+ adapter: geminiText("gemini-3-flash-preview"),
37
+ systemPrompts: [SYSTEM_PROMPT],
38
+ messages: [
39
+ {
40
+ role: "user",
41
+ content: promptText,
42
+ },
43
+ ],
44
+ stream: false,
45
+ })
46
+
47
+ // Sanitize: lowercase, strip anything not [a-z0-9-], collapse repeated hyphens,
48
+ // trim leading/trailing hyphens, truncate to 60 chars
49
+ const slug = result
50
+ .trim()
51
+ .toLowerCase()
52
+ .replace(/[^a-z0-9-]+/g, "-")
53
+ .replace(/-{2,}/g, "-")
54
+ .replace(/^-+|-+$/g, "")
55
+ .slice(0, 60)
56
+
57
+ if (!slug) {
58
+ throw new Error("Gemini returned an empty slug for worktree name generation")
59
+ }
60
+
61
+ return slug
62
+ }