@codexstar/pi-listen 1.0.11 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/extensions/voice/config.ts +4 -0
- package/extensions/voice.ts +572 -124
- package/package.json +1 -1
|
@@ -31,6 +31,8 @@ export interface VoiceConfig {
|
|
|
31
31
|
scope: VoiceSettingsScope;
|
|
32
32
|
btwEnabled: boolean;
|
|
33
33
|
onboarding: VoiceOnboardingState;
|
|
34
|
+
/** Deepgram API key — stored in config so it's available even when env var isn't set */
|
|
35
|
+
deepgramApiKey?: string;
|
|
34
36
|
}
|
|
35
37
|
|
|
36
38
|
export interface LoadedVoiceConfig {
|
|
@@ -60,6 +62,7 @@ export const DEFAULT_CONFIG: VoiceConfig = {
|
|
|
60
62
|
model: "small",
|
|
61
63
|
scope: "global",
|
|
62
64
|
btwEnabled: true,
|
|
65
|
+
deepgramApiKey: undefined,
|
|
63
66
|
onboarding: {
|
|
64
67
|
completed: false,
|
|
65
68
|
schemaVersion: VOICE_CONFIG_VERSION,
|
|
@@ -121,6 +124,7 @@ function migrateConfig(rawVoice: any, source: VoiceConfigSource): VoiceConfig {
|
|
|
121
124
|
model: typeof rawVoice.model === "string" ? rawVoice.model : DEFAULT_CONFIG.model,
|
|
122
125
|
scope: (rawVoice.scope as VoiceSettingsScope | undefined) ?? (source === "project" ? "project" : "global"),
|
|
123
126
|
btwEnabled: typeof rawVoice.btwEnabled === "boolean" ? rawVoice.btwEnabled : DEFAULT_CONFIG.btwEnabled,
|
|
127
|
+
deepgramApiKey: typeof rawVoice.deepgramApiKey === "string" ? rawVoice.deepgramApiKey : undefined,
|
|
124
128
|
onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted),
|
|
125
129
|
};
|
|
126
130
|
}
|
package/extensions/voice.ts
CHANGED
|
@@ -1,22 +1,27 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* pi-voice —
|
|
2
|
+
* pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* 1.
|
|
6
|
-
*
|
|
7
|
-
* 2.
|
|
8
|
-
* 3.
|
|
4
|
+
* Architecture (modeled after Claude Code's voice pipeline):
|
|
5
|
+
* 1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
|
|
6
|
+
* and pipes it to stdout (no file).
|
|
7
|
+
* 2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
|
|
8
|
+
* 3. Deepgram returns interim + final transcripts in real-time.
|
|
9
|
+
* 4. Interim transcripts update a live widget above the editor.
|
|
10
|
+
* 5. On key-release (or toggle stop), a CloseStream message is sent;
|
|
11
|
+
* final transcript is injected into the editor.
|
|
9
12
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
13
|
+
* Activation:
|
|
14
|
+
* - Hold SPACE (empty editor) → release to finalize
|
|
15
|
+
* - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
|
|
16
|
+
* - Ctrl+Shift+B → hold to record → auto-send as /btw
|
|
12
17
|
*
|
|
13
|
-
* Config in ~/.pi/agent/settings.json
|
|
18
|
+
* Config in ~/.pi/agent/settings.json:
|
|
14
19
|
* {
|
|
15
20
|
* "voice": {
|
|
16
21
|
* "enabled": true,
|
|
17
22
|
* "language": "en",
|
|
18
|
-
* "backend": "
|
|
19
|
-
* "model": "
|
|
23
|
+
* "backend": "deepgram",
|
|
24
|
+
* "model": "nova-3"
|
|
20
25
|
* }
|
|
21
26
|
* }
|
|
22
27
|
*/
|
|
@@ -65,6 +70,14 @@ interface BtwExchange {
|
|
|
65
70
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
66
71
|
|
|
67
72
|
const SAMPLE_RATE = 16000;
|
|
73
|
+
const CHANNELS = 1;
|
|
74
|
+
const ENCODING = "linear16";
|
|
75
|
+
const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
|
|
76
|
+
const KEEPALIVE_INTERVAL_MS = 8000;
|
|
77
|
+
const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
|
|
78
|
+
const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
|
|
79
|
+
const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
|
|
80
|
+
|
|
68
81
|
const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
|
|
69
82
|
const PROJECT_ROOT = path.join(EXT_DIR, "..");
|
|
70
83
|
const DAEMON_SCRIPT = path.join(PROJECT_ROOT, "daemon.py");
|
|
@@ -74,7 +87,7 @@ function commandExists(cmd: string): boolean {
|
|
|
74
87
|
return spawnSync("which", [cmd], { stdio: "pipe", timeout: 3000 }).status === 0;
|
|
75
88
|
}
|
|
76
89
|
|
|
77
|
-
// ─── Daemon Communication
|
|
90
|
+
// ─── Daemon Communication (kept for non-deepgram local backends) ─────────────
|
|
78
91
|
|
|
79
92
|
let activeSocketPath = getSocketPath({
|
|
80
93
|
scope: DEFAULT_CONFIG.scope,
|
|
@@ -135,8 +148,6 @@ async function isDaemonRunning(socketPath = activeSocketPath): Promise<boolean>
|
|
|
135
148
|
async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
136
149
|
if (await isDaemonRunning(activeSocketPath)) {
|
|
137
150
|
const status = await daemonSend({ cmd: "status" }, 3000, activeSocketPath);
|
|
138
|
-
// When backend is 'auto', accept any loaded backend — the daemon already
|
|
139
|
-
// resolved 'auto' to a concrete backend, so we don't need to reload.
|
|
140
151
|
if (config.backend === "auto" || (status.backend === config.backend && status.model === config.model)) return true;
|
|
141
152
|
const reloaded = await daemonSend({
|
|
142
153
|
cmd: "load",
|
|
@@ -175,7 +186,6 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
|
175
186
|
|
|
176
187
|
proc.on("error", () => resolve(false));
|
|
177
188
|
|
|
178
|
-
// Timeout: if daemon doesn't start in 10s, kill orphan and fall back
|
|
179
189
|
setTimeout(() => {
|
|
180
190
|
if (!started) {
|
|
181
191
|
try { proc.kill(); } catch {}
|
|
@@ -185,46 +195,40 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
|
185
195
|
});
|
|
186
196
|
}
|
|
187
197
|
|
|
188
|
-
// ───
|
|
198
|
+
// ─── Legacy file-based transcription (for non-deepgram backends) ─────────────
|
|
189
199
|
|
|
190
|
-
let
|
|
200
|
+
let legacyRecProcess: ChildProcess | null = null;
|
|
191
201
|
|
|
192
|
-
function
|
|
193
|
-
if (
|
|
194
|
-
|
|
195
|
-
|
|
202
|
+
function startLegacyRecordingToFile(outPath: string): boolean {
|
|
203
|
+
if (legacyRecProcess) {
|
|
204
|
+
legacyRecProcess.kill("SIGTERM");
|
|
205
|
+
legacyRecProcess = null;
|
|
196
206
|
}
|
|
197
|
-
|
|
198
207
|
if (!commandExists("rec")) return false;
|
|
199
|
-
|
|
200
|
-
recProcess = spawn("rec", [
|
|
208
|
+
legacyRecProcess = spawn("rec", [
|
|
201
209
|
"-q", "-r", String(SAMPLE_RATE), "-c", "1", "-b", "16", outPath,
|
|
202
210
|
], { stdio: ["pipe", "pipe", "pipe"] });
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
recProcess.on("error", () => { recProcess = null; });
|
|
211
|
+
legacyRecProcess.stderr?.on("data", () => {});
|
|
212
|
+
legacyRecProcess.on("error", () => { legacyRecProcess = null; });
|
|
206
213
|
return true;
|
|
207
214
|
}
|
|
208
215
|
|
|
209
|
-
function
|
|
216
|
+
function stopLegacyRecording(): Promise<void> {
|
|
210
217
|
return new Promise((resolve) => {
|
|
211
|
-
if (!
|
|
212
|
-
|
|
213
|
-
|
|
218
|
+
if (!legacyRecProcess) { resolve(); return; }
|
|
219
|
+
legacyRecProcess.on("close", () => { legacyRecProcess = null; resolve(); });
|
|
220
|
+
legacyRecProcess.kill("SIGTERM");
|
|
214
221
|
setTimeout(() => {
|
|
215
|
-
if (
|
|
222
|
+
if (legacyRecProcess) { legacyRecProcess.kill("SIGKILL"); legacyRecProcess = null; }
|
|
216
223
|
resolve();
|
|
217
224
|
}, 2000);
|
|
218
225
|
});
|
|
219
226
|
}
|
|
220
227
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
async function transcribeAudio(
|
|
228
|
+
async function transcribeAudioFile(
|
|
224
229
|
audioPath: string,
|
|
225
230
|
config: VoiceConfig,
|
|
226
231
|
): Promise<{ text: string; duration: number; error?: string }> {
|
|
227
|
-
// Try daemon first
|
|
228
232
|
if (await isDaemonRunning()) {
|
|
229
233
|
const resp = await daemonSend({
|
|
230
234
|
cmd: "transcribe",
|
|
@@ -238,13 +242,10 @@ async function transcribeAudio(
|
|
|
238
242
|
return resp as { text: string; duration: number };
|
|
239
243
|
}
|
|
240
244
|
}
|
|
241
|
-
|
|
242
|
-
// Fallback: direct subprocess
|
|
243
245
|
return new Promise((resolve) => {
|
|
244
246
|
const args = [TRANSCRIBE_SCRIPT, "--language", config.language, audioPath];
|
|
245
247
|
if (config.backend !== "auto") args.splice(1, 0, "--backend", config.backend);
|
|
246
248
|
if (config.model) args.splice(1, 0, "--model", config.model);
|
|
247
|
-
|
|
248
249
|
const proc = spawn("python3", args, { stdio: ["pipe", "pipe", "pipe"] });
|
|
249
250
|
let stdout = "";
|
|
250
251
|
let stderr = "";
|
|
@@ -258,6 +259,250 @@ async function transcribeAudio(
|
|
|
258
259
|
});
|
|
259
260
|
}
|
|
260
261
|
|
|
262
|
+
// ─── Deepgram WebSocket Streaming ────────────────────────────────────────────
|
|
263
|
+
|
|
264
|
+
interface StreamingSession {
|
|
265
|
+
ws: WebSocket;
|
|
266
|
+
recProcess: ChildProcess;
|
|
267
|
+
interimText: string; // Current interim (partial) transcript
|
|
268
|
+
finalizedParts: string[]; // All finalized transcript segments
|
|
269
|
+
keepAliveTimer: ReturnType<typeof setInterval> | null;
|
|
270
|
+
closed: boolean;
|
|
271
|
+
onTranscript: (interim: string, finals: string[]) => void;
|
|
272
|
+
onDone: (fullText: string) => void;
|
|
273
|
+
onError: (err: string) => void;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function getDeepgramApiKey(): string | null {
|
|
277
|
+
// Priority: env var → config file → null
|
|
278
|
+
return process.env.DEEPGRAM_API_KEY || null;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Resolve the Deepgram API key from all sources:
|
|
283
|
+
* 1. process.env.DEEPGRAM_API_KEY (shell)
|
|
284
|
+
* 2. config.deepgramApiKey (settings.json, persisted at setup time)
|
|
285
|
+
*/
|
|
286
|
+
function resolveDeepgramApiKey(config: VoiceConfig): string | null {
|
|
287
|
+
return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function isDeepgramStreaming(config: VoiceConfig): boolean {
|
|
291
|
+
const key = resolveDeepgramApiKey(config);
|
|
292
|
+
if (!key) return false;
|
|
293
|
+
// Use streaming for deepgram backend, or auto mode when deepgram key is available
|
|
294
|
+
return config.backend === "deepgram" || (config.backend === "auto" && !!key);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function buildDeepgramWsUrl(config: VoiceConfig): string {
|
|
298
|
+
const params = new URLSearchParams({
|
|
299
|
+
encoding: ENCODING,
|
|
300
|
+
sample_rate: String(SAMPLE_RATE),
|
|
301
|
+
channels: String(CHANNELS),
|
|
302
|
+
endpointing: "300", // ms of silence before phrase boundary
|
|
303
|
+
utterance_end_ms: "1000", // ms of silence before utterance is complete
|
|
304
|
+
language: config.language || "en",
|
|
305
|
+
model: config.model || "nova-3",
|
|
306
|
+
smart_format: "true",
|
|
307
|
+
interim_results: "true",
|
|
308
|
+
});
|
|
309
|
+
return `${DEEPGRAM_WS_URL}?${params.toString()}`;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function startStreamingSession(
|
|
313
|
+
config: VoiceConfig,
|
|
314
|
+
callbacks: {
|
|
315
|
+
onTranscript: (interim: string, finals: string[]) => void;
|
|
316
|
+
onDone: (fullText: string) => void;
|
|
317
|
+
onError: (err: string) => void;
|
|
318
|
+
},
|
|
319
|
+
): StreamingSession | null {
|
|
320
|
+
const apiKey = resolveDeepgramApiKey(config);
|
|
321
|
+
if (!apiKey) {
|
|
322
|
+
callbacks.onError("DEEPGRAM_API_KEY not set");
|
|
323
|
+
return null;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (!commandExists("rec")) {
|
|
327
|
+
callbacks.onError("Voice requires SoX. Install: brew install sox");
|
|
328
|
+
return null;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Start SoX streaming raw PCM to stdout (no file)
|
|
332
|
+
const recProc = spawn("rec", [
|
|
333
|
+
"-q",
|
|
334
|
+
"-r", String(SAMPLE_RATE),
|
|
335
|
+
"-c", String(CHANNELS),
|
|
336
|
+
"-b", "16",
|
|
337
|
+
"-e", "signed-integer",
|
|
338
|
+
"-t", "raw",
|
|
339
|
+
"-", // output to stdout
|
|
340
|
+
], { stdio: ["pipe", "pipe", "pipe"] });
|
|
341
|
+
|
|
342
|
+
recProc.stderr?.on("data", () => {}); // suppress SoX warnings
|
|
343
|
+
|
|
344
|
+
// Connect WebSocket to Deepgram
|
|
345
|
+
const wsUrl = buildDeepgramWsUrl(config);
|
|
346
|
+
const ws = new WebSocket(wsUrl, {
|
|
347
|
+
headers: {
|
|
348
|
+
"Authorization": `Token ${apiKey}`,
|
|
349
|
+
},
|
|
350
|
+
} as any);
|
|
351
|
+
|
|
352
|
+
const session: StreamingSession = {
|
|
353
|
+
ws,
|
|
354
|
+
recProcess: recProc,
|
|
355
|
+
interimText: "",
|
|
356
|
+
finalizedParts: [],
|
|
357
|
+
keepAliveTimer: null,
|
|
358
|
+
closed: false,
|
|
359
|
+
onTranscript: callbacks.onTranscript,
|
|
360
|
+
onDone: callbacks.onDone,
|
|
361
|
+
onError: callbacks.onError,
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
ws.onopen = () => {
|
|
365
|
+
// Send initial KeepAlive
|
|
366
|
+
try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
|
|
367
|
+
|
|
368
|
+
// Start keepalive timer
|
|
369
|
+
session.keepAliveTimer = setInterval(() => {
|
|
370
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
371
|
+
try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
|
|
372
|
+
}
|
|
373
|
+
}, KEEPALIVE_INTERVAL_MS);
|
|
374
|
+
|
|
375
|
+
// Pipe SoX stdout → WebSocket as binary frames
|
|
376
|
+
recProc.stdout?.on("data", (chunk: Buffer) => {
|
|
377
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
378
|
+
try { ws.send(chunk); } catch {}
|
|
379
|
+
}
|
|
380
|
+
});
|
|
381
|
+
};
|
|
382
|
+
|
|
383
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
384
|
+
try {
|
|
385
|
+
const msg = typeof event.data === "string" ? JSON.parse(event.data) : null;
|
|
386
|
+
if (!msg) return;
|
|
387
|
+
|
|
388
|
+
if (msg.type === "Results") {
|
|
389
|
+
const alt = msg.channel?.alternatives?.[0];
|
|
390
|
+
const transcript = alt?.transcript || "";
|
|
391
|
+
|
|
392
|
+
if (msg.is_final) {
|
|
393
|
+
// Final result for this audio segment
|
|
394
|
+
if (transcript.trim()) {
|
|
395
|
+
session.finalizedParts.push(transcript.trim());
|
|
396
|
+
}
|
|
397
|
+
session.interimText = "";
|
|
398
|
+
} else {
|
|
399
|
+
// Interim result — live update
|
|
400
|
+
session.interimText = transcript;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
session.onTranscript(session.interimText, session.finalizedParts);
|
|
404
|
+
|
|
405
|
+
// If speech_final is true, it's the end of an utterance
|
|
406
|
+
// (similar to TranscriptEndpoint in Claude Code's protocol)
|
|
407
|
+
if (msg.speech_final && transcript.trim()) {
|
|
408
|
+
// Already added to finalizedParts above when is_final was true
|
|
409
|
+
}
|
|
410
|
+
} else if (msg.type === "Metadata") {
|
|
411
|
+
// Connection metadata — ignore
|
|
412
|
+
} else if (msg.type === "UtteranceEnd") {
|
|
413
|
+
// Utterance boundary — Deepgram detected end of speech
|
|
414
|
+
// Nothing extra needed, is_final already handles finalization
|
|
415
|
+
} else if (msg.type === "Error" || msg.type === "error") {
|
|
416
|
+
session.onError(msg.message || msg.description || "Deepgram error");
|
|
417
|
+
}
|
|
418
|
+
} catch (e: any) {
|
|
419
|
+
// Ignore parse errors for binary data
|
|
420
|
+
}
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
ws.onerror = (event: Event) => {
|
|
424
|
+
if (!session.closed) {
|
|
425
|
+
session.onError("WebSocket connection error");
|
|
426
|
+
}
|
|
427
|
+
};
|
|
428
|
+
|
|
429
|
+
ws.onclose = () => {
|
|
430
|
+
if (!session.closed) {
|
|
431
|
+
finalizeSession(session);
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
recProc.on("error", (err) => {
|
|
436
|
+
session.onError(`SoX error: ${err.message}`);
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
recProc.on("close", () => {
|
|
440
|
+
// SoX stopped — send CloseStream to Deepgram
|
|
441
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
442
|
+
try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
return session;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function stopStreamingSession(session: StreamingSession): void {
|
|
450
|
+
if (session.closed) return;
|
|
451
|
+
|
|
452
|
+
// Stop the microphone
|
|
453
|
+
try { session.recProcess.kill("SIGTERM"); } catch {}
|
|
454
|
+
|
|
455
|
+
// CloseStream tells Deepgram to flush remaining audio
|
|
456
|
+
if (session.ws.readyState === WebSocket.OPEN) {
|
|
457
|
+
try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// Safety: finalize after timeout even if Deepgram doesn't respond
|
|
461
|
+
setTimeout(() => {
|
|
462
|
+
if (!session.closed) {
|
|
463
|
+
finalizeSession(session);
|
|
464
|
+
}
|
|
465
|
+
}, FINALIZE_SAFETY_TIMEOUT_MS);
|
|
466
|
+
|
|
467
|
+
// Shorter timeout: if no new data arrives for 1.5s, assume done
|
|
468
|
+
let lastDataTime = Date.now();
|
|
469
|
+
const origOnMessage = session.ws.onmessage;
|
|
470
|
+
session.ws.onmessage = (event: MessageEvent) => {
|
|
471
|
+
lastDataTime = Date.now();
|
|
472
|
+
if (origOnMessage) origOnMessage.call(session.ws, event);
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
const noDataCheck = setInterval(() => {
|
|
476
|
+
if (Date.now() - lastDataTime > FINALIZE_NO_DATA_TIMEOUT_MS) {
|
|
477
|
+
clearInterval(noDataCheck);
|
|
478
|
+
if (!session.closed) {
|
|
479
|
+
finalizeSession(session);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}, 500);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
function finalizeSession(session: StreamingSession): void {
|
|
486
|
+
if (session.closed) return;
|
|
487
|
+
session.closed = true;
|
|
488
|
+
|
|
489
|
+
// Clean up keepalive
|
|
490
|
+
if (session.keepAliveTimer) {
|
|
491
|
+
clearInterval(session.keepAliveTimer);
|
|
492
|
+
session.keepAliveTimer = null;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Close WebSocket
|
|
496
|
+
try { session.ws.close(); } catch {}
|
|
497
|
+
|
|
498
|
+
// Kill SoX if still running
|
|
499
|
+
try { session.recProcess.kill("SIGKILL"); } catch {}
|
|
500
|
+
|
|
501
|
+
// Deliver final transcript
|
|
502
|
+
const fullText = session.finalizedParts.join(" ").trim();
|
|
503
|
+
session.onDone(fullText);
|
|
504
|
+
}
|
|
505
|
+
|
|
261
506
|
// ─── Extension ───────────────────────────────────────────────────────────────
|
|
262
507
|
|
|
263
508
|
export default function (pi: ExtensionAPI) {
|
|
@@ -272,6 +517,10 @@ export default function (pi: ExtensionAPI) {
|
|
|
272
517
|
let terminalInputUnsub: (() => void) | null = null;
|
|
273
518
|
let isHolding = false;
|
|
274
519
|
|
|
520
|
+
// Streaming session state
|
|
521
|
+
let activeSession: StreamingSession | null = null;
|
|
522
|
+
let currentTarget: "editor" | "btw" = "editor";
|
|
523
|
+
|
|
275
524
|
// ─── BTW State ───────────────────────────────────────────────────────────
|
|
276
525
|
|
|
277
526
|
let btwThread: BtwExchange[] = [];
|
|
@@ -289,17 +538,19 @@ export default function (pi: ExtensionAPI) {
|
|
|
289
538
|
}
|
|
290
539
|
const modeTag = !config.onboarding.completed
|
|
291
540
|
? "SETUP"
|
|
292
|
-
: config
|
|
293
|
-
? "
|
|
294
|
-
: config.mode === "
|
|
295
|
-
? "
|
|
296
|
-
: "
|
|
541
|
+
: isDeepgramStreaming(config)
|
|
542
|
+
? "STREAM"
|
|
543
|
+
: config.mode === "api"
|
|
544
|
+
? "API"
|
|
545
|
+
: config.mode === "local"
|
|
546
|
+
? "LOCAL"
|
|
547
|
+
: "AUTO";
|
|
297
548
|
ctx.ui.setStatus("voice", `MIC ${modeTag}`);
|
|
298
549
|
break;
|
|
299
550
|
}
|
|
300
551
|
case "recording": {
|
|
301
552
|
const secs = Math.round((Date.now() - recordingStart) / 1000);
|
|
302
|
-
ctx.ui.setStatus("voice",
|
|
553
|
+
ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
|
|
303
554
|
break;
|
|
304
555
|
}
|
|
305
556
|
case "transcribing":
|
|
@@ -315,7 +566,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
315
566
|
|
|
316
567
|
function voiceCleanup() {
|
|
317
568
|
if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
|
|
318
|
-
if (
|
|
569
|
+
if (activeSession) {
|
|
570
|
+
finalizeSession(activeSession);
|
|
571
|
+
activeSession = null;
|
|
572
|
+
}
|
|
573
|
+
if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
|
|
319
574
|
if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
|
|
320
575
|
isHolding = false;
|
|
321
576
|
setVoiceState("idle");
|
|
@@ -332,7 +587,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
332
587
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
333
588
|
const provisioningPlan = buildProvisioningPlan(nextConfig, diagnostics);
|
|
334
589
|
let validated = provisioningPlan.ready;
|
|
335
|
-
if (validated && nextConfig.enabled) {
|
|
590
|
+
if (validated && nextConfig.enabled && !isDeepgramStreaming(nextConfig)) {
|
|
336
591
|
validated = await ensureDaemon(nextConfig);
|
|
337
592
|
}
|
|
338
593
|
|
|
@@ -349,43 +604,173 @@ export default function (pi: ExtensionAPI) {
|
|
|
349
604
|
].join("\n"), validated ? "info" : "warning");
|
|
350
605
|
}
|
|
351
606
|
|
|
352
|
-
// ───
|
|
607
|
+
// ─── Live Transcript Widget ──────────────────────────────────────────────
|
|
353
608
|
|
|
354
|
-
|
|
355
|
-
if (
|
|
609
|
+
function updateLiveTranscriptWidget(interim: string, finals: string[]) {
|
|
610
|
+
if (!ctx?.hasUI) return;
|
|
611
|
+
|
|
612
|
+
const finalized = finals.join(" ");
|
|
613
|
+
const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
|
|
356
614
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
615
|
+
if (!displayText.trim()) {
|
|
616
|
+
ctx.ui.setWidget("voice-recording", [
|
|
617
|
+
" 🎙 Listening... (speak now)",
|
|
618
|
+
], { placement: "aboveEditor" });
|
|
619
|
+
return;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Show the live transcript — last 3 lines max
|
|
623
|
+
const words = displayText.split(" ");
|
|
624
|
+
const lines: string[] = [];
|
|
625
|
+
let currentLine = " 🎙 ";
|
|
626
|
+
const maxLineLen = 70;
|
|
627
|
+
|
|
628
|
+
for (const word of words) {
|
|
629
|
+
if ((currentLine + word).length > maxLineLen) {
|
|
630
|
+
lines.push(currentLine);
|
|
631
|
+
currentLine = " " + word + " ";
|
|
632
|
+
} else {
|
|
633
|
+
currentLine += word + " ";
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
if (currentLine.trim()) lines.push(currentLine);
|
|
637
|
+
|
|
638
|
+
// Keep only last 4 lines to avoid widget overflow
|
|
639
|
+
const visibleLines = lines.slice(-4);
|
|
640
|
+
if (interim) {
|
|
641
|
+
// Show a blinking cursor for interim text
|
|
642
|
+
const lastIdx = visibleLines.length - 1;
|
|
643
|
+
visibleLines[lastIdx] = visibleLines[lastIdx].trimEnd() + "▍";
|
|
361
644
|
}
|
|
362
645
|
|
|
646
|
+
ctx.ui.setWidget("voice-recording", visibleLines, { placement: "aboveEditor" });
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
// ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
|
|
650
|
+
|
|
651
|
+
async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
|
|
652
|
+
if (voiceState !== "idle" || !ctx) return false;
|
|
653
|
+
|
|
654
|
+
currentTarget = target;
|
|
363
655
|
recordingStart = Date.now();
|
|
364
|
-
setVoiceState("recording");
|
|
365
|
-
statusTimer = setInterval(() => {
|
|
366
|
-
if (voiceState === "recording") updateVoiceStatus();
|
|
367
|
-
}, 1000);
|
|
368
656
|
|
|
369
|
-
if (
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
657
|
+
if (isDeepgramStreaming(config)) {
|
|
658
|
+
// === STREAMING PATH === (Deepgram WebSocket)
|
|
659
|
+
setVoiceState("recording");
|
|
660
|
+
|
|
661
|
+
const session = startStreamingSession(config, {
|
|
662
|
+
onTranscript: (interim, finals) => {
|
|
663
|
+
updateLiveTranscriptWidget(interim, finals);
|
|
664
|
+
updateVoiceStatus();
|
|
665
|
+
},
|
|
666
|
+
onDone: (fullText) => {
|
|
667
|
+
activeSession = null;
|
|
668
|
+
ctx?.ui.setWidget("voice-recording", undefined);
|
|
669
|
+
|
|
670
|
+
if (!fullText.trim()) {
|
|
671
|
+
ctx?.ui.notify("No speech detected.", "warning");
|
|
672
|
+
setVoiceState("idle");
|
|
673
|
+
return;
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
if (target === "btw") {
|
|
677
|
+
handleBtw(fullText);
|
|
678
|
+
} else {
|
|
679
|
+
if (ctx?.hasUI) {
|
|
680
|
+
const existing = ctx.ui.getEditorText();
|
|
681
|
+
ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
|
|
682
|
+
const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
|
|
683
|
+
ctx.ui.notify(
|
|
684
|
+
`STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
|
|
685
|
+
"info",
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
setVoiceState("idle");
|
|
690
|
+
},
|
|
691
|
+
onError: (err) => {
|
|
692
|
+
activeSession = null;
|
|
693
|
+
ctx?.ui.setWidget("voice-recording", undefined);
|
|
694
|
+
ctx?.ui.notify(`STT error: ${err}`, "error");
|
|
695
|
+
setVoiceState("idle");
|
|
696
|
+
},
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
if (!session) {
|
|
700
|
+
setVoiceState("idle");
|
|
701
|
+
return false;
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
activeSession = session;
|
|
705
|
+
|
|
706
|
+
// Status timer for elapsed time
|
|
707
|
+
statusTimer = setInterval(() => {
|
|
708
|
+
if (voiceState === "recording") {
|
|
709
|
+
updateVoiceStatus();
|
|
710
|
+
const elapsed = (Date.now() - recordingStart) / 1000;
|
|
711
|
+
if (elapsed >= MAX_RECORDING_SECS) {
|
|
712
|
+
isHolding = false;
|
|
713
|
+
stopVoiceRecording(target);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
}, 1000);
|
|
717
|
+
|
|
718
|
+
if (ctx.hasUI) {
|
|
719
|
+
ctx.ui.setWidget("voice-recording", [
|
|
720
|
+
" 🎙 Listening... speak now — press SPACE again to stop",
|
|
721
|
+
], { placement: "aboveEditor" });
|
|
722
|
+
}
|
|
723
|
+
return true;
|
|
724
|
+
|
|
725
|
+
} else {
|
|
726
|
+
// === LEGACY PATH === (file-based for local backends)
|
|
727
|
+
tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
|
|
728
|
+
if (!startLegacyRecordingToFile(tempFile)) {
|
|
729
|
+
ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
|
|
730
|
+
return false;
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
setVoiceState("recording");
|
|
734
|
+
statusTimer = setInterval(() => {
|
|
735
|
+
if (voiceState === "recording") {
|
|
736
|
+
updateVoiceStatus();
|
|
737
|
+
const elapsed = (Date.now() - recordingStart) / 1000;
|
|
738
|
+
if (elapsed >= MAX_RECORDING_SECS) {
|
|
739
|
+
isHolding = false;
|
|
740
|
+
stopVoiceRecording(target);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}, 1000);
|
|
744
|
+
|
|
745
|
+
if (ctx.hasUI) {
|
|
746
|
+
ctx.ui.setWidget("voice-recording", [
|
|
747
|
+
target === "btw"
|
|
748
|
+
? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
|
|
749
|
+
: " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
|
|
750
|
+
], { placement: "aboveEditor" });
|
|
751
|
+
}
|
|
752
|
+
return true;
|
|
375
753
|
}
|
|
376
|
-
return true;
|
|
377
754
|
}
|
|
378
755
|
|
|
379
756
|
async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
|
|
380
757
|
if (voiceState !== "recording" || !ctx) return;
|
|
381
758
|
if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
|
|
382
759
|
|
|
760
|
+
if (activeSession) {
|
|
761
|
+
// === STREAMING PATH === Stop the stream, finalize will call onDone
|
|
762
|
+
setVoiceState("transcribing");
|
|
763
|
+
stopStreamingSession(activeSession);
|
|
764
|
+
return;
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
// === LEGACY PATH ===
|
|
383
768
|
const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
|
|
384
|
-
const audioFile = tempFile;
|
|
769
|
+
const audioFile = tempFile;
|
|
385
770
|
setVoiceState("transcribing");
|
|
386
771
|
ctx.ui.setWidget("voice-recording", undefined);
|
|
387
772
|
|
|
388
|
-
await
|
|
773
|
+
await stopLegacyRecording();
|
|
389
774
|
|
|
390
775
|
if (!audioFile || !fs.existsSync(audioFile)) {
|
|
391
776
|
ctx.ui.notify("No audio recorded.", "warning");
|
|
@@ -402,12 +787,9 @@ export default function (pi: ExtensionAPI) {
|
|
|
402
787
|
return;
|
|
403
788
|
}
|
|
404
789
|
|
|
405
|
-
// Ensure daemon is up before transcribing — await so the warm path
|
|
406
|
-
// is available for this request instead of falling through to the
|
|
407
|
-
// cold subprocess fallback.
|
|
408
790
|
await ensureDaemon(config).catch(() => {});
|
|
409
791
|
|
|
410
|
-
const result = await
|
|
792
|
+
const result = await transcribeAudioFile(audioFile, config);
|
|
411
793
|
try { fs.unlinkSync(audioFile); } catch {}
|
|
412
794
|
if (tempFile === audioFile) tempFile = null;
|
|
413
795
|
|
|
@@ -427,7 +809,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
427
809
|
if (target === "btw") {
|
|
428
810
|
await handleBtw(transcript);
|
|
429
811
|
} else {
|
|
430
|
-
// Inject into editor
|
|
431
812
|
if (ctx.hasUI) {
|
|
432
813
|
const existing = ctx.ui.getEditorText();
|
|
433
814
|
ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
|
|
@@ -441,25 +822,38 @@ export default function (pi: ExtensionAPI) {
|
|
|
441
822
|
setVoiceState("idle");
|
|
442
823
|
}
|
|
443
824
|
|
|
444
|
-
// ─── Hold-to-talk
|
|
825
|
+
// ─── Hold-to-talk / Toggle-to-talk ──────────────────────────────────────
|
|
826
|
+
//
|
|
827
|
+
// Kitty protocol terminals (Ghostty, WezTerm, Kitty) send key-release
|
|
828
|
+
// events (":3u" sequences), enabling true hold-to-talk.
|
|
829
|
+
//
|
|
830
|
+
// Non-Kitty terminals (Apple Terminal, iTerm2 without config, basic xterm)
|
|
831
|
+
// only send key-press. We detect this and fall back to toggle:
|
|
832
|
+
// 1st SPACE press → start recording
|
|
833
|
+
// 2nd SPACE press → stop recording + transcribe
|
|
834
|
+
//
|
|
835
|
+
// We auto-detect Kitty support: if we see a key-release within the first
|
|
836
|
+
// recording, we know hold-to-talk works. Otherwise, we stay in toggle mode.
|
|
837
|
+
|
|
838
|
+
let kittyReleaseDetected = false; // have we ever seen a Kitty release event?
|
|
445
839
|
|
|
446
840
|
function setupHoldToTalk() {
|
|
447
841
|
if (!ctx?.hasUI) return;
|
|
448
842
|
|
|
449
|
-
// Remove previous listener
|
|
450
843
|
if (terminalInputUnsub) { terminalInputUnsub(); terminalInputUnsub = null; }
|
|
451
844
|
|
|
452
845
|
terminalInputUnsub = ctx.ui.onTerminalInput((data: string) => {
|
|
453
846
|
if (!config.enabled) return undefined;
|
|
454
847
|
|
|
455
|
-
//
|
|
848
|
+
// ── SPACE handling ──
|
|
456
849
|
if (matchesKey(data, "space")) {
|
|
457
|
-
// Only activate when editor is empty (avoid conflicting with typing)
|
|
458
850
|
const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
|
|
459
851
|
if (editorText && editorText.trim().length > 0) return undefined;
|
|
460
852
|
|
|
853
|
+
// Kitty key-release: stop recording
|
|
461
854
|
if (isKeyRelease(data)) {
|
|
462
|
-
|
|
855
|
+
kittyReleaseDetected = true;
|
|
856
|
+
if (isHolding && voiceState === "recording") {
|
|
463
857
|
isHolding = false;
|
|
464
858
|
stopVoiceRecording("editor");
|
|
465
859
|
return { consume: true };
|
|
@@ -467,12 +861,27 @@ export default function (pi: ExtensionAPI) {
|
|
|
467
861
|
return undefined;
|
|
468
862
|
}
|
|
469
863
|
|
|
864
|
+
// Kitty key-repeat: suppress while holding
|
|
470
865
|
if (isKeyRepeat(data)) {
|
|
471
866
|
if (isHolding) return { consume: true };
|
|
472
867
|
return undefined;
|
|
473
868
|
}
|
|
474
869
|
|
|
475
|
-
// Key
|
|
870
|
+
// === Key PRESS ===
|
|
871
|
+
|
|
872
|
+
// Currently recording? → this is the "stop" press (toggle mode)
|
|
873
|
+
if (voiceState === "recording") {
|
|
874
|
+
isHolding = false;
|
|
875
|
+
stopVoiceRecording("editor");
|
|
876
|
+
return { consume: true };
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// Currently transcribing? → ignore, wait for it to finish
|
|
880
|
+
if (voiceState === "transcribing") {
|
|
881
|
+
return { consume: true };
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
// Idle → start recording
|
|
476
885
|
if (voiceState === "idle" && !isHolding) {
|
|
477
886
|
isHolding = true;
|
|
478
887
|
startVoiceRecording("editor").then((ok) => {
|
|
@@ -485,10 +894,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
485
894
|
return undefined;
|
|
486
895
|
}
|
|
487
896
|
|
|
488
|
-
//
|
|
897
|
+
// ── Ctrl+Shift+B handling (BTW voice) ──
|
|
489
898
|
if (matchesKey(data, "ctrl+shift+b")) {
|
|
490
899
|
if (isKeyRelease(data)) {
|
|
491
|
-
|
|
900
|
+
kittyReleaseDetected = true;
|
|
901
|
+
if (isHolding && voiceState === "recording") {
|
|
492
902
|
isHolding = false;
|
|
493
903
|
stopVoiceRecording("btw");
|
|
494
904
|
return { consume: true };
|
|
@@ -501,6 +911,13 @@ export default function (pi: ExtensionAPI) {
|
|
|
501
911
|
return undefined;
|
|
502
912
|
}
|
|
503
913
|
|
|
914
|
+
// Toggle: stop if recording
|
|
915
|
+
if (voiceState === "recording") {
|
|
916
|
+
isHolding = false;
|
|
917
|
+
stopVoiceRecording("btw");
|
|
918
|
+
return { consume: true };
|
|
919
|
+
}
|
|
920
|
+
|
|
504
921
|
if (voiceState === "idle" && !isHolding) {
|
|
505
922
|
isHolding = true;
|
|
506
923
|
startVoiceRecording("btw").then((ok) => {
|
|
@@ -513,12 +930,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
513
930
|
return undefined;
|
|
514
931
|
}
|
|
515
932
|
|
|
516
|
-
// Any other key while holding = cancel
|
|
517
|
-
if (isHolding && voiceState === "recording") {
|
|
518
|
-
// Don't cancel on modifier-only events
|
|
519
|
-
return undefined;
|
|
520
|
-
}
|
|
521
|
-
|
|
522
933
|
return undefined;
|
|
523
934
|
});
|
|
524
935
|
}
|
|
@@ -526,7 +937,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
526
937
|
// ─── BTW: Side Conversations ─────────────────────────────────────────────
|
|
527
938
|
|
|
528
939
|
function buildBtwContext(): string {
|
|
529
|
-
// Build context from main session + btw thread
|
|
530
940
|
const systemPrompt = ctx?.getSystemPrompt() ?? "";
|
|
531
941
|
let btwContext = "You are a helpful side-channel assistant. ";
|
|
532
942
|
btwContext += "The user is having a parallel conversation while their main Pi agent works. ";
|
|
@@ -560,7 +970,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
560
970
|
"",
|
|
561
971
|
];
|
|
562
972
|
|
|
563
|
-
// Show last exchange
|
|
564
973
|
lines.push(` Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
|
|
565
974
|
const answerLines = last.answer.split("\n");
|
|
566
975
|
for (const line of answerLines.slice(0, 8)) {
|
|
@@ -579,7 +988,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
579
988
|
|
|
580
989
|
btwWidgetVisible = true;
|
|
581
990
|
|
|
582
|
-
// Show thinking state
|
|
583
991
|
ctx.ui.setWidget("btw", [
|
|
584
992
|
" BTW",
|
|
585
993
|
"",
|
|
@@ -588,10 +996,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
588
996
|
" Thinking...",
|
|
589
997
|
], { placement: "aboveEditor" });
|
|
590
998
|
|
|
591
|
-
// Build context for LLM
|
|
592
999
|
const btwContext = buildBtwContext();
|
|
593
1000
|
|
|
594
|
-
// Use the model registry to get current model
|
|
595
1001
|
const model = ctx.model;
|
|
596
1002
|
if (!model) {
|
|
597
1003
|
const exchange: BtwExchange = {
|
|
@@ -606,7 +1012,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
606
1012
|
}
|
|
607
1013
|
|
|
608
1014
|
try {
|
|
609
|
-
// Stream the response
|
|
610
1015
|
let answer = "";
|
|
611
1016
|
const eventStream = streamSimple(model, {
|
|
612
1017
|
systemPrompt: btwContext,
|
|
@@ -623,7 +1028,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
623
1028
|
break;
|
|
624
1029
|
}
|
|
625
1030
|
|
|
626
|
-
// Update widget with streaming response
|
|
627
1031
|
const displayLines: string[] = [
|
|
628
1032
|
` BTW`,
|
|
629
1033
|
"",
|
|
@@ -647,7 +1051,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
647
1051
|
pi.appendEntry("btw", exchange);
|
|
648
1052
|
updateBtwWidget();
|
|
649
1053
|
} catch (err: any) {
|
|
650
|
-
// Fallback: send as a follow-up message to the main agent
|
|
651
1054
|
const exchange: BtwExchange = {
|
|
652
1055
|
question: message,
|
|
653
1056
|
answer: `(BTW streaming failed: ${err.message}. Falling back to sendUserMessage.)`,
|
|
@@ -657,7 +1060,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
657
1060
|
pi.appendEntry("btw", exchange);
|
|
658
1061
|
updateBtwWidget();
|
|
659
1062
|
|
|
660
|
-
// Use sendUserMessage as alternative
|
|
661
1063
|
pi.sendUserMessage(
|
|
662
1064
|
`[BTW question]: ${message}`,
|
|
663
1065
|
{ deliverAs: "followUp" },
|
|
@@ -667,7 +1069,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
667
1069
|
|
|
668
1070
|
// ─── Shortcuts ───────────────────────────────────────────────────────────
|
|
669
1071
|
|
|
670
|
-
// Ctrl+Shift+V = toggle voice (fallback for non-Kitty terminals)
|
|
671
1072
|
pi.registerShortcut("ctrl+shift+v", {
|
|
672
1073
|
description: "Toggle voice recording (start/stop)",
|
|
673
1074
|
handler: async (handlerCtx) => {
|
|
@@ -679,6 +1080,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
679
1080
|
if (voiceState === "idle") {
|
|
680
1081
|
await startVoiceRecording("editor");
|
|
681
1082
|
} else if (voiceState === "recording") {
|
|
1083
|
+
isHolding = false;
|
|
682
1084
|
await stopVoiceRecording("editor");
|
|
683
1085
|
}
|
|
684
1086
|
},
|
|
@@ -694,12 +1096,42 @@ export default function (pi: ExtensionAPI) {
|
|
|
694
1096
|
configSource = loaded.source;
|
|
695
1097
|
updateSocketPath(config, currentCwd);
|
|
696
1098
|
|
|
697
|
-
//
|
|
698
|
-
//
|
|
1099
|
+
// Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
|
|
1100
|
+
// This ensures streaming works even when Pi is launched from a context
|
|
1101
|
+
// that doesn't source .zshrc (GUI app, tmux, etc.)
|
|
1102
|
+
if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
|
|
1103
|
+
config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
|
|
1104
|
+
if (configSource !== "default") {
|
|
1105
|
+
saveConfig(config, config.scope, currentCwd);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
|
|
1110
|
+
if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
|
|
1111
|
+
try {
|
|
1112
|
+
const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
|
|
1113
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
1114
|
+
timeout: 3000,
|
|
1115
|
+
env: { ...process.env, HOME: os.homedir() },
|
|
1116
|
+
});
|
|
1117
|
+
const shellKey = result.stdout?.toString().trim();
|
|
1118
|
+
if (shellKey && shellKey.length > 5) {
|
|
1119
|
+
config.deepgramApiKey = shellKey;
|
|
1120
|
+
process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
|
|
1121
|
+
if (configSource !== "default") {
|
|
1122
|
+
saveConfig(config, config.scope, currentCwd);
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
} catch {}
|
|
1126
|
+
}
|
|
1127
|
+
|
|
699
1128
|
if (config.enabled && config.onboarding.completed) {
|
|
700
1129
|
updateVoiceStatus();
|
|
701
1130
|
setupHoldToTalk();
|
|
702
|
-
|
|
1131
|
+
// Only start daemon for non-streaming backends
|
|
1132
|
+
if (!isDeepgramStreaming(config)) {
|
|
1133
|
+
ensureDaemon(config).catch(() => {});
|
|
1134
|
+
}
|
|
703
1135
|
}
|
|
704
1136
|
});
|
|
705
1137
|
|
|
@@ -744,7 +1176,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
744
1176
|
// ─── /voice command ──────────────────────────────────────────────────────
|
|
745
1177
|
|
|
746
1178
|
pi.registerCommand("voice", {
|
|
747
|
-
description: "Voice input: /voice [on|off|test|info|setup|reconfigure|doctor|backends|daemon]",
|
|
1179
|
+
description: "Voice input: /voice [on|off|stop|test|info|setup|reconfigure|doctor|backends|daemon]",
|
|
748
1180
|
handler: async (args, cmdCtx) => {
|
|
749
1181
|
ctx = cmdCtx;
|
|
750
1182
|
const sub = (args || "").trim().toLowerCase();
|
|
@@ -753,8 +1185,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
753
1185
|
config.enabled = true;
|
|
754
1186
|
updateVoiceStatus();
|
|
755
1187
|
setupHoldToTalk();
|
|
756
|
-
|
|
757
|
-
|
|
1188
|
+
if (!isDeepgramStreaming(config)) {
|
|
1189
|
+
ensureDaemon(config).catch(() => {});
|
|
1190
|
+
}
|
|
1191
|
+
const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
|
|
1192
|
+
cmdCtx.ui.notify(`Voice enabled (${mode}).\n Hold SPACE (empty editor) → release to transcribe\n Ctrl+Shift+V → toggle recording on/off\n Live transcription shown while speaking`, "info");
|
|
758
1193
|
return;
|
|
759
1194
|
}
|
|
760
1195
|
|
|
@@ -767,9 +1202,22 @@ export default function (pi: ExtensionAPI) {
|
|
|
767
1202
|
return;
|
|
768
1203
|
}
|
|
769
1204
|
|
|
1205
|
+
if (sub === "stop") {
|
|
1206
|
+
if (voiceState === "recording") {
|
|
1207
|
+
isHolding = false;
|
|
1208
|
+
await stopVoiceRecording("editor");
|
|
1209
|
+
cmdCtx.ui.notify("Recording stopped and transcribed.", "info");
|
|
1210
|
+
} else {
|
|
1211
|
+
cmdCtx.ui.notify("No recording in progress.", "info");
|
|
1212
|
+
}
|
|
1213
|
+
return;
|
|
1214
|
+
}
|
|
1215
|
+
|
|
770
1216
|
if (sub === "test") {
|
|
771
1217
|
cmdCtx.ui.notify("Testing voice setup...", "info");
|
|
772
1218
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
1219
|
+
const dgKey = resolveDeepgramApiKey(config);
|
|
1220
|
+
const streaming = isDeepgramStreaming(config);
|
|
773
1221
|
const daemonUp = await isDaemonRunning();
|
|
774
1222
|
const provisioningPlan = buildProvisioningPlan(config, diagnostics);
|
|
775
1223
|
const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
|
|
@@ -782,6 +1230,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
782
1230
|
` model: ${config.model}`,
|
|
783
1231
|
` model status: ${modelReadiness}`,
|
|
784
1232
|
` language: ${config.language}`,
|
|
1233
|
+
` streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
|
|
1234
|
+
` DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
|
|
785
1235
|
` onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
|
|
786
1236
|
` python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
|
|
787
1237
|
` sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
|
|
@@ -803,11 +1253,10 @@ export default function (pi: ExtensionAPI) {
|
|
|
803
1253
|
}
|
|
804
1254
|
}
|
|
805
1255
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
lines.push("
|
|
810
|
-
lines.push(...provisioningPlan.manualSteps.map((step) => ` - ${step}`));
|
|
1256
|
+
if (!dgKey && config.backend === "deepgram") {
|
|
1257
|
+
lines.push("");
|
|
1258
|
+
lines.push("⚠️ DEEPGRAM_API_KEY not set! Add to ~/.zshrc or ~/.env.secrets");
|
|
1259
|
+
lines.push(" export DEEPGRAM_API_KEY=your_key_here");
|
|
811
1260
|
}
|
|
812
1261
|
|
|
813
1262
|
cmdCtx.ui.notify(lines.join("\n"), provisioningPlan.ready ? "info" : "warning");
|
|
@@ -824,22 +1273,24 @@ export default function (pi: ExtensionAPI) {
|
|
|
824
1273
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
825
1274
|
const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
|
|
826
1275
|
const modelReadiness = getModelReadiness(selectedBackend, config.model);
|
|
1276
|
+
const streaming = isDeepgramStreaming(config);
|
|
827
1277
|
|
|
828
1278
|
cmdCtx.ui.notify([
|
|
829
1279
|
`Voice config:`,
|
|
830
|
-
` enabled:
|
|
831
|
-
` mode:
|
|
832
|
-
` scope:
|
|
833
|
-
` backend:
|
|
834
|
-
` model:
|
|
835
|
-
` model
|
|
836
|
-
` language:
|
|
837
|
-
`
|
|
838
|
-
`
|
|
839
|
-
`
|
|
840
|
-
`
|
|
841
|
-
`
|
|
842
|
-
`
|
|
1280
|
+
` enabled: ${config.enabled}`,
|
|
1281
|
+
` mode: ${config.mode}`,
|
|
1282
|
+
` scope: ${config.scope}`,
|
|
1283
|
+
` backend: ${config.backend}`,
|
|
1284
|
+
` model: ${config.model}`,
|
|
1285
|
+
` model stat: ${modelReadiness}`,
|
|
1286
|
+
` language: ${config.language}`,
|
|
1287
|
+
` streaming: ${streaming ? "YES (Deepgram WebSocket)" : "NO (batch)"}`,
|
|
1288
|
+
` state: ${voiceState}`,
|
|
1289
|
+
` setup: ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
|
|
1290
|
+
` socket: ${activeSocketPath}`,
|
|
1291
|
+
` daemon: ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
|
|
1292
|
+
` hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
|
|
1293
|
+
` btw-key: Ctrl+Shift+B (hold to record → auto-btw)`,
|
|
843
1294
|
].join("\n"), "info");
|
|
844
1295
|
return;
|
|
845
1296
|
}
|
|
@@ -882,7 +1333,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
882
1333
|
cmdCtx.ui.notify("Voice setup cancelled.", "warning");
|
|
883
1334
|
return;
|
|
884
1335
|
}
|
|
885
|
-
|
|
886
1336
|
await finalizeAndSaveSetup(cmdCtx, result.config, result.selectedScope, result.summaryLines, "setup-command");
|
|
887
1337
|
return;
|
|
888
1338
|
}
|
|
@@ -990,7 +1440,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
990
1440
|
},
|
|
991
1441
|
});
|
|
992
1442
|
|
|
993
|
-
// ─── Dedicated setup command
|
|
1443
|
+
// ─── Dedicated setup command ─────────────────────────────────────────────
|
|
994
1444
|
|
|
995
1445
|
pi.registerCommand("voice-setup", {
|
|
996
1446
|
description: "Configure voice input — select backend, model, and language",
|
|
@@ -1058,7 +1508,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
1058
1508
|
|
|
1059
1509
|
pi.sendUserMessage(content, { deliverAs: "followUp" });
|
|
1060
1510
|
|
|
1061
|
-
// Clear after injection
|
|
1062
1511
|
btwThread = [];
|
|
1063
1512
|
btwWidgetVisible = false;
|
|
1064
1513
|
cmdCtx.ui.setWidget("btw", undefined);
|
|
@@ -1083,7 +1532,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
1083
1532
|
threadText += `Q: ${ex.question}\nA: ${ex.answer}\n\n`;
|
|
1084
1533
|
}
|
|
1085
1534
|
|
|
1086
|
-
// Ask the model to summarize
|
|
1087
1535
|
const model = ctx.model;
|
|
1088
1536
|
if (!model) {
|
|
1089
1537
|
cmdCtx.ui.notify("No model available for summarization.", "error");
|