@codexstar/pi-listen 1.0.12 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/extensions/voice/config.ts +4 -0
- package/extensions/voice.ts +556 -131
- package/package.json +1 -1
|
@@ -31,6 +31,8 @@ export interface VoiceConfig {
|
|
|
31
31
|
scope: VoiceSettingsScope;
|
|
32
32
|
btwEnabled: boolean;
|
|
33
33
|
onboarding: VoiceOnboardingState;
|
|
34
|
+
/** Deepgram API key — stored in config so it's available even when env var isn't set */
|
|
35
|
+
deepgramApiKey?: string;
|
|
34
36
|
}
|
|
35
37
|
|
|
36
38
|
export interface LoadedVoiceConfig {
|
|
@@ -60,6 +62,7 @@ export const DEFAULT_CONFIG: VoiceConfig = {
|
|
|
60
62
|
model: "small",
|
|
61
63
|
scope: "global",
|
|
62
64
|
btwEnabled: true,
|
|
65
|
+
deepgramApiKey: undefined,
|
|
63
66
|
onboarding: {
|
|
64
67
|
completed: false,
|
|
65
68
|
schemaVersion: VOICE_CONFIG_VERSION,
|
|
@@ -121,6 +124,7 @@ function migrateConfig(rawVoice: any, source: VoiceConfigSource): VoiceConfig {
|
|
|
121
124
|
model: typeof rawVoice.model === "string" ? rawVoice.model : DEFAULT_CONFIG.model,
|
|
122
125
|
scope: (rawVoice.scope as VoiceSettingsScope | undefined) ?? (source === "project" ? "project" : "global"),
|
|
123
126
|
btwEnabled: typeof rawVoice.btwEnabled === "boolean" ? rawVoice.btwEnabled : DEFAULT_CONFIG.btwEnabled,
|
|
127
|
+
deepgramApiKey: typeof rawVoice.deepgramApiKey === "string" ? rawVoice.deepgramApiKey : undefined,
|
|
124
128
|
onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted),
|
|
125
129
|
};
|
|
126
130
|
}
|
package/extensions/voice.ts
CHANGED
|
@@ -1,22 +1,27 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* pi-voice —
|
|
2
|
+
* pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* 1.
|
|
6
|
-
*
|
|
7
|
-
* 2.
|
|
8
|
-
* 3.
|
|
4
|
+
* Architecture (modeled after Claude Code's voice pipeline):
|
|
5
|
+
* 1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
|
|
6
|
+
* and pipes it to stdout (no file).
|
|
7
|
+
* 2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
|
|
8
|
+
* 3. Deepgram returns interim + final transcripts in real-time.
|
|
9
|
+
* 4. Interim transcripts update a live widget above the editor.
|
|
10
|
+
* 5. On key-release (or toggle stop), a CloseStream message is sent;
|
|
11
|
+
* final transcript is injected into the editor.
|
|
9
12
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
13
|
+
* Activation:
|
|
14
|
+
* - Hold SPACE (empty editor) → release to finalize
|
|
15
|
+
* - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
|
|
16
|
+
* - Ctrl+Shift+B → hold to record → auto-send as /btw
|
|
12
17
|
*
|
|
13
|
-
* Config in ~/.pi/agent/settings.json
|
|
18
|
+
* Config in ~/.pi/agent/settings.json:
|
|
14
19
|
* {
|
|
15
20
|
* "voice": {
|
|
16
21
|
* "enabled": true,
|
|
17
22
|
* "language": "en",
|
|
18
|
-
* "backend": "
|
|
19
|
-
* "model": "
|
|
23
|
+
* "backend": "deepgram",
|
|
24
|
+
* "model": "nova-3"
|
|
20
25
|
* }
|
|
21
26
|
* }
|
|
22
27
|
*/
|
|
@@ -65,6 +70,14 @@ interface BtwExchange {
|
|
|
65
70
|
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
66
71
|
|
|
67
72
|
const SAMPLE_RATE = 16000;
|
|
73
|
+
const CHANNELS = 1;
|
|
74
|
+
const ENCODING = "linear16";
|
|
75
|
+
const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
|
|
76
|
+
const KEEPALIVE_INTERVAL_MS = 8000;
|
|
77
|
+
const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
|
|
78
|
+
const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
|
|
79
|
+
const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
|
|
80
|
+
|
|
68
81
|
const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
|
|
69
82
|
const PROJECT_ROOT = path.join(EXT_DIR, "..");
|
|
70
83
|
const DAEMON_SCRIPT = path.join(PROJECT_ROOT, "daemon.py");
|
|
@@ -74,7 +87,7 @@ function commandExists(cmd: string): boolean {
|
|
|
74
87
|
return spawnSync("which", [cmd], { stdio: "pipe", timeout: 3000 }).status === 0;
|
|
75
88
|
}
|
|
76
89
|
|
|
77
|
-
// ─── Daemon Communication
|
|
90
|
+
// ─── Daemon Communication (kept for non-deepgram local backends) ─────────────
|
|
78
91
|
|
|
79
92
|
let activeSocketPath = getSocketPath({
|
|
80
93
|
scope: DEFAULT_CONFIG.scope,
|
|
@@ -135,8 +148,6 @@ async function isDaemonRunning(socketPath = activeSocketPath): Promise<boolean>
|
|
|
135
148
|
async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
136
149
|
if (await isDaemonRunning(activeSocketPath)) {
|
|
137
150
|
const status = await daemonSend({ cmd: "status" }, 3000, activeSocketPath);
|
|
138
|
-
// When backend is 'auto', accept any loaded backend — the daemon already
|
|
139
|
-
// resolved 'auto' to a concrete backend, so we don't need to reload.
|
|
140
151
|
if (config.backend === "auto" || (status.backend === config.backend && status.model === config.model)) return true;
|
|
141
152
|
const reloaded = await daemonSend({
|
|
142
153
|
cmd: "load",
|
|
@@ -175,7 +186,6 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
|
175
186
|
|
|
176
187
|
proc.on("error", () => resolve(false));
|
|
177
188
|
|
|
178
|
-
// Timeout: if daemon doesn't start in 10s, kill orphan and fall back
|
|
179
189
|
setTimeout(() => {
|
|
180
190
|
if (!started) {
|
|
181
191
|
try { proc.kill(); } catch {}
|
|
@@ -185,46 +195,40 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
|
|
|
185
195
|
});
|
|
186
196
|
}
|
|
187
197
|
|
|
188
|
-
// ───
|
|
198
|
+
// ─── Legacy file-based transcription (for non-deepgram backends) ─────────────
|
|
189
199
|
|
|
190
|
-
let
|
|
200
|
+
let legacyRecProcess: ChildProcess | null = null;
|
|
191
201
|
|
|
192
|
-
function
|
|
193
|
-
if (
|
|
194
|
-
|
|
195
|
-
|
|
202
|
+
function startLegacyRecordingToFile(outPath: string): boolean {
|
|
203
|
+
if (legacyRecProcess) {
|
|
204
|
+
legacyRecProcess.kill("SIGTERM");
|
|
205
|
+
legacyRecProcess = null;
|
|
196
206
|
}
|
|
197
|
-
|
|
198
207
|
if (!commandExists("rec")) return false;
|
|
199
|
-
|
|
200
|
-
recProcess = spawn("rec", [
|
|
208
|
+
legacyRecProcess = spawn("rec", [
|
|
201
209
|
"-q", "-r", String(SAMPLE_RATE), "-c", "1", "-b", "16", outPath,
|
|
202
210
|
], { stdio: ["pipe", "pipe", "pipe"] });
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
recProcess.on("error", () => { recProcess = null; });
|
|
211
|
+
legacyRecProcess.stderr?.on("data", () => {});
|
|
212
|
+
legacyRecProcess.on("error", () => { legacyRecProcess = null; });
|
|
206
213
|
return true;
|
|
207
214
|
}
|
|
208
215
|
|
|
209
|
-
function
|
|
216
|
+
function stopLegacyRecording(): Promise<void> {
|
|
210
217
|
return new Promise((resolve) => {
|
|
211
|
-
if (!
|
|
212
|
-
|
|
213
|
-
|
|
218
|
+
if (!legacyRecProcess) { resolve(); return; }
|
|
219
|
+
legacyRecProcess.on("close", () => { legacyRecProcess = null; resolve(); });
|
|
220
|
+
legacyRecProcess.kill("SIGTERM");
|
|
214
221
|
setTimeout(() => {
|
|
215
|
-
if (
|
|
222
|
+
if (legacyRecProcess) { legacyRecProcess.kill("SIGKILL"); legacyRecProcess = null; }
|
|
216
223
|
resolve();
|
|
217
224
|
}, 2000);
|
|
218
225
|
});
|
|
219
226
|
}
|
|
220
227
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
async function transcribeAudio(
|
|
228
|
+
async function transcribeAudioFile(
|
|
224
229
|
audioPath: string,
|
|
225
230
|
config: VoiceConfig,
|
|
226
231
|
): Promise<{ text: string; duration: number; error?: string }> {
|
|
227
|
-
// Try daemon first
|
|
228
232
|
if (await isDaemonRunning()) {
|
|
229
233
|
const resp = await daemonSend({
|
|
230
234
|
cmd: "transcribe",
|
|
@@ -238,13 +242,10 @@ async function transcribeAudio(
|
|
|
238
242
|
return resp as { text: string; duration: number };
|
|
239
243
|
}
|
|
240
244
|
}
|
|
241
|
-
|
|
242
|
-
// Fallback: direct subprocess
|
|
243
245
|
return new Promise((resolve) => {
|
|
244
246
|
const args = [TRANSCRIBE_SCRIPT, "--language", config.language, audioPath];
|
|
245
247
|
if (config.backend !== "auto") args.splice(1, 0, "--backend", config.backend);
|
|
246
248
|
if (config.model) args.splice(1, 0, "--model", config.model);
|
|
247
|
-
|
|
248
249
|
const proc = spawn("python3", args, { stdio: ["pipe", "pipe", "pipe"] });
|
|
249
250
|
let stdout = "";
|
|
250
251
|
let stderr = "";
|
|
@@ -258,6 +259,250 @@ async function transcribeAudio(
|
|
|
258
259
|
});
|
|
259
260
|
}
|
|
260
261
|
|
|
262
|
+
// ─── Deepgram WebSocket Streaming ────────────────────────────────────────────
|
|
263
|
+
|
|
264
|
+
interface StreamingSession {
|
|
265
|
+
ws: WebSocket;
|
|
266
|
+
recProcess: ChildProcess;
|
|
267
|
+
interimText: string; // Current interim (partial) transcript
|
|
268
|
+
finalizedParts: string[]; // All finalized transcript segments
|
|
269
|
+
keepAliveTimer: ReturnType<typeof setInterval> | null;
|
|
270
|
+
closed: boolean;
|
|
271
|
+
onTranscript: (interim: string, finals: string[]) => void;
|
|
272
|
+
onDone: (fullText: string) => void;
|
|
273
|
+
onError: (err: string) => void;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function getDeepgramApiKey(): string | null {
|
|
277
|
+
// Priority: env var → config file → null
|
|
278
|
+
return process.env.DEEPGRAM_API_KEY || null;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Resolve the Deepgram API key from all sources:
|
|
283
|
+
* 1. process.env.DEEPGRAM_API_KEY (shell)
|
|
284
|
+
* 2. config.deepgramApiKey (settings.json, persisted at setup time)
|
|
285
|
+
*/
|
|
286
|
+
function resolveDeepgramApiKey(config: VoiceConfig): string | null {
|
|
287
|
+
return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function isDeepgramStreaming(config: VoiceConfig): boolean {
|
|
291
|
+
const key = resolveDeepgramApiKey(config);
|
|
292
|
+
if (!key) return false;
|
|
293
|
+
// Use streaming for deepgram backend, or auto mode when deepgram key is available
|
|
294
|
+
return config.backend === "deepgram" || (config.backend === "auto" && !!key);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function buildDeepgramWsUrl(config: VoiceConfig): string {
|
|
298
|
+
const params = new URLSearchParams({
|
|
299
|
+
encoding: ENCODING,
|
|
300
|
+
sample_rate: String(SAMPLE_RATE),
|
|
301
|
+
channels: String(CHANNELS),
|
|
302
|
+
endpointing: "300", // ms of silence before phrase boundary
|
|
303
|
+
utterance_end_ms: "1000", // ms of silence before utterance is complete
|
|
304
|
+
language: config.language || "en",
|
|
305
|
+
model: config.model || "nova-3",
|
|
306
|
+
smart_format: "true",
|
|
307
|
+
interim_results: "true",
|
|
308
|
+
});
|
|
309
|
+
return `${DEEPGRAM_WS_URL}?${params.toString()}`;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function startStreamingSession(
|
|
313
|
+
config: VoiceConfig,
|
|
314
|
+
callbacks: {
|
|
315
|
+
onTranscript: (interim: string, finals: string[]) => void;
|
|
316
|
+
onDone: (fullText: string) => void;
|
|
317
|
+
onError: (err: string) => void;
|
|
318
|
+
},
|
|
319
|
+
): StreamingSession | null {
|
|
320
|
+
const apiKey = resolveDeepgramApiKey(config);
|
|
321
|
+
if (!apiKey) {
|
|
322
|
+
callbacks.onError("DEEPGRAM_API_KEY not set");
|
|
323
|
+
return null;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (!commandExists("rec")) {
|
|
327
|
+
callbacks.onError("Voice requires SoX. Install: brew install sox");
|
|
328
|
+
return null;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Start SoX streaming raw PCM to stdout (no file)
|
|
332
|
+
const recProc = spawn("rec", [
|
|
333
|
+
"-q",
|
|
334
|
+
"-r", String(SAMPLE_RATE),
|
|
335
|
+
"-c", String(CHANNELS),
|
|
336
|
+
"-b", "16",
|
|
337
|
+
"-e", "signed-integer",
|
|
338
|
+
"-t", "raw",
|
|
339
|
+
"-", // output to stdout
|
|
340
|
+
], { stdio: ["pipe", "pipe", "pipe"] });
|
|
341
|
+
|
|
342
|
+
recProc.stderr?.on("data", () => {}); // suppress SoX warnings
|
|
343
|
+
|
|
344
|
+
// Connect WebSocket to Deepgram
|
|
345
|
+
const wsUrl = buildDeepgramWsUrl(config);
|
|
346
|
+
const ws = new WebSocket(wsUrl, {
|
|
347
|
+
headers: {
|
|
348
|
+
"Authorization": `Token ${apiKey}`,
|
|
349
|
+
},
|
|
350
|
+
} as any);
|
|
351
|
+
|
|
352
|
+
const session: StreamingSession = {
|
|
353
|
+
ws,
|
|
354
|
+
recProcess: recProc,
|
|
355
|
+
interimText: "",
|
|
356
|
+
finalizedParts: [],
|
|
357
|
+
keepAliveTimer: null,
|
|
358
|
+
closed: false,
|
|
359
|
+
onTranscript: callbacks.onTranscript,
|
|
360
|
+
onDone: callbacks.onDone,
|
|
361
|
+
onError: callbacks.onError,
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
ws.onopen = () => {
|
|
365
|
+
// Send initial KeepAlive
|
|
366
|
+
try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
|
|
367
|
+
|
|
368
|
+
// Start keepalive timer
|
|
369
|
+
session.keepAliveTimer = setInterval(() => {
|
|
370
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
371
|
+
try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
|
|
372
|
+
}
|
|
373
|
+
}, KEEPALIVE_INTERVAL_MS);
|
|
374
|
+
|
|
375
|
+
// Pipe SoX stdout → WebSocket as binary frames
|
|
376
|
+
recProc.stdout?.on("data", (chunk: Buffer) => {
|
|
377
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
378
|
+
try { ws.send(chunk); } catch {}
|
|
379
|
+
}
|
|
380
|
+
});
|
|
381
|
+
};
|
|
382
|
+
|
|
383
|
+
ws.onmessage = (event: MessageEvent) => {
|
|
384
|
+
try {
|
|
385
|
+
const msg = typeof event.data === "string" ? JSON.parse(event.data) : null;
|
|
386
|
+
if (!msg) return;
|
|
387
|
+
|
|
388
|
+
if (msg.type === "Results") {
|
|
389
|
+
const alt = msg.channel?.alternatives?.[0];
|
|
390
|
+
const transcript = alt?.transcript || "";
|
|
391
|
+
|
|
392
|
+
if (msg.is_final) {
|
|
393
|
+
// Final result for this audio segment
|
|
394
|
+
if (transcript.trim()) {
|
|
395
|
+
session.finalizedParts.push(transcript.trim());
|
|
396
|
+
}
|
|
397
|
+
session.interimText = "";
|
|
398
|
+
} else {
|
|
399
|
+
// Interim result — live update
|
|
400
|
+
session.interimText = transcript;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
session.onTranscript(session.interimText, session.finalizedParts);
|
|
404
|
+
|
|
405
|
+
// If speech_final is true, it's the end of an utterance
|
|
406
|
+
// (similar to TranscriptEndpoint in Claude Code's protocol)
|
|
407
|
+
if (msg.speech_final && transcript.trim()) {
|
|
408
|
+
// Already added to finalizedParts above when is_final was true
|
|
409
|
+
}
|
|
410
|
+
} else if (msg.type === "Metadata") {
|
|
411
|
+
// Connection metadata — ignore
|
|
412
|
+
} else if (msg.type === "UtteranceEnd") {
|
|
413
|
+
// Utterance boundary — Deepgram detected end of speech
|
|
414
|
+
// Nothing extra needed, is_final already handles finalization
|
|
415
|
+
} else if (msg.type === "Error" || msg.type === "error") {
|
|
416
|
+
session.onError(msg.message || msg.description || "Deepgram error");
|
|
417
|
+
}
|
|
418
|
+
} catch (e: any) {
|
|
419
|
+
// Ignore parse errors for binary data
|
|
420
|
+
}
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
ws.onerror = (event: Event) => {
|
|
424
|
+
if (!session.closed) {
|
|
425
|
+
session.onError("WebSocket connection error");
|
|
426
|
+
}
|
|
427
|
+
};
|
|
428
|
+
|
|
429
|
+
ws.onclose = () => {
|
|
430
|
+
if (!session.closed) {
|
|
431
|
+
finalizeSession(session);
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
recProc.on("error", (err) => {
|
|
436
|
+
session.onError(`SoX error: ${err.message}`);
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
recProc.on("close", () => {
|
|
440
|
+
// SoX stopped — send CloseStream to Deepgram
|
|
441
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
442
|
+
try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
return session;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function stopStreamingSession(session: StreamingSession): void {
|
|
450
|
+
if (session.closed) return;
|
|
451
|
+
|
|
452
|
+
// Stop the microphone
|
|
453
|
+
try { session.recProcess.kill("SIGTERM"); } catch {}
|
|
454
|
+
|
|
455
|
+
// CloseStream tells Deepgram to flush remaining audio
|
|
456
|
+
if (session.ws.readyState === WebSocket.OPEN) {
|
|
457
|
+
try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// Safety: finalize after timeout even if Deepgram doesn't respond
|
|
461
|
+
setTimeout(() => {
|
|
462
|
+
if (!session.closed) {
|
|
463
|
+
finalizeSession(session);
|
|
464
|
+
}
|
|
465
|
+
}, FINALIZE_SAFETY_TIMEOUT_MS);
|
|
466
|
+
|
|
467
|
+
// Shorter timeout: if no new data arrives for 1.5s, assume done
|
|
468
|
+
let lastDataTime = Date.now();
|
|
469
|
+
const origOnMessage = session.ws.onmessage;
|
|
470
|
+
session.ws.onmessage = (event: MessageEvent) => {
|
|
471
|
+
lastDataTime = Date.now();
|
|
472
|
+
if (origOnMessage) origOnMessage.call(session.ws, event);
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
const noDataCheck = setInterval(() => {
|
|
476
|
+
if (Date.now() - lastDataTime > FINALIZE_NO_DATA_TIMEOUT_MS) {
|
|
477
|
+
clearInterval(noDataCheck);
|
|
478
|
+
if (!session.closed) {
|
|
479
|
+
finalizeSession(session);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}, 500);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
function finalizeSession(session: StreamingSession): void {
|
|
486
|
+
if (session.closed) return;
|
|
487
|
+
session.closed = true;
|
|
488
|
+
|
|
489
|
+
// Clean up keepalive
|
|
490
|
+
if (session.keepAliveTimer) {
|
|
491
|
+
clearInterval(session.keepAliveTimer);
|
|
492
|
+
session.keepAliveTimer = null;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Close WebSocket
|
|
496
|
+
try { session.ws.close(); } catch {}
|
|
497
|
+
|
|
498
|
+
// Kill SoX if still running
|
|
499
|
+
try { session.recProcess.kill("SIGKILL"); } catch {}
|
|
500
|
+
|
|
501
|
+
// Deliver final transcript
|
|
502
|
+
const fullText = session.finalizedParts.join(" ").trim();
|
|
503
|
+
session.onDone(fullText);
|
|
504
|
+
}
|
|
505
|
+
|
|
261
506
|
// ─── Extension ───────────────────────────────────────────────────────────────
|
|
262
507
|
|
|
263
508
|
export default function (pi: ExtensionAPI) {
|
|
@@ -272,6 +517,10 @@ export default function (pi: ExtensionAPI) {
|
|
|
272
517
|
let terminalInputUnsub: (() => void) | null = null;
|
|
273
518
|
let isHolding = false;
|
|
274
519
|
|
|
520
|
+
// Streaming session state
|
|
521
|
+
let activeSession: StreamingSession | null = null;
|
|
522
|
+
let currentTarget: "editor" | "btw" = "editor";
|
|
523
|
+
|
|
275
524
|
// ─── BTW State ───────────────────────────────────────────────────────────
|
|
276
525
|
|
|
277
526
|
let btwThread: BtwExchange[] = [];
|
|
@@ -289,17 +538,19 @@ export default function (pi: ExtensionAPI) {
|
|
|
289
538
|
}
|
|
290
539
|
const modeTag = !config.onboarding.completed
|
|
291
540
|
? "SETUP"
|
|
292
|
-
: config
|
|
293
|
-
? "
|
|
294
|
-
: config.mode === "
|
|
295
|
-
? "
|
|
296
|
-
: "
|
|
541
|
+
: isDeepgramStreaming(config)
|
|
542
|
+
? "STREAM"
|
|
543
|
+
: config.mode === "api"
|
|
544
|
+
? "API"
|
|
545
|
+
: config.mode === "local"
|
|
546
|
+
? "LOCAL"
|
|
547
|
+
: "AUTO";
|
|
297
548
|
ctx.ui.setStatus("voice", `MIC ${modeTag}`);
|
|
298
549
|
break;
|
|
299
550
|
}
|
|
300
551
|
case "recording": {
|
|
301
552
|
const secs = Math.round((Date.now() - recordingStart) / 1000);
|
|
302
|
-
ctx.ui.setStatus("voice",
|
|
553
|
+
ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
|
|
303
554
|
break;
|
|
304
555
|
}
|
|
305
556
|
case "transcribing":
|
|
@@ -315,7 +566,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
315
566
|
|
|
316
567
|
function voiceCleanup() {
|
|
317
568
|
if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
|
|
318
|
-
if (
|
|
569
|
+
if (activeSession) {
|
|
570
|
+
finalizeSession(activeSession);
|
|
571
|
+
activeSession = null;
|
|
572
|
+
}
|
|
573
|
+
if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
|
|
319
574
|
if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
|
|
320
575
|
isHolding = false;
|
|
321
576
|
setVoiceState("idle");
|
|
@@ -332,7 +587,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
332
587
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
333
588
|
const provisioningPlan = buildProvisioningPlan(nextConfig, diagnostics);
|
|
334
589
|
let validated = provisioningPlan.ready;
|
|
335
|
-
if (validated && nextConfig.enabled) {
|
|
590
|
+
if (validated && nextConfig.enabled && !isDeepgramStreaming(nextConfig)) {
|
|
336
591
|
validated = await ensureDaemon(nextConfig);
|
|
337
592
|
}
|
|
338
593
|
|
|
@@ -349,53 +604,173 @@ export default function (pi: ExtensionAPI) {
|
|
|
349
604
|
].join("\n"), validated ? "info" : "warning");
|
|
350
605
|
}
|
|
351
606
|
|
|
352
|
-
// ───
|
|
607
|
+
// ─── Live Transcript Widget ──────────────────────────────────────────────
|
|
353
608
|
|
|
354
|
-
|
|
609
|
+
function updateLiveTranscriptWidget(interim: string, finals: string[]) {
|
|
610
|
+
if (!ctx?.hasUI) return;
|
|
355
611
|
|
|
356
|
-
|
|
357
|
-
|
|
612
|
+
const finalized = finals.join(" ");
|
|
613
|
+
const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
|
|
358
614
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
615
|
+
if (!displayText.trim()) {
|
|
616
|
+
ctx.ui.setWidget("voice-recording", [
|
|
617
|
+
" 🎙 Listening... (speak now)",
|
|
618
|
+
], { placement: "aboveEditor" });
|
|
619
|
+
return;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Show the live transcript — last 3 lines max
|
|
623
|
+
const words = displayText.split(" ");
|
|
624
|
+
const lines: string[] = [];
|
|
625
|
+
let currentLine = " 🎙 ";
|
|
626
|
+
const maxLineLen = 70;
|
|
627
|
+
|
|
628
|
+
for (const word of words) {
|
|
629
|
+
if ((currentLine + word).length > maxLineLen) {
|
|
630
|
+
lines.push(currentLine);
|
|
631
|
+
currentLine = " " + word + " ";
|
|
632
|
+
} else {
|
|
633
|
+
currentLine += word + " ";
|
|
634
|
+
}
|
|
363
635
|
}
|
|
636
|
+
if (currentLine.trim()) lines.push(currentLine);
|
|
637
|
+
|
|
638
|
+
// Keep only last 4 lines to avoid widget overflow
|
|
639
|
+
const visibleLines = lines.slice(-4);
|
|
640
|
+
if (interim) {
|
|
641
|
+
// Show a blinking cursor for interim text
|
|
642
|
+
const lastIdx = visibleLines.length - 1;
|
|
643
|
+
visibleLines[lastIdx] = visibleLines[lastIdx].trimEnd() + "▍";
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
ctx.ui.setWidget("voice-recording", visibleLines, { placement: "aboveEditor" });
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
// ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
|
|
650
|
+
|
|
651
|
+
async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
|
|
652
|
+
if (voiceState !== "idle" || !ctx) return false;
|
|
364
653
|
|
|
654
|
+
currentTarget = target;
|
|
365
655
|
recordingStart = Date.now();
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
656
|
+
|
|
657
|
+
if (isDeepgramStreaming(config)) {
|
|
658
|
+
// === STREAMING PATH === (Deepgram WebSocket)
|
|
659
|
+
setVoiceState("recording");
|
|
660
|
+
|
|
661
|
+
const session = startStreamingSession(config, {
|
|
662
|
+
onTranscript: (interim, finals) => {
|
|
663
|
+
updateLiveTranscriptWidget(interim, finals);
|
|
664
|
+
updateVoiceStatus();
|
|
665
|
+
},
|
|
666
|
+
onDone: (fullText) => {
|
|
667
|
+
activeSession = null;
|
|
668
|
+
ctx?.ui.setWidget("voice-recording", undefined);
|
|
669
|
+
|
|
670
|
+
if (!fullText.trim()) {
|
|
671
|
+
ctx?.ui.notify("No speech detected.", "warning");
|
|
672
|
+
setVoiceState("idle");
|
|
673
|
+
return;
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
if (target === "btw") {
|
|
677
|
+
handleBtw(fullText);
|
|
678
|
+
} else {
|
|
679
|
+
if (ctx?.hasUI) {
|
|
680
|
+
const existing = ctx.ui.getEditorText();
|
|
681
|
+
ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
|
|
682
|
+
const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
|
|
683
|
+
ctx.ui.notify(
|
|
684
|
+
`STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
|
|
685
|
+
"info",
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
setVoiceState("idle");
|
|
690
|
+
},
|
|
691
|
+
onError: (err) => {
|
|
692
|
+
activeSession = null;
|
|
693
|
+
ctx?.ui.setWidget("voice-recording", undefined);
|
|
694
|
+
ctx?.ui.notify(`STT error: ${err}`, "error");
|
|
695
|
+
setVoiceState("idle");
|
|
696
|
+
},
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
if (!session) {
|
|
700
|
+
setVoiceState("idle");
|
|
701
|
+
return false;
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
activeSession = session;
|
|
705
|
+
|
|
706
|
+
// Status timer for elapsed time
|
|
707
|
+
statusTimer = setInterval(() => {
|
|
708
|
+
if (voiceState === "recording") {
|
|
709
|
+
updateVoiceStatus();
|
|
710
|
+
const elapsed = (Date.now() - recordingStart) / 1000;
|
|
711
|
+
if (elapsed >= MAX_RECORDING_SECS) {
|
|
712
|
+
isHolding = false;
|
|
713
|
+
stopVoiceRecording(target);
|
|
714
|
+
}
|
|
375
715
|
}
|
|
716
|
+
}, 1000);
|
|
717
|
+
|
|
718
|
+
if (ctx.hasUI) {
|
|
719
|
+
ctx.ui.setWidget("voice-recording", [
|
|
720
|
+
" 🎙 Listening... speak now — press SPACE again to stop",
|
|
721
|
+
], { placement: "aboveEditor" });
|
|
376
722
|
}
|
|
377
|
-
|
|
723
|
+
return true;
|
|
378
724
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
725
|
+
} else {
|
|
726
|
+
// === LEGACY PATH === (file-based for local backends)
|
|
727
|
+
tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
|
|
728
|
+
if (!startLegacyRecordingToFile(tempFile)) {
|
|
729
|
+
ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
|
|
730
|
+
return false;
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
setVoiceState("recording");
|
|
734
|
+
statusTimer = setInterval(() => {
|
|
735
|
+
if (voiceState === "recording") {
|
|
736
|
+
updateVoiceStatus();
|
|
737
|
+
const elapsed = (Date.now() - recordingStart) / 1000;
|
|
738
|
+
if (elapsed >= MAX_RECORDING_SECS) {
|
|
739
|
+
isHolding = false;
|
|
740
|
+
stopVoiceRecording(target);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}, 1000);
|
|
744
|
+
|
|
745
|
+
if (ctx.hasUI) {
|
|
746
|
+
ctx.ui.setWidget("voice-recording", [
|
|
747
|
+
target === "btw"
|
|
748
|
+
? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
|
|
749
|
+
: " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
|
|
750
|
+
], { placement: "aboveEditor" });
|
|
751
|
+
}
|
|
752
|
+
return true;
|
|
385
753
|
}
|
|
386
|
-
return true;
|
|
387
754
|
}
|
|
388
755
|
|
|
389
756
|
async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
|
|
390
757
|
if (voiceState !== "recording" || !ctx) return;
|
|
391
758
|
if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
|
|
392
759
|
|
|
760
|
+
if (activeSession) {
|
|
761
|
+
// === STREAMING PATH === Stop the stream, finalize will call onDone
|
|
762
|
+
setVoiceState("transcribing");
|
|
763
|
+
stopStreamingSession(activeSession);
|
|
764
|
+
return;
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
// === LEGACY PATH ===
|
|
393
768
|
const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
|
|
394
|
-
const audioFile = tempFile;
|
|
769
|
+
const audioFile = tempFile;
|
|
395
770
|
setVoiceState("transcribing");
|
|
396
771
|
ctx.ui.setWidget("voice-recording", undefined);
|
|
397
772
|
|
|
398
|
-
await
|
|
773
|
+
await stopLegacyRecording();
|
|
399
774
|
|
|
400
775
|
if (!audioFile || !fs.existsSync(audioFile)) {
|
|
401
776
|
ctx.ui.notify("No audio recorded.", "warning");
|
|
@@ -412,12 +787,9 @@ export default function (pi: ExtensionAPI) {
|
|
|
412
787
|
return;
|
|
413
788
|
}
|
|
414
789
|
|
|
415
|
-
// Ensure daemon is up before transcribing — await so the warm path
|
|
416
|
-
// is available for this request instead of falling through to the
|
|
417
|
-
// cold subprocess fallback.
|
|
418
790
|
await ensureDaemon(config).catch(() => {});
|
|
419
791
|
|
|
420
|
-
const result = await
|
|
792
|
+
const result = await transcribeAudioFile(audioFile, config);
|
|
421
793
|
try { fs.unlinkSync(audioFile); } catch {}
|
|
422
794
|
if (tempFile === audioFile) tempFile = null;
|
|
423
795
|
|
|
@@ -437,7 +809,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
437
809
|
if (target === "btw") {
|
|
438
810
|
await handleBtw(transcript);
|
|
439
811
|
} else {
|
|
440
|
-
// Inject into editor
|
|
441
812
|
if (ctx.hasUI) {
|
|
442
813
|
const existing = ctx.ui.getEditorText();
|
|
443
814
|
ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
|
|
@@ -451,25 +822,38 @@ export default function (pi: ExtensionAPI) {
|
|
|
451
822
|
setVoiceState("idle");
|
|
452
823
|
}
|
|
453
824
|
|
|
454
|
-
// ─── Hold-to-talk
|
|
825
|
+
// ─── Hold-to-talk / Toggle-to-talk ──────────────────────────────────────
|
|
826
|
+
//
|
|
827
|
+
// Kitty protocol terminals (Ghostty, WezTerm, Kitty) send key-release
|
|
828
|
+
// events (":3u" sequences), enabling true hold-to-talk.
|
|
829
|
+
//
|
|
830
|
+
// Non-Kitty terminals (Apple Terminal, iTerm2 without config, basic xterm)
|
|
831
|
+
// only send key-press. We detect this and fall back to toggle:
|
|
832
|
+
// 1st SPACE press → start recording
|
|
833
|
+
// 2nd SPACE press → stop recording + transcribe
|
|
834
|
+
//
|
|
835
|
+
// We auto-detect Kitty support: if we see a key-release within the first
|
|
836
|
+
// recording, we know hold-to-talk works. Otherwise, we stay in toggle mode.
|
|
837
|
+
|
|
838
|
+
let kittyReleaseDetected = false; // have we ever seen a Kitty release event?
|
|
455
839
|
|
|
456
840
|
function setupHoldToTalk() {
|
|
457
841
|
if (!ctx?.hasUI) return;
|
|
458
842
|
|
|
459
|
-
// Remove previous listener
|
|
460
843
|
if (terminalInputUnsub) { terminalInputUnsub(); terminalInputUnsub = null; }
|
|
461
844
|
|
|
462
845
|
terminalInputUnsub = ctx.ui.onTerminalInput((data: string) => {
|
|
463
846
|
if (!config.enabled) return undefined;
|
|
464
847
|
|
|
465
|
-
//
|
|
848
|
+
// ── SPACE handling ──
|
|
466
849
|
if (matchesKey(data, "space")) {
|
|
467
|
-
// Only activate when editor is empty (avoid conflicting with typing)
|
|
468
850
|
const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
|
|
469
851
|
if (editorText && editorText.trim().length > 0) return undefined;
|
|
470
852
|
|
|
853
|
+
// Kitty key-release: stop recording
|
|
471
854
|
if (isKeyRelease(data)) {
|
|
472
|
-
|
|
855
|
+
kittyReleaseDetected = true;
|
|
856
|
+
if (isHolding && voiceState === "recording") {
|
|
473
857
|
isHolding = false;
|
|
474
858
|
stopVoiceRecording("editor");
|
|
475
859
|
return { consume: true };
|
|
@@ -477,12 +861,27 @@ export default function (pi: ExtensionAPI) {
|
|
|
477
861
|
return undefined;
|
|
478
862
|
}
|
|
479
863
|
|
|
864
|
+
// Kitty key-repeat: suppress while holding
|
|
480
865
|
if (isKeyRepeat(data)) {
|
|
481
866
|
if (isHolding) return { consume: true };
|
|
482
867
|
return undefined;
|
|
483
868
|
}
|
|
484
869
|
|
|
485
|
-
// Key
|
|
870
|
+
// === Key PRESS ===
|
|
871
|
+
|
|
872
|
+
// Currently recording? → this is the "stop" press (toggle mode)
|
|
873
|
+
if (voiceState === "recording") {
|
|
874
|
+
isHolding = false;
|
|
875
|
+
stopVoiceRecording("editor");
|
|
876
|
+
return { consume: true };
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// Currently transcribing? → ignore, wait for it to finish
|
|
880
|
+
if (voiceState === "transcribing") {
|
|
881
|
+
return { consume: true };
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
// Idle → start recording
|
|
486
885
|
if (voiceState === "idle" && !isHolding) {
|
|
487
886
|
isHolding = true;
|
|
488
887
|
startVoiceRecording("editor").then((ok) => {
|
|
@@ -495,10 +894,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
495
894
|
return undefined;
|
|
496
895
|
}
|
|
497
896
|
|
|
498
|
-
//
|
|
897
|
+
// ── Ctrl+Shift+B handling (BTW voice) ──
|
|
499
898
|
if (matchesKey(data, "ctrl+shift+b")) {
|
|
500
899
|
if (isKeyRelease(data)) {
|
|
501
|
-
|
|
900
|
+
kittyReleaseDetected = true;
|
|
901
|
+
if (isHolding && voiceState === "recording") {
|
|
502
902
|
isHolding = false;
|
|
503
903
|
stopVoiceRecording("btw");
|
|
504
904
|
return { consume: true };
|
|
@@ -511,6 +911,13 @@ export default function (pi: ExtensionAPI) {
|
|
|
511
911
|
return undefined;
|
|
512
912
|
}
|
|
513
913
|
|
|
914
|
+
// Toggle: stop if recording
|
|
915
|
+
if (voiceState === "recording") {
|
|
916
|
+
isHolding = false;
|
|
917
|
+
stopVoiceRecording("btw");
|
|
918
|
+
return { consume: true };
|
|
919
|
+
}
|
|
920
|
+
|
|
514
921
|
if (voiceState === "idle" && !isHolding) {
|
|
515
922
|
isHolding = true;
|
|
516
923
|
startVoiceRecording("btw").then((ok) => {
|
|
@@ -523,12 +930,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
523
930
|
return undefined;
|
|
524
931
|
}
|
|
525
932
|
|
|
526
|
-
// Any other key while holding = cancel
|
|
527
|
-
if (isHolding && voiceState === "recording") {
|
|
528
|
-
// Don't cancel on modifier-only events
|
|
529
|
-
return undefined;
|
|
530
|
-
}
|
|
531
|
-
|
|
532
933
|
return undefined;
|
|
533
934
|
});
|
|
534
935
|
}
|
|
@@ -536,7 +937,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
536
937
|
// ─── BTW: Side Conversations ─────────────────────────────────────────────
|
|
537
938
|
|
|
538
939
|
function buildBtwContext(): string {
|
|
539
|
-
// Build context from main session + btw thread
|
|
540
940
|
const systemPrompt = ctx?.getSystemPrompt() ?? "";
|
|
541
941
|
let btwContext = "You are a helpful side-channel assistant. ";
|
|
542
942
|
btwContext += "The user is having a parallel conversation while their main Pi agent works. ";
|
|
@@ -570,7 +970,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
570
970
|
"",
|
|
571
971
|
];
|
|
572
972
|
|
|
573
|
-
// Show last exchange
|
|
574
973
|
lines.push(` Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
|
|
575
974
|
const answerLines = last.answer.split("\n");
|
|
576
975
|
for (const line of answerLines.slice(0, 8)) {
|
|
@@ -589,7 +988,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
589
988
|
|
|
590
989
|
btwWidgetVisible = true;
|
|
591
990
|
|
|
592
|
-
// Show thinking state
|
|
593
991
|
ctx.ui.setWidget("btw", [
|
|
594
992
|
" BTW",
|
|
595
993
|
"",
|
|
@@ -598,10 +996,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
598
996
|
" Thinking...",
|
|
599
997
|
], { placement: "aboveEditor" });
|
|
600
998
|
|
|
601
|
-
// Build context for LLM
|
|
602
999
|
const btwContext = buildBtwContext();
|
|
603
1000
|
|
|
604
|
-
// Use the model registry to get current model
|
|
605
1001
|
const model = ctx.model;
|
|
606
1002
|
if (!model) {
|
|
607
1003
|
const exchange: BtwExchange = {
|
|
@@ -616,7 +1012,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
616
1012
|
}
|
|
617
1013
|
|
|
618
1014
|
try {
|
|
619
|
-
// Stream the response
|
|
620
1015
|
let answer = "";
|
|
621
1016
|
const eventStream = streamSimple(model, {
|
|
622
1017
|
systemPrompt: btwContext,
|
|
@@ -633,7 +1028,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
633
1028
|
break;
|
|
634
1029
|
}
|
|
635
1030
|
|
|
636
|
-
// Update widget with streaming response
|
|
637
1031
|
const displayLines: string[] = [
|
|
638
1032
|
` BTW`,
|
|
639
1033
|
"",
|
|
@@ -657,7 +1051,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
657
1051
|
pi.appendEntry("btw", exchange);
|
|
658
1052
|
updateBtwWidget();
|
|
659
1053
|
} catch (err: any) {
|
|
660
|
-
// Fallback: send as a follow-up message to the main agent
|
|
661
1054
|
const exchange: BtwExchange = {
|
|
662
1055
|
question: message,
|
|
663
1056
|
answer: `(BTW streaming failed: ${err.message}. Falling back to sendUserMessage.)`,
|
|
@@ -667,7 +1060,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
667
1060
|
pi.appendEntry("btw", exchange);
|
|
668
1061
|
updateBtwWidget();
|
|
669
1062
|
|
|
670
|
-
// Use sendUserMessage as alternative
|
|
671
1063
|
pi.sendUserMessage(
|
|
672
1064
|
`[BTW question]: ${message}`,
|
|
673
1065
|
{ deliverAs: "followUp" },
|
|
@@ -677,7 +1069,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
677
1069
|
|
|
678
1070
|
// ─── Shortcuts ───────────────────────────────────────────────────────────
|
|
679
1071
|
|
|
680
|
-
// Ctrl+Shift+V = toggle voice (fallback for non-Kitty terminals)
|
|
681
1072
|
pi.registerShortcut("ctrl+shift+v", {
|
|
682
1073
|
description: "Toggle voice recording (start/stop)",
|
|
683
1074
|
handler: async (handlerCtx) => {
|
|
@@ -705,12 +1096,42 @@ export default function (pi: ExtensionAPI) {
|
|
|
705
1096
|
configSource = loaded.source;
|
|
706
1097
|
updateSocketPath(config, currentCwd);
|
|
707
1098
|
|
|
708
|
-
//
|
|
709
|
-
//
|
|
1099
|
+
// Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
|
|
1100
|
+
// This ensures streaming works even when Pi is launched from a context
|
|
1101
|
+
// that doesn't source .zshrc (GUI app, tmux, etc.)
|
|
1102
|
+
if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
|
|
1103
|
+
config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
|
|
1104
|
+
if (configSource !== "default") {
|
|
1105
|
+
saveConfig(config, config.scope, currentCwd);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
// Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
|
|
1110
|
+
if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
|
|
1111
|
+
try {
|
|
1112
|
+
const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
|
|
1113
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
1114
|
+
timeout: 3000,
|
|
1115
|
+
env: { ...process.env, HOME: os.homedir() },
|
|
1116
|
+
});
|
|
1117
|
+
const shellKey = result.stdout?.toString().trim();
|
|
1118
|
+
if (shellKey && shellKey.length > 5) {
|
|
1119
|
+
config.deepgramApiKey = shellKey;
|
|
1120
|
+
process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
|
|
1121
|
+
if (configSource !== "default") {
|
|
1122
|
+
saveConfig(config, config.scope, currentCwd);
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
} catch {}
|
|
1126
|
+
}
|
|
1127
|
+
|
|
710
1128
|
if (config.enabled && config.onboarding.completed) {
|
|
711
1129
|
updateVoiceStatus();
|
|
712
1130
|
setupHoldToTalk();
|
|
713
|
-
|
|
1131
|
+
// Only start daemon for non-streaming backends
|
|
1132
|
+
if (!isDeepgramStreaming(config)) {
|
|
1133
|
+
ensureDaemon(config).catch(() => {});
|
|
1134
|
+
}
|
|
714
1135
|
}
|
|
715
1136
|
});
|
|
716
1137
|
|
|
@@ -764,8 +1185,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
764
1185
|
config.enabled = true;
|
|
765
1186
|
updateVoiceStatus();
|
|
766
1187
|
setupHoldToTalk();
|
|
767
|
-
|
|
768
|
-
|
|
1188
|
+
if (!isDeepgramStreaming(config)) {
|
|
1189
|
+
ensureDaemon(config).catch(() => {});
|
|
1190
|
+
}
|
|
1191
|
+
const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
|
|
1192
|
+
cmdCtx.ui.notify(`Voice enabled (${mode}).\n Hold SPACE (empty editor) → release to transcribe\n Ctrl+Shift+V → toggle recording on/off\n Live transcription shown while speaking`, "info");
|
|
769
1193
|
return;
|
|
770
1194
|
}
|
|
771
1195
|
|
|
@@ -779,7 +1203,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
779
1203
|
}
|
|
780
1204
|
|
|
781
1205
|
if (sub === "stop") {
|
|
782
|
-
// Emergency stop — cancel any active recording
|
|
783
1206
|
if (voiceState === "recording") {
|
|
784
1207
|
isHolding = false;
|
|
785
1208
|
await stopVoiceRecording("editor");
|
|
@@ -793,6 +1216,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
793
1216
|
if (sub === "test") {
|
|
794
1217
|
cmdCtx.ui.notify("Testing voice setup...", "info");
|
|
795
1218
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
1219
|
+
const dgKey = resolveDeepgramApiKey(config);
|
|
1220
|
+
const streaming = isDeepgramStreaming(config);
|
|
796
1221
|
const daemonUp = await isDaemonRunning();
|
|
797
1222
|
const provisioningPlan = buildProvisioningPlan(config, diagnostics);
|
|
798
1223
|
const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
|
|
@@ -805,6 +1230,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
805
1230
|
` model: ${config.model}`,
|
|
806
1231
|
` model status: ${modelReadiness}`,
|
|
807
1232
|
` language: ${config.language}`,
|
|
1233
|
+
` streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
|
|
1234
|
+
` DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
|
|
808
1235
|
` onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
|
|
809
1236
|
` python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
|
|
810
1237
|
` sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
|
|
@@ -826,11 +1253,10 @@ export default function (pi: ExtensionAPI) {
|
|
|
826
1253
|
}
|
|
827
1254
|
}
|
|
828
1255
|
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
lines.push("
|
|
833
|
-
lines.push(...provisioningPlan.manualSteps.map((step) => ` - ${step}`));
|
|
1256
|
+
if (!dgKey && config.backend === "deepgram") {
|
|
1257
|
+
lines.push("");
|
|
1258
|
+
lines.push("⚠️ DEEPGRAM_API_KEY not set! Add to ~/.zshrc or ~/.env.secrets");
|
|
1259
|
+
lines.push(" export DEEPGRAM_API_KEY=your_key_here");
|
|
834
1260
|
}
|
|
835
1261
|
|
|
836
1262
|
cmdCtx.ui.notify(lines.join("\n"), provisioningPlan.ready ? "info" : "warning");
|
|
@@ -847,22 +1273,24 @@ export default function (pi: ExtensionAPI) {
|
|
|
847
1273
|
const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
|
|
848
1274
|
const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
|
|
849
1275
|
const modelReadiness = getModelReadiness(selectedBackend, config.model);
|
|
1276
|
+
const streaming = isDeepgramStreaming(config);
|
|
850
1277
|
|
|
851
1278
|
cmdCtx.ui.notify([
|
|
852
1279
|
`Voice config:`,
|
|
853
|
-
` enabled:
|
|
854
|
-
` mode:
|
|
855
|
-
` scope:
|
|
856
|
-
` backend:
|
|
857
|
-
` model:
|
|
858
|
-
` model
|
|
859
|
-
` language:
|
|
860
|
-
`
|
|
861
|
-
`
|
|
862
|
-
`
|
|
863
|
-
`
|
|
864
|
-
`
|
|
865
|
-
`
|
|
1280
|
+
` enabled: ${config.enabled}`,
|
|
1281
|
+
` mode: ${config.mode}`,
|
|
1282
|
+
` scope: ${config.scope}`,
|
|
1283
|
+
` backend: ${config.backend}`,
|
|
1284
|
+
` model: ${config.model}`,
|
|
1285
|
+
` model stat: ${modelReadiness}`,
|
|
1286
|
+
` language: ${config.language}`,
|
|
1287
|
+
` streaming: ${streaming ? "YES (Deepgram WebSocket)" : "NO (batch)"}`,
|
|
1288
|
+
` state: ${voiceState}`,
|
|
1289
|
+
` setup: ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
|
|
1290
|
+
` socket: ${activeSocketPath}`,
|
|
1291
|
+
` daemon: ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
|
|
1292
|
+
` hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
|
|
1293
|
+
` btw-key: Ctrl+Shift+B (hold to record → auto-btw)`,
|
|
866
1294
|
].join("\n"), "info");
|
|
867
1295
|
return;
|
|
868
1296
|
}
|
|
@@ -905,7 +1333,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
905
1333
|
cmdCtx.ui.notify("Voice setup cancelled.", "warning");
|
|
906
1334
|
return;
|
|
907
1335
|
}
|
|
908
|
-
|
|
909
1336
|
await finalizeAndSaveSetup(cmdCtx, result.config, result.selectedScope, result.summaryLines, "setup-command");
|
|
910
1337
|
return;
|
|
911
1338
|
}
|
|
@@ -1013,7 +1440,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
1013
1440
|
},
|
|
1014
1441
|
});
|
|
1015
1442
|
|
|
1016
|
-
// ─── Dedicated setup command
|
|
1443
|
+
// ─── Dedicated setup command ─────────────────────────────────────────────
|
|
1017
1444
|
|
|
1018
1445
|
pi.registerCommand("voice-setup", {
|
|
1019
1446
|
description: "Configure voice input — select backend, model, and language",
|
|
@@ -1081,7 +1508,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
1081
1508
|
|
|
1082
1509
|
pi.sendUserMessage(content, { deliverAs: "followUp" });
|
|
1083
1510
|
|
|
1084
|
-
// Clear after injection
|
|
1085
1511
|
btwThread = [];
|
|
1086
1512
|
btwWidgetVisible = false;
|
|
1087
1513
|
cmdCtx.ui.setWidget("btw", undefined);
|
|
@@ -1106,7 +1532,6 @@ export default function (pi: ExtensionAPI) {
|
|
|
1106
1532
|
threadText += `Q: ${ex.question}\nA: ${ex.answer}\n\n`;
|
|
1107
1533
|
}
|
|
1108
1534
|
|
|
1109
|
-
// Ask the model to summarize
|
|
1110
1535
|
const model = ctx.model;
|
|
1111
1536
|
if (!model) {
|
|
1112
1537
|
cmdCtx.ui.notify("No model available for summarization.", "error");
|