@p8n.ai/pi-listens 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -6,6 +6,29 @@ This project follows [Semantic Versioning](https://semver.org/).
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.0] - 2026-05-09
10
+
11
+ ### Added
12
+
13
+ - `/init` command to create a global settings file (`~/.pi/pi-listens.json`) with sensible defaults.
14
+ - `/voice-check` command (replaces `/voice-status`) with improved diagnostic output.
15
+
16
+ ### Changed
17
+
18
+ - `/voice-on` now enables auto-speak by default for a full hands-free experience. Use `--no-speak` to opt out.
19
+ - Rename `/voice-status` to `/voice-check` to better communicate its diagnostic purpose.
20
+
21
+ ### Removed
22
+
23
+ - `/listen` slash command. Use `/voice-on` for the hands-free voice loop, or the `voice_input` agent tool for programmatic speech input.
24
+ ## [0.1.2] - 2026-05-09
25
+
26
+ ### Changed
27
+
28
+ - Stream TTS audio directly to the local player so speech starts sooner.
29
+ - Make `voice_output` non-blocking by default; pass `wait_for_playback: true` to wait.
30
+ - Replace the `R` voice-panel shortcut with Space for easier listen/stop control.
31
+
9
32
  ## [0.1.1] - 2026-05-09
10
33
 
11
34
  ### Fixed
@@ -31,6 +54,8 @@ This project follows [Semantic Versioning](https://semver.org/).
31
54
  - Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
32
55
  - Clean up generated audio files when spoken playback is interrupted.
33
56
 
34
- [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.1...HEAD
57
+ [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.2.0...HEAD
35
58
  [0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
36
59
  [0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
60
+ [0.1.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.2
61
+ [0.2.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.0
package/README.md CHANGED
@@ -7,14 +7,27 @@ Speech-first Pi package powered by [Sarvam AI](https://www.sarvam.ai/). It gives
7
7
  - voice-first clarification loops where the agent speaks a question, listens, transcribes, and continues
8
8
  - interactive TUI and headless/RPC usage through Pi extension tools and UI fallback
9
9
 
10
- ## Install
10
+ ## Quick start
11
11
 
12
12
  ```bash
13
13
  pi install npm:@p8n.ai/pi-listens
14
- export SARVAM_API_KEY="your-sarvam-api-key"
15
14
  pi
16
15
  ```
17
16
 
17
+ Inside Pi, run `/init` to create a global settings file with sensible defaults:
18
+
19
+ ```
20
+ /init
21
+ ```
22
+
23
+ Then open `~/.pi/pi-listens.json` and replace the `apiKey` placeholder with your [Sarvam AI API key](https://dashboard.sarvam.ai).
24
+
25
+ Alternatively, set the key via environment variable:
26
+
27
+ ```bash
28
+ export SARVAM_API_KEY="your-sarvam-api-key"
29
+ ```
30
+
18
31
  For local development from this checkout:
19
32
 
20
33
  ```bash
@@ -88,16 +101,15 @@ The extension also injects voice guidance into the system prompt:
88
101
 
89
102
  | Command | Purpose |
90
103
  | --- | --- |
91
- | `/listen [seconds]` | Stream one utterance over Sarvam WebSocket STT, wait for a sustained silence boundary, transcribe, and send it to Pi as a user message. |
104
+ | `/init` | Create a global settings file at `~/.pi/pi-listens.json` with sensible defaults. Use `--overwrite` to replace an existing file. |
92
105
  | `/speak <text>` | Speak text with Sarvam TTS. |
93
- | `/voice-on [--speak] [--manual] [--no-listen] [seconds]` | Open the hands-free TUI panel. By default it listens now and auto-listens again after each agent response. `--speak` reads short assistant replies aloud. `--manual` leaves the panel active but only listens when you press R. |
94
- | `/voice-on --no-speak` | Open the panel without auto-reading assistant replies. |
95
- | `/voice-status` | Show setup and voice-mode status. |
106
+ | `/voice-on [--no-speak] [--manual] [--no-listen] [seconds]` | Start the hands-free voice loop. Auto-speaks assistant replies and auto-listens by default. `--no-speak` disables reading replies aloud. `--manual` disables auto-listen (press Space to listen). |
107
+ | `/voice-check` | Show setup diagnostics and voice-mode status. |
96
108
 
97
109
  Voice panel controls in interactive mode:
98
- - R: listen now; press again while listening to stop listening; if Pi is speaking, R stops playback before listening
99
- - A: auto-listen on/off (listen again after each assistant reply)
100
- - S: read aloud on/off (speak assistant replies)
110
+ - Space: listen now; press again while listening to stop; if Pi is speaking, stops playback first
111
+ - A: toggle auto-listen (listen again after each assistant reply)
112
+ - S: toggle read-aloud (speak assistant replies)
101
113
  - Q: close the panel and stop any active listening or speaking
102
114
  - Click the orb: visual ripple feedback (terminals with mouse reporting)
103
115
 
@@ -143,7 +155,7 @@ Example config file:
143
155
  "ttsSampleRate": 24000,
144
156
  "ttsOutputCodec": "wav",
145
157
  "textFallback": true,
146
- "autoSpeakAssistant": false,
158
+ "autoSpeakAssistant": true,
147
159
  "maxAutoSpeakChars": 320
148
160
  }
149
161
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@p8n.ai/pi-listens",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "description": "Pi package for speech-first interaction using Sarvam AI speech-to-text and text-to-speech.",
5
5
  "author": "Ravindra Barthwal",
6
6
  "license": "MIT",
@@ -15,6 +15,13 @@ This Pi package provides voice tools backed by Sarvam AI.
15
15
  - `voice_transcribe_file`: transcribe an existing audio file.
16
16
  - `voice_setup_check`: diagnose API key, recorder, player, and voice settings.
17
17
 
18
+ ## Commands
19
+
20
+ - `/init`: create a global settings file with defaults. User only needs to set their Sarvam API key.
21
+ - `/speak <text>`: speak text with Sarvam TTS.
22
+ - `/voice-on`: start hands-free voice loop (auto-speaks replies and auto-listens by default).
23
+ - `/voice-check`: show setup diagnostics and voice-mode status.
24
+
18
25
  ## Usage rules
19
26
 
20
27
  1. When you need user input, clarification, or confirmation, use `voice_ask` instead of asking only in text.
package/src/audio.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { mkdir, rm } from "node:fs/promises";
2
2
  import { randomUUID } from "node:crypto";
3
3
  import { join } from "node:path";
4
- import { spawn } from "node:child_process";
4
+ import { spawn, type StdioOptions } from "node:child_process";
5
5
  import { accessSync, constants } from "node:fs";
6
6
  import { once } from "node:events";
7
7
  import type { PiListensConfig } from "./config.js";
@@ -10,15 +10,17 @@ export interface AudioRuntime {
10
10
  record(seconds?: number, signal?: AbortSignal): Promise<string>;
11
11
  streamPcm(signal?: AbortSignal): AsyncIterable<Buffer>;
12
12
  play(path: string, signal?: AbortSignal): Promise<void>;
13
+ playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void>;
13
14
  cleanup(path: string): Promise<void>;
14
15
  stopPlayback(): void;
15
16
  stopAll(): void;
16
- describe(): { recorder: string; player: string };
17
+ describe(): { recorder: string; player: string; streamingPlayer: string };
17
18
  }
18
19
 
19
20
  export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
20
21
  const recorder = config.recordCommand ? "custom" : detectRecorder();
21
22
  const player = config.playCommand ? "custom" : detectPlayer();
23
+ const streamingPlayer = detectStreamingPlayer();
22
24
 
23
25
  return {
24
26
  async record(seconds = config.recordSeconds, signal?: AbortSignal): Promise<string> {
@@ -68,6 +70,16 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
68
70
  await run(command.command, command.args, signal, { kind: "play" });
69
71
  },
70
72
 
73
+ async playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void> {
74
+ if (!streamingPlayer) {
75
+ throw new Error(
76
+ "No streaming audio player found. Install ffplay or sox (`play`) for low-latency TTS playback, or use file playback fallback.",
77
+ );
78
+ }
79
+ const command = streamingPlayerCommand(streamingPlayer, config.ttsOutputCodec, config.ttsSampleRate);
80
+ await pipeStreamToCommand(stream, command.command, command.args, signal);
81
+ },
82
+
71
83
  async cleanup(path: string): Promise<void> {
72
84
  if (!config.deleteAudio) return;
73
85
  await rm(path, { force: true }).catch(() => undefined);
@@ -82,7 +94,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
82
94
  },
83
95
 
84
96
  describe() {
85
- return { recorder: recorder ?? "missing", player: player ?? "missing" };
97
+ return { recorder: recorder ?? "missing", player: player ?? "missing", streamingPlayer: streamingPlayer ?? "missing" };
86
98
  },
87
99
  };
88
100
  }
@@ -205,6 +217,13 @@ function detectPlayer(): string | null {
205
217
  return null;
206
218
  }
207
219
 
220
+ function detectStreamingPlayer(): string | null {
221
+ if (isCommandAvailable("ffplay")) return "ffplay";
222
+ if (isCommandAvailable("play")) return "play";
223
+ if (isCommandAvailable("aplay")) return "aplay";
224
+ return null;
225
+ }
226
+
208
227
  function isCommandAvailable(command: string): boolean {
209
228
  const paths = (process.env.PATH ?? "").split(":").filter(Boolean);
210
229
  for (const dir of paths) {
@@ -303,6 +322,76 @@ async function* streamCommandOutput(command: string, args: string[], signal?: Ab
303
322
  }
304
323
  }
305
324
 
325
+ async function pipeStreamToCommand(stream: ReadableStream<Uint8Array>, command: string, args: string[], signal?: AbortSignal): Promise<void> {
326
+ if (signal?.aborted) throw new Error("Cancelled");
327
+ const child = spawnManaged(command, args, "play", ["pipe", "pipe", "pipe"]);
328
+ let stderr = "";
329
+ let stdout = "";
330
+ let exitCode: number | null = null;
331
+ let exitSignal: NodeJS.Signals | null = null;
332
+ let spawnError: Error | undefined;
333
+
334
+ const stop = () => terminateChild(child);
335
+ signal?.addEventListener("abort", stop, { once: true });
336
+ child.stdout?.on("data", (chunk) => { stdout += chunk.toString(); });
337
+ child.stderr?.on("data", (chunk) => { stderr += chunk.toString(); });
338
+ child.on("error", (err) => { spawnError = err; });
339
+ child.on("close", (code, termSignal) => { exitCode = code; exitSignal = termSignal; });
340
+
341
+ try {
342
+ if (!child.stdin) throw new Error(`${command} did not provide stdin for streaming audio playback`);
343
+ const stdin = child.stdin;
344
+ const reader = stream.getReader();
345
+ try {
346
+ while (true) {
347
+ if (signal?.aborted) throw new Error("Cancelled");
348
+ if (spawnError) throw spawnError;
349
+ const { done, value } = await reader.read();
350
+ if (done) break;
351
+ if (!value?.byteLength) continue;
352
+ if (!stdin.write(Buffer.from(value))) await once(stdin, "drain");
353
+ }
354
+ } finally {
355
+ reader.releaseLock();
356
+ }
357
+ stdin.end();
358
+ if (exitCode === null && !spawnError) await once(child, "close");
359
+ if (signal?.aborted) throw new Error("Cancelled");
360
+ if (spawnError) throw spawnError;
361
+ if (exitCode !== 0) {
362
+ const output = [stderr.trim(), stdout.trim()].filter(Boolean).join("\n");
363
+ throw new Error(`${command} failed${exitSignal ? ` (${exitSignal})` : ""}${exitCode === null ? "" : ` with exit code ${exitCode}`}${output ? `: ${output}` : ""}`);
364
+ }
365
+ } finally {
366
+ signal?.removeEventListener("abort", stop);
367
+ if (!child.killed && exitCode === null) stop();
368
+ }
369
+ }
370
+
371
+ function streamingPlayerCommand(player: string, codec: PiListensConfig["ttsOutputCodec"], sampleRate: number): CommandSpec {
372
+ if (player === "ffplay") {
373
+ const args = ["-nodisp", "-autoexit", "-loglevel", "error"];
374
+ if (codec === "linear16") args.push("-f", "s16le", "-ar", String(sampleRate), "-ac", "1");
375
+ if (codec === "mulaw") args.push("-f", "mulaw", "-ar", String(sampleRate), "-ac", "1");
376
+ if (codec === "alaw") args.push("-f", "alaw", "-ar", String(sampleRate), "-ac", "1");
377
+ args.push("-i", "pipe:0");
378
+ return { command: "ffplay", args };
379
+ }
380
+ if (player === "play") {
381
+ if (codec === "linear16") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-b", "16", "-e", "signed-integer", "-t", "raw", "-"] };
382
+ if (codec === "mulaw" || codec === "alaw") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-t", codec, "-"] };
383
+ return { command: "play", args: ["-q", "-t", soxTypeForCodec(codec), "-"] };
384
+ }
385
+ if (player === "aplay" && codec === "wav") return { command: "aplay", args: ["-q", "-"] };
386
+ throw new Error(`Unsupported streaming player ${player} for codec ${codec}`);
387
+ }
388
+
389
+ function soxTypeForCodec(codec: PiListensConfig["ttsOutputCodec"]): string {
390
+ if (codec === "aac") return "adts";
391
+ if (codec === "linear16") return "raw";
392
+ return codec;
393
+ }
394
+
306
395
  type AudioProcessKind = "record" | "play" | "other";
307
396
 
308
397
  type ManagedChild = ReturnType<typeof spawn>;
@@ -318,10 +407,10 @@ export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; for
318
407
  }
319
408
  }
320
409
 
321
- function spawnManaged(command: string, args: string[], kind: AudioProcessKind): ManagedChild {
410
+ function spawnManaged(command: string, args: string[], kind: AudioProcessKind, stdio: StdioOptions = ["ignore", "pipe", "pipe"]): ManagedChild {
322
411
  installProcessExitCleanup();
323
412
  const child = spawn(command, args, {
324
- stdio: ["ignore", "pipe", "pipe"],
413
+ stdio,
325
414
  detached: process.platform !== "win32",
326
415
  });
327
416
  activeChildren.add(child);
package/src/commands.ts CHANGED
@@ -1,9 +1,11 @@
1
- import { mkdir } from "node:fs/promises";
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { mkdir, writeFile } from "node:fs/promises";
3
+ import { homedir } from "node:os";
2
4
  import { join } from "node:path";
3
5
  import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
4
6
  import type { VoiceToolServices } from "./tools.js";
5
7
  import { conciseTranscript, prepareSpokenText } from "./text.js";
6
- import { audioExtensionForCodec } from "./config.js";
8
+ import { audioExtensionForCodec, maskSecret } from "./config.js";
7
9
  import { applyVoiceChrome, installVoiceUi, uninstallVoiceUi } from "./voice-ui.js";
8
10
 
9
11
  export type VoiceLoopStatus = "idle" | "listening" | "transcribing" | "agent" | "speaking" | "error";
@@ -26,10 +28,10 @@ export interface VoiceModeState {
26
28
  }
27
29
 
28
30
  export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState) {
29
- pi.registerCommand("listen", {
30
- description: "Record speech, transcribe with Sarvam AI, and send it to pi as a user message",
31
+ pi.registerCommand("init", {
32
+ description: "Create a global pi-listens settings file at ~/.pi/pi-listens.json with sensible defaults",
31
33
  handler: async (args, ctx) => {
32
- await listenAndSend(pi, services, ctx, parseSeconds(args));
34
+ await initSettings(services, ctx, args.includes("--overwrite"));
33
35
  },
34
36
  });
35
37
 
@@ -46,11 +48,10 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
46
48
  });
47
49
 
48
50
  pi.registerCommand("voice-on", {
49
- description: "Enable hands-free voice loop. Use --speak to read short assistant replies aloud.",
51
+ description: "Enable hands-free voice loop with auto-speak and auto-listen. Use --no-speak to disable reading replies aloud, --manual to only listen on demand.",
50
52
  handler: async (args, ctx) => {
51
53
  state.enabled = true;
52
- if (args.includes("--speak")) state.autoSpeakAssistant = true;
53
- if (args.includes("--no-speak")) state.autoSpeakAssistant = false;
54
+ state.autoSpeakAssistant = !args.includes("--no-speak");
54
55
  state.autoListen = !args.includes("--manual");
55
56
  installVoiceUi(ctx, state, createVoiceUiCallbacks(pi, services, state, ctx));
56
57
  applyVoiceChrome(ctx, state);
@@ -60,29 +61,97 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
60
61
  });
61
62
 
62
63
 
63
- pi.registerCommand("voice-status", {
64
- description: "Show pi-listens Sarvam AI, recorder, player, and voice-mode status",
64
+ pi.registerCommand("voice-check", {
65
+ description: "Check pi-listens setup: Sarvam AI key, recorder, player, and voice-mode status",
65
66
  handler: async (_args, ctx) => {
66
67
  const config = services.getConfig();
67
68
  const audio = services.getAudio().describe();
69
+ const ready = Boolean(config.apiKey) && audio.recorder !== "missing" && audio.player !== "missing";
68
70
  ctx.ui.notify(
69
71
  [
70
- `Voice mode: ${state.enabled ? "on" : "off"}`,
71
- `Auto-speak assistant: ${state.autoSpeakAssistant ? "on" : "off"}`,
72
- `Auto-listen: ${state.autoListen ? "on" : "off"}`,
73
- `Status: ${state.status}`,
74
- `Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
72
+ ready ? "✓ pi-listens is ready." : "⚠ pi-listens needs attention.",
73
+ "",
74
+ `Sarvam API key: ${maskSecret(config.apiKey)}`,
75
75
  `Recorder: ${audio.recorder}`,
76
76
  `Player: ${audio.player}`,
77
+ `Streaming player: ${audio.streamingPlayer}`,
77
78
  `STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
78
79
  `TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
80
+ "",
81
+ `Voice mode: ${state.enabled ? "on" : "off"}`,
82
+ `Auto-speak: ${state.autoSpeakAssistant ? "on" : "off"}`,
83
+ `Auto-listen: ${state.autoListen ? "on" : "off"}`,
79
84
  ].join("\n"),
80
- config.apiKey && audio.recorder !== "missing" && audio.player !== "missing" ? "info" : "warning",
85
+ ready ? "info" : "warning",
81
86
  );
82
87
  },
83
88
  });
84
89
  }
85
90
 
91
+ const INIT_SETTINGS_TEMPLATE = {
92
+ apiKey: "paste-your-sarvam-api-key-here",
93
+ sttModel: "saaras:v3",
94
+ sttMode: "transcribe",
95
+ sttLanguageCode: "unknown",
96
+ translateInputToEnglish: true,
97
+ ttsModel: "bulbul:v3",
98
+ ttsLanguageCode: "en-IN",
99
+ ttsSpeaker: "shubh",
100
+ recordSeconds: 300,
101
+ recordSampleRate: 16000,
102
+ streamChunkMs: 250,
103
+ streamMaxSeconds: 300,
104
+ silenceStartSeconds: 0.2,
105
+ silenceStopSeconds: 3.5,
106
+ silenceThreshold: "1%",
107
+ ttsSampleRate: 24000,
108
+ ttsOutputCodec: "wav",
109
+ textFallback: true,
110
+ autoSpeakAssistant: true,
111
+ maxAutoSpeakChars: 320,
112
+ };
113
+
114
+ async function initSettings(services: VoiceToolServices, ctx: ExtensionCommandContext, overwrite: boolean) {
115
+ const dir = join(homedir(), ".pi");
116
+ const filePath = join(dir, "pi-listens.json");
117
+
118
+ if (existsSync(filePath) && !overwrite) {
119
+ const existing = readFileSync(filePath, "utf8");
120
+ let parsed: Record<string, unknown> = {};
121
+ try { parsed = JSON.parse(existing) as Record<string, unknown>; } catch { /* ignore */ }
122
+ const hasKey = typeof parsed.apiKey === "string" && parsed.apiKey !== "paste-your-sarvam-api-key-here" && parsed.apiKey.length > 0;
123
+ ctx.ui.notify(
124
+ [
125
+ `Settings file already exists: ${filePath}`,
126
+ hasKey ? "Sarvam API key: set" : "Sarvam API key: not yet configured",
127
+ "",
128
+ "Use /init --overwrite to replace it with fresh defaults.",
129
+ ].join("\n"),
130
+ "info",
131
+ );
132
+ return;
133
+ }
134
+
135
+ await mkdir(dir, { recursive: true });
136
+ await writeFile(filePath, `${JSON.stringify(INIT_SETTINGS_TEMPLATE, null, 2)}\n`, "utf8");
137
+
138
+ const audio = services.getAudio().describe();
139
+ ctx.ui.notify(
140
+ [
141
+ `✓ Created settings file: ${filePath}`,
142
+ "",
143
+ "Next step: open the file and replace the apiKey value with your Sarvam AI API key.",
144
+ "Get a key at: https://dashboard.sarvam.ai",
145
+ "",
146
+ `Recorder: ${audio.recorder}`,
147
+ `Player: ${audio.player}`,
148
+ audio.recorder === "missing" || audio.player === "missing"
149
+ ? "⚠ Install SoX (rec/play) or ffmpeg for microphone and audio playback."
150
+ : "✓ Audio recorder and player detected.",
151
+ ].join("\n"),
152
+ "info",
153
+ );
154
+ }
86
155
  export async function maybeContinueVoiceLoop(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState, ctx: ExtensionContext) {
87
156
  if (!state.enabled || state.isListening) return;
88
157
  if (state.autoSpeakAssistant && state.lastAssistantText) {
@@ -177,10 +246,8 @@ async function listenAndSend(
177
246
  }
178
247
 
179
248
  async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal, state?: VoiceModeState, ctx?: ExtensionContext) {
180
- const config = services.getConfig();
181
249
  const speakAbortController = state ? new AbortController() : undefined;
182
250
  const speakSignal = combineSignals(signal, speakAbortController?.signal);
183
- let path: string | undefined;
184
251
 
185
252
  if (state) {
186
253
  state.speakAbortController?.abort();
@@ -190,14 +257,9 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
190
257
  }
191
258
 
192
259
  try {
193
- await mkdir(config.audioDir, { recursive: true });
194
- path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
195
- const result = await services.getSpeech().synthesizeToFile(text, path, speakSignal.signal);
196
- path = result.path;
197
- await services.getAudio().play(result.path, speakSignal.signal);
260
+ await playSpeechBest(services, text, speakSignal.signal);
198
261
  } finally {
199
262
  speakSignal.cleanup();
200
- if (path) await services.getAudio().cleanup(path);
201
263
  if (state && state.speakAbortController === speakAbortController) state.speakAbortController = undefined;
202
264
  if (state && state.status === "speaking") {
203
265
  state.status = "idle";
@@ -206,6 +268,25 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
206
268
  }
207
269
  }
208
270
 
271
+ async function playSpeechBest(services: VoiceToolServices, text: string, signal?: AbortSignal) {
272
+ const audio = services.getAudio();
273
+ if (audio.describe().streamingPlayer !== "missing") {
274
+ const result = await services.getSpeech().synthesizeStream(text, signal);
275
+ await audio.playStream(result.stream, signal);
276
+ return;
277
+ }
278
+
279
+ const config = services.getConfig();
280
+ await mkdir(config.audioDir, { recursive: true });
281
+ const path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
282
+ try {
283
+ const result = await services.getSpeech().synthesizeToFile(text, path, signal);
284
+ await audio.play(result.path, signal);
285
+ } finally {
286
+ await audio.cleanup(path);
287
+ }
288
+ }
289
+
209
290
  function parseSeconds(args: string): number | undefined {
210
291
  const match = args.match(/(?:^|\s)(\d{1,4})(?:\s|$)/);
211
292
  if (!match) return undefined;
package/src/index.ts CHANGED
@@ -52,7 +52,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
52
52
  `Sarvam API key: ${maskSecret(config.apiKey)}`,
53
53
  `Recorder: ${audioInfo.recorder}`,
54
54
  `Player: ${audioInfo.player}`,
55
- "Run /voice-status or call voice_setup_check for details.",
55
+ "Run /init to create a settings file, or /voice-check for details.",
56
56
  ].join("\n"),
57
57
  "warning",
58
58
  );
package/src/sarvam.ts CHANGED
@@ -15,6 +15,10 @@ export interface SynthesisResult {
15
15
  bytes: number;
16
16
  }
17
17
 
18
+ export interface SynthesisStreamResult {
19
+ stream: ReadableStream<Uint8Array>;
20
+ }
21
+
18
22
  type StreamingData = {
19
23
  transcript?: string;
20
24
  request_id?: string;
@@ -127,6 +131,28 @@ export class SarvamSpeechClient {
127
131
  return { path, bytes: buffer.byteLength };
128
132
  }
129
133
 
134
+ async synthesizeStream(text: string, signal?: AbortSignal): Promise<SynthesisStreamResult> {
135
+ const config = this.getConfig();
136
+ const client = this.getClient(config);
137
+ const response = await client.textToSpeech.convertStream(
138
+ {
139
+ text,
140
+ target_language_code: config.ttsLanguageCode as never,
141
+ speaker: config.ttsSpeaker as never,
142
+ model: config.ttsModel as never,
143
+ pace: config.ttsPace,
144
+ temperature: config.ttsTemperature,
145
+ speech_sample_rate: config.ttsSampleRate as never,
146
+ enable_preprocessing: true,
147
+ output_audio_codec: config.ttsOutputCodec as never,
148
+ },
149
+ { abortSignal: signal },
150
+ );
151
+ const stream = response.stream();
152
+ if (!stream) throw new Error("Sarvam TTS response did not include a readable audio stream");
153
+ return { stream };
154
+ }
155
+
130
156
  private async withStreamingSocket(
131
157
  signal: AbortSignal | undefined,
132
158
  mode: SttMode | undefined,
package/src/tools.ts CHANGED
@@ -17,7 +17,7 @@ export interface VoiceToolServices {
17
17
 
18
18
  const VoiceOutputParams = Type.Object({
19
19
  text: Type.String({ description: "Short text to speak to the user. Keep it concise; do not speak code blocks or long logs." }),
20
- wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default true." })),
20
+ wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default false." })),
21
21
  });
22
22
 
23
23
  const VoiceInputParams = Type.Object({
@@ -54,21 +54,20 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
54
54
  ],
55
55
  parameters: VoiceOutputParams,
56
56
  async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {
57
- onUpdate?.({ content: [{ type: "text", text: "Synthesizing speech with Sarvam AI…" }], details: {} });
58
- const result = await speak(params.text, services, signal);
59
- const playback = services.getAudio().play(result.path, signal).finally(() => services.getAudio().cleanup(result.path));
60
- if (params.wait_for_playback === false) {
57
+ onUpdate?.({ content: [{ type: "text", text: "Starting streamed speech with Sarvam AI…" }], details: {} });
58
+ const playback = playSpeechBest(params.text, services, signal);
59
+ if (params.wait_for_playback !== true) {
61
60
  void playback.catch(() => undefined);
62
61
  return {
63
62
  content: [{ type: "text", text: `Started speaking to user: ${params.text}` }],
64
- details: { ...result, played: "started", text: params.text },
63
+ details: { played: "started", text: params.text },
65
64
  };
66
65
  }
67
66
  onUpdate?.({ content: [{ type: "text", text: "Playing audio…" }], details: {} });
68
- await playback;
67
+ const details = await playback;
69
68
  return {
70
69
  content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
71
- details: { ...result, played: true, text: params.text },
70
+ details: { ...details, played: true, text: params.text },
72
71
  };
73
72
  },
74
73
  renderCall(args: VoiceOutputInput, theme) {
@@ -117,12 +116,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
117
116
  parameters: VoiceAskParams,
118
117
  async execute(_toolCallId, params: VoiceAskInput, signal, onUpdate, ctx) {
119
118
  onUpdate?.({ content: [{ type: "text", text: "Speaking question…" }], details: {} });
120
- const spoken = await speak(params.question, services, signal);
121
- try {
122
- await services.getAudio().play(spoken.path, signal);
123
- } finally {
124
- await services.getAudio().cleanup(spoken.path);
125
- }
119
+ await playSpeechBest(params.question, services, signal);
126
120
  const answer = await listenAndMaybeFallback(
127
121
  params,
128
122
  services,
@@ -173,6 +167,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
173
167
  `Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
174
168
  `Recorder: ${audio.recorder}`,
175
169
  `Player: ${audio.player}`,
170
+ `Streaming player: ${audio.streamingPlayer}`,
176
171
  `STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
177
172
  `TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
178
173
  ].join("\n"),
@@ -184,7 +179,24 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
184
179
  });
185
180
  }
186
181
 
187
- async function speak(text: string, services: VoiceToolServices, signal?: AbortSignal) {
182
+ async function playSpeechBest(text: string, services: VoiceToolServices, signal?: AbortSignal): Promise<Record<string, unknown>> {
183
+ const audio = services.getAudio();
184
+ if (audio.describe().streamingPlayer !== "missing") {
185
+ const result = await services.getSpeech().synthesizeStream(text, signal);
186
+ await audio.playStream(result.stream, signal);
187
+ return { playback: "stream" };
188
+ }
189
+
190
+ const result = await speakToFile(text, services, signal);
191
+ try {
192
+ await audio.play(result.path, signal);
193
+ return { ...result, playback: "file" };
194
+ } finally {
195
+ await audio.cleanup(result.path);
196
+ }
197
+ }
198
+
199
+ async function speakToFile(text: string, services: VoiceToolServices, signal?: AbortSignal) {
188
200
  const config = services.getConfig();
189
201
  await mkdir(config.audioDir, { recursive: true });
190
202
  const path = join(config.audioDir, `pi-listens-output-${Date.now()}-${randomUUID()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
package/src/voice-ui.ts CHANGED
@@ -88,7 +88,7 @@ class VoiceLoopEditor extends CustomEditor {
88
88
  if (mouse.pressed && mouse.button === 0) this.triggerMouseOrbClick(mouse);
89
89
  return;
90
90
  }
91
- if (data.toLowerCase() === "r") {
91
+ if (data === " ") {
92
92
  this.triggerOrbClick(1);
93
93
  this.callbacks.startListening();
94
94
  return;
@@ -309,7 +309,7 @@ function frameIntervalForStatus(status: VoiceModeState["status"]): number {
309
309
  function controlRail(state: VoiceModeState, palette: OrbPalette, width: number): string[] {
310
310
  const listenLabel = state.isListening ? "stop" : "listen";
311
311
  const pills = [
312
- controlPill("R", listenLabel, state.isListening ? "active" : "primary", palette),
312
+ controlPill("Space", listenLabel, state.isListening ? "active" : "primary", palette),
313
313
  controlPill("A", state.autoListen ? "auto-listen on" : "auto-listen off", state.autoListen ? "active" : "muted", palette),
314
314
  controlPill("S", state.autoSpeakAssistant ? "read aloud on" : "read aloud off", state.autoSpeakAssistant ? "active" : "muted", palette),
315
315
  controlPill("Q", "close", "danger", palette),