npm - @p8n.ai/pi-listens - Versions diffs - 0.1.0 → 0.1.2 - Mend

@p8n.ai/pi-listens 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md +19 -1
package/README.md +5 -5
package/package.json +2 -2
package/skills/pi-listens/SKILL.md +5 -4
package/src/audio.ts +115 -15
package/src/commands.ts +30 -12
package/src/config.ts +1 -1
package/src/index.ts +1 -1
package/src/sarvam.ts +73 -16
package/src/text.ts +17 -2
package/src/tools.ts +29 -17
package/src/voice-ui.ts +2 -2

package/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,22 @@ This project follows [Semantic Versioning](https://semver.org/).
 ## [Unreleased]
+## [0.1.2] - 2026-05-09
+### Changed
+- Stream TTS audio directly to the local player so speech starts sooner.
+- Make `voice_output` non-blocking by default; pass `wait_for_playback: true` to wait.
+- Replace the `R` voice-panel shortcut with Space for easier listen/stop control.
+## [0.1.1] - 2026-05-09
+### Fixed
+- Return Sarvam STT results faster after flushing microphone audio.
+- Stop current speech playback before starting a new listen, without cancelling the new recording.
+- Keep spoken auto-summaries concise and avoid headings, hashtags, bullet lists, and boilerplate recaps.
 ## [0.1.0] - 2026-05-09
 ### Added
@@ -23,5 +39,7 @@ This project follows [Semantic Versioning](https://semver.org/).
 - Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
 - Clean up generated audio files when spoken playback is interrupted.
-[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.0...HEAD
+[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.2...HEAD
 [0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
+[0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
+[0.1.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.2

package/README.md CHANGED Viewed

@@ -80,9 +80,9 @@ The package registers these tools for Pi's agent:
 The extension also injects voice guidance into the system prompt:
 - use `voice_ask` whenever user input is needed in voice-first sessions
-- use `voice_output` for short spoken status or response snippets
+- use `voice_output` only for short spoken status or response snippets
+- keep spoken replies to 1-2 short sentences with no headings, hashtags, bullet lists, boilerplate recaps, or full task summaries
 - do not speak code blocks, logs, diffs, stack traces, or long explanations
-- keep spoken questions concise and answerable in a short response
 ## Commands
@@ -95,10 +95,10 @@ The extension also injects voice guidance into the system prompt:
 | `/voice-status` | Show setup and voice-mode status. |
 Voice panel controls in interactive mode:
-- R: listen now; press again while listening to stop listening
+- Space: listen now; press again while listening to stop listening; if Pi is speaking, Space stops playback before listening
 - A: auto-listen on/off (listen again after each assistant reply)
 - S: read aloud on/off (speak assistant replies)
-- Q: close the panel (and stop listening first if needed)
+- Q: close the panel and stop any active listening or speaking
 - Click the orb: visual ripple feedback (terminals with mouse reporting)
 ## Headless/RPC behavior
@@ -144,7 +144,7 @@ Example config file:
   "ttsOutputCodec": "wav",
   "textFallback": true,
   "autoSpeakAssistant": false,
-  "maxAutoSpeakChars": 900
+  "maxAutoSpeakChars": 320
 }
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@p8n.ai/pi-listens",
-  "version": "0.1.0",
+  "version": "0.1.2",
   "description": "Pi package for speech-first interaction using Sarvam AI speech-to-text and text-to-speech.",
   "author": "Ravindra Barthwal",
   "license": "MIT",
@@ -33,7 +33,7 @@
   ],
   "scripts": {
     "typecheck": "tsc --noEmit",
-    "test": "npm run typecheck"
+    "test": "npm run typecheck && node --import tsx --test test/**/*.test.ts"
   },
   "pi": {
     "extensions": [

package/skills/pi-listens/SKILL.md CHANGED Viewed

@@ -19,10 +19,11 @@ This Pi package provides voice tools backed by Sarvam AI.
 1. When you need user input, clarification, or confirmation, use `voice_ask` instead of asking only in text.
 2. Before using `voice_input`, make sure the user already knows you are listening. If not, use `voice_ask`.
-3. Use `voice_output` for concise spoken status updates or spoken summaries that matter to the user.
-4. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
-5. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
-6. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
+3. Use `voice_output` only for concise spoken status updates or spoken summaries that matter to the user.
+4. Spoken output must be brief: 1-2 short sentences, no markdown headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.
+5. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
+6. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
+7. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
 ## Good voice question style

package/src/audio.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { mkdir, rm } from "node:fs/promises";
 import { randomUUID } from "node:crypto";
 import { join } from "node:path";
-import { spawn } from "node:child_process";
+import { spawn, type StdioOptions } from "node:child_process";
 import { accessSync, constants } from "node:fs";
 import { once } from "node:events";
 import type { PiListensConfig } from "./config.js";
@@ -10,14 +10,17 @@ export interface AudioRuntime {
 	record(seconds?: number, signal?: AbortSignal): Promise<string>;
 	streamPcm(signal?: AbortSignal): AsyncIterable<Buffer>;
 	play(path: string, signal?: AbortSignal): Promise<void>;
+	playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void>;
 	cleanup(path: string): Promise<void>;
+	stopPlayback(): void;
 	stopAll(): void;
-	describe(): { recorder: string; player: string };
+	describe(): { recorder: string; player: string; streamingPlayer: string };
 }
 export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 	const recorder = config.recordCommand ? "custom" : detectRecorder();
 	const player = config.playCommand ? "custom" : detectPlayer();
+	const streamingPlayer = detectStreamingPlayer();
 	return {
 		async record(seconds = config.recordSeconds, signal?: AbortSignal): Promise<string> {
@@ -41,7 +44,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 				: useUtteranceMode
 					? utteranceRecorderCommand(recorder, path, config.recordSampleRate, config.silenceStartSeconds, config.silenceStopSeconds, config.silenceThreshold)
 					: recorderCommand(recorder, path, seconds, config.recordSampleRate);
-			await run(command.command, command.args, signal, useUtteranceMode ? { timeoutMs: seconds * 1000, resolveOnTimeout: true } : undefined);
+			await run(command.command, command.args, signal, { ...(useUtteranceMode ? { timeoutMs: seconds * 1000, resolveOnTimeout: true } : {}), kind: "record" });
 			return path;
 		},
@@ -54,7 +57,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 			const command = config.streamCommand
 				? customCommand(config.streamCommand, { sampleRate: config.recordSampleRate })
 				: pcmStreamCommand(recorder, config.recordSampleRate);
-			return streamCommandOutput(command.command, command.args, signal);
+			return streamCommandOutput(command.command, command.args, signal, "record");
 		},
 		async play(path: string, signal?: AbortSignal): Promise<void> {
@@ -64,7 +67,17 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 				);
 			}
 			const command = config.playCommand ? customCommand(config.playCommand, { path }) : playerCommand(player, path);
-			await run(command.command, command.args, signal);
+			await run(command.command, command.args, signal, { kind: "play" });
+		},
+		async playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void> {
+			if (!streamingPlayer) {
+				throw new Error(
+					"No streaming audio player found. Install ffplay or sox (`play`) for low-latency TTS playback, or use file playback fallback.",
+				);
+			}
+			const command = streamingPlayerCommand(streamingPlayer, config.ttsOutputCodec, config.ttsSampleRate);
+			await pipeStreamToCommand(stream, command.command, command.args, signal);
 		},
 		async cleanup(path: string): Promise<void> {
@@ -72,12 +85,16 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 			await rm(path, { force: true }).catch(() => undefined);
 		},
+		stopPlayback(): void {
+			stopActiveAudioProcesses({ kind: "play" });
+		},
 		stopAll(): void {
 			stopActiveAudioProcesses();
 		},
 		describe() {
-			return { recorder: recorder ?? "missing", player: player ?? "missing" };
+			return { recorder: recorder ?? "missing", player: player ?? "missing", streamingPlayer: streamingPlayer ?? "missing" };
 		},
 	};
 }
@@ -200,6 +217,13 @@ function detectPlayer(): string | null {
 	return null;
 }
+function detectStreamingPlayer(): string | null {
+	if (isCommandAvailable("ffplay")) return "ffplay";
+	if (isCommandAvailable("play")) return "play";
+	if (isCommandAvailable("aplay")) return "aplay";
+	return null;
+}
 function isCommandAvailable(command: string): boolean {
 	const paths = (process.env.PATH ?? "").split(":").filter(Boolean);
 	for (const dir of paths) {
@@ -213,14 +237,14 @@ function isCommandAvailable(command: string): boolean {
 	return false;
 }
-function run(command: string, args: string[], signal?: AbortSignal, options: { timeoutMs?: number; resolveOnTimeout?: boolean } = {}): Promise<void> {
+function run(command: string, args: string[], signal?: AbortSignal, options: { timeoutMs?: number; resolveOnTimeout?: boolean; kind?: AudioProcessKind } = {}): Promise<void> {
 	return new Promise((resolve, reject) => {
 		if (signal?.aborted) {
 			reject(new Error("Cancelled"));
 			return;
 		}
-		const child = spawnManaged(command, args);
+		const child = spawnManaged(command, args, options.kind ?? "other");
 		let stderr = "";
 		let stdout = "";
 		let timedOut = false;
@@ -265,9 +289,9 @@ function run(command: string, args: string[], signal?: AbortSignal, options: { t
 	});
 }
-async function* streamCommandOutput(command: string, args: string[], signal?: AbortSignal): AsyncIterable<Buffer> {
+async function* streamCommandOutput(command: string, args: string[], signal?: AbortSignal, kind: AudioProcessKind = "other"): AsyncIterable<Buffer> {
 	if (signal?.aborted) throw new Error("Cancelled");
-	const child = spawnManaged(command, args);
+	const child = spawnManaged(command, args, kind);
 	let stderr = "";
 	let exitCode: number | null = null;
 	let exitSignal: NodeJS.Signals | null = null;
@@ -298,23 +322,99 @@ async function* streamCommandOutput(command: string, args: string[], signal?: Ab
 	}
 }
+async function pipeStreamToCommand(stream: ReadableStream<Uint8Array>, command: string, args: string[], signal?: AbortSignal): Promise<void> {
+	if (signal?.aborted) throw new Error("Cancelled");
+	const child = spawnManaged(command, args, "play", ["pipe", "pipe", "pipe"]);
+	let stderr = "";
+	let stdout = "";
+	let exitCode: number | null = null;
+	let exitSignal: NodeJS.Signals | null = null;
+	let spawnError: Error | undefined;
+	const stop = () => terminateChild(child);
+	signal?.addEventListener("abort", stop, { once: true });
+	child.stdout?.on("data", (chunk) => { stdout += chunk.toString(); });
+	child.stderr?.on("data", (chunk) => { stderr += chunk.toString(); });
+	child.on("error", (err) => { spawnError = err; });
+	child.on("close", (code, termSignal) => { exitCode = code; exitSignal = termSignal; });
+	try {
+		if (!child.stdin) throw new Error(`${command} did not provide stdin for streaming audio playback`);
+		const stdin = child.stdin;
+		const reader = stream.getReader();
+		try {
+			while (true) {
+				if (signal?.aborted) throw new Error("Cancelled");
+				if (spawnError) throw spawnError;
+				const { done, value } = await reader.read();
+				if (done) break;
+				if (!value?.byteLength) continue;
+				if (!stdin.write(Buffer.from(value))) await once(stdin, "drain");
+			}
+		} finally {
+			reader.releaseLock();
+		}
+		stdin.end();
+		if (exitCode === null && !spawnError) await once(child, "close");
+		if (signal?.aborted) throw new Error("Cancelled");
+		if (spawnError) throw spawnError;
+		if (exitCode !== 0) {
+			const output = [stderr.trim(), stdout.trim()].filter(Boolean).join("\n");
+			throw new Error(`${command} failed${exitSignal ? ` (${exitSignal})` : ""}${exitCode === null ? "" : ` with exit code ${exitCode}`}${output ? `: ${output}` : ""}`);
+		}
+	} finally {
+		signal?.removeEventListener("abort", stop);
+		if (!child.killed && exitCode === null) stop();
+	}
+}
+function streamingPlayerCommand(player: string, codec: PiListensConfig["ttsOutputCodec"], sampleRate: number): CommandSpec {
+	if (player === "ffplay") {
+		const args = ["-nodisp", "-autoexit", "-loglevel", "error"];
+		if (codec === "linear16") args.push("-f", "s16le", "-ar", String(sampleRate), "-ac", "1");
+		if (codec === "mulaw") args.push("-f", "mulaw", "-ar", String(sampleRate), "-ac", "1");
+		if (codec === "alaw") args.push("-f", "alaw", "-ar", String(sampleRate), "-ac", "1");
+		args.push("-i", "pipe:0");
+		return { command: "ffplay", args };
+	}
+	if (player === "play") {
+		if (codec === "linear16") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-b", "16", "-e", "signed-integer", "-t", "raw", "-"] };
+		if (codec === "mulaw" || codec === "alaw") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-t", codec, "-"] };
+		return { command: "play", args: ["-q", "-t", soxTypeForCodec(codec), "-"] };
+	}
+	if (player === "aplay" && codec === "wav") return { command: "aplay", args: ["-q", "-"] };
+	throw new Error(`Unsupported streaming player ${player} for codec ${codec}`);
+}
+function soxTypeForCodec(codec: PiListensConfig["ttsOutputCodec"]): string {
+	if (codec === "aac") return "adts";
+	if (codec === "linear16") return "raw";
+	return codec;
+}
+type AudioProcessKind = "record" | "play" | "other";
 type ManagedChild = ReturnType<typeof spawn>;
 const activeChildren = new Set<ManagedChild>();
+const childKinds = new WeakMap<ManagedChild, AudioProcessKind>();
 const terminatingChildren = new WeakSet<ManagedChild>();
 let processExitCleanupInstalled = false;
-export function stopActiveAudioProcesses(force = false): void {
-	for (const child of [...activeChildren]) terminateChild(child, force);
+export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; force?: boolean } = {}): void {
+	for (const child of [...activeChildren]) {
+		if (!options.kind || childKinds.get(child) === options.kind) terminateChild(child, options.force);
+	}
 }
-function spawnManaged(command: string, args: string[]): ManagedChild {
+function spawnManaged(command: string, args: string[], kind: AudioProcessKind, stdio: StdioOptions = ["ignore", "pipe", "pipe"]): ManagedChild {
 	installProcessExitCleanup();
 	const child = spawn(command, args, {
-		stdio: ["ignore", "pipe", "pipe"],
+		stdio,
 		detached: process.platform !== "win32",
 	});
 	activeChildren.add(child);
+	childKinds.set(child, kind);
 	const untrack = () => activeChildren.delete(child);
 	child.once("close", untrack);
 	child.once("error", untrack);
@@ -324,7 +424,7 @@ function spawnManaged(command: string, args: string[]): ManagedChild {
 function installProcessExitCleanup(): void {
 	if (processExitCleanupInstalled) return;
 	processExitCleanupInstalled = true;
-	process.once("exit", () => stopActiveAudioProcesses(true));
+	process.once("exit", () => stopActiveAudioProcesses({ force: true }));
 }
 function terminateChild(child: ManagedChild, force = false): void {

package/src/commands.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
 					`Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
 					`Recorder: ${audio.recorder}`,
 					`Player: ${audio.player}`,
+					`Streaming player: ${audio.streamingPlayer}`,
 					`STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
 					`TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
 				].join("\n"),
@@ -121,6 +122,7 @@ async function listenAndSend(
 		state.listenAbortController?.abort();
 		return;
 	}
+	stopSpeaking(services, state);
 	state.recordSeconds = seconds ?? services.getConfig().recordSeconds;
 	state.silenceStopSeconds = services.getConfig().silenceStopSeconds;
 	state.isListening = true;
@@ -176,10 +178,8 @@ async function listenAndSend(
 }
 async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal, state?: VoiceModeState, ctx?: ExtensionContext) {
-	const config = services.getConfig();
 	const speakAbortController = state ? new AbortController() : undefined;
 	const speakSignal = combineSignals(signal, speakAbortController?.signal);
-	let path: string | undefined;
 	if (state) {
 		state.speakAbortController?.abort();
@@ -189,14 +189,9 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
 	}
 	try {
-		await mkdir(config.audioDir, { recursive: true });
-		path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
-		const result = await services.getSpeech().synthesizeToFile(text, path, speakSignal.signal);
-		path = result.path;
-		await services.getAudio().play(result.path, speakSignal.signal);
+		await playSpeechBest(services, text, speakSignal.signal);
 	} finally {
 		speakSignal.cleanup();
-		if (path) await services.getAudio().cleanup(path);
 		if (state && state.speakAbortController === speakAbortController) state.speakAbortController = undefined;
 		if (state && state.status === "speaking") {
 			state.status = "idle";
@@ -205,6 +200,25 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
 	}
 }
+async function playSpeechBest(services: VoiceToolServices, text: string, signal?: AbortSignal) {
+	const audio = services.getAudio();
+	if (audio.describe().streamingPlayer !== "missing") {
+		const result = await services.getSpeech().synthesizeStream(text, signal);
+		await audio.playStream(result.stream, signal);
+		return;
+	}
+	const config = services.getConfig();
+	await mkdir(config.audioDir, { recursive: true });
+	const path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
+	try {
+		const result = await services.getSpeech().synthesizeToFile(text, path, signal);
+		await audio.play(result.path, signal);
+	} finally {
+		await audio.cleanup(path);
+	}
+}
 function parseSeconds(args: string): number | undefined {
 	const match = args.match(/(?:^|\s)(\d{1,4})(?:\s|$)/);
 	if (!match) return undefined;
@@ -253,6 +267,13 @@ function isCancelled(err: unknown): boolean {
 	return err instanceof Error && /cancelled|aborted/i.test(err.message);
 }
+function stopSpeaking(services: VoiceToolServices, state: VoiceModeState) {
+	const speakAbortController = state.speakAbortController;
+	state.speakAbortController = undefined;
+	speakAbortController?.abort();
+	services.getAudio().stopPlayback();
+}
 export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState, ctx?: ExtensionContext | ExtensionCommandContext) {
 	state.enabled = false;
 	state.autoListen = false;
@@ -264,10 +285,7 @@ export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState
 	state.listenAbortController = undefined;
 	listenAbortController?.abort();
-	const speakAbortController = state.speakAbortController;
-	state.speakAbortController = undefined;
-	speakAbortController?.abort();
+	stopSpeaking(services, state);
 	services.getAudio().stopAll();
 	if (ctx) uninstallVoiceUi(ctx, state);

package/src/config.ts CHANGED Viewed

@@ -60,7 +60,7 @@ const DEFAULT_CONFIG: PiListensConfig = {
 	deleteAudio: true,
 	textFallback: true,
 	autoSpeakAssistant: false,
-	maxAutoSpeakChars: 900,
+	maxAutoSpeakChars: 320,
 };
 type RawConfig = Partial<PiListensConfig>;

package/src/index.ts CHANGED Viewed

@@ -65,7 +65,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
 	pi.on("before_agent_start", async (event) => {
 		return {
-			systemPrompt: `${event.systemPrompt}\n\nPi Listens voice guidance:\n- The user may primarily interact by speech through Sarvam AI. Text input is still possible.\n- When voice mode is active, treat it as a hands-free conversation: listen only while the voice UI/input tool is active, then pause listening while you work.\n- Use voice_output for concise spoken progress, completion, or status updates that matter to the user.\n- When you need clarification, confirmation, or any user input, prefer voice_ask with a concise spoken question instead of asking only in text.\n- Use voice_input only after the user already knows you are listening.\n- Do not speak code blocks, logs, diffs, stack traces, or long explanations; summarize them briefly and leave detail in text.`,
+			systemPrompt: `${event.systemPrompt}\n\nPi Listens voice guidance:\n- The user may primarily interact by speech through Sarvam AI. Text input is still possible.\n- When voice mode is active, treat it as a hands-free conversation: listen only while the voice UI/input tool is active, then pause listening while you work.\n- Use voice_output only for concise spoken progress, completion, or status updates that matter to the user.\n- Spoken replies must be brief: 1-2 short sentences, no headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.\n- When you need clarification, confirmation, or any user input, prefer voice_ask with a concise spoken question instead of asking only in text.\n- Use voice_input only after the user already knows you are listening.\n- Do not speak code blocks, logs, diffs, stack traces, or long explanations; summarize briefly and leave detail in text.`,
 		};
 	});

package/src/sarvam.ts CHANGED Viewed

@@ -1,5 +1,4 @@
 import { readFile, writeFile } from "node:fs/promises";
-import { setTimeout as delay } from "node:timers/promises";
 import { SarvamAIClient } from "sarvamai";
 import type { AudioRuntime } from "./audio.js";
 import type { PiListensConfig, SttMode } from "./config.js";
@@ -16,6 +15,10 @@ export interface SynthesisResult {
 	bytes: number;
 }
+export interface SynthesisStreamResult {
+	stream: ReadableStream<Uint8Array>;
+}
 type StreamingData = {
 	transcript?: string;
 	request_id?: string;
@@ -128,6 +131,28 @@ export class SarvamSpeechClient {
 		return { path, bytes: buffer.byteLength };
 	}
+	async synthesizeStream(text: string, signal?: AbortSignal): Promise<SynthesisStreamResult> {
+		const config = this.getConfig();
+		const client = this.getClient(config);
+		const response = await client.textToSpeech.convertStream(
+			{
+				text,
+				target_language_code: config.ttsLanguageCode as never,
+				speaker: config.ttsSpeaker as never,
+				model: config.ttsModel as never,
+				pace: config.ttsPace,
+				temperature: config.ttsTemperature,
+				speech_sample_rate: config.ttsSampleRate as never,
+				enable_preprocessing: true,
+				output_audio_codec: config.ttsOutputCodec as never,
+			},
+			{ abortSignal: signal },
+		);
+		const stream = response.stream();
+		if (!stream) throw new Error("Sarvam TTS response did not include a readable audio stream");
+		return { stream };
+	}
 	private async withStreamingSocket(
 		signal: AbortSignal | undefined,
 		mode: SttMode | undefined,
@@ -145,35 +170,46 @@ export class SarvamSpeechClient {
 		let languageProbability: number | undefined;
 		let streamError: Error | undefined;
 		let lastMessageAt = Date.now();
+		const messageWaiters = new Set<() => void>();
 		const socket = connectStreamingSocket(config, mode ?? (config.translateInputToEnglish ? "translate" : config.sttMode), inputAudioCodec);
 		const closeOnAbort = () => socket.close();
 		signal?.addEventListener("abort", closeOnAbort, { once: true });
+		const notifyMessageWaiters = () => {
+			const waiters = [...messageWaiters];
+			messageWaiters.clear();
+			for (const waiter of waiters) waiter();
+		};
 		socket.onMessage((message: StreamingResponse) => {
 			lastMessageAt = Date.now();
-			if (message.type === "error") {
-				streamError = new Error(message.data?.error ?? message.data?.code ?? "Sarvam streaming STT failed");
-				return;
+			try {
+				if (message.type === "error") {
+					streamError = new Error(message.data?.error ?? message.data?.code ?? "Sarvam streaming STT failed");
+					return;
+				}
+				if (message.type !== "data") return;
+				const data = message.data;
+				if (!data) return;
+				transcript = mergeTranscript(transcript, data.transcript ?? "");
+				requestId = data.request_id ?? requestId;
+				languageCode = data.language_code ?? languageCode;
+				languageProbability = data.language_probability ?? languageProbability;
+			} finally {
+				notifyMessageWaiters();
 			}
-			if (message.type !== "data") return;
-			const data = message.data;
-			if (!data) return;
-			transcript = mergeTranscript(transcript, data.transcript ?? "");
-			requestId = data.request_id ?? requestId;
-			languageCode = data.language_code ?? languageCode;
-			languageProbability = data.language_probability ?? languageProbability;
 		});
-		socket.onError((error: Error) => { streamError = error; });
+		socket.onError((error: Error) => { streamError = error; notifyMessageWaiters(); });
 		try {
 			await socket.waitForOpen();
 			await streamAudio(socket, async () => {
 				const startedWaitingAt = Date.now();
-				while (Date.now() - startedWaitingAt < 3000) {
+				const maxWaitMs = transcript.trim() ? 900 : 1600;
+				const settleMs = 250;
+				while (Date.now() - startedWaitingAt < maxWaitMs) {
 					if (streamError) throw streamError;
-					if (Date.now() - lastMessageAt > 850 && transcript.trim()) break;
-					await delay(100, undefined, { signal }).catch((err) => { throw err; });
+					if (transcript.trim() && Date.now() - lastMessageAt >= settleMs) break;
+					await waitForMessageOrTimeout(messageWaiters, 50, signal);
 				}
 			});
 			if (streamError) throw streamError;
@@ -288,6 +324,27 @@ function connectStreamingSocket(config: PiListensConfig, mode: SttMode, inputAud
 	};
 }
+function waitForMessageOrTimeout(waiters: Set<() => void>, timeoutMs: number, signal?: AbortSignal): Promise<void> {
+	return new Promise((resolve, reject) => {
+		if (signal?.aborted) {
+			reject(new Error("Cancelled"));
+			return;
+		}
+		const done = () => { cleanup(); resolve(); };
+		const onAbort = () => { cleanup(); reject(new Error("Cancelled")); };
+		const timeout = setTimeout(done, timeoutMs);
+		const cleanup = () => {
+			clearTimeout(timeout);
+			waiters.delete(done);
+			signal?.removeEventListener("abort", onAbort);
+		};
+		waiters.add(done);
+		signal?.addEventListener("abort", onAbort, { once: true });
+	});
+}
 type CombinedSignal = { signal?: AbortSignal; cleanup: () => void };
 function combineSignals(...signals: Array<AbortSignal | undefined>): CombinedSignal {

package/src/text.ts CHANGED Viewed

@@ -16,17 +16,32 @@ export function firstTextContent(message: unknown): string {
 export function prepareSpokenText(text: string, maxChars: number): string {
 	let prepared = text
-		.replace(/```[\s\S]*?```/g, " I am skipping a code block. ")
+		.replace(/```[\s\S]*?```/g, " I skipped a code block. ")
+		.replace(/^\s{0,3}#{1,6}\s+/gm, "")
+		.replace(/^\s*[-*+]\s+/gm, "")
+		.replace(/^\s*\d+[.)]\s+/gm, "")
 		.replace(/`([^`]+)`/g, "$1")
 		.replace(/https?:\/\/\S+/g, "link")
+		.replace(/[#*_>~|]+/g, " ")
 		.replace(/\s+/g, " ")
 		.trim();
+	prepared = conciseSpokenSummary(prepared);
 	if (prepared.length > maxChars) {
-		prepared = `${prepared.slice(0, Math.max(0, maxChars - 80)).trim()}… I have more details on screen.`;
+		prepared = `${prepared.slice(0, Math.max(0, maxChars - 32)).trim()}… More on screen.`;
 	}
 	return prepared;
 }
+function conciseSpokenSummary(text: string): string {
+	const sentences = text.match(/[^.!?]+[.!?]+|[^.!?]+$/g)?.map((part) => part.trim()).filter(Boolean) ?? [];
+	if (sentences.length === 0) return text;
+	const useful = sentences.filter((sentence) => !/^(sure|here('|’)s|summary|in summary|done|completed|i('|’)ve|i have)\b/i.test(sentence));
+	const picked = (useful.length ? useful : sentences).slice(0, 2).join(" ").trim();
+	return picked || text;
+}
 export function conciseTranscript(transcript: string): string {
 	const trimmed = transcript.trim();
 	return trimmed.length === 0 ? "(no speech recognized)" : trimmed;

package/src/tools.ts CHANGED Viewed

@@ -17,7 +17,7 @@ export interface VoiceToolServices {
 const VoiceOutputParams = Type.Object({
 	text: Type.String({ description: "Short text to speak to the user. Keep it concise; do not speak code blocks or long logs." }),
-	wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default true." })),
+	wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default false." })),
 });
 const VoiceInputParams = Type.Object({
@@ -49,26 +49,25 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 		description: "Speak a short message to the user using Sarvam AI text-to-speech and local audio playback.",
 		promptSnippet: "Speak short user-facing messages with Sarvam AI TTS",
 		promptGuidelines: [
-			"Use voice_output when a spoken user-facing message matters, especially before waiting for voice input.",
-			"Keep voice_output text brief and conversational; do not speak code blocks, command output, stack traces, or long explanations.",
+			"Use voice_output only when a spoken user-facing message matters, especially before waiting for voice input.",
+			"Keep voice_output to 1-2 short conversational sentences. Do not speak headings, hashtags, bullet lists, boilerplate recaps, code, command output, stack traces, or long explanations.",
 		],
 		parameters: VoiceOutputParams,
 		async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {
-			onUpdate?.({ content: [{ type: "text", text: "Synthesizing speech with Sarvam AI…" }], details: {} });
-			const result = await speak(params.text, services, signal);
-			const playback = services.getAudio().play(result.path, signal).finally(() => services.getAudio().cleanup(result.path));
-			if (params.wait_for_playback === false) {
+			onUpdate?.({ content: [{ type: "text", text: "Starting streamed speech with Sarvam AI…" }], details: {} });
+			const playback = playSpeechBest(params.text, services, signal);
+			if (params.wait_for_playback !== true) {
 				void playback.catch(() => undefined);
 				return {
 					content: [{ type: "text", text: `Started speaking to user: ${params.text}` }],
-					details: { ...result, played: "started", text: params.text },
+					details: { played: "started", text: params.text },
 				};
 			}
 			onUpdate?.({ content: [{ type: "text", text: "Playing audio…" }], details: {} });
-			await playback;
+			const details = await playback;
 			return {
 				content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
-				details: { ...result, played: true, text: params.text },
+				details: { ...details, played: true, text: params.text },
 			};
 		},
 		renderCall(args: VoiceOutputInput, theme) {
@@ -117,12 +116,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 		parameters: VoiceAskParams,
 		async execute(_toolCallId, params: VoiceAskInput, signal, onUpdate, ctx) {
 			onUpdate?.({ content: [{ type: "text", text: "Speaking question…" }], details: {} });
-			const spoken = await speak(params.question, services, signal);
-			try {
-				await services.getAudio().play(spoken.path, signal);
-			} finally {
-				await services.getAudio().cleanup(spoken.path);
-			}
+			await playSpeechBest(params.question, services, signal);
 			const answer = await listenAndMaybeFallback(
 				params,
 				services,
@@ -173,6 +167,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 							`Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
 							`Recorder: ${audio.recorder}`,
 							`Player: ${audio.player}`,
+							`Streaming player: ${audio.streamingPlayer}`,
 							`STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
 							`TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
 						].join("\n"),
@@ -184,7 +179,24 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 	});
 }
-async function speak(text: string, services: VoiceToolServices, signal?: AbortSignal) {
+async function playSpeechBest(text: string, services: VoiceToolServices, signal?: AbortSignal): Promise<Record<string, unknown>> {
+	const audio = services.getAudio();
+	if (audio.describe().streamingPlayer !== "missing") {
+		const result = await services.getSpeech().synthesizeStream(text, signal);
+		await audio.playStream(result.stream, signal);
+		return { playback: "stream" };
+	}
+	const result = await speakToFile(text, services, signal);
+	try {
+		await audio.play(result.path, signal);
+		return { ...result, playback: "file" };
+	} finally {
+		await audio.cleanup(result.path);
+	}
+}
+async function speakToFile(text: string, services: VoiceToolServices, signal?: AbortSignal) {
 	const config = services.getConfig();
 	await mkdir(config.audioDir, { recursive: true });
 	const path = join(config.audioDir, `pi-listens-output-${Date.now()}-${randomUUID()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);

package/src/voice-ui.ts CHANGED Viewed

@@ -88,7 +88,7 @@ class VoiceLoopEditor extends CustomEditor {
 			if (mouse.pressed && mouse.button === 0) this.triggerMouseOrbClick(mouse);
 			return;
 		}
-		if (data.toLowerCase() === "r") {
+		if (data === " ") {
 			this.triggerOrbClick(1);
 			this.callbacks.startListening();
 			return;
@@ -309,7 +309,7 @@ function frameIntervalForStatus(status: VoiceModeState["status"]): number {
 function controlRail(state: VoiceModeState, palette: OrbPalette, width: number): string[] {
 	const listenLabel = state.isListening ? "stop" : "listen";
 	const pills = [
-		controlPill("R", listenLabel, state.isListening ? "active" : "primary", palette),
+		controlPill("Space", listenLabel, state.isListening ? "active" : "primary", palette),
 		controlPill("A", state.autoListen ? "auto-listen on" : "auto-listen off", state.autoListen ? "active" : "muted", palette),
 		controlPill("S", state.autoSpeakAssistant ? "read aloud on" : "read aloud off", state.autoSpeakAssistant ? "active" : "muted", palette),
 		controlPill("Q", "close", "danger", palette),