npm - @p8n.ai/pi-listens - Versions diffs - 0.1.1 → 0.1.2 - Mend

@p8n.ai/pi-listens 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,14 @@ This project follows [Semantic Versioning](https://semver.org/).
 ## [Unreleased]
+## [0.1.2] - 2026-05-09
+### Changed
+- Stream TTS audio directly to the local player so speech starts sooner.
+- Make `voice_output` non-blocking by default; pass `wait_for_playback: true` to wait.
+- Replace the `R` voice-panel shortcut with Space for easier listen/stop control.
 ## [0.1.1] - 2026-05-09
 ### Fixed
@@ -31,6 +39,7 @@ This project follows [Semantic Versioning](https://semver.org/).
 - Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
 - Clean up generated audio files when spoken playback is interrupted.
-[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.1...HEAD
+[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.2...HEAD
 [0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
 [0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
+[0.1.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.2

package/README.md CHANGED Viewed

@@ -95,7 +95,7 @@ The extension also injects voice guidance into the system prompt:
 | `/voice-status` | Show setup and voice-mode status. |
 Voice panel controls in interactive mode:
-- R: listen now; press again while listening to stop listening; if Pi is speaking, R stops playback before listening
+- Space: listen now; press again while listening to stop listening; if Pi is speaking, Space stops playback before listening
 - A: auto-listen on/off (listen again after each assistant reply)
 - S: read aloud on/off (speak assistant replies)
 - Q: close the panel and stop any active listening or speaking

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@p8n.ai/pi-listens",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "Pi package for speech-first interaction using Sarvam AI speech-to-text and text-to-speech.",
   "author": "Ravindra Barthwal",
   "license": "MIT",

package/src/audio.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { mkdir, rm } from "node:fs/promises";
 import { randomUUID } from "node:crypto";
 import { join } from "node:path";
-import { spawn } from "node:child_process";
+import { spawn, type StdioOptions } from "node:child_process";
 import { accessSync, constants } from "node:fs";
 import { once } from "node:events";
 import type { PiListensConfig } from "./config.js";
@@ -10,15 +10,17 @@ export interface AudioRuntime {
 	record(seconds?: number, signal?: AbortSignal): Promise<string>;
 	streamPcm(signal?: AbortSignal): AsyncIterable<Buffer>;
 	play(path: string, signal?: AbortSignal): Promise<void>;
+	playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void>;
 	cleanup(path: string): Promise<void>;
 	stopPlayback(): void;
 	stopAll(): void;
-	describe(): { recorder: string; player: string };
+	describe(): { recorder: string; player: string; streamingPlayer: string };
 }
 export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 	const recorder = config.recordCommand ? "custom" : detectRecorder();
 	const player = config.playCommand ? "custom" : detectPlayer();
+	const streamingPlayer = detectStreamingPlayer();
 	return {
 		async record(seconds = config.recordSeconds, signal?: AbortSignal): Promise<string> {
@@ -68,6 +70,16 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 			await run(command.command, command.args, signal, { kind: "play" });
 		},
+		async playStream(stream: ReadableStream<Uint8Array>, signal?: AbortSignal): Promise<void> {
+			if (!streamingPlayer) {
+				throw new Error(
+					"No streaming audio player found. Install ffplay or sox (`play`) for low-latency TTS playback, or use file playback fallback.",
+				);
+			}
+			const command = streamingPlayerCommand(streamingPlayer, config.ttsOutputCodec, config.ttsSampleRate);
+			await pipeStreamToCommand(stream, command.command, command.args, signal);
+		},
 		async cleanup(path: string): Promise<void> {
 			if (!config.deleteAudio) return;
 			await rm(path, { force: true }).catch(() => undefined);
@@ -82,7 +94,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
 		},
 		describe() {
-			return { recorder: recorder ?? "missing", player: player ?? "missing" };
+			return { recorder: recorder ?? "missing", player: player ?? "missing", streamingPlayer: streamingPlayer ?? "missing" };
 		},
 	};
 }
@@ -205,6 +217,13 @@ function detectPlayer(): string | null {
 	return null;
 }
+function detectStreamingPlayer(): string | null {
+	if (isCommandAvailable("ffplay")) return "ffplay";
+	if (isCommandAvailable("play")) return "play";
+	if (isCommandAvailable("aplay")) return "aplay";
+	return null;
+}
 function isCommandAvailable(command: string): boolean {
 	const paths = (process.env.PATH ?? "").split(":").filter(Boolean);
 	for (const dir of paths) {
@@ -303,6 +322,76 @@ async function* streamCommandOutput(command: string, args: string[], signal?: Ab
 	}
 }
+async function pipeStreamToCommand(stream: ReadableStream<Uint8Array>, command: string, args: string[], signal?: AbortSignal): Promise<void> {
+	if (signal?.aborted) throw new Error("Cancelled");
+	const child = spawnManaged(command, args, "play", ["pipe", "pipe", "pipe"]);
+	let stderr = "";
+	let stdout = "";
+	let exitCode: number | null = null;
+	let exitSignal: NodeJS.Signals | null = null;
+	let spawnError: Error | undefined;
+	const stop = () => terminateChild(child);
+	signal?.addEventListener("abort", stop, { once: true });
+	child.stdout?.on("data", (chunk) => { stdout += chunk.toString(); });
+	child.stderr?.on("data", (chunk) => { stderr += chunk.toString(); });
+	child.on("error", (err) => { spawnError = err; });
+	child.on("close", (code, termSignal) => { exitCode = code; exitSignal = termSignal; });
+	try {
+		if (!child.stdin) throw new Error(`${command} did not provide stdin for streaming audio playback`);
+		const stdin = child.stdin;
+		const reader = stream.getReader();
+		try {
+			while (true) {
+				if (signal?.aborted) throw new Error("Cancelled");
+				if (spawnError) throw spawnError;
+				const { done, value } = await reader.read();
+				if (done) break;
+				if (!value?.byteLength) continue;
+				if (!stdin.write(Buffer.from(value))) await once(stdin, "drain");
+			}
+		} finally {
+			reader.releaseLock();
+		}
+		stdin.end();
+		if (exitCode === null && !spawnError) await once(child, "close");
+		if (signal?.aborted) throw new Error("Cancelled");
+		if (spawnError) throw spawnError;
+		if (exitCode !== 0) {
+			const output = [stderr.trim(), stdout.trim()].filter(Boolean).join("\n");
+			throw new Error(`${command} failed${exitSignal ? ` (${exitSignal})` : ""}${exitCode === null ? "" : ` with exit code ${exitCode}`}${output ? `: ${output}` : ""}`);
+		}
+	} finally {
+		signal?.removeEventListener("abort", stop);
+		if (!child.killed && exitCode === null) stop();
+	}
+}
+function streamingPlayerCommand(player: string, codec: PiListensConfig["ttsOutputCodec"], sampleRate: number): CommandSpec {
+	if (player === "ffplay") {
+		const args = ["-nodisp", "-autoexit", "-loglevel", "error"];
+		if (codec === "linear16") args.push("-f", "s16le", "-ar", String(sampleRate), "-ac", "1");
+		if (codec === "mulaw") args.push("-f", "mulaw", "-ar", String(sampleRate), "-ac", "1");
+		if (codec === "alaw") args.push("-f", "alaw", "-ar", String(sampleRate), "-ac", "1");
+		args.push("-i", "pipe:0");
+		return { command: "ffplay", args };
+	}
+	if (player === "play") {
+		if (codec === "linear16") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-b", "16", "-e", "signed-integer", "-t", "raw", "-"] };
+		if (codec === "mulaw" || codec === "alaw") return { command: "play", args: ["-q", "-r", String(sampleRate), "-c", "1", "-t", codec, "-"] };
+		return { command: "play", args: ["-q", "-t", soxTypeForCodec(codec), "-"] };
+	}
+	if (player === "aplay" && codec === "wav") return { command: "aplay", args: ["-q", "-"] };
+	throw new Error(`Unsupported streaming player ${player} for codec ${codec}`);
+}
+function soxTypeForCodec(codec: PiListensConfig["ttsOutputCodec"]): string {
+	if (codec === "aac") return "adts";
+	if (codec === "linear16") return "raw";
+	return codec;
+}
 type AudioProcessKind = "record" | "play" | "other";
 type ManagedChild = ReturnType<typeof spawn>;
@@ -318,10 +407,10 @@ export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; for
 	}
 }
-function spawnManaged(command: string, args: string[], kind: AudioProcessKind): ManagedChild {
+function spawnManaged(command: string, args: string[], kind: AudioProcessKind, stdio: StdioOptions = ["ignore", "pipe", "pipe"]): ManagedChild {
 	installProcessExitCleanup();
 	const child = spawn(command, args, {
-		stdio: ["ignore", "pipe", "pipe"],
+		stdio,
 		detached: process.platform !== "win32",
 	});
 	activeChildren.add(child);

package/src/commands.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
 					`Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
 					`Recorder: ${audio.recorder}`,
 					`Player: ${audio.player}`,
+					`Streaming player: ${audio.streamingPlayer}`,
 					`STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
 					`TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
 				].join("\n"),
@@ -177,10 +178,8 @@ async function listenAndSend(
 }
 async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal, state?: VoiceModeState, ctx?: ExtensionContext) {
-	const config = services.getConfig();
 	const speakAbortController = state ? new AbortController() : undefined;
 	const speakSignal = combineSignals(signal, speakAbortController?.signal);
-	let path: string | undefined;
 	if (state) {
 		state.speakAbortController?.abort();
@@ -190,14 +189,9 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
 	}
 	try {
-		await mkdir(config.audioDir, { recursive: true });
-		path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
-		const result = await services.getSpeech().synthesizeToFile(text, path, speakSignal.signal);
-		path = result.path;
-		await services.getAudio().play(result.path, speakSignal.signal);
+		await playSpeechBest(services, text, speakSignal.signal);
 	} finally {
 		speakSignal.cleanup();
-		if (path) await services.getAudio().cleanup(path);
 		if (state && state.speakAbortController === speakAbortController) state.speakAbortController = undefined;
 		if (state && state.status === "speaking") {
 			state.status = "idle";
@@ -206,6 +200,25 @@ async function speakText(services: VoiceToolServices, text: string, signal?: Abo
 	}
 }
+async function playSpeechBest(services: VoiceToolServices, text: string, signal?: AbortSignal) {
+	const audio = services.getAudio();
+	if (audio.describe().streamingPlayer !== "missing") {
+		const result = await services.getSpeech().synthesizeStream(text, signal);
+		await audio.playStream(result.stream, signal);
+		return;
+	}
+	const config = services.getConfig();
+	await mkdir(config.audioDir, { recursive: true });
+	const path = join(config.audioDir, `pi-listens-command-${Date.now()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);
+	try {
+		const result = await services.getSpeech().synthesizeToFile(text, path, signal);
+		await audio.play(result.path, signal);
+	} finally {
+		await audio.cleanup(path);
+	}
+}
 function parseSeconds(args: string): number | undefined {
 	const match = args.match(/(?:^|\s)(\d{1,4})(?:\s|$)/);
 	if (!match) return undefined;

package/src/sarvam.ts CHANGED Viewed

@@ -15,6 +15,10 @@ export interface SynthesisResult {
 	bytes: number;
 }
+export interface SynthesisStreamResult {
+	stream: ReadableStream<Uint8Array>;
+}
 type StreamingData = {
 	transcript?: string;
 	request_id?: string;
@@ -127,6 +131,28 @@ export class SarvamSpeechClient {
 		return { path, bytes: buffer.byteLength };
 	}
+	async synthesizeStream(text: string, signal?: AbortSignal): Promise<SynthesisStreamResult> {
+		const config = this.getConfig();
+		const client = this.getClient(config);
+		const response = await client.textToSpeech.convertStream(
+			{
+				text,
+				target_language_code: config.ttsLanguageCode as never,
+				speaker: config.ttsSpeaker as never,
+				model: config.ttsModel as never,
+				pace: config.ttsPace,
+				temperature: config.ttsTemperature,
+				speech_sample_rate: config.ttsSampleRate as never,
+				enable_preprocessing: true,
+				output_audio_codec: config.ttsOutputCodec as never,
+			},
+			{ abortSignal: signal },
+		);
+		const stream = response.stream();
+		if (!stream) throw new Error("Sarvam TTS response did not include a readable audio stream");
+		return { stream };
+	}
 	private async withStreamingSocket(
 		signal: AbortSignal | undefined,
 		mode: SttMode | undefined,

package/src/tools.ts CHANGED Viewed

@@ -17,7 +17,7 @@ export interface VoiceToolServices {
 const VoiceOutputParams = Type.Object({
 	text: Type.String({ description: "Short text to speak to the user. Keep it concise; do not speak code blocks or long logs." }),
-	wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default true." })),
+	wait_for_playback: Type.Optional(Type.Boolean({ description: "Wait until audio playback completes before returning. Default false." })),
 });
 const VoiceInputParams = Type.Object({
@@ -54,21 +54,20 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 		],
 		parameters: VoiceOutputParams,
 		async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {
-			onUpdate?.({ content: [{ type: "text", text: "Synthesizing speech with Sarvam AI…" }], details: {} });
-			const result = await speak(params.text, services, signal);
-			const playback = services.getAudio().play(result.path, signal).finally(() => services.getAudio().cleanup(result.path));
-			if (params.wait_for_playback === false) {
+			onUpdate?.({ content: [{ type: "text", text: "Starting streamed speech with Sarvam AI…" }], details: {} });
+			const playback = playSpeechBest(params.text, services, signal);
+			if (params.wait_for_playback !== true) {
 				void playback.catch(() => undefined);
 				return {
 					content: [{ type: "text", text: `Started speaking to user: ${params.text}` }],
-					details: { ...result, played: "started", text: params.text },
+					details: { played: "started", text: params.text },
 				};
 			}
 			onUpdate?.({ content: [{ type: "text", text: "Playing audio…" }], details: {} });
-			await playback;
+			const details = await playback;
 			return {
 				content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
-				details: { ...result, played: true, text: params.text },
+				details: { ...details, played: true, text: params.text },
 			};
 		},
 		renderCall(args: VoiceOutputInput, theme) {
@@ -117,12 +116,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 		parameters: VoiceAskParams,
 		async execute(_toolCallId, params: VoiceAskInput, signal, onUpdate, ctx) {
 			onUpdate?.({ content: [{ type: "text", text: "Speaking question…" }], details: {} });
-			const spoken = await speak(params.question, services, signal);
-			try {
-				await services.getAudio().play(spoken.path, signal);
-			} finally {
-				await services.getAudio().cleanup(spoken.path);
-			}
+			await playSpeechBest(params.question, services, signal);
 			const answer = await listenAndMaybeFallback(
 				params,
 				services,
@@ -173,6 +167,7 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 							`Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
 							`Recorder: ${audio.recorder}`,
 							`Player: ${audio.player}`,
+							`Streaming player: ${audio.streamingPlayer}`,
 							`STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
 							`TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
 						].join("\n"),
@@ -184,7 +179,24 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
 	});
 }
-async function speak(text: string, services: VoiceToolServices, signal?: AbortSignal) {
+async function playSpeechBest(text: string, services: VoiceToolServices, signal?: AbortSignal): Promise<Record<string, unknown>> {
+	const audio = services.getAudio();
+	if (audio.describe().streamingPlayer !== "missing") {
+		const result = await services.getSpeech().synthesizeStream(text, signal);
+		await audio.playStream(result.stream, signal);
+		return { playback: "stream" };
+	}
+	const result = await speakToFile(text, services, signal);
+	try {
+		await audio.play(result.path, signal);
+		return { ...result, playback: "file" };
+	} finally {
+		await audio.cleanup(result.path);
+	}
+}
+async function speakToFile(text: string, services: VoiceToolServices, signal?: AbortSignal) {
 	const config = services.getConfig();
 	await mkdir(config.audioDir, { recursive: true });
 	const path = join(config.audioDir, `pi-listens-output-${Date.now()}-${randomUUID()}.${audioExtensionForCodec(config.ttsOutputCodec)}`);

package/src/voice-ui.ts CHANGED Viewed

@@ -88,7 +88,7 @@ class VoiceLoopEditor extends CustomEditor {
 			if (mouse.pressed && mouse.button === 0) this.triggerMouseOrbClick(mouse);
 			return;
 		}
-		if (data.toLowerCase() === "r") {
+		if (data === " ") {
 			this.triggerOrbClick(1);
 			this.callbacks.startListening();
 			return;
@@ -309,7 +309,7 @@ function frameIntervalForStatus(status: VoiceModeState["status"]): number {
 function controlRail(state: VoiceModeState, palette: OrbPalette, width: number): string[] {
 	const listenLabel = state.isListening ? "stop" : "listen";
 	const pills = [
-		controlPill("R", listenLabel, state.isListening ? "active" : "primary", palette),
+		controlPill("Space", listenLabel, state.isListening ? "active" : "primary", palette),
 		controlPill("A", state.autoListen ? "auto-listen on" : "auto-listen off", state.autoListen ? "active" : "muted", palette),
 		controlPill("S", state.autoSpeakAssistant ? "read aloud on" : "read aloud off", state.autoSpeakAssistant ? "active" : "muted", palette),
 		controlPill("Q", "close", "danger", palette),