@p8n.ai/pi-listens 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -6,6 +6,14 @@ This project follows [Semantic Versioning](https://semver.org/).
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.1.1] - 2026-05-09
10
+
11
+ ### Fixed
12
+
13
+ - Return Sarvam STT results faster after flushing microphone audio.
14
+ - Stop current speech playback before starting a new listen, without cancelling the new recording.
15
+ - Keep spoken auto-summaries concise and avoid headings, hashtags, bullet lists, and boilerplate recaps.
16
+
9
17
  ## [0.1.0] - 2026-05-09
10
18
 
11
19
  ### Added
@@ -23,5 +31,6 @@ This project follows [Semantic Versioning](https://semver.org/).
23
31
  - Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
24
32
  - Clean up generated audio files when spoken playback is interrupted.
25
33
 
26
- [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.0...HEAD
34
+ [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.1...HEAD
27
35
  [0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
36
+ [0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
package/README.md CHANGED
@@ -80,9 +80,9 @@ The package registers these tools for Pi's agent:
80
80
  The extension also injects voice guidance into the system prompt:
81
81
 
82
82
  - use `voice_ask` whenever user input is needed in voice-first sessions
83
- - use `voice_output` for short spoken status or response snippets
83
+ - use `voice_output` only for short spoken status or response snippets
84
+ - keep spoken replies to 1-2 short sentences with no headings, hashtags, bullet lists, boilerplate recaps, or full task summaries
84
85
  - do not speak code blocks, logs, diffs, stack traces, or long explanations
85
- - keep spoken questions concise and answerable in a short response
86
86
 
87
87
  ## Commands
88
88
 
@@ -95,10 +95,10 @@ The extension also injects voice guidance into the system prompt:
95
95
  | `/voice-status` | Show setup and voice-mode status. |
96
96
 
97
97
  Voice panel controls in interactive mode:
98
- - R: listen now; press again while listening to stop listening
98
+ - R: listen now; press again while listening to stop listening; if Pi is speaking, R stops playback before listening
99
99
  - A: auto-listen on/off (listen again after each assistant reply)
100
100
  - S: read aloud on/off (speak assistant replies)
101
- - Q: close the panel (and stop listening first if needed)
101
+ - Q: close the panel and stop any active listening or speaking
102
102
  - Click the orb: visual ripple feedback (terminals with mouse reporting)
103
103
 
104
104
  ## Headless/RPC behavior
@@ -144,7 +144,7 @@ Example config file:
144
144
  "ttsOutputCodec": "wav",
145
145
  "textFallback": true,
146
146
  "autoSpeakAssistant": false,
147
- "maxAutoSpeakChars": 900
147
+ "maxAutoSpeakChars": 320
148
148
  }
149
149
  ```
150
150
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@p8n.ai/pi-listens",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Pi package for speech-first interaction using Sarvam AI speech-to-text and text-to-speech.",
5
5
  "author": "Ravindra Barthwal",
6
6
  "license": "MIT",
@@ -33,7 +33,7 @@
33
33
  ],
34
34
  "scripts": {
35
35
  "typecheck": "tsc --noEmit",
36
- "test": "npm run typecheck"
36
+ "test": "npm run typecheck && node --import tsx --test test/**/*.test.ts"
37
37
  },
38
38
  "pi": {
39
39
  "extensions": [
@@ -19,10 +19,11 @@ This Pi package provides voice tools backed by Sarvam AI.
19
19
 
20
20
  1. When you need user input, clarification, or confirmation, use `voice_ask` instead of asking only in text.
21
21
  2. Before using `voice_input`, make sure the user already knows you are listening. If not, use `voice_ask`.
22
- 3. Use `voice_output` for concise spoken status updates or spoken summaries that matter to the user.
23
- 4. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
24
- 5. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
25
- 6. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
22
+ 3. Use `voice_output` only for concise spoken status updates or spoken summaries that matter to the user.
23
+ 4. Spoken output must be brief: 1-2 short sentences, no markdown headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.
24
+ 5. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
25
+ 6. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
26
+ 7. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
26
27
 
27
28
  ## Good voice question style
28
29
 
package/src/audio.ts CHANGED
@@ -11,6 +11,7 @@ export interface AudioRuntime {
11
11
  streamPcm(signal?: AbortSignal): AsyncIterable<Buffer>;
12
12
  play(path: string, signal?: AbortSignal): Promise<void>;
13
13
  cleanup(path: string): Promise<void>;
14
+ stopPlayback(): void;
14
15
  stopAll(): void;
15
16
  describe(): { recorder: string; player: string };
16
17
  }
@@ -41,7 +42,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
41
42
  : useUtteranceMode
42
43
  ? utteranceRecorderCommand(recorder, path, config.recordSampleRate, config.silenceStartSeconds, config.silenceStopSeconds, config.silenceThreshold)
43
44
  : recorderCommand(recorder, path, seconds, config.recordSampleRate);
44
- await run(command.command, command.args, signal, useUtteranceMode ? { timeoutMs: seconds * 1000, resolveOnTimeout: true } : undefined);
45
+ await run(command.command, command.args, signal, { ...(useUtteranceMode ? { timeoutMs: seconds * 1000, resolveOnTimeout: true } : {}), kind: "record" });
45
46
  return path;
46
47
  },
47
48
 
@@ -54,7 +55,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
54
55
  const command = config.streamCommand
55
56
  ? customCommand(config.streamCommand, { sampleRate: config.recordSampleRate })
56
57
  : pcmStreamCommand(recorder, config.recordSampleRate);
57
- return streamCommandOutput(command.command, command.args, signal);
58
+ return streamCommandOutput(command.command, command.args, signal, "record");
58
59
  },
59
60
 
60
61
  async play(path: string, signal?: AbortSignal): Promise<void> {
@@ -64,7 +65,7 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
64
65
  );
65
66
  }
66
67
  const command = config.playCommand ? customCommand(config.playCommand, { path }) : playerCommand(player, path);
67
- await run(command.command, command.args, signal);
68
+ await run(command.command, command.args, signal, { kind: "play" });
68
69
  },
69
70
 
70
71
  async cleanup(path: string): Promise<void> {
@@ -72,6 +73,10 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
72
73
  await rm(path, { force: true }).catch(() => undefined);
73
74
  },
74
75
 
76
+ stopPlayback(): void {
77
+ stopActiveAudioProcesses({ kind: "play" });
78
+ },
79
+
75
80
  stopAll(): void {
76
81
  stopActiveAudioProcesses();
77
82
  },
@@ -213,14 +218,14 @@ function isCommandAvailable(command: string): boolean {
213
218
  return false;
214
219
  }
215
220
 
216
- function run(command: string, args: string[], signal?: AbortSignal, options: { timeoutMs?: number; resolveOnTimeout?: boolean } = {}): Promise<void> {
221
+ function run(command: string, args: string[], signal?: AbortSignal, options: { timeoutMs?: number; resolveOnTimeout?: boolean; kind?: AudioProcessKind } = {}): Promise<void> {
217
222
  return new Promise((resolve, reject) => {
218
223
  if (signal?.aborted) {
219
224
  reject(new Error("Cancelled"));
220
225
  return;
221
226
  }
222
227
 
223
- const child = spawnManaged(command, args);
228
+ const child = spawnManaged(command, args, options.kind ?? "other");
224
229
  let stderr = "";
225
230
  let stdout = "";
226
231
  let timedOut = false;
@@ -265,9 +270,9 @@ function run(command: string, args: string[], signal?: AbortSignal, options: { t
265
270
  });
266
271
  }
267
272
 
268
- async function* streamCommandOutput(command: string, args: string[], signal?: AbortSignal): AsyncIterable<Buffer> {
273
+ async function* streamCommandOutput(command: string, args: string[], signal?: AbortSignal, kind: AudioProcessKind = "other"): AsyncIterable<Buffer> {
269
274
  if (signal?.aborted) throw new Error("Cancelled");
270
- const child = spawnManaged(command, args);
275
+ const child = spawnManaged(command, args, kind);
271
276
  let stderr = "";
272
277
  let exitCode: number | null = null;
273
278
  let exitSignal: NodeJS.Signals | null = null;
@@ -298,23 +303,29 @@ async function* streamCommandOutput(command: string, args: string[], signal?: Ab
298
303
  }
299
304
  }
300
305
 
306
+ type AudioProcessKind = "record" | "play" | "other";
307
+
301
308
  type ManagedChild = ReturnType<typeof spawn>;
302
309
 
303
310
  const activeChildren = new Set<ManagedChild>();
311
+ const childKinds = new WeakMap<ManagedChild, AudioProcessKind>();
304
312
  const terminatingChildren = new WeakSet<ManagedChild>();
305
313
  let processExitCleanupInstalled = false;
306
314
 
307
- export function stopActiveAudioProcesses(force = false): void {
308
- for (const child of [...activeChildren]) terminateChild(child, force);
315
+ export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; force?: boolean } = {}): void {
316
+ for (const child of [...activeChildren]) {
317
+ if (!options.kind || childKinds.get(child) === options.kind) terminateChild(child, options.force);
318
+ }
309
319
  }
310
320
 
311
- function spawnManaged(command: string, args: string[]): ManagedChild {
321
+ function spawnManaged(command: string, args: string[], kind: AudioProcessKind): ManagedChild {
312
322
  installProcessExitCleanup();
313
323
  const child = spawn(command, args, {
314
324
  stdio: ["ignore", "pipe", "pipe"],
315
325
  detached: process.platform !== "win32",
316
326
  });
317
327
  activeChildren.add(child);
328
+ childKinds.set(child, kind);
318
329
  const untrack = () => activeChildren.delete(child);
319
330
  child.once("close", untrack);
320
331
  child.once("error", untrack);
@@ -324,7 +335,7 @@ function spawnManaged(command: string, args: string[]): ManagedChild {
324
335
  function installProcessExitCleanup(): void {
325
336
  if (processExitCleanupInstalled) return;
326
337
  processExitCleanupInstalled = true;
327
- process.once("exit", () => stopActiveAudioProcesses(true));
338
+ process.once("exit", () => stopActiveAudioProcesses({ force: true }));
328
339
  }
329
340
 
330
341
  function terminateChild(child: ManagedChild, force = false): void {
package/src/commands.ts CHANGED
@@ -121,6 +121,7 @@ async function listenAndSend(
121
121
  state.listenAbortController?.abort();
122
122
  return;
123
123
  }
124
+ stopSpeaking(services, state);
124
125
  state.recordSeconds = seconds ?? services.getConfig().recordSeconds;
125
126
  state.silenceStopSeconds = services.getConfig().silenceStopSeconds;
126
127
  state.isListening = true;
@@ -253,6 +254,13 @@ function isCancelled(err: unknown): boolean {
253
254
  return err instanceof Error && /cancelled|aborted/i.test(err.message);
254
255
  }
255
256
 
257
+ function stopSpeaking(services: VoiceToolServices, state: VoiceModeState) {
258
+ const speakAbortController = state.speakAbortController;
259
+ state.speakAbortController = undefined;
260
+ speakAbortController?.abort();
261
+ services.getAudio().stopPlayback();
262
+ }
263
+
256
264
  export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState, ctx?: ExtensionContext | ExtensionCommandContext) {
257
265
  state.enabled = false;
258
266
  state.autoListen = false;
@@ -264,10 +272,7 @@ export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState
264
272
  state.listenAbortController = undefined;
265
273
  listenAbortController?.abort();
266
274
 
267
- const speakAbortController = state.speakAbortController;
268
- state.speakAbortController = undefined;
269
- speakAbortController?.abort();
270
-
275
+ stopSpeaking(services, state);
271
276
  services.getAudio().stopAll();
272
277
 
273
278
  if (ctx) uninstallVoiceUi(ctx, state);
package/src/config.ts CHANGED
@@ -60,7 +60,7 @@ const DEFAULT_CONFIG: PiListensConfig = {
60
60
  deleteAudio: true,
61
61
  textFallback: true,
62
62
  autoSpeakAssistant: false,
63
- maxAutoSpeakChars: 900,
63
+ maxAutoSpeakChars: 320,
64
64
  };
65
65
 
66
66
  type RawConfig = Partial<PiListensConfig>;
package/src/index.ts CHANGED
@@ -65,7 +65,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
65
65
 
66
66
  pi.on("before_agent_start", async (event) => {
67
67
  return {
68
- systemPrompt: `${event.systemPrompt}\n\nPi Listens voice guidance:\n- The user may primarily interact by speech through Sarvam AI. Text input is still possible.\n- When voice mode is active, treat it as a hands-free conversation: listen only while the voice UI/input tool is active, then pause listening while you work.\n- Use voice_output for concise spoken progress, completion, or status updates that matter to the user.\n- When you need clarification, confirmation, or any user input, prefer voice_ask with a concise spoken question instead of asking only in text.\n- Use voice_input only after the user already knows you are listening.\n- Do not speak code blocks, logs, diffs, stack traces, or long explanations; summarize them briefly and leave detail in text.`,
68
+ systemPrompt: `${event.systemPrompt}\n\nPi Listens voice guidance:\n- The user may primarily interact by speech through Sarvam AI. Text input is still possible.\n- When voice mode is active, treat it as a hands-free conversation: listen only while the voice UI/input tool is active, then pause listening while you work.\n- Use voice_output only for concise spoken progress, completion, or status updates that matter to the user.\n- Spoken replies must be brief: 1-2 short sentences, no headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.\n- When you need clarification, confirmation, or any user input, prefer voice_ask with a concise spoken question instead of asking only in text.\n- Use voice_input only after the user already knows you are listening.\n- Do not speak code blocks, logs, diffs, stack traces, or long explanations; summarize briefly and leave detail in text.`,
69
69
  };
70
70
  });
71
71
 
package/src/sarvam.ts CHANGED
@@ -1,5 +1,4 @@
1
1
  import { readFile, writeFile } from "node:fs/promises";
2
- import { setTimeout as delay } from "node:timers/promises";
3
2
  import { SarvamAIClient } from "sarvamai";
4
3
  import type { AudioRuntime } from "./audio.js";
5
4
  import type { PiListensConfig, SttMode } from "./config.js";
@@ -145,35 +144,46 @@ export class SarvamSpeechClient {
145
144
  let languageProbability: number | undefined;
146
145
  let streamError: Error | undefined;
147
146
  let lastMessageAt = Date.now();
148
-
147
+ const messageWaiters = new Set<() => void>();
149
148
  const socket = connectStreamingSocket(config, mode ?? (config.translateInputToEnglish ? "translate" : config.sttMode), inputAudioCodec);
150
149
 
151
150
  const closeOnAbort = () => socket.close();
152
151
  signal?.addEventListener("abort", closeOnAbort, { once: true });
152
+ const notifyMessageWaiters = () => {
153
+ const waiters = [...messageWaiters];
154
+ messageWaiters.clear();
155
+ for (const waiter of waiters) waiter();
156
+ };
153
157
  socket.onMessage((message: StreamingResponse) => {
154
158
  lastMessageAt = Date.now();
155
- if (message.type === "error") {
156
- streamError = new Error(message.data?.error ?? message.data?.code ?? "Sarvam streaming STT failed");
157
- return;
159
+ try {
160
+ if (message.type === "error") {
161
+ streamError = new Error(message.data?.error ?? message.data?.code ?? "Sarvam streaming STT failed");
162
+ return;
163
+ }
164
+ if (message.type !== "data") return;
165
+ const data = message.data;
166
+ if (!data) return;
167
+ transcript = mergeTranscript(transcript, data.transcript ?? "");
168
+ requestId = data.request_id ?? requestId;
169
+ languageCode = data.language_code ?? languageCode;
170
+ languageProbability = data.language_probability ?? languageProbability;
171
+ } finally {
172
+ notifyMessageWaiters();
158
173
  }
159
- if (message.type !== "data") return;
160
- const data = message.data;
161
- if (!data) return;
162
- transcript = mergeTranscript(transcript, data.transcript ?? "");
163
- requestId = data.request_id ?? requestId;
164
- languageCode = data.language_code ?? languageCode;
165
- languageProbability = data.language_probability ?? languageProbability;
166
174
  });
167
- socket.onError((error: Error) => { streamError = error; });
175
+ socket.onError((error: Error) => { streamError = error; notifyMessageWaiters(); });
168
176
 
169
177
  try {
170
178
  await socket.waitForOpen();
171
179
  await streamAudio(socket, async () => {
172
180
  const startedWaitingAt = Date.now();
173
- while (Date.now() - startedWaitingAt < 3000) {
181
+ const maxWaitMs = transcript.trim() ? 900 : 1600;
182
+ const settleMs = 250;
183
+ while (Date.now() - startedWaitingAt < maxWaitMs) {
174
184
  if (streamError) throw streamError;
175
- if (Date.now() - lastMessageAt > 850 && transcript.trim()) break;
176
- await delay(100, undefined, { signal }).catch((err) => { throw err; });
185
+ if (transcript.trim() && Date.now() - lastMessageAt >= settleMs) break;
186
+ await waitForMessageOrTimeout(messageWaiters, 50, signal);
177
187
  }
178
188
  });
179
189
  if (streamError) throw streamError;
@@ -288,6 +298,27 @@ function connectStreamingSocket(config: PiListensConfig, mode: SttMode, inputAud
288
298
  };
289
299
  }
290
300
 
301
+ function waitForMessageOrTimeout(waiters: Set<() => void>, timeoutMs: number, signal?: AbortSignal): Promise<void> {
302
+ return new Promise((resolve, reject) => {
303
+ if (signal?.aborted) {
304
+ reject(new Error("Cancelled"));
305
+ return;
306
+ }
307
+
308
+ const done = () => { cleanup(); resolve(); };
309
+ const onAbort = () => { cleanup(); reject(new Error("Cancelled")); };
310
+ const timeout = setTimeout(done, timeoutMs);
311
+ const cleanup = () => {
312
+ clearTimeout(timeout);
313
+ waiters.delete(done);
314
+ signal?.removeEventListener("abort", onAbort);
315
+ };
316
+
317
+ waiters.add(done);
318
+ signal?.addEventListener("abort", onAbort, { once: true });
319
+ });
320
+ }
321
+
291
322
  type CombinedSignal = { signal?: AbortSignal; cleanup: () => void };
292
323
 
293
324
  function combineSignals(...signals: Array<AbortSignal | undefined>): CombinedSignal {
package/src/text.ts CHANGED
@@ -16,17 +16,32 @@ export function firstTextContent(message: unknown): string {
16
16
 
17
17
  export function prepareSpokenText(text: string, maxChars: number): string {
18
18
  let prepared = text
19
- .replace(/```[\s\S]*?```/g, " I am skipping a code block. ")
19
+ .replace(/```[\s\S]*?```/g, " I skipped a code block. ")
20
+ .replace(/^\s{0,3}#{1,6}\s+/gm, "")
21
+ .replace(/^\s*[-*+]\s+/gm, "")
22
+ .replace(/^\s*\d+[.)]\s+/gm, "")
20
23
  .replace(/`([^`]+)`/g, "$1")
21
24
  .replace(/https?:\/\/\S+/g, "link")
25
+ .replace(/[#*_>~|]+/g, " ")
22
26
  .replace(/\s+/g, " ")
23
27
  .trim();
28
+
29
+ prepared = conciseSpokenSummary(prepared);
24
30
  if (prepared.length > maxChars) {
25
- prepared = `${prepared.slice(0, Math.max(0, maxChars - 80)).trim()}… I have more details on screen.`;
31
+ prepared = `${prepared.slice(0, Math.max(0, maxChars - 32)).trim()}… More on screen.`;
26
32
  }
27
33
  return prepared;
28
34
  }
29
35
 
36
+ function conciseSpokenSummary(text: string): string {
37
+ const sentences = text.match(/[^.!?]+[.!?]+|[^.!?]+$/g)?.map((part) => part.trim()).filter(Boolean) ?? [];
38
+ if (sentences.length === 0) return text;
39
+
40
+ const useful = sentences.filter((sentence) => !/^(sure|here('|’)s|summary|in summary|done|completed|i('|’)ve|i have)\b/i.test(sentence));
41
+ const picked = (useful.length ? useful : sentences).slice(0, 2).join(" ").trim();
42
+ return picked || text;
43
+ }
44
+
30
45
  export function conciseTranscript(transcript: string): string {
31
46
  const trimmed = transcript.trim();
32
47
  return trimmed.length === 0 ? "(no speech recognized)" : trimmed;
package/src/tools.ts CHANGED
@@ -49,8 +49,8 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
49
49
  description: "Speak a short message to the user using Sarvam AI text-to-speech and local audio playback.",
50
50
  promptSnippet: "Speak short user-facing messages with Sarvam AI TTS",
51
51
  promptGuidelines: [
52
- "Use voice_output when a spoken user-facing message matters, especially before waiting for voice input.",
53
- "Keep voice_output text brief and conversational; do not speak code blocks, command output, stack traces, or long explanations.",
52
+ "Use voice_output only when a spoken user-facing message matters, especially before waiting for voice input.",
53
+ "Keep voice_output to 1-2 short conversational sentences. Do not speak headings, hashtags, bullet lists, boilerplate recaps, code, command output, stack traces, or long explanations.",
54
54
  ],
55
55
  parameters: VoiceOutputParams,
56
56
  async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {