@p8n.ai/pi-listens 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -6,6 +6,36 @@ This project follows [Semantic Versioning](https://semver.org/).
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.2.1] - 2026-05-09
10
+
11
+ ### Changed
12
+
13
+ - Remove automatic read-aloud of assistant replies (`autoSpeakAssistant`). The agent controls speech via the `voice_output` tool — no more unsolicited summaries.
14
+ - `voice_output` now stops any in-flight playback before starting new speech, preventing overlapping audio.
15
+ - Voice loop waits for any tool-initiated playback to finish before opening the mic for the next listen cycle.
16
+ - Remove `autoSpeakAssistant`, `maxAutoSpeakChars` config options and `PI_LISTENS_AUTO_SPEAK`, `PI_LISTENS_MAX_AUTO_SPEAK_CHARS` env vars.
17
+ - Remove the "S" (read-aloud toggle) key and pill from the voice panel.
18
+ - Remove `--no-speak` flag from `/voice-on`.
19
+
20
+ ### Added
21
+
22
+ - Voice orb now shows a **speaking** state (pink/magenta palette with wave animation) when `voice_output` or `voice_ask` is playing audio. Status bar shows "speaking…" with ♪/♫ indicators.
23
+
24
+ ## [0.2.0] - 2026-05-09
25
+
26
+ ### Added
27
+
28
+ - `/init` command to create a global settings file (`~/.pi/pi-listens.json`) with sensible defaults.
29
+ - `/voice-check` command (replaces `/voice-status`) with improved diagnostic output.
30
+
31
+ ### Changed
32
+
33
+ - `/voice-on` now enables auto-speak by default for a full hands-free experience. Use `--no-speak` to opt out. _(removed in next release)_
34
+ - Rename `/voice-status` to `/voice-check` to better communicate its diagnostic purpose.
35
+
36
+ ### Removed
37
+
38
+ - `/listen` slash command. Use `/voice-on` for the hands-free voice loop, or the `voice_input` agent tool for programmatic speech input.
9
39
  ## [0.1.2] - 2026-05-09
10
40
 
11
41
  ### Changed
@@ -30,7 +60,7 @@ This project follows [Semantic Versioning](https://semver.org/).
30
60
  - Sarvam AI speech-to-text tools for microphone input and audio file transcription.
31
61
  - Sarvam AI text-to-speech tools for spoken output and spoken clarification loops.
32
62
  - `/listen`, `/speak`, `/voice-on`, and `/voice-status` slash commands.
33
- - Interactive voice panel with listen, auto-listen, read-aloud, and close controls.
63
+ - Interactive voice panel with listen, auto-listen, and close controls.
34
64
  - Config support through environment variables, user config, and project config.
35
65
  - Global config at `~/.pi/pi-listens.json`, with project-level overrides from `<project>/.pi/pi-listens.json`.
36
66
 
@@ -39,7 +69,9 @@ This project follows [Semantic Versioning](https://semver.org/).
39
69
  - Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
40
70
  - Clean up generated audio files when spoken playback is interrupted.
41
71
 
42
- [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.1.2...HEAD
72
+ [Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.2.1...HEAD
43
73
  [0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
44
74
  [0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
45
75
  [0.1.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.2
76
+ [0.2.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.0
77
+ [0.2.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.1
package/README.md CHANGED
@@ -7,14 +7,27 @@ Speech-first Pi package powered by [Sarvam AI](https://www.sarvam.ai/). It gives
7
7
  - voice-first clarification loops where the agent speaks a question, listens, transcribes, and continues
8
8
  - interactive TUI and headless/RPC usage through Pi extension tools and UI fallback
9
9
 
10
- ## Install
10
+ ## Quick start
11
11
 
12
12
  ```bash
13
13
  pi install npm:@p8n.ai/pi-listens
14
- export SARVAM_API_KEY="your-sarvam-api-key"
15
14
  pi
16
15
  ```
17
16
 
17
+ Inside Pi, run `/init` to create a global settings file with sensible defaults:
18
+
19
+ ```
20
+ /init
21
+ ```
22
+
23
+ Then open `~/.pi/pi-listens.json` and replace the `apiKey` placeholder with your [Sarvam AI API key](https://dashboard.sarvam.ai).
24
+
25
+ Alternatively, set the key via environment variable:
26
+
27
+ ```bash
28
+ export SARVAM_API_KEY="your-sarvam-api-key"
29
+ ```
30
+
18
31
  For local development from this checkout:
19
32
 
20
33
  ```bash
@@ -88,19 +101,27 @@ The extension also injects voice guidance into the system prompt:
88
101
 
89
102
  | Command | Purpose |
90
103
  | --- | --- |
91
- | `/listen [seconds]` | Stream one utterance over Sarvam WebSocket STT, wait for a sustained silence boundary, transcribe, and send it to Pi as a user message. |
104
+ | `/init` | Create a global settings file at `~/.pi/pi-listens.json` with sensible defaults. Use `--overwrite` to replace an existing file. |
92
105
  | `/speak <text>` | Speak text with Sarvam TTS. |
93
- | `/voice-on [--speak] [--manual] [--no-listen] [seconds]` | Open the hands-free TUI panel. By default it listens now and auto-listens again after each agent response. `--speak` reads short assistant replies aloud. `--manual` leaves the panel active but only listens when you press R. |
94
- | `/voice-on --no-speak` | Open the panel without auto-reading assistant replies. |
95
- | `/voice-status` | Show setup and voice-mode status. |
106
+ | `/voice-on [--manual] [--no-listen] [seconds]` | Start the hands-free voice loop. Auto-listens for the next instruction after each agent turn. `--manual` disables auto-listen (press Space to listen). |
107
+ | `/voice-check` | Show setup diagnostics and voice-mode status. |
96
108
 
97
109
  Voice panel controls in interactive mode:
98
- - Space: listen now; press again while listening to stop listening; if Pi is speaking, Space stops playback before listening
99
- - A: auto-listen on/off (listen again after each assistant reply)
100
- - S: read aloud on/off (speak assistant replies)
110
+ - Space: listen now; press again while listening to stop; if Pi is speaking, stops playback first
111
+ - A: toggle auto-listen (listen again after each assistant reply)
101
112
  - Q: close the panel and stop any active listening or speaking
102
113
  - Click the orb: visual ripple feedback (terminals with mouse reporting)
103
114
 
115
+ The orb animates to reflect the current state:
116
+
117
+ | State | Orb Color | Animation | Status Bar |
118
+ | --- | --- | --- | --- |
119
+ | Idle | Teal | Gentle pulse | `voice on` |
120
+ | Listening | Blue | Ripple | `listening…` |
121
+ | Speaking | Pink/Magenta | Wave | `speaking…` |
122
+ | Agent working | Purple | Swirl | `agent working` |
123
+ | Error | Red | — | Shows error message |
124
+
104
125
  ## Headless/RPC behavior
105
126
 
106
127
  Pi extension tools work in interactive TUI and headless/RPC modes.
@@ -142,9 +163,7 @@ Example config file:
142
163
  "silenceThreshold": "1%",
143
164
  "ttsSampleRate": 24000,
144
165
  "ttsOutputCodec": "wav",
145
- "textFallback": true,
146
- "autoSpeakAssistant": false,
147
- "maxAutoSpeakChars": 320
166
+ "textFallback": true
148
167
  }
149
168
  ```
150
169
 
@@ -176,8 +195,6 @@ Supported environment variables:
176
195
  - `PI_LISTENS_AUDIO_DIR`
177
196
  - `PI_LISTENS_DELETE_AUDIO`
178
197
  - `PI_LISTENS_TEXT_FALLBACK`
179
- - `PI_LISTENS_AUTO_SPEAK`
180
- - `PI_LISTENS_MAX_AUTO_SPEAK_CHARS`
181
198
 
182
199
  ## Notes
183
200
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@p8n.ai/pi-listens",
3
- "version": "0.1.2",
3
+ "version": "0.2.1",
4
4
  "description": "Pi package for speech-first interaction using Sarvam AI speech-to-text and text-to-speech.",
5
5
  "author": "Ravindra Barthwal",
6
6
  "license": "MIT",
@@ -15,15 +15,23 @@ This Pi package provides voice tools backed by Sarvam AI.
15
15
  - `voice_transcribe_file`: transcribe an existing audio file.
16
16
  - `voice_setup_check`: diagnose API key, recorder, player, and voice settings.
17
17
 
18
+ ## Commands
19
+
20
+ - `/init`: create a global settings file with defaults. User only needs to set their Sarvam API key.
21
+ - `/speak <text>`: speak text with Sarvam TTS.
22
+ - `/voice-on`: start hands-free voice loop (auto-listens after each agent turn by default).
23
+ - `/voice-check`: show setup diagnostics and voice-mode status.
24
+
18
25
  ## Usage rules
19
26
 
20
27
  1. When you need user input, clarification, or confirmation, use `voice_ask` instead of asking only in text.
21
28
  2. Before using `voice_input`, make sure the user already knows you are listening. If not, use `voice_ask`.
22
- 3. Use `voice_output` only for concise spoken status updates or spoken summaries that matter to the user.
29
+ 3. The agent controls when to speak — use `voice_output` for concise spoken status updates that matter to the user. There is no automatic read-aloud; the agent decides.
23
30
  4. Spoken output must be brief: 1-2 short sentences, no markdown headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.
24
31
  5. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
25
32
  6. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
26
33
  7. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
34
+ 8. The voice orb reflects the current state: blue (listening), pink (speaking), purple (agent working). Calling `voice_output` automatically transitions the orb to the speaking state.
27
35
 
28
36
  ## Good voice question style
29
37
 
package/src/audio.ts CHANGED
@@ -14,6 +14,8 @@ export interface AudioRuntime {
14
14
  cleanup(path: string): Promise<void>;
15
15
  stopPlayback(): void;
16
16
  stopAll(): void;
17
+ hasActivePlayback(): boolean;
18
+ waitForPlaybackIdle(timeoutMs?: number): Promise<void>;
17
19
  describe(): { recorder: string; player: string; streamingPlayer: string };
18
20
  }
19
21
 
@@ -93,6 +95,17 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
93
95
  stopActiveAudioProcesses();
94
96
  },
95
97
 
98
+ hasActivePlayback(): boolean {
99
+ return hasActiveProcesses("play");
100
+ },
101
+
102
+ async waitForPlaybackIdle(timeoutMs = 30_000): Promise<void> {
103
+ const start = Date.now();
104
+ while (hasActiveProcesses("play") && Date.now() - start < timeoutMs) {
105
+ await new Promise((r) => setTimeout(r, 150));
106
+ }
107
+ },
108
+
96
109
  describe() {
97
110
  return { recorder: recorder ?? "missing", player: player ?? "missing", streamingPlayer: streamingPlayer ?? "missing" };
98
111
  },
@@ -407,6 +420,13 @@ export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; for
407
420
  }
408
421
  }
409
422
 
423
+ function hasActiveProcesses(kind: AudioProcessKind): boolean {
424
+ for (const child of activeChildren) {
425
+ if (childKinds.get(child) === kind) return true;
426
+ }
427
+ return false;
428
+ }
429
+
410
430
  function spawnManaged(command: string, args: string[], kind: AudioProcessKind, stdio: StdioOptions = ["ignore", "pipe", "pipe"]): ManagedChild {
411
431
  installProcessExitCleanup();
412
432
  const child = spawn(command, args, {
package/src/commands.ts CHANGED
@@ -1,9 +1,11 @@
1
- import { mkdir } from "node:fs/promises";
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { mkdir, writeFile } from "node:fs/promises";
3
+ import { homedir } from "node:os";
2
4
  import { join } from "node:path";
3
5
  import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
4
6
  import type { VoiceToolServices } from "./tools.js";
5
- import { conciseTranscript, prepareSpokenText } from "./text.js";
6
- import { audioExtensionForCodec } from "./config.js";
7
+ import { conciseTranscript } from "./text.js";
8
+ import { audioExtensionForCodec, maskSecret } from "./config.js";
7
9
  import { applyVoiceChrome, installVoiceUi, uninstallVoiceUi } from "./voice-ui.js";
8
10
 
9
11
  export type VoiceLoopStatus = "idle" | "listening" | "transcribing" | "agent" | "speaking" | "error";
@@ -11,12 +13,10 @@ export type VoiceLoopStatus = "idle" | "listening" | "transcribing" | "agent" |
11
13
  export interface VoiceModeState {
12
14
  enabled: boolean;
13
15
  autoListen: boolean;
14
- autoSpeakAssistant: boolean;
15
16
  isListening: boolean;
16
17
  status: VoiceLoopStatus;
17
18
  uiInstalled?: boolean;
18
19
  previousEditorFactory?: unknown;
19
- lastAssistantText?: string;
20
20
  lastTranscript?: string;
21
21
  lastError?: string;
22
22
  recordSeconds?: number;
@@ -26,10 +26,10 @@ export interface VoiceModeState {
26
26
  }
27
27
 
28
28
  export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState) {
29
- pi.registerCommand("listen", {
30
- description: "Record speech, transcribe with Sarvam AI, and send it to pi as a user message",
29
+ pi.registerCommand("init", {
30
+ description: "Create a global pi-listens settings file at ~/.pi/pi-listens.json with sensible defaults",
31
31
  handler: async (args, ctx) => {
32
- await listenAndSend(pi, services, ctx, parseSeconds(args));
32
+ await initSettings(services, ctx, args.includes("--overwrite"));
33
33
  },
34
34
  });
35
35
 
@@ -46,11 +46,9 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
46
46
  });
47
47
 
48
48
  pi.registerCommand("voice-on", {
49
- description: "Enable hands-free voice loop. Use --speak to read short assistant replies aloud.",
49
+ description: "Enable hands-free voice loop with auto-listen. Use --manual to only listen on demand.",
50
50
  handler: async (args, ctx) => {
51
51
  state.enabled = true;
52
- if (args.includes("--speak")) state.autoSpeakAssistant = true;
53
- if (args.includes("--no-speak")) state.autoSpeakAssistant = false;
54
52
  state.autoListen = !args.includes("--manual");
55
53
  installVoiceUi(ctx, state, createVoiceUiCallbacks(pi, services, state, ctx));
56
54
  applyVoiceChrome(ctx, state);
@@ -60,52 +58,99 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
60
58
  });
61
59
 
62
60
 
63
- pi.registerCommand("voice-status", {
64
- description: "Show pi-listens Sarvam AI, recorder, player, and voice-mode status",
61
+ pi.registerCommand("voice-check", {
62
+ description: "Check pi-listens setup: Sarvam AI key, recorder, player, and voice-mode status",
65
63
  handler: async (_args, ctx) => {
66
64
  const config = services.getConfig();
67
65
  const audio = services.getAudio().describe();
66
+ const ready = Boolean(config.apiKey) && audio.recorder !== "missing" && audio.player !== "missing";
68
67
  ctx.ui.notify(
69
68
  [
70
- `Voice mode: ${state.enabled ? "on" : "off"}`,
71
- `Auto-speak assistant: ${state.autoSpeakAssistant ? "on" : "off"}`,
72
- `Auto-listen: ${state.autoListen ? "on" : "off"}`,
73
- `Status: ${state.status}`,
74
- `Sarvam API key: ${config.apiKey ? "set" : "missing"}`,
69
+ ready ? "✓ pi-listens is ready." : "⚠ pi-listens needs attention.",
70
+ "",
71
+ `Sarvam API key: ${maskSecret(config.apiKey)}`,
75
72
  `Recorder: ${audio.recorder}`,
76
73
  `Player: ${audio.player}`,
77
74
  `Streaming player: ${audio.streamingPlayer}`,
78
75
  `STT: ${config.sttModel} (${config.translateInputToEnglish ? "translate→English" : config.sttMode}, ${config.sttLanguageCode})`,
79
76
  `TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
77
+ "",
78
+ `Voice mode: ${state.enabled ? "on" : "off"}`,
79
+ `Auto-listen: ${state.autoListen ? "on" : "off"}`,
80
80
  ].join("\n"),
81
- config.apiKey && audio.recorder !== "missing" && audio.player !== "missing" ? "info" : "warning",
81
+ ready ? "info" : "warning",
82
82
  );
83
83
  },
84
84
  });
85
85
  }
86
86
 
87
+ const INIT_SETTINGS_TEMPLATE = {
88
+ apiKey: "paste-your-sarvam-api-key-here",
89
+ sttModel: "saaras:v3",
90
+ sttMode: "transcribe",
91
+ sttLanguageCode: "unknown",
92
+ translateInputToEnglish: true,
93
+ ttsModel: "bulbul:v3",
94
+ ttsLanguageCode: "en-IN",
95
+ ttsSpeaker: "shubh",
96
+ recordSeconds: 300,
97
+ recordSampleRate: 16000,
98
+ streamChunkMs: 250,
99
+ streamMaxSeconds: 300,
100
+ silenceStartSeconds: 0.2,
101
+ silenceStopSeconds: 3.5,
102
+ silenceThreshold: "1%",
103
+ ttsSampleRate: 24000,
104
+ ttsOutputCodec: "wav",
105
+ textFallback: true,
106
+ };
107
+
108
+ async function initSettings(services: VoiceToolServices, ctx: ExtensionCommandContext, overwrite: boolean) {
109
+ const dir = join(homedir(), ".pi");
110
+ const filePath = join(dir, "pi-listens.json");
111
+
112
+ if (existsSync(filePath) && !overwrite) {
113
+ const existing = readFileSync(filePath, "utf8");
114
+ let parsed: Record<string, unknown> = {};
115
+ try { parsed = JSON.parse(existing) as Record<string, unknown>; } catch { /* ignore */ }
116
+ const hasKey = typeof parsed.apiKey === "string" && parsed.apiKey !== "paste-your-sarvam-api-key-here" && parsed.apiKey.length > 0;
117
+ ctx.ui.notify(
118
+ [
119
+ `Settings file already exists: ${filePath}`,
120
+ hasKey ? "Sarvam API key: set" : "Sarvam API key: not yet configured",
121
+ "",
122
+ "Use /init --overwrite to replace it with fresh defaults.",
123
+ ].join("\n"),
124
+ "info",
125
+ );
126
+ return;
127
+ }
128
+
129
+ await mkdir(dir, { recursive: true });
130
+ await writeFile(filePath, `${JSON.stringify(INIT_SETTINGS_TEMPLATE, null, 2)}\n`, "utf8");
131
+
132
+ const audio = services.getAudio().describe();
133
+ ctx.ui.notify(
134
+ [
135
+ `✓ Created settings file: ${filePath}`,
136
+ "",
137
+ "Next step: open the file and replace the apiKey value with your Sarvam AI API key.",
138
+ "Get a key at: https://dashboard.sarvam.ai",
139
+ "",
140
+ `Recorder: ${audio.recorder}`,
141
+ `Player: ${audio.player}`,
142
+ audio.recorder === "missing" || audio.player === "missing"
143
+ ? "⚠ Install SoX (rec/play) or ffmpeg for microphone and audio playback."
144
+ : "✓ Audio recorder and player detected.",
145
+ ].join("\n"),
146
+ "info",
147
+ );
148
+ }
87
149
  export async function maybeContinueVoiceLoop(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState, ctx: ExtensionContext) {
88
150
  if (!state.enabled || state.isListening) return;
89
- if (state.autoSpeakAssistant && state.lastAssistantText) {
90
- const spoken = prepareSpokenText(state.lastAssistantText, services.getConfig().maxAutoSpeakChars);
91
- if (spoken) {
92
- try {
93
- await speakText(services, spoken, ctx.signal, state, ctx);
94
- } catch (err) {
95
- if (isCancelled(err)) {
96
- state.status = "idle";
97
- state.lastError = undefined;
98
- applyVoiceChrome(ctx, state);
99
- return;
100
- }
101
- state.status = "error";
102
- state.lastError = errorMessage(err);
103
- applyVoiceChrome(ctx, state);
104
- ctx.ui.notify(`pi-listens could not speak assistant response: ${errorMessage(err)}`, "warning");
105
- }
106
- }
107
- }
108
- if (!state.enabled || !state.autoListen) { state.status = "idle"; applyVoiceChrome(ctx, state); return; }
151
+ if (!state.autoListen) { state.status = "idle"; applyVoiceChrome(ctx, state); return; }
152
+ // Wait for any in-flight tool-initiated playback to finish before opening the mic
153
+ await waitForPlaybackIdle(services);
109
154
  ctx.ui.notify("Listening for your next instruction…", "info");
110
155
  await listenAndSend(pi, services, ctx, undefined, { followUpWhenBusy: true });
111
156
  }
@@ -177,27 +222,14 @@ async function listenAndSend(
177
222
  }
178
223
  }
179
224
 
180
- async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal, state?: VoiceModeState, ctx?: ExtensionContext) {
181
- const speakAbortController = state ? new AbortController() : undefined;
182
- const speakSignal = combineSignals(signal, speakAbortController?.signal);
183
-
184
- if (state) {
185
- state.speakAbortController?.abort();
186
- state.speakAbortController = speakAbortController;
187
- state.status = "speaking";
188
- if (ctx) applyVoiceChrome(ctx, state);
189
- }
225
+ async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal) {
226
+ // Stop any in-flight playback before starting new speech
227
+ services.getAudio().stopPlayback();
228
+ await playSpeechBest(services, text, signal);
229
+ }
190
230
 
191
- try {
192
- await playSpeechBest(services, text, speakSignal.signal);
193
- } finally {
194
- speakSignal.cleanup();
195
- if (state && state.speakAbortController === speakAbortController) state.speakAbortController = undefined;
196
- if (state && state.status === "speaking") {
197
- state.status = "idle";
198
- if (ctx) applyVoiceChrome(ctx, state);
199
- }
200
- }
231
+ async function waitForPlaybackIdle(services: VoiceToolServices): Promise<void> {
232
+ await services.getAudio().waitForPlaybackIdle();
201
233
  }
202
234
 
203
235
  async function playSpeechBest(services: VoiceToolServices, text: string, signal?: AbortSignal) {
@@ -236,7 +268,6 @@ function createVoiceUiCallbacks(pi: ExtensionAPI, services: VoiceToolServices, s
236
268
  disable: () => {
237
269
  stopVoiceMode(services, state, ctx);
238
270
  },
239
- toggleSpeak: () => { state.autoSpeakAssistant = !state.autoSpeakAssistant; applyVoiceChrome(ctx, state); },
240
271
  toggleAutoListen: () => { state.autoListen = !state.autoListen; applyVoiceChrome(ctx, state); },
241
272
  };
242
273
  }
@@ -292,9 +323,24 @@ export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState
292
323
  }
293
324
 
294
325
  const serviceState = new WeakMap<VoiceToolServices, VoiceModeState>();
326
+ const serviceCtx = new WeakMap<VoiceToolServices, ExtensionContext>();
295
327
 
296
328
  export function attachStateToServices(services: VoiceToolServices, state: VoiceModeState) {
297
329
  serviceState.set(services, state);
330
+ services.notifySpeaking = (speaking) => {
331
+ if (!state.enabled) return;
332
+ if (speaking) {
333
+ state.status = "speaking";
334
+ } else if (state.status === "speaking") {
335
+ state.status = "idle";
336
+ }
337
+ const ctx = serviceCtx.get(services);
338
+ if (ctx) applyVoiceChrome(ctx, state);
339
+ };
340
+ }
341
+
342
+ export function updateServiceContext(services: VoiceToolServices, ctx: ExtensionContext) {
343
+ serviceCtx.set(services, ctx);
298
344
  }
299
345
 
300
346
  function getStateFromServices(services: VoiceToolServices): VoiceModeState {
package/src/config.ts CHANGED
@@ -32,8 +32,6 @@ export interface PiListensConfig {
32
32
  audioDir: string;
33
33
  deleteAudio: boolean;
34
34
  textFallback: boolean;
35
- autoSpeakAssistant: boolean;
36
- maxAutoSpeakChars: number;
37
35
  }
38
36
 
39
37
  const DEFAULT_CONFIG: PiListensConfig = {
@@ -59,8 +57,6 @@ const DEFAULT_CONFIG: PiListensConfig = {
59
57
  audioDir: join(tmpdir(), "pi-listens"),
60
58
  deleteAudio: true,
61
59
  textFallback: true,
62
- autoSpeakAssistant: false,
63
- maxAutoSpeakChars: 320,
64
60
  };
65
61
 
66
62
  type RawConfig = Partial<PiListensConfig>;
@@ -102,8 +98,6 @@ export function resolveConfig(cwd: string): PiListensConfig {
102
98
  audioDir: env("PI_LISTENS_AUDIO_DIR"),
103
99
  deleteAudio: parseBoolean(env("PI_LISTENS_DELETE_AUDIO")),
104
100
  textFallback: parseBoolean(env("PI_LISTENS_TEXT_FALLBACK")),
105
- autoSpeakAssistant: parseBoolean(env("PI_LISTENS_AUTO_SPEAK")),
106
- maxAutoSpeakChars: parseInteger(env("PI_LISTENS_MAX_AUTO_SPEAK_CHARS")),
107
101
  };
108
102
 
109
103
  return mergeDefined(DEFAULT_CONFIG, fileConfig, envConfig);
package/src/index.ts CHANGED
@@ -2,9 +2,8 @@ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
2
2
  import { createAudioRuntime, type AudioRuntime } from "./audio.js";
3
3
  import { maskSecret, resolveConfig, type PiListensConfig } from "./config.js";
4
4
  import { SarvamSpeechClient } from "./sarvam.js";
5
- import { attachStateToServices, maybeContinueVoiceLoop, registerVoiceCommands, stopVoiceMode, type VoiceModeState } from "./commands.js";
5
+ import { attachStateToServices, maybeContinueVoiceLoop, registerVoiceCommands, stopVoiceMode, updateServiceContext, type VoiceModeState } from "./commands.js";
6
6
  import { registerVoiceTools, type VoiceToolServices } from "./tools.js";
7
- import { firstTextContent } from "./text.js";
8
7
 
9
8
  export default function piListensExtension(pi: ExtensionAPI) {
10
9
  let config: PiListensConfig = resolveConfig(process.cwd());
@@ -13,10 +12,8 @@ export default function piListensExtension(pi: ExtensionAPI) {
13
12
 
14
13
  const speech = new SarvamSpeechClient(() => config);
15
14
  const state: VoiceModeState = {
16
-
17
15
  enabled: false,
18
16
  autoListen: false,
19
- autoSpeakAssistant: config.autoSpeakAssistant,
20
17
  isListening: false,
21
18
  status: "idle",
22
19
  recordSeconds: config.recordSeconds,
@@ -34,7 +31,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
34
31
  lastCwd = cwd;
35
32
  config = resolveConfig(cwd);
36
33
  audio = createAudioRuntime(config);
37
- if (!state.enabled) { state.autoSpeakAssistant = config.autoSpeakAssistant; state.recordSeconds = config.recordSeconds; state.silenceStopSeconds = config.silenceStopSeconds; }
34
+ if (!state.enabled) { state.recordSeconds = config.recordSeconds; state.silenceStopSeconds = config.silenceStopSeconds; }
38
35
  }
39
36
 
40
37
  registerVoiceTools(pi, services);
@@ -42,6 +39,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
42
39
 
43
40
  pi.on("session_start", async (_event, ctx) => {
44
41
  reloadConfig(ctx.cwd);
42
+ updateServiceContext(services, ctx);
45
43
  const audioInfo = audio.describe();
46
44
  const ready = Boolean(config.apiKey) && audioInfo.recorder !== "missing" && audioInfo.player !== "missing";
47
45
  ctx.ui.setStatus("pi-listens", state.enabled ? "voice on" : ready ? "voice ready" : "voice setup needed");
@@ -52,7 +50,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
52
50
  `Sarvam API key: ${maskSecret(config.apiKey)}`,
53
51
  `Recorder: ${audioInfo.recorder}`,
54
52
  `Player: ${audioInfo.player}`,
55
- "Run /voice-status or call voice_setup_check for details.",
53
+ "Run /init to create a settings file, or /voice-check for details.",
56
54
  ].join("\n"),
57
55
  "warning",
58
56
  );
@@ -69,12 +67,9 @@ export default function piListensExtension(pi: ExtensionAPI) {
69
67
  };
70
68
  });
71
69
 
72
- pi.on("message_end", async (event) => {
73
- if (event.message.role !== "assistant") return;
74
- state.lastAssistantText = firstTextContent(event.message);
75
- });
76
70
 
77
71
  pi.on("agent_end", async (_event, ctx) => {
72
+ updateServiceContext(services, ctx);
78
73
  await maybeContinueVoiceLoop(pi, services, state, ctx);
79
74
  });
80
75
 
package/src/text.ts CHANGED
@@ -1,46 +1,3 @@
1
- export function firstTextContent(message: unknown): string {
2
- if (!message || typeof message !== "object") return "";
3
- const content = (message as { content?: unknown }).content;
4
- if (typeof content === "string") return content;
5
- if (!Array.isArray(content)) return "";
6
- return content
7
- .map((part) => {
8
- if (!part || typeof part !== "object") return "";
9
- const p = part as { type?: string; text?: string };
10
- return p.type === "text" && typeof p.text === "string" ? p.text : "";
11
- })
12
- .filter(Boolean)
13
- .join("\n")
14
- .trim();
15
- }
16
-
17
- export function prepareSpokenText(text: string, maxChars: number): string {
18
- let prepared = text
19
- .replace(/```[\s\S]*?```/g, " I skipped a code block. ")
20
- .replace(/^\s{0,3}#{1,6}\s+/gm, "")
21
- .replace(/^\s*[-*+]\s+/gm, "")
22
- .replace(/^\s*\d+[.)]\s+/gm, "")
23
- .replace(/`([^`]+)`/g, "$1")
24
- .replace(/https?:\/\/\S+/g, "link")
25
- .replace(/[#*_>~|]+/g, " ")
26
- .replace(/\s+/g, " ")
27
- .trim();
28
-
29
- prepared = conciseSpokenSummary(prepared);
30
- if (prepared.length > maxChars) {
31
- prepared = `${prepared.slice(0, Math.max(0, maxChars - 32)).trim()}… More on screen.`;
32
- }
33
- return prepared;
34
- }
35
-
36
- function conciseSpokenSummary(text: string): string {
37
- const sentences = text.match(/[^.!?]+[.!?]+|[^.!?]+$/g)?.map((part) => part.trim()).filter(Boolean) ?? [];
38
- if (sentences.length === 0) return text;
39
-
40
- const useful = sentences.filter((sentence) => !/^(sure|here('|’)s|summary|in summary|done|completed|i('|’)ve|i have)\b/i.test(sentence));
41
- const picked = (useful.length ? useful : sentences).slice(0, 2).join(" ").trim();
42
- return picked || text;
43
- }
44
1
 
45
2
  export function conciseTranscript(transcript: string): string {
46
3
  const trimmed = transcript.trim();
package/src/tools.ts CHANGED
@@ -13,6 +13,7 @@ export interface VoiceToolServices {
13
13
  getConfig: () => PiListensConfig;
14
14
  getAudio: () => AudioRuntime;
15
15
  getSpeech: () => SarvamSpeechClient;
16
+ notifySpeaking?: (speaking: boolean) => void;
16
17
  }
17
18
 
18
19
  const VoiceOutputParams = Type.Object({
@@ -54,21 +55,28 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
54
55
  ],
55
56
  parameters: VoiceOutputParams,
56
57
  async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {
58
+ // Stop any in-flight playback before starting new speech
59
+ services.getAudio().stopPlayback();
60
+ services.notifySpeaking?.(true);
57
61
  onUpdate?.({ content: [{ type: "text", text: "Starting streamed speech with Sarvam AI…" }], details: {} });
58
62
  const playback = playSpeechBest(params.text, services, signal);
59
63
  if (params.wait_for_playback !== true) {
60
- void playback.catch(() => undefined);
64
+ void playback.then(() => services.notifySpeaking?.(false), () => services.notifySpeaking?.(false));
61
65
  return {
62
66
  content: [{ type: "text", text: `Started speaking to user: ${params.text}` }],
63
67
  details: { played: "started", text: params.text },
64
68
  };
65
69
  }
66
70
  onUpdate?.({ content: [{ type: "text", text: "Playing audio…" }], details: {} });
67
- const details = await playback;
68
- return {
69
- content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
70
- details: { ...details, played: true, text: params.text },
71
- };
71
+ try {
72
+ const details = await playback;
73
+ return {
74
+ content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
75
+ details: { ...details, played: true, text: params.text },
76
+ };
77
+ } finally {
78
+ services.notifySpeaking?.(false);
79
+ }
72
80
  },
73
81
  renderCall(args: VoiceOutputInput, theme) {
74
82
  return new Text(`${theme.fg("toolTitle", theme.bold("voice_output "))}${theme.fg("muted", quote(args.text))}`, 0, 0);
@@ -116,7 +124,13 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
116
124
  parameters: VoiceAskParams,
117
125
  async execute(_toolCallId, params: VoiceAskInput, signal, onUpdate, ctx) {
118
126
  onUpdate?.({ content: [{ type: "text", text: "Speaking question…" }], details: {} });
119
- await playSpeechBest(params.question, services, signal);
127
+ services.getAudio().stopPlayback();
128
+ services.notifySpeaking?.(true);
129
+ try {
130
+ await playSpeechBest(params.question, services, signal);
131
+ } finally {
132
+ services.notifySpeaking?.(false);
133
+ }
120
134
  const answer = await listenAndMaybeFallback(
121
135
  params,
122
136
  services,
package/src/voice-ui.ts CHANGED
@@ -7,7 +7,6 @@ type EditorFactory = ReturnType<ExtensionContext["ui"]["getEditorComponent"]>;
7
7
  export interface VoiceUiCallbacks {
8
8
  startListening: () => void;
9
9
  disable: () => void;
10
- toggleSpeak: () => void;
11
10
  toggleAutoListen: () => void;
12
11
  }
13
12
 
@@ -46,15 +45,19 @@ export function applyVoiceChrome(ctx: ExtensionContext, state: VoiceModeState) {
46
45
  ? "listening…"
47
46
  : state.status === "agent"
48
47
  ? "agent working"
49
- : state.autoSpeakAssistant
50
- ? "voice on + speak"
48
+ : state.status === "speaking"
49
+ ? "speaking…"
51
50
  : "voice on"
52
51
  : "voice ready";
53
52
  ctx.ui.setStatus("pi-listens", status);
54
53
  if (!state.enabled) return;
55
54
  ctx.ui.setWorkingIndicator({
56
- frames: state.status === "listening" ? [ctx.ui.theme.fg("accent", "●"), ctx.ui.theme.fg("muted", "•")] : [ctx.ui.theme.fg("accent", "◌")],
57
- intervalMs: 250,
55
+ frames: state.status === "listening"
56
+ ? [ctx.ui.theme.fg("accent", "●"), ctx.ui.theme.fg("muted", "•")]
57
+ : state.status === "speaking"
58
+ ? [ctx.ui.theme.fg("accent", "♪"), ctx.ui.theme.fg("muted", "♫")]
59
+ : [ctx.ui.theme.fg("accent", "◌")],
60
+ intervalMs: state.status === "speaking" ? 200 : 250,
58
61
  });
59
62
  }
60
63
 
@@ -93,11 +96,6 @@ class VoiceLoopEditor extends CustomEditor {
93
96
  this.callbacks.startListening();
94
97
  return;
95
98
  }
96
- if (data.toLowerCase() === "s") {
97
- this.triggerOrbClick(0.5, -0.18, 0.12);
98
- this.callbacks.toggleSpeak();
99
- return;
100
- }
101
99
  if (data.toLowerCase() === "a") {
102
100
  this.triggerOrbClick(0.65, 0.18, 0.1);
103
101
  this.callbacks.toggleAutoListen();
@@ -311,7 +309,6 @@ function controlRail(state: VoiceModeState, palette: OrbPalette, width: number):
311
309
  const pills = [
312
310
  controlPill("Space", listenLabel, state.isListening ? "active" : "primary", palette),
313
311
  controlPill("A", state.autoListen ? "auto-listen on" : "auto-listen off", state.autoListen ? "active" : "muted", palette),
314
- controlPill("S", state.autoSpeakAssistant ? "read aloud on" : "read aloud off", state.autoSpeakAssistant ? "active" : "muted", palette),
315
312
  controlPill("Q", "close", "danger", palette),
316
313
  ];
317
314
  return wrapInline(pills, " ", Math.max(24, width - 2));