@p8n.ai/pi-listens 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -4
- package/README.md +15 -10
- package/package.json +1 -1
- package/skills/pi-listens/SKILL.md +4 -3
- package/src/audio.ts +20 -0
- package/src/commands.ts +29 -51
- package/src/config.ts +0 -6
- package/src/index.ts +5 -10
- package/src/text.ts +0 -43
- package/src/tools.ts +21 -7
- package/src/voice-ui.ts +8 -11
package/CHANGELOG.md
CHANGED
|
@@ -6,16 +6,33 @@ This project follows [Semantic Versioning](https://semver.org/).
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.2.2] - 2026-05-09
|
|
10
|
+
|
|
11
|
+
## [0.2.1] - 2026-05-09
|
|
12
|
+
|
|
13
|
+
### Changed
|
|
14
|
+
|
|
15
|
+
- Remove automatic read-aloud of assistant replies (`autoSpeakAssistant`). The agent controls speech via the `voice_output` tool — no more unsolicited summaries.
|
|
16
|
+
- `voice_output` now stops any in-flight playback before starting new speech, preventing overlapping audio.
|
|
17
|
+
- Voice loop waits for any tool-initiated playback to finish before opening the mic for the next listen cycle.
|
|
18
|
+
- Remove `autoSpeakAssistant`, `maxAutoSpeakChars` config options and `PI_LISTENS_AUTO_SPEAK`, `PI_LISTENS_MAX_AUTO_SPEAK_CHARS` env vars.
|
|
19
|
+
- Remove the "S" (read-aloud toggle) key and pill from the voice panel.
|
|
20
|
+
- Remove `--no-speak` flag from `/voice-on`.
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
|
|
24
|
+
- Voice orb now shows a **speaking** state (pink/magenta palette with wave animation) when `voice_output` or `voice_ask` is playing audio. Status bar shows "speaking…" with ♪/♫ indicators.
|
|
25
|
+
|
|
9
26
|
## [0.2.0] - 2026-05-09
|
|
10
27
|
|
|
11
28
|
### Added
|
|
12
29
|
|
|
13
|
-
- `/init` command to create a global settings file (`~/.pi/pi-listens.json`) with sensible defaults.
|
|
30
|
+
- `/voice-init` command to create a global settings file (`~/.pi/pi-listens.json`) with sensible defaults.
|
|
14
31
|
- `/voice-check` command (replaces `/voice-status`) with improved diagnostic output.
|
|
15
32
|
|
|
16
33
|
### Changed
|
|
17
34
|
|
|
18
|
-
- `/voice-on` now enables auto-speak by default for a full hands-free experience. Use `--no-speak` to opt out.
|
|
35
|
+
- `/voice-on` now enables auto-speak by default for a full hands-free experience. Use `--no-speak` to opt out. _(removed in next release)_
|
|
19
36
|
- Rename `/voice-status` to `/voice-check` to better communicate its diagnostic purpose.
|
|
20
37
|
|
|
21
38
|
### Removed
|
|
@@ -45,7 +62,7 @@ This project follows [Semantic Versioning](https://semver.org/).
|
|
|
45
62
|
- Sarvam AI speech-to-text tools for microphone input and audio file transcription.
|
|
46
63
|
- Sarvam AI text-to-speech tools for spoken output and spoken clarification loops.
|
|
47
64
|
- `/listen`, `/speak`, `/voice-on`, and `/voice-status` slash commands.
|
|
48
|
-
- Interactive voice panel with listen, auto-listen,
|
|
65
|
+
- Interactive voice panel with listen, auto-listen, and close controls.
|
|
49
66
|
- Config support through environment variables, user config, and project config.
|
|
50
67
|
- Global config at `~/.pi/pi-listens.json`, with project-level overrides from `<project>/.pi/pi-listens.json`.
|
|
51
68
|
|
|
@@ -54,8 +71,10 @@ This project follows [Semantic Versioning](https://semver.org/).
|
|
|
54
71
|
- Stop active audio capture/playback subprocesses when voice mode is closed or the Pi session shuts down.
|
|
55
72
|
- Clean up generated audio files when spoken playback is interrupted.
|
|
56
73
|
|
|
57
|
-
[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.2.
|
|
74
|
+
[Unreleased]: https://github.com/p8n-ai/pi-listens/compare/v0.2.2...HEAD
|
|
58
75
|
[0.1.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.0
|
|
59
76
|
[0.1.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.1
|
|
60
77
|
[0.1.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.1.2
|
|
61
78
|
[0.2.0]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.0
|
|
79
|
+
[0.2.1]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.1
|
|
80
|
+
[0.2.2]: https://github.com/p8n-ai/pi-listens/releases/tag/v0.2.2
|
package/README.md
CHANGED
|
@@ -14,10 +14,10 @@ pi install npm:@p8n.ai/pi-listens
|
|
|
14
14
|
pi
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
-
Inside Pi, run `/init` to create a global settings file with sensible defaults:
|
|
17
|
+
Inside Pi, run `/voice-init` to create a global settings file with sensible defaults:
|
|
18
18
|
|
|
19
19
|
```
|
|
20
|
-
/init
|
|
20
|
+
/voice-init
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
Then open `~/.pi/pi-listens.json` and replace the `apiKey` placeholder with your [Sarvam AI API key](https://dashboard.sarvam.ai).
|
|
@@ -101,18 +101,27 @@ The extension also injects voice guidance into the system prompt:
|
|
|
101
101
|
|
|
102
102
|
| Command | Purpose |
|
|
103
103
|
| --- | --- |
|
|
104
|
-
| `/init` | Create a global settings file at `~/.pi/pi-listens.json` with sensible defaults. Use `--overwrite` to replace an existing file. |
|
|
104
|
+
| `/voice-init` | Create a global settings file at `~/.pi/pi-listens.json` with sensible defaults. Use `--overwrite` to replace an existing file. |
|
|
105
105
|
| `/speak <text>` | Speak text with Sarvam TTS. |
|
|
106
|
-
| `/voice-on [--
|
|
106
|
+
| `/voice-on [--manual] [--no-listen] [seconds]` | Start the hands-free voice loop. Auto-listens for the next instruction after each agent turn. `--manual` disables auto-listen (press Space to listen). |
|
|
107
107
|
| `/voice-check` | Show setup diagnostics and voice-mode status. |
|
|
108
108
|
|
|
109
109
|
Voice panel controls in interactive mode:
|
|
110
110
|
- Space: listen now; press again while listening to stop; if Pi is speaking, stops playback first
|
|
111
111
|
- A: toggle auto-listen (listen again after each assistant reply)
|
|
112
|
-
- S: toggle read-aloud (speak assistant replies)
|
|
113
112
|
- Q: close the panel and stop any active listening or speaking
|
|
114
113
|
- Click the orb: visual ripple feedback (terminals with mouse reporting)
|
|
115
114
|
|
|
115
|
+
The orb animates to reflect the current state:
|
|
116
|
+
|
|
117
|
+
| State | Orb Color | Animation | Status Bar |
|
|
118
|
+
| --- | --- | --- | --- |
|
|
119
|
+
| Idle | Teal | Gentle pulse | `voice on` |
|
|
120
|
+
| Listening | Blue | Ripple | `listening…` |
|
|
121
|
+
| Speaking | Pink/Magenta | Wave | `speaking…` |
|
|
122
|
+
| Agent working | Purple | Swirl | `agent working` |
|
|
123
|
+
| Error | Red | — | Shows error message |
|
|
124
|
+
|
|
116
125
|
## Headless/RPC behavior
|
|
117
126
|
|
|
118
127
|
Pi extension tools work in interactive TUI and headless/RPC modes.
|
|
@@ -154,9 +163,7 @@ Example config file:
|
|
|
154
163
|
"silenceThreshold": "1%",
|
|
155
164
|
"ttsSampleRate": 24000,
|
|
156
165
|
"ttsOutputCodec": "wav",
|
|
157
|
-
"textFallback": true
|
|
158
|
-
"autoSpeakAssistant": true,
|
|
159
|
-
"maxAutoSpeakChars": 320
|
|
166
|
+
"textFallback": true
|
|
160
167
|
}
|
|
161
168
|
```
|
|
162
169
|
|
|
@@ -188,8 +195,6 @@ Supported environment variables:
|
|
|
188
195
|
- `PI_LISTENS_AUDIO_DIR`
|
|
189
196
|
- `PI_LISTENS_DELETE_AUDIO`
|
|
190
197
|
- `PI_LISTENS_TEXT_FALLBACK`
|
|
191
|
-
- `PI_LISTENS_AUTO_SPEAK`
|
|
192
|
-
- `PI_LISTENS_MAX_AUTO_SPEAK_CHARS`
|
|
193
198
|
|
|
194
199
|
## Notes
|
|
195
200
|
|
package/package.json
CHANGED
|
@@ -17,20 +17,21 @@ This Pi package provides voice tools backed by Sarvam AI.
|
|
|
17
17
|
|
|
18
18
|
## Commands
|
|
19
19
|
|
|
20
|
-
- `/init`: create a global settings file with defaults. User only needs to set their Sarvam API key.
|
|
20
|
+
- `/voice-init`: create a global settings file with defaults. User only needs to set their Sarvam API key.
|
|
21
21
|
- `/speak <text>`: speak text with Sarvam TTS.
|
|
22
|
-
- `/voice-on`: start hands-free voice loop (auto-
|
|
22
|
+
- `/voice-on`: start hands-free voice loop (auto-listens after each agent turn by default).
|
|
23
23
|
- `/voice-check`: show setup diagnostics and voice-mode status.
|
|
24
24
|
|
|
25
25
|
## Usage rules
|
|
26
26
|
|
|
27
27
|
1. When you need user input, clarification, or confirmation, use `voice_ask` instead of asking only in text.
|
|
28
28
|
2. Before using `voice_input`, make sure the user already knows you are listening. If not, use `voice_ask`.
|
|
29
|
-
3.
|
|
29
|
+
3. The agent controls when to speak — use `voice_output` for concise spoken status updates that matter to the user. There is no automatic read-aloud; the agent decides.
|
|
30
30
|
4. Spoken output must be brief: 1-2 short sentences, no markdown headings, no hashtags, no bullet lists, no boilerplate recap, and no full task summaries. Leave details in text.
|
|
31
31
|
5. Do not speak code blocks, diffs, stack traces, logs, long tables, or lengthy explanations. Summarize briefly and leave details in text.
|
|
32
32
|
6. Treat transcripts returned by `voice_input` or `voice_ask` as user input, while allowing for speech-recognition mistakes. If the transcript is ambiguous, ask a short follow-up with `voice_ask`.
|
|
33
33
|
7. If speech is not recognized, rely on the tool's text fallback when available, or ask again with a shorter prompt.
|
|
34
|
+
8. The voice orb reflects the current state: blue (listening), pink (speaking), purple (agent working). Calling `voice_output` automatically transitions the orb to the speaking state.
|
|
34
35
|
|
|
35
36
|
## Good voice question style
|
|
36
37
|
|
package/src/audio.ts
CHANGED
|
@@ -14,6 +14,8 @@ export interface AudioRuntime {
|
|
|
14
14
|
cleanup(path: string): Promise<void>;
|
|
15
15
|
stopPlayback(): void;
|
|
16
16
|
stopAll(): void;
|
|
17
|
+
hasActivePlayback(): boolean;
|
|
18
|
+
waitForPlaybackIdle(timeoutMs?: number): Promise<void>;
|
|
17
19
|
describe(): { recorder: string; player: string; streamingPlayer: string };
|
|
18
20
|
}
|
|
19
21
|
|
|
@@ -93,6 +95,17 @@ export function createAudioRuntime(config: PiListensConfig): AudioRuntime {
|
|
|
93
95
|
stopActiveAudioProcesses();
|
|
94
96
|
},
|
|
95
97
|
|
|
98
|
+
hasActivePlayback(): boolean {
|
|
99
|
+
return hasActiveProcesses("play");
|
|
100
|
+
},
|
|
101
|
+
|
|
102
|
+
async waitForPlaybackIdle(timeoutMs = 30_000): Promise<void> {
|
|
103
|
+
const start = Date.now();
|
|
104
|
+
while (hasActiveProcesses("play") && Date.now() - start < timeoutMs) {
|
|
105
|
+
await new Promise((r) => setTimeout(r, 150));
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
|
|
96
109
|
describe() {
|
|
97
110
|
return { recorder: recorder ?? "missing", player: player ?? "missing", streamingPlayer: streamingPlayer ?? "missing" };
|
|
98
111
|
},
|
|
@@ -407,6 +420,13 @@ export function stopActiveAudioProcesses(options: { kind?: AudioProcessKind; for
|
|
|
407
420
|
}
|
|
408
421
|
}
|
|
409
422
|
|
|
423
|
+
function hasActiveProcesses(kind: AudioProcessKind): boolean {
|
|
424
|
+
for (const child of activeChildren) {
|
|
425
|
+
if (childKinds.get(child) === kind) return true;
|
|
426
|
+
}
|
|
427
|
+
return false;
|
|
428
|
+
}
|
|
429
|
+
|
|
410
430
|
function spawnManaged(command: string, args: string[], kind: AudioProcessKind, stdio: StdioOptions = ["ignore", "pipe", "pipe"]): ManagedChild {
|
|
411
431
|
installProcessExitCleanup();
|
|
412
432
|
const child = spawn(command, args, {
|
package/src/commands.ts
CHANGED
|
@@ -4,7 +4,7 @@ import { homedir } from "node:os";
|
|
|
4
4
|
import { join } from "node:path";
|
|
5
5
|
import type { ExtensionAPI, ExtensionCommandContext, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
6
6
|
import type { VoiceToolServices } from "./tools.js";
|
|
7
|
-
import { conciseTranscript
|
|
7
|
+
import { conciseTranscript } from "./text.js";
|
|
8
8
|
import { audioExtensionForCodec, maskSecret } from "./config.js";
|
|
9
9
|
import { applyVoiceChrome, installVoiceUi, uninstallVoiceUi } from "./voice-ui.js";
|
|
10
10
|
|
|
@@ -13,12 +13,10 @@ export type VoiceLoopStatus = "idle" | "listening" | "transcribing" | "agent" |
|
|
|
13
13
|
export interface VoiceModeState {
|
|
14
14
|
enabled: boolean;
|
|
15
15
|
autoListen: boolean;
|
|
16
|
-
autoSpeakAssistant: boolean;
|
|
17
16
|
isListening: boolean;
|
|
18
17
|
status: VoiceLoopStatus;
|
|
19
18
|
uiInstalled?: boolean;
|
|
20
19
|
previousEditorFactory?: unknown;
|
|
21
|
-
lastAssistantText?: string;
|
|
22
20
|
lastTranscript?: string;
|
|
23
21
|
lastError?: string;
|
|
24
22
|
recordSeconds?: number;
|
|
@@ -28,7 +26,7 @@ export interface VoiceModeState {
|
|
|
28
26
|
}
|
|
29
27
|
|
|
30
28
|
export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState) {
|
|
31
|
-
pi.registerCommand("init", {
|
|
29
|
+
pi.registerCommand("voice-init", {
|
|
32
30
|
description: "Create a global pi-listens settings file at ~/.pi/pi-listens.json with sensible defaults",
|
|
33
31
|
handler: async (args, ctx) => {
|
|
34
32
|
await initSettings(services, ctx, args.includes("--overwrite"));
|
|
@@ -48,10 +46,9 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
|
|
|
48
46
|
});
|
|
49
47
|
|
|
50
48
|
pi.registerCommand("voice-on", {
|
|
51
|
-
description: "Enable hands-free voice loop with auto-
|
|
49
|
+
description: "Enable hands-free voice loop with auto-listen. Use --manual to only listen on demand.",
|
|
52
50
|
handler: async (args, ctx) => {
|
|
53
51
|
state.enabled = true;
|
|
54
|
-
state.autoSpeakAssistant = !args.includes("--no-speak");
|
|
55
52
|
state.autoListen = !args.includes("--manual");
|
|
56
53
|
installVoiceUi(ctx, state, createVoiceUiCallbacks(pi, services, state, ctx));
|
|
57
54
|
applyVoiceChrome(ctx, state);
|
|
@@ -79,7 +76,6 @@ export function registerVoiceCommands(pi: ExtensionAPI, services: VoiceToolServi
|
|
|
79
76
|
`TTS: ${config.ttsModel} (${config.ttsLanguageCode}, speaker ${config.ttsSpeaker})`,
|
|
80
77
|
"",
|
|
81
78
|
`Voice mode: ${state.enabled ? "on" : "off"}`,
|
|
82
|
-
`Auto-speak: ${state.autoSpeakAssistant ? "on" : "off"}`,
|
|
83
79
|
`Auto-listen: ${state.autoListen ? "on" : "off"}`,
|
|
84
80
|
].join("\n"),
|
|
85
81
|
ready ? "info" : "warning",
|
|
@@ -107,8 +103,6 @@ const INIT_SETTINGS_TEMPLATE = {
|
|
|
107
103
|
ttsSampleRate: 24000,
|
|
108
104
|
ttsOutputCodec: "wav",
|
|
109
105
|
textFallback: true,
|
|
110
|
-
autoSpeakAssistant: true,
|
|
111
|
-
maxAutoSpeakChars: 320,
|
|
112
106
|
};
|
|
113
107
|
|
|
114
108
|
async function initSettings(services: VoiceToolServices, ctx: ExtensionCommandContext, overwrite: boolean) {
|
|
@@ -125,7 +119,7 @@ async function initSettings(services: VoiceToolServices, ctx: ExtensionCommandCo
|
|
|
125
119
|
`Settings file already exists: ${filePath}`,
|
|
126
120
|
hasKey ? "Sarvam API key: set" : "Sarvam API key: not yet configured",
|
|
127
121
|
"",
|
|
128
|
-
"Use /init --overwrite to replace it with fresh defaults.",
|
|
122
|
+
"Use /voice-init --overwrite to replace it with fresh defaults.",
|
|
129
123
|
].join("\n"),
|
|
130
124
|
"info",
|
|
131
125
|
);
|
|
@@ -154,26 +148,9 @@ async function initSettings(services: VoiceToolServices, ctx: ExtensionCommandCo
|
|
|
154
148
|
}
|
|
155
149
|
export async function maybeContinueVoiceLoop(pi: ExtensionAPI, services: VoiceToolServices, state: VoiceModeState, ctx: ExtensionContext) {
|
|
156
150
|
if (!state.enabled || state.isListening) return;
|
|
157
|
-
if (state.
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try {
|
|
161
|
-
await speakText(services, spoken, ctx.signal, state, ctx);
|
|
162
|
-
} catch (err) {
|
|
163
|
-
if (isCancelled(err)) {
|
|
164
|
-
state.status = "idle";
|
|
165
|
-
state.lastError = undefined;
|
|
166
|
-
applyVoiceChrome(ctx, state);
|
|
167
|
-
return;
|
|
168
|
-
}
|
|
169
|
-
state.status = "error";
|
|
170
|
-
state.lastError = errorMessage(err);
|
|
171
|
-
applyVoiceChrome(ctx, state);
|
|
172
|
-
ctx.ui.notify(`pi-listens could not speak assistant response: ${errorMessage(err)}`, "warning");
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
if (!state.enabled || !state.autoListen) { state.status = "idle"; applyVoiceChrome(ctx, state); return; }
|
|
151
|
+
if (!state.autoListen) { state.status = "idle"; applyVoiceChrome(ctx, state); return; }
|
|
152
|
+
// Wait for any in-flight tool-initiated playback to finish before opening the mic
|
|
153
|
+
await waitForPlaybackIdle(services);
|
|
177
154
|
ctx.ui.notify("Listening for your next instruction…", "info");
|
|
178
155
|
await listenAndSend(pi, services, ctx, undefined, { followUpWhenBusy: true });
|
|
179
156
|
}
|
|
@@ -245,27 +222,14 @@ async function listenAndSend(
|
|
|
245
222
|
}
|
|
246
223
|
}
|
|
247
224
|
|
|
248
|
-
async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
state.speakAbortController?.abort();
|
|
254
|
-
state.speakAbortController = speakAbortController;
|
|
255
|
-
state.status = "speaking";
|
|
256
|
-
if (ctx) applyVoiceChrome(ctx, state);
|
|
257
|
-
}
|
|
225
|
+
async function speakText(services: VoiceToolServices, text: string, signal?: AbortSignal) {
|
|
226
|
+
// Stop any in-flight playback before starting new speech
|
|
227
|
+
services.getAudio().stopPlayback();
|
|
228
|
+
await playSpeechBest(services, text, signal);
|
|
229
|
+
}
|
|
258
230
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
} finally {
|
|
262
|
-
speakSignal.cleanup();
|
|
263
|
-
if (state && state.speakAbortController === speakAbortController) state.speakAbortController = undefined;
|
|
264
|
-
if (state && state.status === "speaking") {
|
|
265
|
-
state.status = "idle";
|
|
266
|
-
if (ctx) applyVoiceChrome(ctx, state);
|
|
267
|
-
}
|
|
268
|
-
}
|
|
231
|
+
async function waitForPlaybackIdle(services: VoiceToolServices): Promise<void> {
|
|
232
|
+
await services.getAudio().waitForPlaybackIdle();
|
|
269
233
|
}
|
|
270
234
|
|
|
271
235
|
async function playSpeechBest(services: VoiceToolServices, text: string, signal?: AbortSignal) {
|
|
@@ -304,7 +268,6 @@ function createVoiceUiCallbacks(pi: ExtensionAPI, services: VoiceToolServices, s
|
|
|
304
268
|
disable: () => {
|
|
305
269
|
stopVoiceMode(services, state, ctx);
|
|
306
270
|
},
|
|
307
|
-
toggleSpeak: () => { state.autoSpeakAssistant = !state.autoSpeakAssistant; applyVoiceChrome(ctx, state); },
|
|
308
271
|
toggleAutoListen: () => { state.autoListen = !state.autoListen; applyVoiceChrome(ctx, state); },
|
|
309
272
|
};
|
|
310
273
|
}
|
|
@@ -360,9 +323,24 @@ export function stopVoiceMode(services: VoiceToolServices, state: VoiceModeState
|
|
|
360
323
|
}
|
|
361
324
|
|
|
362
325
|
const serviceState = new WeakMap<VoiceToolServices, VoiceModeState>();
|
|
326
|
+
const serviceCtx = new WeakMap<VoiceToolServices, ExtensionContext>();
|
|
363
327
|
|
|
364
328
|
export function attachStateToServices(services: VoiceToolServices, state: VoiceModeState) {
|
|
365
329
|
serviceState.set(services, state);
|
|
330
|
+
services.notifySpeaking = (speaking) => {
|
|
331
|
+
if (!state.enabled) return;
|
|
332
|
+
if (speaking) {
|
|
333
|
+
state.status = "speaking";
|
|
334
|
+
} else if (state.status === "speaking") {
|
|
335
|
+
state.status = "idle";
|
|
336
|
+
}
|
|
337
|
+
const ctx = serviceCtx.get(services);
|
|
338
|
+
if (ctx) applyVoiceChrome(ctx, state);
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
export function updateServiceContext(services: VoiceToolServices, ctx: ExtensionContext) {
|
|
343
|
+
serviceCtx.set(services, ctx);
|
|
366
344
|
}
|
|
367
345
|
|
|
368
346
|
function getStateFromServices(services: VoiceToolServices): VoiceModeState {
|
package/src/config.ts
CHANGED
|
@@ -32,8 +32,6 @@ export interface PiListensConfig {
|
|
|
32
32
|
audioDir: string;
|
|
33
33
|
deleteAudio: boolean;
|
|
34
34
|
textFallback: boolean;
|
|
35
|
-
autoSpeakAssistant: boolean;
|
|
36
|
-
maxAutoSpeakChars: number;
|
|
37
35
|
}
|
|
38
36
|
|
|
39
37
|
const DEFAULT_CONFIG: PiListensConfig = {
|
|
@@ -59,8 +57,6 @@ const DEFAULT_CONFIG: PiListensConfig = {
|
|
|
59
57
|
audioDir: join(tmpdir(), "pi-listens"),
|
|
60
58
|
deleteAudio: true,
|
|
61
59
|
textFallback: true,
|
|
62
|
-
autoSpeakAssistant: false,
|
|
63
|
-
maxAutoSpeakChars: 320,
|
|
64
60
|
};
|
|
65
61
|
|
|
66
62
|
type RawConfig = Partial<PiListensConfig>;
|
|
@@ -102,8 +98,6 @@ export function resolveConfig(cwd: string): PiListensConfig {
|
|
|
102
98
|
audioDir: env("PI_LISTENS_AUDIO_DIR"),
|
|
103
99
|
deleteAudio: parseBoolean(env("PI_LISTENS_DELETE_AUDIO")),
|
|
104
100
|
textFallback: parseBoolean(env("PI_LISTENS_TEXT_FALLBACK")),
|
|
105
|
-
autoSpeakAssistant: parseBoolean(env("PI_LISTENS_AUTO_SPEAK")),
|
|
106
|
-
maxAutoSpeakChars: parseInteger(env("PI_LISTENS_MAX_AUTO_SPEAK_CHARS")),
|
|
107
101
|
};
|
|
108
102
|
|
|
109
103
|
return mergeDefined(DEFAULT_CONFIG, fileConfig, envConfig);
|
package/src/index.ts
CHANGED
|
@@ -2,9 +2,8 @@ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
|
2
2
|
import { createAudioRuntime, type AudioRuntime } from "./audio.js";
|
|
3
3
|
import { maskSecret, resolveConfig, type PiListensConfig } from "./config.js";
|
|
4
4
|
import { SarvamSpeechClient } from "./sarvam.js";
|
|
5
|
-
import { attachStateToServices, maybeContinueVoiceLoop, registerVoiceCommands, stopVoiceMode, type VoiceModeState } from "./commands.js";
|
|
5
|
+
import { attachStateToServices, maybeContinueVoiceLoop, registerVoiceCommands, stopVoiceMode, updateServiceContext, type VoiceModeState } from "./commands.js";
|
|
6
6
|
import { registerVoiceTools, type VoiceToolServices } from "./tools.js";
|
|
7
|
-
import { firstTextContent } from "./text.js";
|
|
8
7
|
|
|
9
8
|
export default function piListensExtension(pi: ExtensionAPI) {
|
|
10
9
|
let config: PiListensConfig = resolveConfig(process.cwd());
|
|
@@ -13,10 +12,8 @@ export default function piListensExtension(pi: ExtensionAPI) {
|
|
|
13
12
|
|
|
14
13
|
const speech = new SarvamSpeechClient(() => config);
|
|
15
14
|
const state: VoiceModeState = {
|
|
16
|
-
|
|
17
15
|
enabled: false,
|
|
18
16
|
autoListen: false,
|
|
19
|
-
autoSpeakAssistant: config.autoSpeakAssistant,
|
|
20
17
|
isListening: false,
|
|
21
18
|
status: "idle",
|
|
22
19
|
recordSeconds: config.recordSeconds,
|
|
@@ -34,7 +31,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
|
|
|
34
31
|
lastCwd = cwd;
|
|
35
32
|
config = resolveConfig(cwd);
|
|
36
33
|
audio = createAudioRuntime(config);
|
|
37
|
-
if (!state.enabled) { state.
|
|
34
|
+
if (!state.enabled) { state.recordSeconds = config.recordSeconds; state.silenceStopSeconds = config.silenceStopSeconds; }
|
|
38
35
|
}
|
|
39
36
|
|
|
40
37
|
registerVoiceTools(pi, services);
|
|
@@ -42,6 +39,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
|
|
|
42
39
|
|
|
43
40
|
pi.on("session_start", async (_event, ctx) => {
|
|
44
41
|
reloadConfig(ctx.cwd);
|
|
42
|
+
updateServiceContext(services, ctx);
|
|
45
43
|
const audioInfo = audio.describe();
|
|
46
44
|
const ready = Boolean(config.apiKey) && audioInfo.recorder !== "missing" && audioInfo.player !== "missing";
|
|
47
45
|
ctx.ui.setStatus("pi-listens", state.enabled ? "voice on" : ready ? "voice ready" : "voice setup needed");
|
|
@@ -52,7 +50,7 @@ export default function piListensExtension(pi: ExtensionAPI) {
|
|
|
52
50
|
`Sarvam API key: ${maskSecret(config.apiKey)}`,
|
|
53
51
|
`Recorder: ${audioInfo.recorder}`,
|
|
54
52
|
`Player: ${audioInfo.player}`,
|
|
55
|
-
"Run /init to create a settings file, or /voice-check for details.",
|
|
53
|
+
"Run /voice-init to create a settings file, or /voice-check for details.",
|
|
56
54
|
].join("\n"),
|
|
57
55
|
"warning",
|
|
58
56
|
);
|
|
@@ -69,12 +67,9 @@ export default function piListensExtension(pi: ExtensionAPI) {
|
|
|
69
67
|
};
|
|
70
68
|
});
|
|
71
69
|
|
|
72
|
-
pi.on("message_end", async (event) => {
|
|
73
|
-
if (event.message.role !== "assistant") return;
|
|
74
|
-
state.lastAssistantText = firstTextContent(event.message);
|
|
75
|
-
});
|
|
76
70
|
|
|
77
71
|
pi.on("agent_end", async (_event, ctx) => {
|
|
72
|
+
updateServiceContext(services, ctx);
|
|
78
73
|
await maybeContinueVoiceLoop(pi, services, state, ctx);
|
|
79
74
|
});
|
|
80
75
|
|
package/src/text.ts
CHANGED
|
@@ -1,46 +1,3 @@
|
|
|
1
|
-
export function firstTextContent(message: unknown): string {
|
|
2
|
-
if (!message || typeof message !== "object") return "";
|
|
3
|
-
const content = (message as { content?: unknown }).content;
|
|
4
|
-
if (typeof content === "string") return content;
|
|
5
|
-
if (!Array.isArray(content)) return "";
|
|
6
|
-
return content
|
|
7
|
-
.map((part) => {
|
|
8
|
-
if (!part || typeof part !== "object") return "";
|
|
9
|
-
const p = part as { type?: string; text?: string };
|
|
10
|
-
return p.type === "text" && typeof p.text === "string" ? p.text : "";
|
|
11
|
-
})
|
|
12
|
-
.filter(Boolean)
|
|
13
|
-
.join("\n")
|
|
14
|
-
.trim();
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export function prepareSpokenText(text: string, maxChars: number): string {
|
|
18
|
-
let prepared = text
|
|
19
|
-
.replace(/```[\s\S]*?```/g, " I skipped a code block. ")
|
|
20
|
-
.replace(/^\s{0,3}#{1,6}\s+/gm, "")
|
|
21
|
-
.replace(/^\s*[-*+]\s+/gm, "")
|
|
22
|
-
.replace(/^\s*\d+[.)]\s+/gm, "")
|
|
23
|
-
.replace(/`([^`]+)`/g, "$1")
|
|
24
|
-
.replace(/https?:\/\/\S+/g, "link")
|
|
25
|
-
.replace(/[#*_>~|]+/g, " ")
|
|
26
|
-
.replace(/\s+/g, " ")
|
|
27
|
-
.trim();
|
|
28
|
-
|
|
29
|
-
prepared = conciseSpokenSummary(prepared);
|
|
30
|
-
if (prepared.length > maxChars) {
|
|
31
|
-
prepared = `${prepared.slice(0, Math.max(0, maxChars - 32)).trim()}… More on screen.`;
|
|
32
|
-
}
|
|
33
|
-
return prepared;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
function conciseSpokenSummary(text: string): string {
|
|
37
|
-
const sentences = text.match(/[^.!?]+[.!?]+|[^.!?]+$/g)?.map((part) => part.trim()).filter(Boolean) ?? [];
|
|
38
|
-
if (sentences.length === 0) return text;
|
|
39
|
-
|
|
40
|
-
const useful = sentences.filter((sentence) => !/^(sure|here('|’)s|summary|in summary|done|completed|i('|’)ve|i have)\b/i.test(sentence));
|
|
41
|
-
const picked = (useful.length ? useful : sentences).slice(0, 2).join(" ").trim();
|
|
42
|
-
return picked || text;
|
|
43
|
-
}
|
|
44
1
|
|
|
45
2
|
export function conciseTranscript(transcript: string): string {
|
|
46
3
|
const trimmed = transcript.trim();
|
package/src/tools.ts
CHANGED
|
@@ -13,6 +13,7 @@ export interface VoiceToolServices {
|
|
|
13
13
|
getConfig: () => PiListensConfig;
|
|
14
14
|
getAudio: () => AudioRuntime;
|
|
15
15
|
getSpeech: () => SarvamSpeechClient;
|
|
16
|
+
notifySpeaking?: (speaking: boolean) => void;
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
const VoiceOutputParams = Type.Object({
|
|
@@ -54,21 +55,28 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
|
|
|
54
55
|
],
|
|
55
56
|
parameters: VoiceOutputParams,
|
|
56
57
|
async execute(_toolCallId, params: VoiceOutputInput, signal, onUpdate) {
|
|
58
|
+
// Stop any in-flight playback before starting new speech
|
|
59
|
+
services.getAudio().stopPlayback();
|
|
60
|
+
services.notifySpeaking?.(true);
|
|
57
61
|
onUpdate?.({ content: [{ type: "text", text: "Starting streamed speech with Sarvam AI…" }], details: {} });
|
|
58
62
|
const playback = playSpeechBest(params.text, services, signal);
|
|
59
63
|
if (params.wait_for_playback !== true) {
|
|
60
|
-
void playback.
|
|
64
|
+
void playback.then(() => services.notifySpeaking?.(false), () => services.notifySpeaking?.(false));
|
|
61
65
|
return {
|
|
62
66
|
content: [{ type: "text", text: `Started speaking to user: ${params.text}` }],
|
|
63
67
|
details: { played: "started", text: params.text },
|
|
64
68
|
};
|
|
65
69
|
}
|
|
66
70
|
onUpdate?.({ content: [{ type: "text", text: "Playing audio…" }], details: {} });
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
try {
|
|
72
|
+
const details = await playback;
|
|
73
|
+
return {
|
|
74
|
+
content: [{ type: "text", text: `Spoke to user: ${params.text}` }],
|
|
75
|
+
details: { ...details, played: true, text: params.text },
|
|
76
|
+
};
|
|
77
|
+
} finally {
|
|
78
|
+
services.notifySpeaking?.(false);
|
|
79
|
+
}
|
|
72
80
|
},
|
|
73
81
|
renderCall(args: VoiceOutputInput, theme) {
|
|
74
82
|
return new Text(`${theme.fg("toolTitle", theme.bold("voice_output "))}${theme.fg("muted", quote(args.text))}`, 0, 0);
|
|
@@ -116,7 +124,13 @@ export function registerVoiceTools(pi: ExtensionAPI, services: VoiceToolServices
|
|
|
116
124
|
parameters: VoiceAskParams,
|
|
117
125
|
async execute(_toolCallId, params: VoiceAskInput, signal, onUpdate, ctx) {
|
|
118
126
|
onUpdate?.({ content: [{ type: "text", text: "Speaking question…" }], details: {} });
|
|
119
|
-
|
|
127
|
+
services.getAudio().stopPlayback();
|
|
128
|
+
services.notifySpeaking?.(true);
|
|
129
|
+
try {
|
|
130
|
+
await playSpeechBest(params.question, services, signal);
|
|
131
|
+
} finally {
|
|
132
|
+
services.notifySpeaking?.(false);
|
|
133
|
+
}
|
|
120
134
|
const answer = await listenAndMaybeFallback(
|
|
121
135
|
params,
|
|
122
136
|
services,
|
package/src/voice-ui.ts
CHANGED
|
@@ -7,7 +7,6 @@ type EditorFactory = ReturnType<ExtensionContext["ui"]["getEditorComponent"]>;
|
|
|
7
7
|
export interface VoiceUiCallbacks {
|
|
8
8
|
startListening: () => void;
|
|
9
9
|
disable: () => void;
|
|
10
|
-
toggleSpeak: () => void;
|
|
11
10
|
toggleAutoListen: () => void;
|
|
12
11
|
}
|
|
13
12
|
|
|
@@ -46,15 +45,19 @@ export function applyVoiceChrome(ctx: ExtensionContext, state: VoiceModeState) {
|
|
|
46
45
|
? "listening…"
|
|
47
46
|
: state.status === "agent"
|
|
48
47
|
? "agent working"
|
|
49
|
-
: state.
|
|
50
|
-
? "
|
|
48
|
+
: state.status === "speaking"
|
|
49
|
+
? "speaking…"
|
|
51
50
|
: "voice on"
|
|
52
51
|
: "voice ready";
|
|
53
52
|
ctx.ui.setStatus("pi-listens", status);
|
|
54
53
|
if (!state.enabled) return;
|
|
55
54
|
ctx.ui.setWorkingIndicator({
|
|
56
|
-
frames: state.status === "listening"
|
|
57
|
-
|
|
55
|
+
frames: state.status === "listening"
|
|
56
|
+
? [ctx.ui.theme.fg("accent", "●"), ctx.ui.theme.fg("muted", "•")]
|
|
57
|
+
: state.status === "speaking"
|
|
58
|
+
? [ctx.ui.theme.fg("accent", "♪"), ctx.ui.theme.fg("muted", "♫")]
|
|
59
|
+
: [ctx.ui.theme.fg("accent", "◌")],
|
|
60
|
+
intervalMs: state.status === "speaking" ? 200 : 250,
|
|
58
61
|
});
|
|
59
62
|
}
|
|
60
63
|
|
|
@@ -93,11 +96,6 @@ class VoiceLoopEditor extends CustomEditor {
|
|
|
93
96
|
this.callbacks.startListening();
|
|
94
97
|
return;
|
|
95
98
|
}
|
|
96
|
-
if (data.toLowerCase() === "s") {
|
|
97
|
-
this.triggerOrbClick(0.5, -0.18, 0.12);
|
|
98
|
-
this.callbacks.toggleSpeak();
|
|
99
|
-
return;
|
|
100
|
-
}
|
|
101
99
|
if (data.toLowerCase() === "a") {
|
|
102
100
|
this.triggerOrbClick(0.65, 0.18, 0.1);
|
|
103
101
|
this.callbacks.toggleAutoListen();
|
|
@@ -311,7 +309,6 @@ function controlRail(state: VoiceModeState, palette: OrbPalette, width: number):
|
|
|
311
309
|
const pills = [
|
|
312
310
|
controlPill("Space", listenLabel, state.isListening ? "active" : "primary", palette),
|
|
313
311
|
controlPill("A", state.autoListen ? "auto-listen on" : "auto-listen off", state.autoListen ? "active" : "muted", palette),
|
|
314
|
-
controlPill("S", state.autoSpeakAssistant ? "read aloud on" : "read aloud off", state.autoSpeakAssistant ? "active" : "muted", palette),
|
|
315
312
|
controlPill("Q", "close", "danger", palette),
|
|
316
313
|
];
|
|
317
314
|
return wrapInline(pills, " ", Math.max(24, width - 2));
|