pi-voice-input 0.2.10 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -27,6 +27,7 @@ pi extension: extensions/index.ts → extensions/voice-input.ts
27
27
  │ ├─ Linux preferred: pw-record
28
28
  │ ├─ Linux fallback: arecord
29
29
  │ └─ macOS: afrecord, or ffmpeg/AVFoundation fallback
30
+ ├─ ducks system output volume while the microphone is listening
30
31
  ├─ records a temporary 16 kHz mono 16-bit WAV
31
32
  ├─ parses the WAV container in TypeScript and extracts raw PCM
32
33
  ├─ sends PCM frames to the configured ASR provider via ws
@@ -108,12 +109,17 @@ The config file is plain JSON and can be edited directly:
108
109
  ```json
109
110
  {
110
111
  "volcApiKey": "",
111
- "polishModel": ""
112
+ "polishModel": "",
113
+ "duckSystemVolume": true,
114
+ "duckSystemVolumeFactor": 0.5,
115
+ "duckSystemVolumeFadeMs": 300
112
116
  }
113
117
  ```
114
118
 
115
119
  `polishModel` is disabled by default. Set it to any model shown by `pi --list-models` to enable transcript polish. If polishing fails, the raw ASR transcript is inserted instead.
116
120
 
121
+ `duckSystemVolume` is enabled by default. While recording, the extension lowers system output volume to `duckSystemVolumeFactor` of the original volume using a short ease-in/ease-out fade (`duckSystemVolumeFadeMs`), then restores the saved volume when recording stops or is cancelled. Linux uses `wpctl` or `pactl`; macOS uses `osascript`.
122
+
117
123
  Verify the effective non-secret config:
118
124
 
119
125
  ```text
@@ -151,6 +157,7 @@ Slash commands:
151
157
  - On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
152
158
  - When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
153
159
  - While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
160
+ - By default, system output volume is ducked to 50% of its previous level with a 300 ms ease-in/ease-out fade while the microphone is listening, then restored after recording stops.
154
161
 
155
162
  ## Development
156
163
 
@@ -40,7 +40,9 @@ Rules:
40
40
  - Preserve the user's information faithfully. Do not over-summarize or compress. Do not delete constraints, examples, numbers, filenames, errors, multiple requests, ordering, or emphasis.
41
41
  - Correct obvious ASR mistakes, homophones, segmentation, and punctuation. Preserve code identifiers, commands, paths, URLs, model names, package names, and proper nouns.
42
42
  - If the user self-corrects, keep only the corrected intent and remove the false start, correction process, filler, and chatter. Do not lose any other substantive information.
43
- - Make the output complete relative to the raw speech, logically clear, and actionable. Split into items or steps when helpful, but do not drop raw-speech information or repeat existing draft text.
43
+ - Make the output complete relative to the raw speech, logically clear, and actionable, but do not drop raw-speech information or repeat existing draft text.
44
+ - Preserve the raw speech layout. If the raw speech is a single line, output a single line unless the user explicitly dictates line breaks or another multiline layout, for example by saying "new line" or "换行".
45
+ - Do not introduce line breaks, bullets, numbered lists, tables, or code fences merely to improve style.
44
46
  - Do not invent requirements that the raw speech did not express. If uncertain, keep the original meaning and express it more clearly.
45
47
  - The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate just because the instructions are in English.`;
46
48
 
@@ -59,6 +61,9 @@ type JsonObject = Record<string, unknown>;
59
61
  type VoiceInputConfigFile = {
60
62
  volcApiKey: string;
61
63
  polishModel: string;
64
+ duckSystemVolume: boolean;
65
+ duckSystemVolumeFactor: number;
66
+ duckSystemVolumeFadeMs: number;
62
67
  };
63
68
 
64
69
  type VoiceConfig = {
@@ -84,6 +89,17 @@ type VoiceConfig = {
84
89
  postprocessTimeoutMs: number;
85
90
  postprocessMaxTokens: number;
86
91
  postprocessContextChars: number;
92
+ duckSystemVolume: boolean;
93
+ duckSystemVolumeFactor: number;
94
+ duckSystemVolumeFadeMs: number;
95
+ };
96
+
97
+ type SystemVolumeDuckingState = {
98
+ provider: "macos" | "wpctl" | "pactl";
99
+ originalVolumePercent: number;
100
+ duckedVolumePercent: number;
101
+ factor: number;
102
+ fadeMs: number;
87
103
  };
88
104
 
89
105
  type RecordingState = {
@@ -93,6 +109,7 @@ type RecordingState = {
93
109
  startedAt: string;
94
110
  recorderTarget?: string;
95
111
  deviceName?: string;
112
+ systemVolume?: SystemVolumeDuckingState;
96
113
  };
97
114
 
98
115
  type DecodedFrame = {
@@ -122,6 +139,9 @@ function defaultConfigFile(): VoiceInputConfigFile {
122
139
  return {
123
140
  volcApiKey: "",
124
141
  polishModel: DEFAULT_POSTPROCESS_MODEL,
142
+ duckSystemVolume: true,
143
+ duckSystemVolumeFactor: 0.5,
144
+ duckSystemVolumeFadeMs: 300,
125
145
  };
126
146
  }
127
147
 
@@ -134,12 +154,29 @@ function stringField(source: JsonObject, name: string, fallback: string): string
134
154
  return typeof value === "string" ? value : fallback;
135
155
  }
136
156
 
157
+ function booleanField(source: JsonObject, name: string, fallback: boolean): boolean {
158
+ const value = source[name];
159
+ return typeof value === "boolean" ? value : fallback;
160
+ }
161
+
162
+ function numberField(source: JsonObject, name: string, fallback: number): number {
163
+ const value = source[name];
164
+ return typeof value === "number" && Number.isFinite(value) ? value : fallback;
165
+ }
166
+
167
+ function clamp(value: number, min: number, max: number): number {
168
+ return Math.min(max, Math.max(min, value));
169
+ }
170
+
137
171
  function normalizeConfigFile(input: unknown): VoiceInputConfigFile {
138
172
  const defaults = defaultConfigFile();
139
173
  const root = isObject(input) ? input : {};
140
174
  return {
141
175
  volcApiKey: stringField(root, "volcApiKey", defaults.volcApiKey).trim(),
142
176
  polishModel: stringField(root, "polishModel", defaults.polishModel).trim(),
177
+ duckSystemVolume: booleanField(root, "duckSystemVolume", defaults.duckSystemVolume),
178
+ duckSystemVolumeFactor: clamp(numberField(root, "duckSystemVolumeFactor", defaults.duckSystemVolumeFactor), 0, 1),
179
+ duckSystemVolumeFadeMs: Math.round(clamp(numberField(root, "duckSystemVolumeFadeMs", defaults.duckSystemVolumeFadeMs), 0, 3000)),
143
180
  };
144
181
  }
145
182
 
@@ -186,6 +223,9 @@ function getConfig(): VoiceConfig {
186
223
  postprocessTimeoutMs: 30000,
187
224
  postprocessMaxTokens: 2048,
188
225
  postprocessContextChars: 6000,
226
+ duckSystemVolume: fileConfig.duckSystemVolume,
227
+ duckSystemVolumeFactor: fileConfig.duckSystemVolumeFactor,
228
+ duckSystemVolumeFadeMs: fileConfig.duckSystemVolumeFadeMs,
189
229
  };
190
230
  }
191
231
 
@@ -216,6 +256,111 @@ function commandOutput(command: string, args: string[], timeoutMs = 1500): strin
216
256
  return (result.stdout || "").trim();
217
257
  }
218
258
 
259
+ function runCommand(command: string, args: string[], timeoutMs = 1500): boolean {
260
+ return spawnSync(command, args, { stdio: "ignore", timeout: timeoutMs }).status === 0;
261
+ }
262
+
263
+ function formatPercent(value: number): string {
264
+ return Number(value.toFixed(2)).toString();
265
+ }
266
+
267
+ function readSystemOutputVolume(): Pick<SystemVolumeDuckingState, "provider" | "originalVolumePercent"> | null {
268
+ if (platform() === "darwin") {
269
+ if (!commandExists("osascript")) return null;
270
+ const output = commandOutput("osascript", ["-e", "output volume of (get volume settings)"]);
271
+ const volume = Number(output.trim());
272
+ return Number.isFinite(volume) ? { provider: "macos", originalVolumePercent: clamp(volume, 0, 100) } : null;
273
+ }
274
+
275
+ if (platform() !== "linux") return null;
276
+
277
+ if (commandExists("wpctl")) {
278
+ const output = commandOutput("wpctl", ["get-volume", "@DEFAULT_AUDIO_SINK@"]);
279
+ const match = output.match(/Volume:\s*([0-9.]+)/);
280
+ const volume = match ? Number(match[1]) * 100 : NaN;
281
+ if (Number.isFinite(volume)) return { provider: "wpctl", originalVolumePercent: Math.max(0, volume) };
282
+ }
283
+
284
+ if (commandExists("pactl")) {
285
+ const output = commandOutput("pactl", ["get-sink-volume", "@DEFAULT_SINK@"]);
286
+ const match = output.match(/([0-9]+(?:\.[0-9]+)?)%/);
287
+ const volume = match ? Number(match[1]) : NaN;
288
+ if (Number.isFinite(volume)) return { provider: "pactl", originalVolumePercent: Math.max(0, volume) };
289
+ }
290
+
291
+ return null;
292
+ }
293
+
294
+ function setSystemOutputVolume(state: Pick<SystemVolumeDuckingState, "provider">, volumePercent: number): boolean {
295
+ if (state.provider === "macos") {
296
+ return runCommand("osascript", ["-e", `set volume output volume ${Math.round(clamp(volumePercent, 0, 100))}`]);
297
+ }
298
+
299
+ const safePercent = Math.max(0, volumePercent);
300
+ if (state.provider === "wpctl") {
301
+ return runCommand("wpctl", ["set-volume", "@DEFAULT_AUDIO_SINK@", `${formatPercent(safePercent)}%`]);
302
+ }
303
+
304
+ return runCommand("pactl", ["set-sink-volume", "@DEFAULT_SINK@", `${formatPercent(safePercent)}%`]);
305
+ }
306
+
307
+ function easeInOut(t: number): number {
308
+ return 0.5 - Math.cos(Math.PI * clamp(t, 0, 1)) / 2;
309
+ }
310
+
311
+ async function fadeSystemOutputVolume(
312
+ state: Pick<SystemVolumeDuckingState, "provider">,
313
+ fromPercent: number,
314
+ toPercent: number,
315
+ fadeMs: number,
316
+ ): Promise<string | null> {
317
+ if (fadeMs <= 0 || Math.abs(fromPercent - toPercent) < 0.1) {
318
+ return setSystemOutputVolume(state, toPercent) ? null : "failed to set system output volume";
319
+ }
320
+
321
+ const steps = Math.max(2, Math.min(20, Math.round(fadeMs / 30)));
322
+ const intervalMs = fadeMs / steps;
323
+ for (let step = 1; step <= steps; step += 1) {
324
+ const eased = easeInOut(step / steps);
325
+ const volume = fromPercent + (toPercent - fromPercent) * eased;
326
+ if (!setSystemOutputVolume(state, volume)) return "failed to set system output volume";
327
+ if (step < steps) await sleep(intervalMs);
328
+ }
329
+ return null;
330
+ }
331
+
332
+ function createSystemVolumeDuckingState(config: VoiceConfig): { state?: SystemVolumeDuckingState; warning?: string } {
333
+ if (!config.duckSystemVolume || config.duckSystemVolumeFactor >= 1) return {};
334
+ const snapshot = readSystemOutputVolume();
335
+ if (!snapshot) return { warning: "system output volume ducking is enabled, but no supported volume control was found" };
336
+
337
+ return {
338
+ state: {
339
+ ...snapshot,
340
+ duckedVolumePercent: snapshot.originalVolumePercent * config.duckSystemVolumeFactor,
341
+ factor: config.duckSystemVolumeFactor,
342
+ fadeMs: config.duckSystemVolumeFadeMs,
343
+ },
344
+ };
345
+ }
346
+
347
+ async function applySystemVolumeDucking(state?: SystemVolumeDuckingState): Promise<string | null> {
348
+ if (!state) return null;
349
+ const warning = await fadeSystemOutputVolume(state, state.originalVolumePercent, state.duckedVolumePercent, state.fadeMs);
350
+ return warning ? `system output volume ducking failed: ${warning}` : null;
351
+ }
352
+
353
+ async function restoreSystemOutputVolume(state?: SystemVolumeDuckingState): Promise<string | null> {
354
+ if (!state) return null;
355
+ const warning = await fadeSystemOutputVolume(state, state.duckedVolumePercent, state.originalVolumePercent, state.fadeMs);
356
+ return warning ? `system output volume restore failed: ${warning}` : null;
357
+ }
358
+
359
+ function restoreSystemOutputVolumeNow(state?: SystemVolumeDuckingState): string | null {
360
+ if (!state) return null;
361
+ return setSystemOutputVolume(state, state.originalVolumePercent) ? null : "system output volume restore failed";
362
+ }
363
+
219
364
  function selectRecorderExecutable(): string {
220
365
  if (platform() === "darwin") {
221
366
  if (commandExists("afrecord")) return "afrecord";
@@ -924,7 +1069,18 @@ function resolvePostprocessModel(ctx: ExtensionContext, reference: string): Mode
924
1069
  }
925
1070
 
926
1071
  function extractAssistantText(message: { content: unknown }): string {
927
- return textFromContent(message.content).trim();
1072
+ const content = message.content;
1073
+ if (typeof content === "string") return content.trim();
1074
+ if (!Array.isArray(content)) return "";
1075
+ return content
1076
+ .map((part) => {
1077
+ if (!part || typeof part !== "object") return "";
1078
+ const block = part as { type?: unknown; text?: unknown };
1079
+ if (block.type === "text" && typeof block.text === "string") return block.text;
1080
+ return "";
1081
+ })
1082
+ .join("")
1083
+ .trim();
928
1084
  }
929
1085
 
930
1086
  function cleanPostprocessOutput(output: string): string {
@@ -935,6 +1091,27 @@ function cleanPostprocessOutput(output: string): string {
935
1091
  return text;
936
1092
  }
937
1093
 
1094
+ function rawTextRequestsMultiline(rawText: string): boolean {
1095
+ return (
1096
+ /\r|\n/.test(rawText) ||
1097
+ /\b(?:new\s*line|newline|line break|next line|new paragraph|paragraph break|carriage return|press enter|separate lines?|multi[- ]line|multiple lines)\b/i.test(rawText) ||
1098
+ /(?:换行|新的一行|另起一行|下一行|回车|分行|多行|逐行|每行|空一行|新段落|另起一段|分段)/u.test(rawText)
1099
+ );
1100
+ }
1101
+
1102
+ function collapseUnexpectedLineBreaks(text: string): string {
1103
+ return text
1104
+ .replace(/\r\n?/g, "\n")
1105
+ .replace(/[ \t\f\v]*\n+[ \t\f\v]*/g, " ")
1106
+ .replace(/[ \t\f\v]{2,}/g, " ")
1107
+ .trim();
1108
+ }
1109
+
1110
+ function preserveExpectedPostprocessLayout(rawText: string, output: string): string {
1111
+ if (rawTextRequestsMultiline(rawText)) return output.trim();
1112
+ return collapseUnexpectedLineBreaks(output);
1113
+ }
1114
+
938
1115
  function removeEditorDraftEcho(editorText: string, output: string): string {
939
1116
  const draft = editorText.trim();
940
1117
  const text = output.trim();
@@ -981,6 +1158,7 @@ function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config:
981
1158
  "IMPORTANT: your output will be pasted verbatim at the current cursor position. It is not a replacement and not a rewrite of the whole editor draft.",
982
1159
  "The current editor draft is context only. Do not rewrite, repeat, complete, delete, or replace existing draft text. Do not output the full sentence after insertion.",
983
1160
  "The true cursor position is not marked in the draft shown here; the pi editor owns the actual insertion point. Do not guess the cursor and synthesize a full surrounding sentence.",
1161
+ "Preserve layout: if the raw ASR text is one line, output one line unless the user explicitly dictated line breaks or another multiline layout.",
984
1162
  "If the raw speech is an inline insertion, continuation, a few words, or a phrase, output only the newly spoken words or phrase.",
985
1163
  "Example: draft is `Please make this function async and [cursor].`, raw speech is `add error handling`, correct output is `add error handling`, not `Please make this function async and add error handling.`.",
986
1164
  "Example: draft is `This variable name is [cursor]unclear`, raw speech is `still`, correct output is `still`, not `This variable name is still unclear`.",
@@ -1038,7 +1216,9 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
1038
1216
  }
1039
1217
 
1040
1218
  const polished = cleanPostprocessOutput(extractAssistantText(response));
1041
- return polished ? removeEditorDraftEcho(getFullEditorText(ctx), polished) : rawText;
1219
+ if (!polished) return rawText;
1220
+ const insertion = removeEditorDraftEcho(getFullEditorText(ctx), polished);
1221
+ return preserveExpectedPostprocessLayout(raw, insertion) || rawText;
1042
1222
  }
1043
1223
 
1044
1224
  function insertIntoEditor(ctx: ExtensionContext, text: string) {
@@ -1055,9 +1235,10 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
1055
1235
  function cleanupStaleRecordingState(config: VoiceConfig): string[] {
1056
1236
  const state = readState(config);
1057
1237
  if (!state || pidAlive(state.pid)) return [];
1238
+ const volumeWarning = restoreSystemOutputVolumeNow(state.systemVolume);
1058
1239
  const cleanupWarnings = cleanupRecordingArtifacts(state);
1059
1240
  clearState(config);
1060
- return cleanupWarnings;
1241
+ return [volumeWarning, ...cleanupWarnings].filter((message): message is string => Boolean(message));
1061
1242
  }
1062
1243
 
1063
1244
  function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
@@ -1091,6 +1272,7 @@ async function startRecording(ctx: ExtensionContext) {
1091
1272
  throw error;
1092
1273
  }
1093
1274
  const deviceName = recordingDeviceName(config, cmd[0]);
1275
+ const volumeDucking = createSystemVolumeDuckingState(config);
1094
1276
 
1095
1277
  ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
1096
1278
  let child: ReturnType<typeof spawn>;
@@ -1115,7 +1297,11 @@ async function startRecording(ctx: ExtensionContext) {
1115
1297
  startedAt: new Date().toISOString(),
1116
1298
  recorderTarget: config.recorderTarget || undefined,
1117
1299
  deviceName,
1300
+ systemVolume: volumeDucking.state,
1118
1301
  });
1302
+ if (volumeDucking.warning) ctx.ui.notify(`Voice input warning: ${volumeDucking.warning}`, "warning");
1303
+ const duckingWarning = await applySystemVolumeDucking(volumeDucking.state);
1304
+ if (duckingWarning) ctx.ui.notify(`Voice input warning: ${duckingWarning}`, "warning");
1119
1305
 
1120
1306
  ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("accent", recordingStatusText(deviceName)));
1121
1307
  }
@@ -1132,7 +1318,9 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
1132
1318
 
1133
1319
  ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", transcribe ? "● transcribing" : "● stopping"));
1134
1320
  if (pidAlive(state.pid)) await stopProcessGroup(state.pid);
1321
+ const volumeRestoreWarning = await restoreSystemOutputVolume(state.systemVolume);
1135
1322
  clearState(config);
1323
+ if (volumeRestoreWarning) ctx.ui.notify(`Voice input warning: ${volumeRestoreWarning}`, "warning");
1136
1324
  if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
1137
1325
 
1138
1326
  if (!transcribe) {
@@ -1174,8 +1362,8 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
1174
1362
  if (!result.text.trim()) {
1175
1363
  ctx.ui.setStatus("voice-input", undefined);
1176
1364
  ctx.ui.notify(
1177
- `Transcription finished but no text was returned. audio=${(durationMs / 1000).toFixed(2)}s total=${result.timings.totalMs}ms`,
1178
- "warning",
1365
+ `No speech detected. audio=${(durationMs / 1000).toFixed(2)}s total=${result.timings.totalMs}ms`,
1366
+ "info",
1179
1367
  );
1180
1368
  return;
1181
1369
  }
@@ -1225,6 +1413,7 @@ function setupHelp(config = getConfig()): string {
1225
1413
  "- To create/update the JSON config file, run: /voice init",
1226
1414
  "- To save/update the key, run: /voice key",
1227
1415
  `- Polish: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
1416
+ `- System volume ducking: ${config.duckSystemVolume ? `${Math.round(config.duckSystemVolumeFactor * 100)}% over ${config.duckSystemVolumeFadeMs}ms` : "disabled"}`,
1228
1417
  `- Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
1229
1418
  "- After saving the key, run: /voice config",
1230
1419
  ].join("\n");
@@ -1261,8 +1450,11 @@ function configSummary(config: VoiceConfig): string {
1261
1450
  `- config file: ${config.configPath}${existsSync(config.configPath) ? "" : " (missing; run /voice init to create it)"}`,
1262
1451
  `- volcApiKey: ${config.apiKey ? "set" : "missing"} (update with /voice key)`,
1263
1452
  `- polishModel: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
1453
+ `- duckSystemVolume: ${config.duckSystemVolume ? "enabled" : "disabled"}`,
1454
+ `- duckSystemVolumeFactor: ${config.duckSystemVolumeFactor}`,
1455
+ `- duckSystemVolumeFadeMs: ${config.duckSystemVolumeFadeMs}`,
1264
1456
  `- current recording device: ${currentDevice}`,
1265
- "Config keys: volcApiKey, polishModel. Leave polishModel empty to disable polish.",
1457
+ "Config keys: volcApiKey, polishModel, duckSystemVolume, duckSystemVolumeFactor, duckSystemVolumeFadeMs. Leave polishModel empty to disable polish.",
1266
1458
  `VolcEngine API key URL: ${VOLC_API_KEY_URL}`,
1267
1459
  ].join("\n");
1268
1460
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-voice-input",
3
- "version": "0.2.10",
3
+ "version": "0.2.12",
4
4
  "description": "Press Ctrl+Shift+R to dictate prompts into Pi using VolcEngine ASR",
5
5
  "type": "module",
6
6
  "keywords": [