pi-voice-input 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,13 +10,13 @@ A publishable, pure TypeScript [pi](https://pi.dev/) extension for Linux and mac
10
10
  Current scope:
11
11
 
12
12
  - Linux uses `pw-record` from PipeWire tools or `arecord` from alsa-utils.
13
- - macOS uses the system `afrecord` command. This path is implemented but not yet validated by the maintainer on macOS hardware.
13
+ - macOS uses `afrecord` when present, otherwise `ffmpeg` with AVFoundation.
14
14
  - A VolcEngine Speech API key is required.
15
15
  - This is not a local/offline ASR engine.
16
16
 
17
17
  The provider layer is intended to be extensible. **Current version supports only VolcEngine WebSocket ASR.**
18
18
 
19
- No Python, `uv`, upload service, or `ffmpeg` is required for normal shortcut usage.
19
+ No Python, `uv`, or upload service is required for normal shortcut usage. On macOS systems without `afrecord`, install `ffmpeg` for recording.
20
20
 
21
21
  ## Architecture
22
22
 
@@ -26,8 +26,8 @@ pi extension: extensions/voice-input.ts
26
26
  ├─ starts/stops a local recorder process
27
27
  │ ├─ Linux preferred: pw-record
28
28
  │ ├─ Linux fallback: arecord
29
- │ └─ macOS: afrecord
30
- ├─ records 16 kHz mono 16-bit WAV
29
+ │ └─ macOS: afrecord, or ffmpeg/AVFoundation fallback
30
+ ├─ records a temporary 16 kHz mono 16-bit WAV
31
31
  ├─ parses the WAV container in TypeScript and extracts raw PCM
32
32
  ├─ sends PCM frames to the configured ASR provider via ws
33
33
  │ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
@@ -44,9 +44,9 @@ System dependency, one of:
44
44
 
45
45
  - Linux: `pw-record` from PipeWire tools, preferred
46
46
  - Linux: `arecord` from alsa-utils, fallback
47
- - macOS: `afrecord`, included with macOS
47
+ - macOS: `afrecord` when present, or `ffmpeg` from Homebrew (`brew install ffmpeg`) as the AVFoundation fallback
48
48
 
49
- On macOS, grant Terminal or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
49
+ On macOS, grant Terminal, ffmpeg, or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
50
50
 
51
51
  ## Install / Update
52
52
 
@@ -134,7 +134,7 @@ Slash commands:
134
134
  /voice start # start recording
135
135
  /voice stop # stop, transcribe, insert text
136
136
  /voice toggle # start if idle, stop if recording
137
- /voice cancel # stop recording without transcribing
137
+ /voice cancel # stop recording and discard local audio without transcribing
138
138
  /voice status # show recorder state
139
139
  /voice config # show effective non-secret config and whether API key is detected
140
140
  /voice init # create or normalize ~/.pi/agent/voice-input.config.json
@@ -144,10 +144,12 @@ Slash commands:
144
144
 
145
145
  ## Notes
146
146
 
147
- - The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
147
+ - The extension uses post-recording WebSocket ASR: it records locally to a per-run temporary WAV, sends the stopped recording in chunks, then deletes the temporary audio. It is optimized for fast voice input, not live subtitles.
148
148
  - The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
149
149
  - The transcript is inserted into the editor only; it is not submitted automatically.
150
- - When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text. The final text is still pasted at the current cursor position without replacing the draft.
150
+ - Recorder stdout/stderr is not logged to disk, to avoid retaining potentially sensitive runtime data.
151
+ - On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
152
+ - When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
151
153
  - While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
152
154
 
153
155
  ## Development
@@ -5,16 +5,17 @@ import { spawn, spawnSync } from "node:child_process";
5
5
  import { randomUUID } from "node:crypto";
6
6
  import {
7
7
  chmodSync,
8
- closeSync,
9
8
  existsSync,
10
9
  mkdirSync,
11
- openSync,
10
+ mkdtempSync,
12
11
  readFileSync,
12
+ readdirSync,
13
+ rmdirSync,
13
14
  statSync,
14
15
  unlinkSync,
15
16
  writeFileSync,
16
17
  } from "node:fs";
17
- import { homedir, platform } from "node:os";
18
+ import { homedir, platform, tmpdir } from "node:os";
18
19
  import path from "node:path";
19
20
  import { gzipSync, gunzipSync } from "node:zlib";
20
21
  import WebSocket from "ws";
@@ -23,18 +24,25 @@ const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.jso
23
24
  const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
24
25
  const DEFAULT_SHORTCUT = Key.ctrlShift("r");
25
26
  const DEFAULT_POSTPROCESS_MODEL = "";
26
- const POSTPROCESS_SYSTEM_PROMPT = `你是 pi 语音输入插件的语音识别后处理器。你的唯一任务是润色原始 ASR 文本,使其成为可直接提交给编码智能体的用户指令。
27
-
28
- 规则:
29
- - 只输出润色后的用户指令正文,不要输出解释、标题、前后缀、引号、代码围栏或寒暄。
30
- - 绝对不要回答、执行或解决用户语音中提出的问题;即使原始语音是问题,也只能把这个问题本身整理成清晰文本,不要给出答案、方案、代码或结论。
31
- - 以忠实保留用户信息为最高优先级。不要一味概括、压缩或简述;不要删除条件、约束、例子、数值、文件名、错误信息、多个请求、前后顺序或语气重点。
32
- - 结合上下文理解省略指代、当前任务、文件/项目名称和用户意图;上下文仅用于理解,不要重复上下文内容,除非原始语音明确要求引用或修改它。
33
- - 修正明显的语音识别错误、同音/近音错误、断句和标点错误;保留代码标识符、命令、路径、URL、模型名、包名和专有名词。
34
- - 如果用户口误后自我更正(例如“不是……是……”“不对……”“算了改成……”),只保留更正后的正确指令,删除错误说法和更正过程。
35
- - 让结果完整、符合逻辑、指令明确、有指导性;必要时拆成条目或步骤,但不得丢失原始信息。
36
- - 不要凭空添加原始语音没有表达的新需求;不确定时保留原意并用更清晰的措辞表达。
37
- - 输出语言必须跟随用户原始语音的主要语言,而不是上下文语言;不要因为上下文是中文/英文就把用户语音翻译成上下文语言。`;
27
+ const POSTPROCESS_SYSTEM_PROMPT = `You are the speech-recognition postprocessor for the pi voice input extension. Your only job is to polish the raw ASR text into text that the plugin can paste verbatim at the current cursor position in the pi editor.
28
+
29
+ Interaction contract:
30
+ - The plugin does not replace editor content with your output. It only pastes/inserts your output at the user's current cursor position.
31
+ - The current editor draft and recent conversation are context only. Use them to understand omitted references, the current task, file/project names, and intent. They are not text for you to rewrite and output as a whole.
32
+ - Do not output the draft, a context sentence, or a full sentence/paragraph that represents the draft after insertion. Doing so would duplicate existing editor content.
33
+ - You may not know the real cursor position. Do not guess the cursor location and synthesize a full surrounding sentence; the editor owns the real insertion point.
34
+ - If the raw speech is adding a few words, half a sentence, a phrase, a condition, or a modifier, output only those newly spoken words. Let the paste operation merge them with the existing draft.
35
+ - Only when the raw speech itself explicitly dictates a complete passage to insert may you output that complete passage. Even then, do not add draft text that the user did not speak.
36
+
37
+ Rules:
38
+ - Output only the polished insertion text. Do not output explanations, headings, prefixes, suffixes, quotes, code fences, or greetings.
39
+ - Never answer, execute, or solve anything asked in the user's speech. If the raw speech is a question, only clean up the question text itself; do not provide an answer, plan, code, or conclusion.
40
+ - Preserve the user's information faithfully. Do not over-summarize or compress. Do not delete constraints, examples, numbers, filenames, errors, multiple requests, ordering, or emphasis.
41
+ - Correct obvious ASR mistakes, homophones, segmentation, and punctuation. Preserve code identifiers, commands, paths, URLs, model names, package names, and proper nouns.
42
+ - If the user self-corrects, keep only the corrected intent and remove the false start, correction process, filler, and chatter. Do not lose any other substantive information.
43
+ - Make the output complete relative to the raw speech, logically clear, and actionable. Split into items or steps when helpful, but do not drop raw-speech information or repeat existing draft text.
44
+ - Do not invent requirements that the raw speech did not express. If uncertain, keep the original meaning and express it more clearly.
45
+ - The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate just because the instructions are in English.`;
38
46
 
39
47
  const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
40
48
  const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
@@ -65,9 +73,7 @@ type VoiceConfig = {
65
73
  requestTimeoutMs: number;
66
74
  finalizeDelayMs: number;
67
75
  recorderTarget: string;
68
- recordingsDir: string;
69
76
  statePath: string;
70
- logDir: string;
71
77
  shortcut: string;
72
78
  enableItn: boolean;
73
79
  enablePunc: boolean;
@@ -83,7 +89,7 @@ type VoiceConfig = {
83
89
  type RecordingState = {
84
90
  pid: number;
85
91
  path: string;
86
- logPath: string;
92
+ logPath?: string;
87
93
  startedAt: string;
88
94
  recorderTarget?: string;
89
95
  deviceName?: string;
@@ -169,9 +175,7 @@ function getConfig(): VoiceConfig {
169
175
  requestTimeoutMs: 90000,
170
176
  finalizeDelayMs: 100,
171
177
  recorderTarget: "",
172
- recordingsDir: path.join(voiceHome, "recordings"),
173
178
  statePath: path.join(voiceHome, "recording.json"),
174
- logDir: path.join(voiceHome, "logs"),
175
179
  shortcut: DEFAULT_SHORTCUT,
176
180
  enableItn: true,
177
181
  enablePunc: true,
@@ -213,7 +217,10 @@ function commandOutput(command: string, args: string[], timeoutMs = 1500): strin
213
217
  }
214
218
 
215
219
  function selectRecorderExecutable(): string {
216
- if (platform() === "darwin" && commandExists("afrecord")) return "afrecord";
220
+ if (platform() === "darwin") {
221
+ if (commandExists("afrecord")) return "afrecord";
222
+ if (commandExists("ffmpeg")) return "ffmpeg";
223
+ }
217
224
  if (commandExists("pw-record")) return "pw-record";
218
225
  if (commandExists("arecord")) return "arecord";
219
226
  return "";
@@ -233,7 +240,31 @@ function recorderCommand(config: VoiceConfig, outputPath: string): string[] {
233
240
  if (executable === "afrecord") {
234
241
  return ["afrecord", "-f", "WAVE", "-d", "LEI16@16000", "-c", "1", outputPath];
235
242
  }
236
- throw new Error("No recorder found. On Linux, install PipeWire tools (pw-record) or alsa-utils (arecord). On macOS, afrecord should be available with the system.");
243
+ if (executable === "ffmpeg" && platform() === "darwin") {
244
+ return [
245
+ "ffmpeg",
246
+ "-hide_banner",
247
+ "-loglevel",
248
+ "error",
249
+ "-nostdin",
250
+ "-y",
251
+ "-f",
252
+ "avfoundation",
253
+ "-i",
254
+ config.recorderTarget || "none:default",
255
+ "-vn",
256
+ "-acodec",
257
+ "pcm_s16le",
258
+ "-ar",
259
+ "16000",
260
+ "-ac",
261
+ "1",
262
+ "-f",
263
+ "wav",
264
+ outputPath,
265
+ ];
266
+ }
267
+ throw new Error("No recorder found. On Linux, install PipeWire tools (pw-record) or alsa-utils (arecord). On macOS, install ffmpeg (brew install ffmpeg) if afrecord is not available.");
237
268
  }
238
269
 
239
270
  type PipeWireSource = {
@@ -322,6 +353,7 @@ function recordingDeviceName(config: VoiceConfig, recorderExecutable: string): s
322
353
  if (recorderExecutable === "pw-record") return pipeWireSourceName(config.recorderTarget);
323
354
  if (recorderExecutable === "arecord") return "ALSA default microphone";
324
355
  if (recorderExecutable === "afrecord") return "macOS default microphone";
356
+ if (recorderExecutable === "ffmpeg" && platform() === "darwin") return "macOS default microphone (ffmpeg/AVFoundation)";
325
357
  return config.recorderTarget || "default microphone";
326
358
  }
327
359
 
@@ -347,6 +379,85 @@ function clearState(config: VoiceConfig) {
347
379
  }
348
380
  }
349
381
 
382
+ function createRecordingPath(): string {
383
+ const dir = mkdtempSync(path.join(tmpdir(), "pi-voice-input-"));
384
+ chmodSync(dir, 0o700);
385
+ return path.join(dir, `recording-${timestampForFilename()}.wav`);
386
+ }
387
+
388
+ function deleteFileIfExists(filePath?: string): string | null {
389
+ if (!filePath) return null;
390
+ try {
391
+ unlinkSync(filePath);
392
+ return null;
393
+ } catch (error) {
394
+ if ((error as NodeJS.ErrnoException).code === "ENOENT") return null;
395
+ return `failed to delete ${filePath}: ${error instanceof Error ? error.message : String(error)}`;
396
+ }
397
+ }
398
+
399
+ function deleteTemporaryRecordingDir(filePath: string): string | null {
400
+ const dir = path.dirname(filePath);
401
+ const parent = path.dirname(dir);
402
+ if (path.resolve(parent) !== path.resolve(tmpdir()) || !path.basename(dir).startsWith("pi-voice-input-")) {
403
+ return null;
404
+ }
405
+
406
+ try {
407
+ rmdirSync(dir);
408
+ return null;
409
+ } catch (error) {
410
+ const code = (error as NodeJS.ErrnoException).code;
411
+ if (code === "ENOENT") return null;
412
+ return `failed to remove temporary directory ${dir}: ${error instanceof Error ? error.message : String(error)}`;
413
+ }
414
+ }
415
+
416
+ function cleanupRecordingArtifacts(state: Pick<RecordingState, "path" | "logPath">): string[] {
417
+ return [deleteFileIfExists(state.path), deleteFileIfExists(state.logPath), deleteTemporaryRecordingDir(state.path)].filter(
418
+ (message): message is string => Boolean(message),
419
+ );
420
+ }
421
+
422
+ function cleanupLegacyDirectory(dir: string, filePattern: RegExp, protectedPaths: Set<string>): string[] {
423
+ if (!existsSync(dir)) return [];
424
+ const warnings: string[] = [];
425
+
426
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
427
+ if (!entry.isFile() || !filePattern.test(entry.name)) continue;
428
+ const filePath = path.join(dir, entry.name);
429
+ if (protectedPaths.has(path.resolve(filePath))) continue;
430
+ const warning = deleteFileIfExists(filePath);
431
+ if (warning) warnings.push(warning);
432
+ }
433
+
434
+ try {
435
+ rmdirSync(dir);
436
+ } catch (error) {
437
+ const code = (error as NodeJS.ErrnoException).code;
438
+ if (code !== "ENOENT" && code !== "ENOTEMPTY") {
439
+ warnings.push(`failed to remove legacy directory ${dir}: ${error instanceof Error ? error.message : String(error)}`);
440
+ }
441
+ }
442
+
443
+ return warnings;
444
+ }
445
+
446
+ function cleanupLegacyStoredArtifacts(config: VoiceConfig): string[] {
447
+ const state = readState(config);
448
+ const protectedPaths = new Set<string>();
449
+ if (state && pidAlive(state.pid)) {
450
+ protectedPaths.add(path.resolve(state.path));
451
+ if (state.logPath) protectedPaths.add(path.resolve(state.logPath));
452
+ }
453
+
454
+ const voiceHome = path.dirname(config.statePath);
455
+ return [
456
+ ...cleanupLegacyDirectory(path.join(voiceHome, "recordings"), /^recording-.*\.wav$/, protectedPaths),
457
+ ...cleanupLegacyDirectory(path.join(voiceHome, "logs"), /^recording-.*\.log$/, protectedPaths),
458
+ ];
459
+ }
460
+
350
461
  function pidAlive(pid: number): boolean {
351
462
  try {
352
463
  process.kill(pid, 0);
@@ -820,30 +931,67 @@ function cleanPostprocessOutput(output: string): string {
820
931
  let text = output.trim();
821
932
  const fence = text.match(/^```[a-zA-Z0-9_-]*\s*\n([\s\S]*?)\n```$/);
822
933
  if (fence) text = fence[1].trim();
823
- text = text.replace(/^(?:优化后的(?:用户)?指令|整理后的(?:用户)?指令|改写后的(?:用户)?指令)\s*[::]\s*/u, "").trim();
934
+ text = text.replace(/^(?:polished(?: user)? instruction|refined(?: user)? instruction|rewritten(?: user)? instruction|final(?: insertion)? text)\s*:\s*/iu, "").trim();
824
935
  return text;
825
936
  }
826
937
 
938
+ function removeEditorDraftEcho(editorText: string, output: string): string {
939
+ const draft = editorText.trim();
940
+ const text = output.trim();
941
+ if (draft.length < 12 || text.length <= draft.length) return output;
942
+
943
+ let prefixLength = 0;
944
+ while (prefixLength < draft.length && prefixLength < text.length && draft[prefixLength] === text[prefixLength]) {
945
+ prefixLength += 1;
946
+ }
947
+
948
+ let suffixLength = 0;
949
+ while (
950
+ suffixLength < draft.length - prefixLength &&
951
+ suffixLength < text.length - prefixLength &&
952
+ draft[draft.length - 1 - suffixLength] === text[text.length - 1 - suffixLength]
953
+ ) {
954
+ suffixLength += 1;
955
+ }
956
+
957
+ if (prefixLength + suffixLength !== draft.length) return output;
958
+ const insertedText = text.slice(prefixLength, text.length - suffixLength).trim();
959
+ return insertedText || output;
960
+ }
961
+
962
+ function getFullEditorText(ctx: ExtensionContext): string {
963
+ try {
964
+ return ctx.ui.getEditorText();
965
+ } catch {
966
+ return "";
967
+ }
968
+ }
969
+
827
970
  function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config: VoiceConfig): string {
828
971
  const contextBudget = config.postprocessContextChars;
829
972
  const editorContext = getEditorContext(ctx, Math.floor(contextBudget / 2));
830
973
  const sessionContext = getRecentSessionContext(ctx, Math.ceil(contextBudget / 2));
831
974
 
832
975
  return [
833
- "请根据上下文只润色下面的原始语音识别结果。",
834
- "如果上下文为空,直接依据原始文本润色。",
835
- "不要回答原始语音里的问题,也不要执行其中的请求;只输出原始语音对应的最终用户指令文本。",
836
- "输出语言必须跟随原始语音的主要语言,不要跟随上下文语言,也不要翻译成上下文语言。",
837
- "务必忠实保留原始语音中的信息和细节,不要为了简洁而概括、压缩或删减。",
838
- "当前输入框草稿只是上下文:语音文本会由插件插入到用户当前光标位置。不要重写、重复、补全、删除或替换草稿里的既有内容。",
976
+ "Polish only the raw ASR text below, using context only when it helps disambiguate the user's intent.",
977
+ "If context is empty or irrelevant, polish the raw text directly.",
978
+ "Do not answer the raw speech, and do not execute its request. Output only the final text that should be inserted into the editor.",
979
+ "The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate.",
980
+ "Faithfully preserve the information and details in the raw speech. Do not summarize, compress, or delete details merely for brevity.",
981
+ "IMPORTANT: your output will be pasted verbatim at the current cursor position. It is not a replacement and not a rewrite of the whole editor draft.",
982
+ "The current editor draft is context only. Do not rewrite, repeat, complete, delete, or replace existing draft text. Do not output the full sentence after insertion.",
983
+ "The true cursor position is not marked in the draft shown here; the pi editor owns the actual insertion point. Do not guess the cursor and synthesize a full surrounding sentence.",
984
+ "If the raw speech is an inline insertion, continuation, a few words, or a phrase, output only the newly spoken words or phrase.",
985
+ "Example: draft is `Please make this function async and [cursor].`, raw speech is `add error handling`, correct output is `add error handling`, not `Please make this function async and add error handling.`.",
986
+ "Example: draft is `This variable name is [cursor]unclear`, raw speech is `still`, correct output is `still`, not `This variable name is still unclear`.",
839
987
  "",
840
- "--- 上下文:当前输入框未发送草稿 ---",
841
- editorContext.trim() || "(空)",
988
+ "--- Context: current unsent editor draft (context only; do not output wholesale) ---",
989
+ editorContext.trim() || "(empty)",
842
990
  "",
843
- "--- 上下文:最近会话 ---",
844
- sessionContext || "(空)",
991
+ "--- Context: recent conversation ---",
992
+ sessionContext || "(empty)",
845
993
  "",
846
- "--- 原始语音识别结果 ---",
994
+ "--- Raw ASR text ---",
847
995
  rawText.trim(),
848
996
  ].join("\n");
849
997
  }
@@ -890,7 +1038,7 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
890
1038
  }
891
1039
 
892
1040
  const polished = cleanPostprocessOutput(extractAssistantText(response));
893
- return polished || rawText;
1041
+ return polished ? removeEditorDraftEcho(getFullEditorText(ctx), polished) : rawText;
894
1042
  }
895
1043
 
896
1044
  function insertIntoEditor(ctx: ExtensionContext, text: string) {
@@ -904,6 +1052,14 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
904
1052
  return Boolean(state && pidAlive(state.pid));
905
1053
  }
906
1054
 
1055
+ function cleanupStaleRecordingState(config: VoiceConfig): string[] {
1056
+ const state = readState(config);
1057
+ if (!state || pidAlive(state.pid)) return [];
1058
+ const cleanupWarnings = cleanupRecordingArtifacts(state);
1059
+ clearState(config);
1060
+ return cleanupWarnings;
1061
+ }
1062
+
907
1063
  function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
908
1064
  if (ctx.hasUI) return true;
909
1065
  ctx.ui.notify(`Voice ${action} requires interactive pi UI. Use /voice config or /voice help for setup information.`, "error");
@@ -920,29 +1076,42 @@ async function startRecording(ctx: ExtensionContext) {
920
1076
  ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("accent", recordingStatusText(deviceName)));
921
1077
  return;
922
1078
  }
923
- if (existing) clearState(config);
1079
+ if (existing) {
1080
+ const cleanupWarnings = cleanupRecordingArtifacts(existing);
1081
+ clearState(config);
1082
+ if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
1083
+ }
924
1084
 
925
- ensureDir(config.recordingsDir);
926
- ensureDir(config.logDir);
927
- const outputPath = path.join(config.recordingsDir, `recording-${timestampForFilename()}.wav`);
928
- const logPath = path.join(config.logDir, `recording-${timestampForFilename()}.log`);
929
- const cmd = recorderCommand(config, outputPath);
1085
+ const outputPath = createRecordingPath();
1086
+ let cmd: string[];
1087
+ try {
1088
+ cmd = recorderCommand(config, outputPath);
1089
+ } catch (error) {
1090
+ cleanupRecordingArtifacts({ path: outputPath });
1091
+ throw error;
1092
+ }
930
1093
  const deviceName = recordingDeviceName(config, cmd[0]);
931
1094
 
932
1095
  ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
933
- const logFd = openSync(logPath, "a");
934
- const child = spawn(cmd[0], cmd.slice(1), {
935
- detached: true,
936
- stdio: ["ignore", logFd, logFd],
937
- });
1096
+ let child: ReturnType<typeof spawn>;
1097
+ try {
1098
+ child = spawn(cmd[0], cmd.slice(1), {
1099
+ detached: true,
1100
+ stdio: ["ignore", "ignore", "ignore"],
1101
+ });
1102
+ } catch (error) {
1103
+ cleanupRecordingArtifacts({ path: outputPath });
1104
+ throw error;
1105
+ }
938
1106
  child.unref();
939
- closeSync(logFd);
940
1107
 
941
- if (!child.pid) throw new Error("Recorder failed to start: no pid returned");
1108
+ if (!child.pid) {
1109
+ cleanupRecordingArtifacts({ path: outputPath });
1110
+ throw new Error("Recorder failed to start: no pid returned");
1111
+ }
942
1112
  writeState(config, {
943
1113
  pid: child.pid,
944
1114
  path: outputPath,
945
- logPath,
946
1115
  startedAt: new Date().toISOString(),
947
1116
  recorderTarget: config.recorderTarget || undefined,
948
1117
  deviceName,
@@ -966,21 +1135,41 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
966
1135
  clearState(config);
967
1136
  if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
968
1137
 
969
- if (!existsSync(state.path) || statSync(state.path).size === 0) {
970
- const log = existsSync(state.logPath) ? readFileSync(state.logPath, "utf8") : "";
971
- throw new Error(`Recording file missing/empty: ${state.path}\nRecorder log:\n${log}`);
972
- }
973
-
974
1138
  if (!transcribe) {
1139
+ const cleanupWarnings = cleanupRecordingArtifacts(state);
975
1140
  ctx.ui.setStatus("voice-input", undefined);
976
- ctx.ui.notify(`Voice recording stopped: ${state.path}`, "info");
1141
+ ctx.ui.notify(
1142
+ cleanupWarnings.length
1143
+ ? `Voice recording cancelled; local audio discard attempted, but cleanup had warnings:\n${cleanupWarnings.join("\n")}`
1144
+ : "Voice recording cancelled; local audio discarded.",
1145
+ cleanupWarnings.length ? "warning" : "info",
1146
+ );
977
1147
  return;
978
1148
  }
979
1149
 
1150
+ if (!existsSync(state.path) || statSync(state.path).size === 0) {
1151
+ const cleanupWarnings = cleanupRecordingArtifacts(state);
1152
+ throw new Error(
1153
+ `Recording file missing/empty: ${state.path}. Recorder output is not persisted for privacy.${
1154
+ cleanupWarnings.length ? `\nCleanup warnings:\n${cleanupWarnings.join("\n")}` : ""
1155
+ }`,
1156
+ );
1157
+ }
1158
+
1159
+ let decodeMs = 0;
1160
+ let durationMs = 0;
1161
+ let result: TranscriptionResult | undefined;
980
1162
  const decodeStart = Date.now();
981
- const { pcm, durationMs } = parseRecordedWav(state.path);
982
- const decodeMs = Date.now() - decodeStart;
983
- const result = await transcribePcm(pcm, durationMs, config);
1163
+ try {
1164
+ const recording = parseRecordedWav(state.path);
1165
+ durationMs = recording.durationMs;
1166
+ decodeMs = Date.now() - decodeStart;
1167
+ result = await transcribePcm(recording.pcm, recording.durationMs, config);
1168
+ } finally {
1169
+ const cleanupWarnings = cleanupRecordingArtifacts(state);
1170
+ if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
1171
+ }
1172
+ if (!result) throw new Error("Transcription failed before a result was produced");
984
1173
 
985
1174
  if (!result.text.trim()) {
986
1175
  ctx.ui.setStatus("voice-input", undefined);
@@ -1148,7 +1337,14 @@ export default function (pi: ExtensionAPI) {
1148
1337
  });
1149
1338
 
1150
1339
  pi.on("session_start", (_event, ctx) => {
1151
- if (getConfig().apiKey) {
1340
+ const currentConfig = getConfig();
1341
+ const cleanupWarnings = [
1342
+ ...cleanupStaleRecordingState(currentConfig),
1343
+ ...cleanupLegacyStoredArtifacts(currentConfig),
1344
+ ];
1345
+ if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
1346
+
1347
+ if (currentConfig.apiKey) {
1152
1348
  ctx.ui.notify(`Voice input loaded: ${startupConfig.shortcut} toggles recording.`, "info");
1153
1349
  return;
1154
1350
  }
@@ -1156,7 +1352,7 @@ export default function (pi: ExtensionAPI) {
1156
1352
  [
1157
1353
  `Voice input loaded: ${startupConfig.shortcut} toggles recording.`,
1158
1354
  "API key is missing. Run /voice key to set it up, or edit the JSON config file.",
1159
- `Config file: ${startupConfig.configPath}`,
1355
+ `Config file: ${currentConfig.configPath}`,
1160
1356
  `Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
1161
1357
  ].join("\n"),
1162
1358
  "warning",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-voice-input",
3
- "version": "0.2.7",
3
+ "version": "0.2.9",
4
4
  "description": "Press Ctrl+Shift+R to dictate prompts into Pi using VolcEngine ASR",
5
5
  "type": "module",
6
6
  "keywords": [