npm - pi-voice-input - Versions diffs - 0.2.7 → 0.2.8 - Mend

pi-voice-input 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md +6 -4
package/extensions/voice-input.ts +226 -58
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -27,7 +27,7 @@ pi extension: extensions/voice-input.ts
   │    ├─ Linux preferred: pw-record
   │    ├─ Linux fallback: arecord
   │    └─ macOS: afrecord
-  ├─ records 16 kHz mono 16-bit WAV
+  ├─ records a temporary 16 kHz mono 16-bit WAV
   ├─ parses the WAV container in TypeScript and extracts raw PCM
   ├─ sends PCM frames to the configured ASR provider via ws
   │    └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
@@ -134,7 +134,7 @@ Slash commands:
 /voice start    # start recording
 /voice stop     # stop, transcribe, insert text
 /voice toggle   # start if idle, stop if recording
-/voice cancel   # stop recording without transcribing
+/voice cancel   # stop recording and discard local audio without transcribing
 /voice status   # show recorder state
 /voice config   # show effective non-secret config and whether API key is detected
 /voice init     # create or normalize ~/.pi/agent/voice-input.config.json
@@ -144,10 +144,12 @@ Slash commands:
 ## Notes
-- The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
+- The extension uses post-recording WebSocket ASR: it records locally to a per-run temporary WAV, sends the stopped recording in chunks, then deletes the temporary audio. It is optimized for fast voice input, not live subtitles.
 - The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
 - The transcript is inserted into the editor only; it is not submitted automatically.
-- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text. The final text is still pasted at the current cursor position without replacing the draft.
+- Recorder stdout/stderr is not logged to disk, to avoid retaining potentially sensitive runtime data.
+- On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
+- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
 - While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
 ## Development

package/extensions/voice-input.ts CHANGED Viewed

@@ -5,16 +5,17 @@ import { spawn, spawnSync } from "node:child_process";
 import { randomUUID } from "node:crypto";
 import {
   chmodSync,
-  closeSync,
   existsSync,
   mkdirSync,
-  openSync,
+  mkdtempSync,
   readFileSync,
+  readdirSync,
+  rmdirSync,
   statSync,
   unlinkSync,
   writeFileSync,
 } from "node:fs";
-import { homedir, platform } from "node:os";
+import { homedir, platform, tmpdir } from "node:os";
 import path from "node:path";
 import { gzipSync, gunzipSync } from "node:zlib";
 import WebSocket from "ws";
@@ -23,18 +24,25 @@ const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.jso
 const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
 const DEFAULT_SHORTCUT = Key.ctrlShift("r");
 const DEFAULT_POSTPROCESS_MODEL = "";
-const POSTPROCESS_SYSTEM_PROMPT = `你是 pi 语音输入插件的语音识别后处理器。你的唯一任务是润色原始 ASR 文本，使其成为可直接提交给编码智能体的用户指令。
-规则：
-- 只输出润色后的用户指令正文，不要输出解释、标题、前后缀、引号、代码围栏或寒暄。
-- 绝对不要回答、执行或解决用户语音中提出的问题；即使原始语音是问题，也只能把这个问题本身整理成清晰文本，不要给出答案、方案、代码或结论。
-- 以忠实保留用户信息为最高优先级。不要一味概括、压缩或简述；不要删除条件、约束、例子、数值、文件名、错误信息、多个请求、前后顺序或语气重点。
-- 结合上下文理解省略指代、当前任务、文件/项目名称和用户意图；上下文仅用于理解，不要重复上下文内容，除非原始语音明确要求引用或修改它。
-- 修正明显的语音识别错误、同音/近音错误、断句和标点错误；保留代码标识符、命令、路径、URL、模型名、包名和专有名词。
-- 如果用户口误后自我更正（例如“不是……是……”“不对……”“算了改成……”），只保留更正后的正确指令，删除错误说法和更正过程。
-- 让结果完整、符合逻辑、指令明确、有指导性；必要时拆成条目或步骤，但不得丢失原始信息。
-- 不要凭空添加原始语音没有表达的新需求；不确定时保留原意并用更清晰的措辞表达。
-- 输出语言必须跟随用户原始语音的主要语言，而不是上下文语言；不要因为上下文是中文/英文就把用户语音翻译成上下文语言。`;
+const POSTPROCESS_SYSTEM_PROMPT = `You are the speech-recognition postprocessor for the pi voice input extension. Your only job is to polish the raw ASR text into text that the plugin can paste verbatim at the current cursor position in the pi editor.
+Interaction contract:
+- The plugin does not replace editor content with your output. It only pastes/inserts your output at the user's current cursor position.
+- The current editor draft and recent conversation are context only. Use them to understand omitted references, the current task, file/project names, and intent. They are not text for you to rewrite and output as a whole.
+- Do not output the draft, a context sentence, or a full sentence/paragraph that represents the draft after insertion. Doing so would duplicate existing editor content.
+- You may not know the real cursor position. Do not guess the cursor location and synthesize a full surrounding sentence; the editor owns the real insertion point.
+- If the raw speech is adding a few words, half a sentence, a phrase, a condition, or a modifier, output only those newly spoken words. Let the paste operation merge them with the existing draft.
+- Only when the raw speech itself explicitly dictates a complete passage to insert may you output that complete passage. Even then, do not add draft text that the user did not speak.
+Rules:
+- Output only the polished insertion text. Do not output explanations, headings, prefixes, suffixes, quotes, code fences, or greetings.
+- Never answer, execute, or solve anything asked in the user's speech. If the raw speech is a question, only clean up the question text itself; do not provide an answer, plan, code, or conclusion.
+- Preserve the user's information faithfully. Do not over-summarize or compress. Do not delete constraints, examples, numbers, filenames, errors, multiple requests, ordering, or emphasis.
+- Correct obvious ASR mistakes, homophones, segmentation, and punctuation. Preserve code identifiers, commands, paths, URLs, model names, package names, and proper nouns.
+- If the user self-corrects, keep only the corrected intent and remove the false start, correction process, filler, and chatter. Do not lose any other substantive information.
+- Make the output complete relative to the raw speech, logically clear, and actionable. Split into items or steps when helpful, but do not drop raw-speech information or repeat existing draft text.
+- Do not invent requirements that the raw speech did not express. If uncertain, keep the original meaning and express it more clearly.
+- The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate just because the instructions are in English.`;
 const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
 const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
@@ -65,9 +73,7 @@ type VoiceConfig = {
   requestTimeoutMs: number;
   finalizeDelayMs: number;
   recorderTarget: string;
-  recordingsDir: string;
   statePath: string;
-  logDir: string;
   shortcut: string;
   enableItn: boolean;
   enablePunc: boolean;
@@ -83,7 +89,7 @@ type VoiceConfig = {
 type RecordingState = {
   pid: number;
   path: string;
-  logPath: string;
+  logPath?: string;
   startedAt: string;
   recorderTarget?: string;
   deviceName?: string;
@@ -169,9 +175,7 @@ function getConfig(): VoiceConfig {
     requestTimeoutMs: 90000,
     finalizeDelayMs: 100,
     recorderTarget: "",
-    recordingsDir: path.join(voiceHome, "recordings"),
     statePath: path.join(voiceHome, "recording.json"),
-    logDir: path.join(voiceHome, "logs"),
     shortcut: DEFAULT_SHORTCUT,
     enableItn: true,
     enablePunc: true,
@@ -347,6 +351,85 @@ function clearState(config: VoiceConfig) {
   }
 }
+function createRecordingPath(): string {
+  const dir = mkdtempSync(path.join(tmpdir(), "pi-voice-input-"));
+  chmodSync(dir, 0o700);
+  return path.join(dir, `recording-${timestampForFilename()}.wav`);
+}
+function deleteFileIfExists(filePath?: string): string | null {
+  if (!filePath) return null;
+  try {
+    unlinkSync(filePath);
+    return null;
+  } catch (error) {
+    if ((error as NodeJS.ErrnoException).code === "ENOENT") return null;
+    return `failed to delete ${filePath}: ${error instanceof Error ? error.message : String(error)}`;
+  }
+}
+function deleteTemporaryRecordingDir(filePath: string): string | null {
+  const dir = path.dirname(filePath);
+  const parent = path.dirname(dir);
+  if (path.resolve(parent) !== path.resolve(tmpdir()) || !path.basename(dir).startsWith("pi-voice-input-")) {
+    return null;
+  }
+  try {
+    rmdirSync(dir);
+    return null;
+  } catch (error) {
+    const code = (error as NodeJS.ErrnoException).code;
+    if (code === "ENOENT") return null;
+    return `failed to remove temporary directory ${dir}: ${error instanceof Error ? error.message : String(error)}`;
+  }
+}
+function cleanupRecordingArtifacts(state: Pick<RecordingState, "path" | "logPath">): string[] {
+  return [deleteFileIfExists(state.path), deleteFileIfExists(state.logPath), deleteTemporaryRecordingDir(state.path)].filter(
+    (message): message is string => Boolean(message),
+  );
+}
+function cleanupLegacyDirectory(dir: string, filePattern: RegExp, protectedPaths: Set<string>): string[] {
+  if (!existsSync(dir)) return [];
+  const warnings: string[] = [];
+  for (const entry of readdirSync(dir, { withFileTypes: true })) {
+    if (!entry.isFile() || !filePattern.test(entry.name)) continue;
+    const filePath = path.join(dir, entry.name);
+    if (protectedPaths.has(path.resolve(filePath))) continue;
+    const warning = deleteFileIfExists(filePath);
+    if (warning) warnings.push(warning);
+  }
+  try {
+    rmdirSync(dir);
+  } catch (error) {
+    const code = (error as NodeJS.ErrnoException).code;
+    if (code !== "ENOENT" && code !== "ENOTEMPTY") {
+      warnings.push(`failed to remove legacy directory ${dir}: ${error instanceof Error ? error.message : String(error)}`);
+    }
+  }
+  return warnings;
+}
+function cleanupLegacyStoredArtifacts(config: VoiceConfig): string[] {
+  const state = readState(config);
+  const protectedPaths = new Set<string>();
+  if (state && pidAlive(state.pid)) {
+    protectedPaths.add(path.resolve(state.path));
+    if (state.logPath) protectedPaths.add(path.resolve(state.logPath));
+  }
+  const voiceHome = path.dirname(config.statePath);
+  return [
+    ...cleanupLegacyDirectory(path.join(voiceHome, "recordings"), /^recording-.*\.wav$/, protectedPaths),
+    ...cleanupLegacyDirectory(path.join(voiceHome, "logs"), /^recording-.*\.log$/, protectedPaths),
+  ];
+}
 function pidAlive(pid: number): boolean {
   try {
     process.kill(pid, 0);
@@ -820,30 +903,67 @@ function cleanPostprocessOutput(output: string): string {
   let text = output.trim();
   const fence = text.match(/^```[a-zA-Z0-9_-]*\s*\n([\s\S]*?)\n```$/);
   if (fence) text = fence[1].trim();
-  text = text.replace(/^(?:优化后的(?:用户)?指令|整理后的(?:用户)?指令|改写后的(?:用户)?指令)\s*[：:]\s*/u, "").trim();
+  text = text.replace(/^(?:polished(?: user)? instruction|refined(?: user)? instruction|rewritten(?: user)? instruction|final(?: insertion)? text)\s*:\s*/iu, "").trim();
   return text;
 }
+function removeEditorDraftEcho(editorText: string, output: string): string {
+  const draft = editorText.trim();
+  const text = output.trim();
+  if (draft.length < 12 || text.length <= draft.length) return output;
+  let prefixLength = 0;
+  while (prefixLength < draft.length && prefixLength < text.length && draft[prefixLength] === text[prefixLength]) {
+    prefixLength += 1;
+  }
+  let suffixLength = 0;
+  while (
+    suffixLength < draft.length - prefixLength &&
+    suffixLength < text.length - prefixLength &&
+    draft[draft.length - 1 - suffixLength] === text[text.length - 1 - suffixLength]
+  ) {
+    suffixLength += 1;
+  }
+  if (prefixLength + suffixLength !== draft.length) return output;
+  const insertedText = text.slice(prefixLength, text.length - suffixLength).trim();
+  return insertedText || output;
+}
+function getFullEditorText(ctx: ExtensionContext): string {
+  try {
+    return ctx.ui.getEditorText();
+  } catch {
+    return "";
+  }
+}
 function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config: VoiceConfig): string {
   const contextBudget = config.postprocessContextChars;
   const editorContext = getEditorContext(ctx, Math.floor(contextBudget / 2));
   const sessionContext = getRecentSessionContext(ctx, Math.ceil(contextBudget / 2));
   return [
-    "请根据上下文只润色下面的原始语音识别结果。",
-    "如果上下文为空，直接依据原始文本润色。",
-    "不要回答原始语音里的问题，也不要执行其中的请求；只输出原始语音对应的最终用户指令文本。",
-    "输出语言必须跟随原始语音的主要语言，不要跟随上下文语言，也不要翻译成上下文语言。",
-    "务必忠实保留原始语音中的信息和细节，不要为了简洁而概括、压缩或删减。",
-    "当前输入框草稿只是上下文：语音文本会由插件插入到用户当前光标位置。不要重写、重复、补全、删除或替换草稿里的既有内容。",
+    "Polish only the raw ASR text below, using context only when it helps disambiguate the user's intent.",
+    "If context is empty or irrelevant, polish the raw text directly.",
+    "Do not answer the raw speech, and do not execute its request. Output only the final text that should be inserted into the editor.",
+    "The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate.",
+    "Faithfully preserve the information and details in the raw speech. Do not summarize, compress, or delete details merely for brevity.",
+    "IMPORTANT: your output will be pasted verbatim at the current cursor position. It is not a replacement and not a rewrite of the whole editor draft.",
+    "The current editor draft is context only. Do not rewrite, repeat, complete, delete, or replace existing draft text. Do not output the full sentence after insertion.",
+    "The true cursor position is not marked in the draft shown here; the pi editor owns the actual insertion point. Do not guess the cursor and synthesize a full surrounding sentence.",
+    "If the raw speech is an inline insertion, continuation, a few words, or a phrase, output only the newly spoken words or phrase.",
+    "Example: draft is `Please make this function async and [cursor].`, raw speech is `add error handling`, correct output is `add error handling`, not `Please make this function async and add error handling.`.",
+    "Example: draft is `This variable name is [cursor]unclear`, raw speech is `still`, correct output is `still`, not `This variable name is still unclear`.",
     "",
-    "--- 上下文：当前输入框未发送草稿 ---",
-    editorContext.trim() || "（空）",
+    "--- Context: current unsent editor draft (context only; do not output wholesale) ---",
+    editorContext.trim() || "(empty)",
     "",
-    "--- 上下文：最近会话 ---",
-    sessionContext || "（空）",
+    "--- Context: recent conversation ---",
+    sessionContext || "(empty)",
     "",
-    "--- 原始语音识别结果 ---",
+    "--- Raw ASR text ---",
     rawText.trim(),
   ].join("\n");
 }
@@ -890,7 +1010,7 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
   }
   const polished = cleanPostprocessOutput(extractAssistantText(response));
-  return polished || rawText;
+  return polished ? removeEditorDraftEcho(getFullEditorText(ctx), polished) : rawText;
 }
 function insertIntoEditor(ctx: ExtensionContext, text: string) {
@@ -904,6 +1024,14 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
   return Boolean(state && pidAlive(state.pid));
 }
+function cleanupStaleRecordingState(config: VoiceConfig): string[] {
+  const state = readState(config);
+  if (!state || pidAlive(state.pid)) return [];
+  const cleanupWarnings = cleanupRecordingArtifacts(state);
+  clearState(config);
+  return cleanupWarnings;
+}
 function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
   if (ctx.hasUI) return true;
   ctx.ui.notify(`Voice ${action} requires interactive pi UI. Use /voice config or /voice help for setup information.`, "error");
@@ -920,29 +1048,42 @@ async function startRecording(ctx: ExtensionContext) {
     ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("accent", recordingStatusText(deviceName)));
     return;
   }
-  if (existing) clearState(config);
+  if (existing) {
+    const cleanupWarnings = cleanupRecordingArtifacts(existing);
+    clearState(config);
+    if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
+  }
-  ensureDir(config.recordingsDir);
-  ensureDir(config.logDir);
-  const outputPath = path.join(config.recordingsDir, `recording-${timestampForFilename()}.wav`);
-  const logPath = path.join(config.logDir, `recording-${timestampForFilename()}.log`);
-  const cmd = recorderCommand(config, outputPath);
+  const outputPath = createRecordingPath();
+  let cmd: string[];
+  try {
+    cmd = recorderCommand(config, outputPath);
+  } catch (error) {
+    cleanupRecordingArtifacts({ path: outputPath });
+    throw error;
+  }
   const deviceName = recordingDeviceName(config, cmd[0]);
   ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
-  const logFd = openSync(logPath, "a");
-  const child = spawn(cmd[0], cmd.slice(1), {
-    detached: true,
-    stdio: ["ignore", logFd, logFd],
-  });
+  let child: ReturnType<typeof spawn>;
+  try {
+    child = spawn(cmd[0], cmd.slice(1), {
+      detached: true,
+      stdio: ["ignore", "ignore", "ignore"],
+    });
+  } catch (error) {
+    cleanupRecordingArtifacts({ path: outputPath });
+    throw error;
+  }
   child.unref();
-  closeSync(logFd);
-  if (!child.pid) throw new Error("Recorder failed to start: no pid returned");
+  if (!child.pid) {
+    cleanupRecordingArtifacts({ path: outputPath });
+    throw new Error("Recorder failed to start: no pid returned");
+  }
   writeState(config, {
     pid: child.pid,
     path: outputPath,
-    logPath,
     startedAt: new Date().toISOString(),
     recorderTarget: config.recorderTarget || undefined,
     deviceName,
@@ -966,21 +1107,41 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
   clearState(config);
   if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
-  if (!existsSync(state.path) || statSync(state.path).size === 0) {
-    const log = existsSync(state.logPath) ? readFileSync(state.logPath, "utf8") : "";
-    throw new Error(`Recording file missing/empty: ${state.path}\nRecorder log:\n${log}`);
-  }
   if (!transcribe) {
+    const cleanupWarnings = cleanupRecordingArtifacts(state);
     ctx.ui.setStatus("voice-input", undefined);
-    ctx.ui.notify(`Voice recording stopped: ${state.path}`, "info");
+    ctx.ui.notify(
+      cleanupWarnings.length
+        ? `Voice recording cancelled; local audio discard attempted, but cleanup had warnings:\n${cleanupWarnings.join("\n")}`
+        : "Voice recording cancelled; local audio discarded.",
+      cleanupWarnings.length ? "warning" : "info",
+    );
     return;
   }
+  if (!existsSync(state.path) || statSync(state.path).size === 0) {
+    const cleanupWarnings = cleanupRecordingArtifacts(state);
+    throw new Error(
+      `Recording file missing/empty: ${state.path}. Recorder output is not persisted for privacy.${
+        cleanupWarnings.length ? `\nCleanup warnings:\n${cleanupWarnings.join("\n")}` : ""
+      }`,
+    );
+  }
+  let decodeMs = 0;
+  let durationMs = 0;
+  let result: TranscriptionResult | undefined;
   const decodeStart = Date.now();
-  const { pcm, durationMs } = parseRecordedWav(state.path);
-  const decodeMs = Date.now() - decodeStart;
-  const result = await transcribePcm(pcm, durationMs, config);
+  try {
+    const recording = parseRecordedWav(state.path);
+    durationMs = recording.durationMs;
+    decodeMs = Date.now() - decodeStart;
+    result = await transcribePcm(recording.pcm, recording.durationMs, config);
+  } finally {
+    const cleanupWarnings = cleanupRecordingArtifacts(state);
+    if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
+  }
+  if (!result) throw new Error("Transcription failed before a result was produced");
   if (!result.text.trim()) {
     ctx.ui.setStatus("voice-input", undefined);
@@ -1148,7 +1309,14 @@ export default function (pi: ExtensionAPI) {
   });
   pi.on("session_start", (_event, ctx) => {
-    if (getConfig().apiKey) {
+    const currentConfig = getConfig();
+    const cleanupWarnings = [
+      ...cleanupStaleRecordingState(currentConfig),
+      ...cleanupLegacyStoredArtifacts(currentConfig),
+    ];
+    if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
+    if (currentConfig.apiKey) {
       ctx.ui.notify(`Voice input loaded: ${startupConfig.shortcut} toggles recording.`, "info");
       return;
     }
@@ -1156,7 +1324,7 @@ export default function (pi: ExtensionAPI) {
       [
         `Voice input loaded: ${startupConfig.shortcut} toggles recording.`,
         "API key is missing. Run /voice key to set it up, or edit the JSON config file.",
-        `Config file: ${startupConfig.configPath}`,
+        `Config file: ${currentConfig.configPath}`,
         `Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
       ].join("\n"),
       "warning",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pi-voice-input",
-  "version": "0.2.7",
+  "version": "0.2.8",
   "description": "Press Ctrl+Shift+R to dictate prompts into Pi using VolcEngine ASR",
   "type": "module",
   "keywords": [