pi-voice-input 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,12 +1,18 @@
1
1
  # pi Voice Input
2
2
 
3
- A publishable, pure TypeScript [pi](https://pi.dev/) extension for local voice input.
3
+ A publishable, pure TypeScript [pi](https://pi.dev/) extension for Linux voice dictation into pi's editor.
4
4
 
5
5
  - Press `Ctrl+Shift+R` once to start recording.
6
6
  - Press `Ctrl+Shift+R` again to stop.
7
- - The extension sends the audio to an ASR provider.
7
+ - The extension sends the audio to VolcEngine WebSocket ASR.
8
8
  - The recognized text is inserted into pi's editor without submitting.
9
9
 
10
+ Current scope:
11
+
12
+ - Linux only for now, using `pw-record` from PipeWire tools or `arecord` from alsa-utils.
13
+ - A VolcEngine Speech API key is required.
14
+ - This is not a local/offline ASR engine.
15
+
10
16
  The provider layer is intended to be extensible. **Current version supports only VolcEngine WebSocket ASR.**
11
17
 
12
18
  No Python, `uv`, upload service, or `ffmpeg` is required for normal shortcut usage.
@@ -24,8 +30,8 @@ pi extension: extensions/voice-input.ts
24
30
  ├─ sends PCM frames to the configured ASR provider via ws
25
31
  │ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
26
32
  ├─ optionally post-processes raw ASR text with a configured pi model
27
- │ └─ default: deepseek/deepseek-v4-flash, no reasoning option
28
- └─ appends the final transcript to pi's editor with ctx.ui.setEditorText()
33
+ │ └─ default: disabled; set polishModel to enable it
34
+ └─ pastes the final transcript into pi's editor
29
35
  ```
30
36
 
31
37
  Runtime package dependency:
@@ -89,11 +95,11 @@ The config file is plain JSON and can be edited directly:
89
95
  ```json
90
96
  {
91
97
  "volcApiKey": "",
92
- "polishModel": "deepseek/deepseek-v4-flash"
98
+ "polishModel": ""
93
99
  }
94
100
  ```
95
101
 
96
- `polishModel` is resolved from pi's model registry, so any model shown by `pi --list-models` can be used. Leave it empty to disable polish. If polishing fails, the raw ASR transcript is inserted instead.
102
+ `polishModel` is disabled by default. Set it to any model shown by `pi --list-models` to enable transcript polish. If polishing fails, the raw ASR transcript is inserted instead.
97
103
 
98
104
  Verify the effective non-secret config:
99
105
 
@@ -128,7 +134,7 @@ Slash commands:
128
134
  - The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
129
135
  - The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
130
136
  - The transcript is inserted into the editor only; it is not submitted automatically.
131
- - When `polishModel` is set, polishing uses the current editor content and recent session messages as context, but outputs only the refined user instruction.
137
+ - When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text. The final text is still pasted at the current cursor position without replacing the draft.
132
138
  - While recording, the status line and tool panel show `Recording with [device name]`.
133
139
 
134
140
  ## Development
@@ -159,6 +165,10 @@ After changing the extension while pi is open, run:
159
165
  /reload
160
166
  ```
161
167
 
168
+ ## Roadmap
169
+
170
+ See [ROADMAP.md](ROADMAP.md) for planned user-visible work, including macOS support.
171
+
162
172
  ## Links
163
173
 
164
174
  - API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
package/ROADMAP.md ADDED
@@ -0,0 +1,30 @@
1
+ # Roadmap
2
+
3
+ This roadmap lists user-visible work planned for pi Voice Input. It is intentionally short so users can quickly understand what is supported now and what is coming next.
4
+
5
+ ## Current support
6
+
7
+ - Linux voice input through `pw-record` or `arecord`
8
+ - VolcEngine WebSocket ASR
9
+ - Optional transcript polish through a configured pi model
10
+
11
+ ## Planned
12
+
13
+ ### macOS support
14
+
15
+ Add first-class macOS recording support so users can dictate into pi without PipeWire or ALSA.
16
+
17
+ Expected direction:
18
+
19
+ - use a macOS-native recording command or a small bundled recorder helper
20
+ - keep the existing user workflow: press `Ctrl+Shift+R` to start, press it again to stop and insert text
21
+ - document required microphone permissions clearly
22
+ - preserve the same config file and ASR provider behavior where possible
23
+
24
+ Status: planned, not yet implemented.
25
+
26
+ ## Later candidates
27
+
28
+ - additional ASR providers
29
+ - configurable shortcut
30
+ - better provider setup diagnostics
@@ -22,7 +22,7 @@ import WebSocket from "ws";
22
22
  const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.json");
23
23
  const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
24
24
  const DEFAULT_SHORTCUT = Key.ctrlShift("r");
25
- const DEFAULT_POSTPROCESS_MODEL = "deepseek/deepseek-v4-flash";
25
+ const DEFAULT_POSTPROCESS_MODEL = "";
26
26
  const POSTPROCESS_SYSTEM_PROMPT = `你是 pi 语音输入插件的语音识别后处理器。你的唯一任务是润色原始 ASR 文本,使其成为可直接提交给编码智能体的用户指令。
27
27
 
28
28
  规则:
@@ -717,7 +717,7 @@ function textFromContent(content: unknown): string {
717
717
  function getEditorContext(ctx: ExtensionContext, maxChars: number): string {
718
718
  if (maxChars <= 0) return "";
719
719
  try {
720
- return tailText(ctx.ui.getEditorText().trim(), maxChars);
720
+ return tailText(ctx.ui.getEditorText(), maxChars);
721
721
  } catch {
722
722
  return "";
723
723
  }
@@ -823,9 +823,10 @@ function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config:
823
823
  "不要回答原始语音里的问题,也不要执行其中的请求;只输出原始语音对应的最终用户指令文本。",
824
824
  "输出语言必须跟随原始语音的主要语言,不要跟随上下文语言,也不要翻译成上下文语言。",
825
825
  "务必忠实保留原始语音中的信息和细节,不要为了简洁而概括、压缩或删减。",
826
+ "当前输入框草稿只是上下文:语音文本会由插件插入到用户当前光标位置。不要重写、重复、补全、删除或替换草稿里的既有内容。",
826
827
  "",
827
- "--- 上下文:当前编辑器已有内容 ---",
828
- editorContext || "(空)",
828
+ "--- 上下文:当前输入框未发送草稿 ---",
829
+ editorContext.trim() || "(空)",
829
830
  "",
830
831
  "--- 上下文:最近会话 ---",
831
832
  sessionContext || "(空)",
@@ -891,7 +892,14 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
891
892
  return Boolean(state && pidAlive(state.pid));
892
893
  }
893
894
 
895
+ function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
896
+ if (ctx.hasUI) return true;
897
+ ctx.ui.notify(`Voice ${action} requires interactive pi UI. Use /voice config or /voice help for setup information.`, "error");
898
+ return false;
899
+ }
900
+
894
901
  async function startRecording(ctx: ExtensionContext) {
902
+ if (!requireInteractiveUi(ctx, "recording")) return;
895
903
  const config = getConfig();
896
904
  const existing = readState(config);
897
905
  if (existing && pidAlive(existing.pid)) {
@@ -933,6 +941,7 @@ async function startRecording(ctx: ExtensionContext) {
933
941
  }
934
942
 
935
943
  async function stopRecording(ctx: ExtensionContext, transcribe = true) {
944
+ if (transcribe && !requireInteractiveUi(ctx, "transcription")) return;
936
945
  const config = getConfig();
937
946
  const state = readState(config);
938
947
  if (!state) {
@@ -1001,10 +1010,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
1001
1010
  }
1002
1011
 
1003
1012
  async function toggleRecording(ctx: ExtensionContext) {
1004
- if (!ctx.hasUI) {
1005
- ctx.ui.notify("voice input requires interactive pi UI", "error");
1006
- return;
1007
- }
1013
+ if (!requireInteractiveUi(ctx, "input")) return;
1008
1014
  const config = getConfig();
1009
1015
  if (await isRecording(config)) await stopRecording(ctx, true);
1010
1016
  else await startRecording(ctx);
package/package.json CHANGED
@@ -1,15 +1,18 @@
1
1
  {
2
2
  "name": "pi-voice-input",
3
- "version": "0.2.1",
4
- "description": "provider-extensible voice input extension for pi",
3
+ "version": "0.2.2",
4
+ "description": "Press Ctrl+Shift+R to dictate prompts into Pi using VolcEngine ASR",
5
5
  "type": "module",
6
6
  "keywords": [
7
7
  "pi-package",
8
8
  "pi-extension",
9
+ "pi",
9
10
  "voice-input",
10
11
  "speech-to-text",
11
12
  "dictation",
12
- "asr"
13
+ "asr",
14
+ "volcengine",
15
+ "linux"
13
16
  ],
14
17
  "license": "MIT",
15
18
  "author": "tr-nc",
@@ -24,6 +27,7 @@
24
27
  "files": [
25
28
  "extensions",
26
29
  "README.md",
30
+ "ROADMAP.md",
27
31
  "AGENTS.md"
28
32
  ],
29
33
  "pi": {