pi-voice-input 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -7
- package/ROADMAP.md +30 -0
- package/extensions/voice-input.ts +28 -20
- package/package.json +7 -3
package/README.md
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
# pi Voice Input
|
|
2
2
|
|
|
3
|
-
A publishable, pure TypeScript [pi](https://pi.dev/) extension for
|
|
3
|
+
A publishable, pure TypeScript [pi](https://pi.dev/) extension for Linux voice dictation into pi's editor.
|
|
4
4
|
|
|
5
5
|
- Press `Ctrl+Shift+R` once to start recording.
|
|
6
6
|
- Press `Ctrl+Shift+R` again to stop.
|
|
7
|
-
- The extension sends the audio to
|
|
7
|
+
- The extension sends the audio to VolcEngine WebSocket ASR.
|
|
8
8
|
- The recognized text is inserted into pi's editor without submitting.
|
|
9
9
|
|
|
10
|
+
Current scope:
|
|
11
|
+
|
|
12
|
+
- Linux only for now, using `pw-record` from PipeWire tools or `arecord` from alsa-utils.
|
|
13
|
+
- A VolcEngine Speech API key is required.
|
|
14
|
+
- This is not a local/offline ASR engine.
|
|
15
|
+
|
|
10
16
|
The provider layer is intended to be extensible. **Current version supports only VolcEngine WebSocket ASR.**
|
|
11
17
|
|
|
12
18
|
No Python, `uv`, upload service, or `ffmpeg` is required for normal shortcut usage.
|
|
@@ -24,8 +30,8 @@ pi extension: extensions/voice-input.ts
|
|
|
24
30
|
├─ sends PCM frames to the configured ASR provider via ws
|
|
25
31
|
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
26
32
|
├─ optionally post-processes raw ASR text with a configured pi model
|
|
27
|
-
│ └─ default:
|
|
28
|
-
└─
|
|
33
|
+
│ └─ default: disabled; set polishModel to enable it
|
|
34
|
+
└─ pastes the final transcript into pi's editor
|
|
29
35
|
```
|
|
30
36
|
|
|
31
37
|
Runtime package dependency:
|
|
@@ -89,11 +95,11 @@ The config file is plain JSON and can be edited directly:
|
|
|
89
95
|
```json
|
|
90
96
|
{
|
|
91
97
|
"volcApiKey": "",
|
|
92
|
-
"polishModel": "
|
|
98
|
+
"polishModel": ""
|
|
93
99
|
}
|
|
94
100
|
```
|
|
95
101
|
|
|
96
|
-
`polishModel` is
|
|
102
|
+
`polishModel` is disabled by default. Set it to any model shown by `pi --list-models` to enable transcript polish. If polishing fails, the raw ASR transcript is inserted instead.
|
|
97
103
|
|
|
98
104
|
Verify the effective non-secret config:
|
|
99
105
|
|
|
@@ -128,7 +134,7 @@ Slash commands:
|
|
|
128
134
|
- The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
|
|
129
135
|
- The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
130
136
|
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
131
|
-
- When `polishModel` is set, polishing uses the
|
|
137
|
+
- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text. The final text is still pasted at the current cursor position without replacing the draft.
|
|
132
138
|
- While recording, the status line and tool panel show `Recording with [device name]`.
|
|
133
139
|
|
|
134
140
|
## Development
|
|
@@ -159,6 +165,10 @@ After changing the extension while pi is open, run:
|
|
|
159
165
|
/reload
|
|
160
166
|
```
|
|
161
167
|
|
|
168
|
+
## Roadmap
|
|
169
|
+
|
|
170
|
+
See [ROADMAP.md](ROADMAP.md) for planned user-visible work, including macOS support.
|
|
171
|
+
|
|
162
172
|
## Links
|
|
163
173
|
|
|
164
174
|
- API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
package/ROADMAP.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Roadmap
|
|
2
|
+
|
|
3
|
+
This roadmap lists user-visible work planned for pi Voice Input. It is intentionally short so users can quickly understand what is supported now and what is coming next.
|
|
4
|
+
|
|
5
|
+
## Current support
|
|
6
|
+
|
|
7
|
+
- Linux voice input through `pw-record` or `arecord`
|
|
8
|
+
- VolcEngine WebSocket ASR
|
|
9
|
+
- Optional transcript polish through a configured pi model
|
|
10
|
+
|
|
11
|
+
## Planned
|
|
12
|
+
|
|
13
|
+
### macOS support
|
|
14
|
+
|
|
15
|
+
Add first-class macOS recording support so users can dictate into pi without PipeWire or ALSA.
|
|
16
|
+
|
|
17
|
+
Expected direction:
|
|
18
|
+
|
|
19
|
+
- use a macOS-native recording command or a small bundled recorder helper
|
|
20
|
+
- keep the existing user workflow: press `Ctrl+Shift+R` to start, press it again to stop and insert text
|
|
21
|
+
- document required microphone permissions clearly
|
|
22
|
+
- preserve the same config file and ASR provider behavior where possible
|
|
23
|
+
|
|
24
|
+
Status: planned, not yet implemented.
|
|
25
|
+
|
|
26
|
+
## Later candidates
|
|
27
|
+
|
|
28
|
+
- additional ASR providers
|
|
29
|
+
- configurable shortcut
|
|
30
|
+
- better provider setup diagnostics
|
|
@@ -22,17 +22,19 @@ import WebSocket from "ws";
|
|
|
22
22
|
const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.json");
|
|
23
23
|
const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
|
|
24
24
|
const DEFAULT_SHORTCUT = Key.ctrlShift("r");
|
|
25
|
-
const DEFAULT_POSTPROCESS_MODEL = "
|
|
26
|
-
const POSTPROCESS_SYSTEM_PROMPT = `你是 pi
|
|
25
|
+
const DEFAULT_POSTPROCESS_MODEL = "";
|
|
26
|
+
const POSTPROCESS_SYSTEM_PROMPT = `你是 pi 语音输入插件的语音识别后处理器。你的唯一任务是润色原始 ASR 文本,使其成为可直接提交给编码智能体的用户指令。
|
|
27
27
|
|
|
28
28
|
规则:
|
|
29
|
-
-
|
|
29
|
+
- 只输出润色后的用户指令正文,不要输出解释、标题、前后缀、引号、代码围栏或寒暄。
|
|
30
|
+
- 绝对不要回答、执行或解决用户语音中提出的问题;即使原始语音是问题,也只能把这个问题本身整理成清晰文本,不要给出答案、方案、代码或结论。
|
|
31
|
+
- 以忠实保留用户信息为最高优先级。不要一味概括、压缩或简述;不要删除条件、约束、例子、数值、文件名、错误信息、多个请求、前后顺序或语气重点。
|
|
30
32
|
- 结合上下文理解省略指代、当前任务、文件/项目名称和用户意图;上下文仅用于理解,不要重复上下文内容,除非原始语音明确要求引用或修改它。
|
|
31
33
|
- 修正明显的语音识别错误、同音/近音错误、断句和标点错误;保留代码标识符、命令、路径、URL、模型名、包名和专有名词。
|
|
32
34
|
- 如果用户口误后自我更正(例如“不是……是……”“不对……”“算了改成……”),只保留更正后的正确指令,删除错误说法和更正过程。
|
|
33
|
-
-
|
|
35
|
+
- 让结果完整、符合逻辑、指令明确、有指导性;必要时拆成条目或步骤,但不得丢失原始信息。
|
|
34
36
|
- 不要凭空添加原始语音没有表达的新需求;不确定时保留原意并用更清晰的措辞表达。
|
|
35
|
-
-
|
|
37
|
+
- 输出语言必须跟随用户原始语音的主要语言,而不是上下文语言;不要因为上下文是中文/英文就把用户语音翻译成上下文语言。`;
|
|
36
38
|
|
|
37
39
|
const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
|
|
38
40
|
const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
|
|
@@ -715,7 +717,7 @@ function textFromContent(content: unknown): string {
|
|
|
715
717
|
function getEditorContext(ctx: ExtensionContext, maxChars: number): string {
|
|
716
718
|
if (maxChars <= 0) return "";
|
|
717
719
|
try {
|
|
718
|
-
return tailText(ctx.ui.getEditorText()
|
|
720
|
+
return tailText(ctx.ui.getEditorText(), maxChars);
|
|
719
721
|
} catch {
|
|
720
722
|
return "";
|
|
721
723
|
}
|
|
@@ -816,12 +818,15 @@ function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config:
|
|
|
816
818
|
const sessionContext = getRecentSessionContext(ctx, Math.ceil(contextBudget / 2));
|
|
817
819
|
|
|
818
820
|
return [
|
|
819
|
-
"
|
|
820
|
-
"
|
|
821
|
-
"
|
|
821
|
+
"请根据上下文只润色下面的原始语音识别结果。",
|
|
822
|
+
"如果上下文为空,直接依据原始文本润色。",
|
|
823
|
+
"不要回答原始语音里的问题,也不要执行其中的请求;只输出原始语音对应的最终用户指令文本。",
|
|
824
|
+
"输出语言必须跟随原始语音的主要语言,不要跟随上下文语言,也不要翻译成上下文语言。",
|
|
825
|
+
"务必忠实保留原始语音中的信息和细节,不要为了简洁而概括、压缩或删减。",
|
|
826
|
+
"当前输入框草稿只是上下文:语音文本会由插件插入到用户当前光标位置。不要重写、重复、补全、删除或替换草稿里的既有内容。",
|
|
822
827
|
"",
|
|
823
|
-
"---
|
|
824
|
-
editorContext || "(空)",
|
|
828
|
+
"--- 上下文:当前输入框未发送草稿 ---",
|
|
829
|
+
editorContext.trim() || "(空)",
|
|
825
830
|
"",
|
|
826
831
|
"--- 上下文:最近会话 ---",
|
|
827
832
|
sessionContext || "(空)",
|
|
@@ -876,12 +881,10 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
|
|
|
876
881
|
return polished || rawText;
|
|
877
882
|
}
|
|
878
883
|
|
|
879
|
-
function
|
|
884
|
+
function insertIntoEditor(ctx: ExtensionContext, text: string) {
|
|
880
885
|
const trimmed = text.trim();
|
|
881
886
|
if (!trimmed) return;
|
|
882
|
-
|
|
883
|
-
const separator = current.trim().length > 0 && !current.endsWith("\n") ? "\n" : "";
|
|
884
|
-
ctx.ui.setEditorText(`${current}${separator}${trimmed}`);
|
|
887
|
+
ctx.ui.pasteToEditor(trimmed);
|
|
885
888
|
}
|
|
886
889
|
|
|
887
890
|
async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
@@ -889,7 +892,14 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
|
889
892
|
return Boolean(state && pidAlive(state.pid));
|
|
890
893
|
}
|
|
891
894
|
|
|
895
|
+
function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
|
|
896
|
+
if (ctx.hasUI) return true;
|
|
897
|
+
ctx.ui.notify(`Voice ${action} requires interactive pi UI. Use /voice config or /voice help for setup information.`, "error");
|
|
898
|
+
return false;
|
|
899
|
+
}
|
|
900
|
+
|
|
892
901
|
async function startRecording(ctx: ExtensionContext) {
|
|
902
|
+
if (!requireInteractiveUi(ctx, "recording")) return;
|
|
893
903
|
const config = getConfig();
|
|
894
904
|
const existing = readState(config);
|
|
895
905
|
if (existing && pidAlive(existing.pid)) {
|
|
@@ -931,6 +941,7 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
931
941
|
}
|
|
932
942
|
|
|
933
943
|
async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
944
|
+
if (transcribe && !requireInteractiveUi(ctx, "transcription")) return;
|
|
934
945
|
const config = getConfig();
|
|
935
946
|
const state = readState(config);
|
|
936
947
|
if (!state) {
|
|
@@ -989,7 +1000,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
989
1000
|
}
|
|
990
1001
|
|
|
991
1002
|
ctx.ui.setStatus("voice-input", undefined);
|
|
992
|
-
|
|
1003
|
+
insertIntoEditor(ctx, finalText);
|
|
993
1004
|
ctx.ui.notify(
|
|
994
1005
|
`Voice text inserted. audio=${(durationMs / 1000).toFixed(2)}s decode=${decodeMs}ms asr=${result.timings.totalMs}ms${
|
|
995
1006
|
config.postprocessEnabled ? ` postprocess=${postprocessMs}ms${postprocessUsed ? " polished" : ""}` : ""
|
|
@@ -999,10 +1010,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
999
1010
|
}
|
|
1000
1011
|
|
|
1001
1012
|
async function toggleRecording(ctx: ExtensionContext) {
|
|
1002
|
-
if (!ctx
|
|
1003
|
-
ctx.ui.notify("voice input requires interactive pi UI", "error");
|
|
1004
|
-
return;
|
|
1005
|
-
}
|
|
1013
|
+
if (!requireInteractiveUi(ctx, "input")) return;
|
|
1006
1014
|
const config = getConfig();
|
|
1007
1015
|
if (await isRecording(config)) await stopRecording(ctx, true);
|
|
1008
1016
|
else await startRecording(ctx);
|
package/package.json
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-voice-input",
|
|
3
|
-
"version": "0.2.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.2.2",
|
|
4
|
+
"description": "Press Ctrl+Shift+R to dictate prompts into Pi using VolcEngine ASR",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"keywords": [
|
|
7
7
|
"pi-package",
|
|
8
8
|
"pi-extension",
|
|
9
|
+
"pi",
|
|
9
10
|
"voice-input",
|
|
10
11
|
"speech-to-text",
|
|
11
12
|
"dictation",
|
|
12
|
-
"asr"
|
|
13
|
+
"asr",
|
|
14
|
+
"volcengine",
|
|
15
|
+
"linux"
|
|
13
16
|
],
|
|
14
17
|
"license": "MIT",
|
|
15
18
|
"author": "tr-nc",
|
|
@@ -24,6 +27,7 @@
|
|
|
24
27
|
"files": [
|
|
25
28
|
"extensions",
|
|
26
29
|
"README.md",
|
|
30
|
+
"ROADMAP.md",
|
|
27
31
|
"AGENTS.md"
|
|
28
32
|
],
|
|
29
33
|
"pi": {
|