pi-voice-input 0.2.12 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -128
- package/extensions/voice-input.ts +38 -10
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,193 +1,158 @@
|
|
|
1
1
|
# pi Voice Input
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Voice dictation for [pi](https://pi.dev/). Press one shortcut, speak naturally, and insert the transcript into the editor without sending the prompt automatically.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
- Press `Ctrl+Shift+R` again to stop.
|
|
7
|
-
- The extension sends the audio to VolcEngine WebSocket ASR.
|
|
8
|
-
- The recognized text is inserted into pi's editor without submitting.
|
|
5
|
+
## Why use it?
|
|
9
6
|
|
|
10
|
-
|
|
7
|
+
Typing long prompts can slow you down. `pi-voice-input` lets you:
|
|
11
8
|
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
9
|
+
- capture ideas quickly while you are thinking out loud
|
|
10
|
+
- dictate long instructions, notes, bug reports, or code review comments
|
|
11
|
+
- speak naturally in Chinese, English, or a mix of both
|
|
12
|
+
- keep your hands on the keyboard with a simple toggle shortcut
|
|
13
|
+
- review or edit the inserted text before you submit it
|
|
14
|
+
- optionally polish dictated text with one of your configured pi models
|
|
16
15
|
|
|
17
|
-
|
|
16
|
+
## Features
|
|
18
17
|
|
|
19
|
-
|
|
18
|
+
- **One-key dictation**: `Ctrl+Shift+R` starts recording; press it again to stop and insert text.
|
|
19
|
+
- **Editor-safe workflow**: transcription is pasted into the current editor only. It does not auto-submit.
|
|
20
|
+
- **Chinese/English mixed input**: handles prompts that switch between Chinese, English, product names, and technical terms.
|
|
21
|
+
- **Works on Linux and macOS**: uses common system recording tools.
|
|
22
|
+
- **Lowers sound while you speak**: automatically turns down system audio during recording, then restores it afterwards.
|
|
23
|
+
- **Optional transcript polish**: use a pi model to clean up punctuation and wording before insertion.
|
|
24
|
+
- **Simple setup commands**: configure from inside pi with `/voice init` and `/voice key`.
|
|
20
25
|
|
|
21
|
-
|
|
26
|
+
Current speech provider: **VolcEngine Speech ASR**. A VolcEngine Speech API key is required.
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
pi extension: extensions/index.ts → extensions/voice-input.ts
|
|
25
|
-
├─ registers Ctrl+Shift+R and /voice commands
|
|
26
|
-
├─ starts/stops a local recorder process
|
|
27
|
-
│ ├─ Linux preferred: pw-record
|
|
28
|
-
│ ├─ Linux fallback: arecord
|
|
29
|
-
│ └─ macOS: afrecord, or ffmpeg/AVFoundation fallback
|
|
30
|
-
├─ ducks system output volume while the microphone is listening
|
|
31
|
-
├─ records a temporary 16 kHz mono 16-bit WAV
|
|
32
|
-
├─ parses the WAV container in TypeScript and extracts raw PCM
|
|
33
|
-
├─ sends PCM frames to the configured ASR provider via ws
|
|
34
|
-
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
35
|
-
├─ optionally post-processes raw ASR text with a configured pi model
|
|
36
|
-
│ └─ default: disabled; set polishModel to enable it
|
|
37
|
-
└─ pastes the final transcript into pi's editor
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
Runtime package dependency:
|
|
41
|
-
|
|
42
|
-
- `ws`
|
|
43
|
-
|
|
44
|
-
System dependency, one of:
|
|
45
|
-
|
|
46
|
-
- Linux: `pw-record` from PipeWire tools, preferred
|
|
47
|
-
- Linux: `arecord` from alsa-utils, fallback
|
|
48
|
-
- macOS: `afrecord` when present, or `ffmpeg` from Homebrew (`brew install ffmpeg`) as the AVFoundation fallback
|
|
49
|
-
|
|
50
|
-
On macOS, grant Terminal, ffmpeg, or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
|
|
51
|
-
|
|
52
|
-
## Install / Update
|
|
53
|
-
|
|
54
|
-
Install the published package with pi:
|
|
28
|
+
## Install
|
|
55
29
|
|
|
56
30
|
```bash
|
|
57
31
|
pi install npm:pi-voice-input
|
|
58
32
|
```
|
|
59
33
|
|
|
60
|
-
Update
|
|
34
|
+
Update later with:
|
|
61
35
|
|
|
62
36
|
```bash
|
|
63
37
|
pi update npm:pi-voice-input
|
|
64
38
|
```
|
|
65
39
|
|
|
66
|
-
|
|
40
|
+
Restart pi after installing or updating.
|
|
67
41
|
|
|
68
|
-
##
|
|
42
|
+
## First-time setup
|
|
69
43
|
|
|
70
|
-
|
|
44
|
+
1. Install the extension:
|
|
71
45
|
|
|
72
|
-
|
|
46
|
+
```bash
|
|
47
|
+
pi install npm:pi-voice-input
|
|
48
|
+
```
|
|
73
49
|
|
|
74
|
-
|
|
50
|
+
2. Restart pi.
|
|
75
51
|
|
|
76
|
-
|
|
52
|
+
3. Create the local config:
|
|
77
53
|
|
|
78
|
-
|
|
79
|
-
|
|
54
|
+
```text
|
|
55
|
+
/voice init
|
|
56
|
+
```
|
|
80
57
|
|
|
81
|
-
|
|
58
|
+
4. Add your VolcEngine Speech API key:
|
|
82
59
|
|
|
83
|
-
|
|
60
|
+
```text
|
|
61
|
+
/voice key
|
|
62
|
+
```
|
|
84
63
|
|
|
85
|
-
|
|
86
|
-
~/.pi/agent/voice-input.config.json
|
|
87
|
-
```
|
|
64
|
+
Get your key here:
|
|
88
65
|
|
|
89
|
-
|
|
66
|
+
https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
90
67
|
|
|
91
|
-
|
|
68
|
+
5. Check that pi sees your setup:
|
|
92
69
|
|
|
93
|
-
```text
|
|
94
|
-
/voice
|
|
95
|
-
```
|
|
70
|
+
```text
|
|
71
|
+
/voice config
|
|
72
|
+
```
|
|
96
73
|
|
|
97
|
-
|
|
74
|
+
6. Press `Ctrl+Shift+R`, speak, then press `Ctrl+Shift+R` again to insert the transcript.
|
|
75
|
+
|
|
76
|
+
## Use
|
|
77
|
+
|
|
78
|
+
Press:
|
|
98
79
|
|
|
99
80
|
```text
|
|
100
|
-
|
|
81
|
+
Ctrl+Shift+R
|
|
101
82
|
```
|
|
102
83
|
|
|
103
|
-
|
|
84
|
+
Then speak naturally in Chinese, English, or both. Press `Ctrl+Shift+R` again to stop recording. The recognized text appears in the editor at your cursor.
|
|
104
85
|
|
|
105
|
-
|
|
86
|
+
Useful commands:
|
|
106
87
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
}
|
|
88
|
+
```text
|
|
89
|
+
/voice start start recording
|
|
90
|
+
/voice stop stop, transcribe, and insert text
|
|
91
|
+
/voice toggle start or stop recording
|
|
92
|
+
/voice cancel stop and discard the recording
|
|
93
|
+
/voice status show current recorder state
|
|
94
|
+
/voice config show non-secret configuration
|
|
95
|
+
/voice key set or replace the API key
|
|
96
|
+
/voice help show setup help
|
|
117
97
|
```
|
|
118
98
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
`duckSystemVolume` is enabled by default. While recording, the extension lowers system output volume to `duckSystemVolumeFactor` of the original volume using a short ease-in/ease-out fade (`duckSystemVolumeFadeMs`), then restores the saved volume when recording stops or is cancelled. Linux uses `wpctl` or `pactl`; macOS uses `osascript`.
|
|
99
|
+
## Optional: polish dictated text
|
|
122
100
|
|
|
123
|
-
|
|
101
|
+
By default, pi inserts the raw transcript. To let a pi model clean up punctuation and wording, set `polishModel` in:
|
|
124
102
|
|
|
125
103
|
```text
|
|
126
|
-
/voice
|
|
104
|
+
~/.pi/agent/voice-input.config.json
|
|
127
105
|
```
|
|
128
106
|
|
|
129
|
-
|
|
107
|
+
Use any model name shown by:
|
|
130
108
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
```text
|
|
134
|
-
Ctrl+Shift+R
|
|
109
|
+
```bash
|
|
110
|
+
pi --list-models
|
|
135
111
|
```
|
|
136
112
|
|
|
137
|
-
|
|
113
|
+
Example:
|
|
138
114
|
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
/voice status # show recorder state
|
|
145
|
-
/voice config # show effective non-secret config and whether API key is detected
|
|
146
|
-
/voice init # create or normalize ~/.pi/agent/voice-input.config.json
|
|
147
|
-
/voice key # prompt for and save the current provider API key
|
|
148
|
-
/voice help # show setup help, including the explicit VolcEngine API key URL
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"volcApiKey": "",
|
|
118
|
+
"polishModel": "your-model-name"
|
|
119
|
+
}
|
|
149
120
|
```
|
|
150
121
|
|
|
151
|
-
|
|
122
|
+
If polishing fails, the raw transcript is inserted instead.
|
|
152
123
|
|
|
153
|
-
|
|
154
|
-
- The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
155
|
-
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
156
|
-
- Recorder stdout/stderr is not logged to disk, to avoid retaining potentially sensitive runtime data.
|
|
157
|
-
- On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
|
|
158
|
-
- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
|
|
159
|
-
- While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
|
|
160
|
-
- By default, system output volume is ducked to 50% of its previous level with a 300 ms ease-in/ease-out fade while the microphone is listening, then restored after recording stops.
|
|
124
|
+
## System requirements
|
|
161
125
|
|
|
162
|
-
|
|
126
|
+
Linux needs one recording tool:
|
|
163
127
|
|
|
164
|
-
|
|
128
|
+
- `pw-record` from PipeWire tools, recommended
|
|
129
|
+
- or `arecord` from alsa-utils
|
|
165
130
|
|
|
166
|
-
|
|
131
|
+
macOS uses the built-in recorder when available. If recording does not work, install ffmpeg:
|
|
167
132
|
|
|
168
133
|
```bash
|
|
169
|
-
|
|
170
|
-
cd pi-voice-input
|
|
171
|
-
npm install
|
|
134
|
+
brew install ffmpeg
|
|
172
135
|
```
|
|
173
136
|
|
|
174
|
-
|
|
137
|
+
On macOS, allow microphone access for your terminal or pi host app when prompted. You can also check System Settings → Privacy & Security → Microphone.
|
|
175
138
|
|
|
176
|
-
|
|
177
|
-
pi -e .
|
|
178
|
-
```
|
|
139
|
+
## Privacy notes
|
|
179
140
|
|
|
180
|
-
|
|
141
|
+
- Your API key is stored locally in `~/.pi/agent/voice-input.config.json`.
|
|
142
|
+
- Recordings are temporary and are removed after use.
|
|
143
|
+
- Transcribed text is inserted into the editor so you can review it before submitting.
|
|
181
144
|
|
|
182
|
-
|
|
183
|
-
pi install .
|
|
184
|
-
```
|
|
145
|
+
## Troubleshooting
|
|
185
146
|
|
|
186
|
-
|
|
147
|
+
- Run `/voice status` to see whether recording is active.
|
|
148
|
+
- Run `/voice config` to confirm the API key is detected.
|
|
149
|
+
- Run `/voice key` again if the key was changed or expired.
|
|
150
|
+
- On macOS, check microphone permission if recording immediately fails.
|
|
151
|
+
- On Linux, make sure `pw-record` or `arecord` is installed and your microphone works in other apps.
|
|
187
152
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
153
|
+
## Development
|
|
154
|
+
|
|
155
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines.
|
|
191
156
|
|
|
192
157
|
## Roadmap
|
|
193
158
|
|
|
@@ -196,5 +161,4 @@ See [ROADMAP.md](ROADMAP.md) for planned user-visible work.
|
|
|
196
161
|
## Links
|
|
197
162
|
|
|
198
163
|
- API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
199
|
-
- ASR
|
|
200
|
-
- WebSocket ASR docs: https://www.volcengine.com/docs/6561/1354869?lang=zh
|
|
164
|
+
- VolcEngine ASR: https://www.volcengine.com/product/asr
|
|
@@ -1091,22 +1091,46 @@ function cleanPostprocessOutput(output: string): string {
|
|
|
1091
1091
|
return text;
|
|
1092
1092
|
}
|
|
1093
1093
|
|
|
1094
|
+
const EXPLICIT_ENGLISH_MULTILINE_PATTERN =
|
|
1095
|
+
/\b(?:new\s*line|newline|line break|next line|new paragraph|paragraph break|carriage return|press enter|separate lines?|multi[- ]line|multiple lines)\b/i;
|
|
1096
|
+
const EXPLICIT_CHINESE_MULTILINE_PATTERN = /(?:换行|新的一行|另起一行|下一行|回车|分行|多行|逐行|每行|空一行|新段落|另起一段|分段)/u;
|
|
1097
|
+
const CJK_LIKE_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
|
|
1098
|
+
const CJK_PUNCTUATION_PATTERN = /[,。!?、;:()《》「」『』“”‘’]/u;
|
|
1099
|
+
const CLOSING_PUNCTUATION_PATTERN = /^[,.;:!?,。!?、;:))\]}》」』”’]/u;
|
|
1100
|
+
const OPENING_PUNCTUATION_PATTERN = /[(([{\[《「『“‘]$/u;
|
|
1101
|
+
|
|
1094
1102
|
function rawTextRequestsMultiline(rawText: string): boolean {
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1103
|
+
// Existing newlines in raw ASR are not reliable user intent: providers can
|
|
1104
|
+
// insert segment or sentence breaks on their own. Treat only spoken layout
|
|
1105
|
+
// commands as intentional multiline input.
|
|
1106
|
+
return EXPLICIT_ENGLISH_MULTILINE_PATTERN.test(rawText) || EXPLICIT_CHINESE_MULTILINE_PATTERN.test(rawText);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
function lineBreakJoiner(left: string, right: string): string {
|
|
1110
|
+
if (!left || !right) return "";
|
|
1111
|
+
if (CLOSING_PUNCTUATION_PATTERN.test(right) || OPENING_PUNCTUATION_PATTERN.test(left)) return "";
|
|
1112
|
+
if (CJK_PUNCTUATION_PATTERN.test(left) || CJK_PUNCTUATION_PATTERN.test(right)) return "";
|
|
1113
|
+
if (CJK_LIKE_PATTERN.test(left) && CJK_LIKE_PATTERN.test(right)) return "";
|
|
1114
|
+
return " ";
|
|
1100
1115
|
}
|
|
1101
1116
|
|
|
1102
1117
|
function collapseUnexpectedLineBreaks(text: string): string {
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
.replace(/[ \t\f\v]*\n+[ \t\f\v]*/g,
|
|
1118
|
+
const normalized = text.replace(/\r\n?/g, "\n");
|
|
1119
|
+
return normalized
|
|
1120
|
+
.replace(/[ \t\f\v]*\n+[ \t\f\v]*/g, (match, offset: number, source: string) => {
|
|
1121
|
+
const left = source.slice(0, offset).replace(/[ \t\f\v]+$/g, "").at(-1) ?? "";
|
|
1122
|
+
const right = source.slice(offset + match.length).replace(/^[ \t\f\v]+/g, "").at(0) ?? "";
|
|
1123
|
+
return lineBreakJoiner(left, right);
|
|
1124
|
+
})
|
|
1106
1125
|
.replace(/[ \t\f\v]{2,}/g, " ")
|
|
1107
1126
|
.trim();
|
|
1108
1127
|
}
|
|
1109
1128
|
|
|
1129
|
+
function normalizeRawTextForPostprocess(rawText: string): string {
|
|
1130
|
+
const raw = rawText.trim();
|
|
1131
|
+
return rawTextRequestsMultiline(raw) ? raw : collapseUnexpectedLineBreaks(raw);
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1110
1134
|
function preserveExpectedPostprocessLayout(rawText: string, output: string): string {
|
|
1111
1135
|
if (rawTextRequestsMultiline(rawText)) return output.trim();
|
|
1112
1136
|
return collapseUnexpectedLineBreaks(output);
|
|
@@ -1193,7 +1217,7 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
|
|
|
1193
1217
|
messages: [
|
|
1194
1218
|
{
|
|
1195
1219
|
role: "user",
|
|
1196
|
-
content: buildPostprocessPrompt(ctx, raw, config),
|
|
1220
|
+
content: buildPostprocessPrompt(ctx, normalizeRawTextForPostprocess(raw), config),
|
|
1197
1221
|
timestamp: Date.now(),
|
|
1198
1222
|
},
|
|
1199
1223
|
],
|
|
@@ -1370,6 +1394,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1370
1394
|
|
|
1371
1395
|
let finalText = result.text;
|
|
1372
1396
|
let postprocessMs = 0;
|
|
1397
|
+
let postprocessSucceeded = false;
|
|
1373
1398
|
let postprocessUsed = false;
|
|
1374
1399
|
if (config.postprocessEnabled) {
|
|
1375
1400
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● polishing"));
|
|
@@ -1377,7 +1402,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1377
1402
|
try {
|
|
1378
1403
|
finalText = await postprocessTranscript(ctx, result.text, config);
|
|
1379
1404
|
postprocessMs = Date.now() - postprocessStart;
|
|
1380
|
-
|
|
1405
|
+
postprocessSucceeded = true;
|
|
1381
1406
|
} catch (error) {
|
|
1382
1407
|
postprocessMs = Date.now() - postprocessStart;
|
|
1383
1408
|
ctx.ui.notify(
|
|
@@ -1387,6 +1412,9 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1387
1412
|
}
|
|
1388
1413
|
}
|
|
1389
1414
|
|
|
1415
|
+
finalText = preserveExpectedPostprocessLayout(result.text, finalText);
|
|
1416
|
+
postprocessUsed = postprocessSucceeded && finalText.trim() !== result.text.trim();
|
|
1417
|
+
|
|
1390
1418
|
ctx.ui.setStatus("voice-input", undefined);
|
|
1391
1419
|
insertIntoEditor(ctx, finalText);
|
|
1392
1420
|
ctx.ui.notify(
|