pi-voice-input 0.2.11 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -121
- package/extensions/voice-input.ts +197 -14
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,186 +1,158 @@
|
|
|
1
1
|
# pi Voice Input
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Voice dictation for [pi](https://pi.dev/). Press one shortcut, speak naturally, and insert the transcript into the editor without sending the prompt automatically.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
- Press `Ctrl+Shift+R` again to stop.
|
|
7
|
-
- The extension sends the audio to VolcEngine WebSocket ASR.
|
|
8
|
-
- The recognized text is inserted into pi's editor without submitting.
|
|
5
|
+
## Why use it?
|
|
9
6
|
|
|
10
|
-
|
|
7
|
+
Typing long prompts can slow you down. `pi-voice-input` lets you:
|
|
11
8
|
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
9
|
+
- capture ideas quickly while you are thinking out loud
|
|
10
|
+
- dictate long instructions, notes, bug reports, or code review comments
|
|
11
|
+
- speak naturally in Chinese, English, or a mix of both
|
|
12
|
+
- keep your hands on the keyboard with a simple toggle shortcut
|
|
13
|
+
- review or edit the inserted text before you submit it
|
|
14
|
+
- optionally polish dictated text with one of your configured pi models
|
|
16
15
|
|
|
17
|
-
|
|
16
|
+
## Features
|
|
18
17
|
|
|
19
|
-
|
|
18
|
+
- **One-key dictation**: `Ctrl+Shift+R` starts recording; press it again to stop and insert text.
|
|
19
|
+
- **Editor-safe workflow**: transcription is pasted into the current editor only. It does not auto-submit.
|
|
20
|
+
- **Chinese/English mixed input**: handles prompts that switch between Chinese, English, product names, and technical terms.
|
|
21
|
+
- **Works on Linux and macOS**: uses common system recording tools.
|
|
22
|
+
- **Lowers sound while you speak**: automatically turns down system audio during recording, then restores it afterwards.
|
|
23
|
+
- **Optional transcript polish**: use a pi model to clean up punctuation and wording before insertion.
|
|
24
|
+
- **Simple setup commands**: configure from inside pi with `/voice init` and `/voice key`.
|
|
20
25
|
|
|
21
|
-
|
|
26
|
+
Current speech provider: **VolcEngine Speech ASR**. A VolcEngine Speech API key is required.
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
pi extension: extensions/index.ts → extensions/voice-input.ts
|
|
25
|
-
├─ registers Ctrl+Shift+R and /voice commands
|
|
26
|
-
├─ starts/stops a local recorder process
|
|
27
|
-
│ ├─ Linux preferred: pw-record
|
|
28
|
-
│ ├─ Linux fallback: arecord
|
|
29
|
-
│ └─ macOS: afrecord, or ffmpeg/AVFoundation fallback
|
|
30
|
-
├─ records a temporary 16 kHz mono 16-bit WAV
|
|
31
|
-
├─ parses the WAV container in TypeScript and extracts raw PCM
|
|
32
|
-
├─ sends PCM frames to the configured ASR provider via ws
|
|
33
|
-
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
34
|
-
├─ optionally post-processes raw ASR text with a configured pi model
|
|
35
|
-
│ └─ default: disabled; set polishModel to enable it
|
|
36
|
-
└─ pastes the final transcript into pi's editor
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
Runtime package dependency:
|
|
40
|
-
|
|
41
|
-
- `ws`
|
|
42
|
-
|
|
43
|
-
System dependency, one of:
|
|
44
|
-
|
|
45
|
-
- Linux: `pw-record` from PipeWire tools, preferred
|
|
46
|
-
- Linux: `arecord` from alsa-utils, fallback
|
|
47
|
-
- macOS: `afrecord` when present, or `ffmpeg` from Homebrew (`brew install ffmpeg`) as the AVFoundation fallback
|
|
48
|
-
|
|
49
|
-
On macOS, grant Terminal, ffmpeg, or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
|
|
50
|
-
|
|
51
|
-
## Install / Update
|
|
52
|
-
|
|
53
|
-
Install the published package with pi:
|
|
28
|
+
## Install
|
|
54
29
|
|
|
55
30
|
```bash
|
|
56
31
|
pi install npm:pi-voice-input
|
|
57
32
|
```
|
|
58
33
|
|
|
59
|
-
Update
|
|
34
|
+
Update later with:
|
|
60
35
|
|
|
61
36
|
```bash
|
|
62
37
|
pi update npm:pi-voice-input
|
|
63
38
|
```
|
|
64
39
|
|
|
65
|
-
|
|
40
|
+
Restart pi after installing or updating.
|
|
66
41
|
|
|
67
|
-
##
|
|
42
|
+
## First-time setup
|
|
68
43
|
|
|
69
|
-
|
|
44
|
+
1. Install the extension:
|
|
70
45
|
|
|
71
|
-
|
|
46
|
+
```bash
|
|
47
|
+
pi install npm:pi-voice-input
|
|
48
|
+
```
|
|
72
49
|
|
|
73
|
-
|
|
50
|
+
2. Restart pi.
|
|
74
51
|
|
|
75
|
-
|
|
52
|
+
3. Create the local config:
|
|
76
53
|
|
|
77
|
-
|
|
78
|
-
|
|
54
|
+
```text
|
|
55
|
+
/voice init
|
|
56
|
+
```
|
|
79
57
|
|
|
80
|
-
|
|
58
|
+
4. Add your VolcEngine Speech API key:
|
|
81
59
|
|
|
82
|
-
|
|
60
|
+
```text
|
|
61
|
+
/voice key
|
|
62
|
+
```
|
|
83
63
|
|
|
84
|
-
|
|
85
|
-
~/.pi/agent/voice-input.config.json
|
|
86
|
-
```
|
|
64
|
+
Get your key here:
|
|
87
65
|
|
|
88
|
-
|
|
66
|
+
https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
89
67
|
|
|
90
|
-
|
|
68
|
+
5. Check that pi sees your setup:
|
|
91
69
|
|
|
92
|
-
```text
|
|
93
|
-
/voice
|
|
94
|
-
```
|
|
70
|
+
```text
|
|
71
|
+
/voice config
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
6. Press `Ctrl+Shift+R`, speak, then press `Ctrl+Shift+R` again to insert the transcript.
|
|
75
|
+
|
|
76
|
+
## Use
|
|
95
77
|
|
|
96
|
-
|
|
78
|
+
Press:
|
|
97
79
|
|
|
98
80
|
```text
|
|
99
|
-
|
|
81
|
+
Ctrl+Shift+R
|
|
100
82
|
```
|
|
101
83
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
84
|
+
Then speak naturally in Chinese, English, or both. Press `Ctrl+Shift+R` again to stop recording. The recognized text appears in the editor at your cursor.
|
|
105
85
|
|
|
106
|
-
|
|
86
|
+
Useful commands:
|
|
107
87
|
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
88
|
+
```text
|
|
89
|
+
/voice start start recording
|
|
90
|
+
/voice stop stop, transcribe, and insert text
|
|
91
|
+
/voice toggle start or stop recording
|
|
92
|
+
/voice cancel stop and discard the recording
|
|
93
|
+
/voice status show current recorder state
|
|
94
|
+
/voice config show non-secret configuration
|
|
95
|
+
/voice key set or replace the API key
|
|
96
|
+
/voice help show setup help
|
|
113
97
|
```
|
|
114
98
|
|
|
115
|
-
|
|
99
|
+
## Optional: polish dictated text
|
|
116
100
|
|
|
117
|
-
|
|
101
|
+
By default, pi inserts the raw transcript. To let a pi model clean up punctuation and wording, set `polishModel` in:
|
|
118
102
|
|
|
119
103
|
```text
|
|
120
|
-
/voice
|
|
104
|
+
~/.pi/agent/voice-input.config.json
|
|
121
105
|
```
|
|
122
106
|
|
|
123
|
-
|
|
107
|
+
Use any model name shown by:
|
|
124
108
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
```text
|
|
128
|
-
Ctrl+Shift+R
|
|
109
|
+
```bash
|
|
110
|
+
pi --list-models
|
|
129
111
|
```
|
|
130
112
|
|
|
131
|
-
|
|
113
|
+
Example:
|
|
132
114
|
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
/voice status # show recorder state
|
|
139
|
-
/voice config # show effective non-secret config and whether API key is detected
|
|
140
|
-
/voice init # create or normalize ~/.pi/agent/voice-input.config.json
|
|
141
|
-
/voice key # prompt for and save the current provider API key
|
|
142
|
-
/voice help # show setup help, including the explicit VolcEngine API key URL
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"volcApiKey": "",
|
|
118
|
+
"polishModel": "your-model-name"
|
|
119
|
+
}
|
|
143
120
|
```
|
|
144
121
|
|
|
145
|
-
|
|
122
|
+
If polishing fails, the raw transcript is inserted instead.
|
|
146
123
|
|
|
147
|
-
|
|
148
|
-
- The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
149
|
-
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
150
|
-
- Recorder stdout/stderr is not logged to disk, to avoid retaining potentially sensitive runtime data.
|
|
151
|
-
- On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
|
|
152
|
-
- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
|
|
153
|
-
- While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
|
|
124
|
+
## System requirements
|
|
154
125
|
|
|
155
|
-
|
|
126
|
+
Linux needs one recording tool:
|
|
156
127
|
|
|
157
|
-
|
|
128
|
+
- `pw-record` from PipeWire tools, recommended
|
|
129
|
+
- or `arecord` from alsa-utils
|
|
158
130
|
|
|
159
|
-
|
|
131
|
+
macOS uses the built-in recorder when available. If recording does not work, install ffmpeg:
|
|
160
132
|
|
|
161
133
|
```bash
|
|
162
|
-
|
|
163
|
-
cd pi-voice-input
|
|
164
|
-
npm install
|
|
134
|
+
brew install ffmpeg
|
|
165
135
|
```
|
|
166
136
|
|
|
167
|
-
|
|
137
|
+
On macOS, allow microphone access for your terminal or pi host app when prompted. You can also check System Settings → Privacy & Security → Microphone.
|
|
168
138
|
|
|
169
|
-
|
|
170
|
-
pi -e .
|
|
171
|
-
```
|
|
139
|
+
## Privacy notes
|
|
172
140
|
|
|
173
|
-
|
|
141
|
+
- Your API key is stored locally in `~/.pi/agent/voice-input.config.json`.
|
|
142
|
+
- Recordings are temporary and are removed after use.
|
|
143
|
+
- Transcribed text is inserted into the editor so you can review it before submitting.
|
|
174
144
|
|
|
175
|
-
|
|
176
|
-
pi install .
|
|
177
|
-
```
|
|
145
|
+
## Troubleshooting
|
|
178
146
|
|
|
179
|
-
|
|
147
|
+
- Run `/voice status` to see whether recording is active.
|
|
148
|
+
- Run `/voice config` to confirm the API key is detected.
|
|
149
|
+
- Run `/voice key` again if the key was changed or expired.
|
|
150
|
+
- On macOS, check microphone permission if recording immediately fails.
|
|
151
|
+
- On Linux, make sure `pw-record` or `arecord` is installed and your microphone works in other apps.
|
|
180
152
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
153
|
+
## Development
|
|
154
|
+
|
|
155
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines.
|
|
184
156
|
|
|
185
157
|
## Roadmap
|
|
186
158
|
|
|
@@ -189,5 +161,4 @@ See [ROADMAP.md](ROADMAP.md) for planned user-visible work.
|
|
|
189
161
|
## Links
|
|
190
162
|
|
|
191
163
|
- API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
192
|
-
- ASR
|
|
193
|
-
- WebSocket ASR docs: https://www.volcengine.com/docs/6561/1354869?lang=zh
|
|
164
|
+
- VolcEngine ASR: https://www.volcengine.com/product/asr
|
|
@@ -61,6 +61,9 @@ type JsonObject = Record<string, unknown>;
|
|
|
61
61
|
type VoiceInputConfigFile = {
|
|
62
62
|
volcApiKey: string;
|
|
63
63
|
polishModel: string;
|
|
64
|
+
duckSystemVolume: boolean;
|
|
65
|
+
duckSystemVolumeFactor: number;
|
|
66
|
+
duckSystemVolumeFadeMs: number;
|
|
64
67
|
};
|
|
65
68
|
|
|
66
69
|
type VoiceConfig = {
|
|
@@ -86,6 +89,17 @@ type VoiceConfig = {
|
|
|
86
89
|
postprocessTimeoutMs: number;
|
|
87
90
|
postprocessMaxTokens: number;
|
|
88
91
|
postprocessContextChars: number;
|
|
92
|
+
duckSystemVolume: boolean;
|
|
93
|
+
duckSystemVolumeFactor: number;
|
|
94
|
+
duckSystemVolumeFadeMs: number;
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
type SystemVolumeDuckingState = {
|
|
98
|
+
provider: "macos" | "wpctl" | "pactl";
|
|
99
|
+
originalVolumePercent: number;
|
|
100
|
+
duckedVolumePercent: number;
|
|
101
|
+
factor: number;
|
|
102
|
+
fadeMs: number;
|
|
89
103
|
};
|
|
90
104
|
|
|
91
105
|
type RecordingState = {
|
|
@@ -95,6 +109,7 @@ type RecordingState = {
|
|
|
95
109
|
startedAt: string;
|
|
96
110
|
recorderTarget?: string;
|
|
97
111
|
deviceName?: string;
|
|
112
|
+
systemVolume?: SystemVolumeDuckingState;
|
|
98
113
|
};
|
|
99
114
|
|
|
100
115
|
type DecodedFrame = {
|
|
@@ -124,6 +139,9 @@ function defaultConfigFile(): VoiceInputConfigFile {
|
|
|
124
139
|
return {
|
|
125
140
|
volcApiKey: "",
|
|
126
141
|
polishModel: DEFAULT_POSTPROCESS_MODEL,
|
|
142
|
+
duckSystemVolume: true,
|
|
143
|
+
duckSystemVolumeFactor: 0.5,
|
|
144
|
+
duckSystemVolumeFadeMs: 300,
|
|
127
145
|
};
|
|
128
146
|
}
|
|
129
147
|
|
|
@@ -136,12 +154,29 @@ function stringField(source: JsonObject, name: string, fallback: string): string
|
|
|
136
154
|
return typeof value === "string" ? value : fallback;
|
|
137
155
|
}
|
|
138
156
|
|
|
157
|
+
function booleanField(source: JsonObject, name: string, fallback: boolean): boolean {
|
|
158
|
+
const value = source[name];
|
|
159
|
+
return typeof value === "boolean" ? value : fallback;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function numberField(source: JsonObject, name: string, fallback: number): number {
|
|
163
|
+
const value = source[name];
|
|
164
|
+
return typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function clamp(value: number, min: number, max: number): number {
|
|
168
|
+
return Math.min(max, Math.max(min, value));
|
|
169
|
+
}
|
|
170
|
+
|
|
139
171
|
function normalizeConfigFile(input: unknown): VoiceInputConfigFile {
|
|
140
172
|
const defaults = defaultConfigFile();
|
|
141
173
|
const root = isObject(input) ? input : {};
|
|
142
174
|
return {
|
|
143
175
|
volcApiKey: stringField(root, "volcApiKey", defaults.volcApiKey).trim(),
|
|
144
176
|
polishModel: stringField(root, "polishModel", defaults.polishModel).trim(),
|
|
177
|
+
duckSystemVolume: booleanField(root, "duckSystemVolume", defaults.duckSystemVolume),
|
|
178
|
+
duckSystemVolumeFactor: clamp(numberField(root, "duckSystemVolumeFactor", defaults.duckSystemVolumeFactor), 0, 1),
|
|
179
|
+
duckSystemVolumeFadeMs: Math.round(clamp(numberField(root, "duckSystemVolumeFadeMs", defaults.duckSystemVolumeFadeMs), 0, 3000)),
|
|
145
180
|
};
|
|
146
181
|
}
|
|
147
182
|
|
|
@@ -188,6 +223,9 @@ function getConfig(): VoiceConfig {
|
|
|
188
223
|
postprocessTimeoutMs: 30000,
|
|
189
224
|
postprocessMaxTokens: 2048,
|
|
190
225
|
postprocessContextChars: 6000,
|
|
226
|
+
duckSystemVolume: fileConfig.duckSystemVolume,
|
|
227
|
+
duckSystemVolumeFactor: fileConfig.duckSystemVolumeFactor,
|
|
228
|
+
duckSystemVolumeFadeMs: fileConfig.duckSystemVolumeFadeMs,
|
|
191
229
|
};
|
|
192
230
|
}
|
|
193
231
|
|
|
@@ -218,6 +256,111 @@ function commandOutput(command: string, args: string[], timeoutMs = 1500): strin
|
|
|
218
256
|
return (result.stdout || "").trim();
|
|
219
257
|
}
|
|
220
258
|
|
|
259
|
+
function runCommand(command: string, args: string[], timeoutMs = 1500): boolean {
|
|
260
|
+
return spawnSync(command, args, { stdio: "ignore", timeout: timeoutMs }).status === 0;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
function formatPercent(value: number): string {
|
|
264
|
+
return Number(value.toFixed(2)).toString();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function readSystemOutputVolume(): Pick<SystemVolumeDuckingState, "provider" | "originalVolumePercent"> | null {
|
|
268
|
+
if (platform() === "darwin") {
|
|
269
|
+
if (!commandExists("osascript")) return null;
|
|
270
|
+
const output = commandOutput("osascript", ["-e", "output volume of (get volume settings)"]);
|
|
271
|
+
const volume = Number(output.trim());
|
|
272
|
+
return Number.isFinite(volume) ? { provider: "macos", originalVolumePercent: clamp(volume, 0, 100) } : null;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if (platform() !== "linux") return null;
|
|
276
|
+
|
|
277
|
+
if (commandExists("wpctl")) {
|
|
278
|
+
const output = commandOutput("wpctl", ["get-volume", "@DEFAULT_AUDIO_SINK@"]);
|
|
279
|
+
const match = output.match(/Volume:\s*([0-9.]+)/);
|
|
280
|
+
const volume = match ? Number(match[1]) * 100 : NaN;
|
|
281
|
+
if (Number.isFinite(volume)) return { provider: "wpctl", originalVolumePercent: Math.max(0, volume) };
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (commandExists("pactl")) {
|
|
285
|
+
const output = commandOutput("pactl", ["get-sink-volume", "@DEFAULT_SINK@"]);
|
|
286
|
+
const match = output.match(/([0-9]+(?:\.[0-9]+)?)%/);
|
|
287
|
+
const volume = match ? Number(match[1]) : NaN;
|
|
288
|
+
if (Number.isFinite(volume)) return { provider: "pactl", originalVolumePercent: Math.max(0, volume) };
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return null;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function setSystemOutputVolume(state: Pick<SystemVolumeDuckingState, "provider">, volumePercent: number): boolean {
|
|
295
|
+
if (state.provider === "macos") {
|
|
296
|
+
return runCommand("osascript", ["-e", `set volume output volume ${Math.round(clamp(volumePercent, 0, 100))}`]);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const safePercent = Math.max(0, volumePercent);
|
|
300
|
+
if (state.provider === "wpctl") {
|
|
301
|
+
return runCommand("wpctl", ["set-volume", "@DEFAULT_AUDIO_SINK@", `${formatPercent(safePercent)}%`]);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return runCommand("pactl", ["set-sink-volume", "@DEFAULT_SINK@", `${formatPercent(safePercent)}%`]);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
function easeInOut(t: number): number {
|
|
308
|
+
return 0.5 - Math.cos(Math.PI * clamp(t, 0, 1)) / 2;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
async function fadeSystemOutputVolume(
|
|
312
|
+
state: Pick<SystemVolumeDuckingState, "provider">,
|
|
313
|
+
fromPercent: number,
|
|
314
|
+
toPercent: number,
|
|
315
|
+
fadeMs: number,
|
|
316
|
+
): Promise<string | null> {
|
|
317
|
+
if (fadeMs <= 0 || Math.abs(fromPercent - toPercent) < 0.1) {
|
|
318
|
+
return setSystemOutputVolume(state, toPercent) ? null : "failed to set system output volume";
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
const steps = Math.max(2, Math.min(20, Math.round(fadeMs / 30)));
|
|
322
|
+
const intervalMs = fadeMs / steps;
|
|
323
|
+
for (let step = 1; step <= steps; step += 1) {
|
|
324
|
+
const eased = easeInOut(step / steps);
|
|
325
|
+
const volume = fromPercent + (toPercent - fromPercent) * eased;
|
|
326
|
+
if (!setSystemOutputVolume(state, volume)) return "failed to set system output volume";
|
|
327
|
+
if (step < steps) await sleep(intervalMs);
|
|
328
|
+
}
|
|
329
|
+
return null;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
function createSystemVolumeDuckingState(config: VoiceConfig): { state?: SystemVolumeDuckingState; warning?: string } {
|
|
333
|
+
if (!config.duckSystemVolume || config.duckSystemVolumeFactor >= 1) return {};
|
|
334
|
+
const snapshot = readSystemOutputVolume();
|
|
335
|
+
if (!snapshot) return { warning: "system output volume ducking is enabled, but no supported volume control was found" };
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
state: {
|
|
339
|
+
...snapshot,
|
|
340
|
+
duckedVolumePercent: snapshot.originalVolumePercent * config.duckSystemVolumeFactor,
|
|
341
|
+
factor: config.duckSystemVolumeFactor,
|
|
342
|
+
fadeMs: config.duckSystemVolumeFadeMs,
|
|
343
|
+
},
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
async function applySystemVolumeDucking(state?: SystemVolumeDuckingState): Promise<string | null> {
|
|
348
|
+
if (!state) return null;
|
|
349
|
+
const warning = await fadeSystemOutputVolume(state, state.originalVolumePercent, state.duckedVolumePercent, state.fadeMs);
|
|
350
|
+
return warning ? `system output volume ducking failed: ${warning}` : null;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
async function restoreSystemOutputVolume(state?: SystemVolumeDuckingState): Promise<string | null> {
|
|
354
|
+
if (!state) return null;
|
|
355
|
+
const warning = await fadeSystemOutputVolume(state, state.duckedVolumePercent, state.originalVolumePercent, state.fadeMs);
|
|
356
|
+
return warning ? `system output volume restore failed: ${warning}` : null;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
function restoreSystemOutputVolumeNow(state?: SystemVolumeDuckingState): string | null {
|
|
360
|
+
if (!state) return null;
|
|
361
|
+
return setSystemOutputVolume(state, state.originalVolumePercent) ? null : "system output volume restore failed";
|
|
362
|
+
}
|
|
363
|
+
|
|
221
364
|
function selectRecorderExecutable(): string {
|
|
222
365
|
if (platform() === "darwin") {
|
|
223
366
|
if (commandExists("afrecord")) return "afrecord";
|
|
@@ -948,22 +1091,46 @@ function cleanPostprocessOutput(output: string): string {
|
|
|
948
1091
|
return text;
|
|
949
1092
|
}
|
|
950
1093
|
|
|
1094
|
+
const EXPLICIT_ENGLISH_MULTILINE_PATTERN =
|
|
1095
|
+
/\b(?:new\s*line|newline|line break|next line|new paragraph|paragraph break|carriage return|press enter|separate lines?|multi[- ]line|multiple lines)\b/i;
|
|
1096
|
+
const EXPLICIT_CHINESE_MULTILINE_PATTERN = /(?:换行|新的一行|另起一行|下一行|回车|分行|多行|逐行|每行|空一行|新段落|另起一段|分段)/u;
|
|
1097
|
+
const CJK_LIKE_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
|
|
1098
|
+
const CJK_PUNCTUATION_PATTERN = /[,。!?、;:()《》「」『』“”‘’]/u;
|
|
1099
|
+
const CLOSING_PUNCTUATION_PATTERN = /^[,.;:!?,。!?、;:))\]}》」』”’]/u;
|
|
1100
|
+
const OPENING_PUNCTUATION_PATTERN = /[(([{\[《「『“‘]$/u;
|
|
1101
|
+
|
|
951
1102
|
function rawTextRequestsMultiline(rawText: string): boolean {
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
1103
|
+
// Existing newlines in raw ASR are not reliable user intent: providers can
|
|
1104
|
+
// insert segment or sentence breaks on their own. Treat only spoken layout
|
|
1105
|
+
// commands as intentional multiline input.
|
|
1106
|
+
return EXPLICIT_ENGLISH_MULTILINE_PATTERN.test(rawText) || EXPLICIT_CHINESE_MULTILINE_PATTERN.test(rawText);
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
function lineBreakJoiner(left: string, right: string): string {
|
|
1110
|
+
if (!left || !right) return "";
|
|
1111
|
+
if (CLOSING_PUNCTUATION_PATTERN.test(right) || OPENING_PUNCTUATION_PATTERN.test(left)) return "";
|
|
1112
|
+
if (CJK_PUNCTUATION_PATTERN.test(left) || CJK_PUNCTUATION_PATTERN.test(right)) return "";
|
|
1113
|
+
if (CJK_LIKE_PATTERN.test(left) && CJK_LIKE_PATTERN.test(right)) return "";
|
|
1114
|
+
return " ";
|
|
957
1115
|
}
|
|
958
1116
|
|
|
959
1117
|
function collapseUnexpectedLineBreaks(text: string): string {
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
.replace(/[ \t\f\v]*\n+[ \t\f\v]*/g,
|
|
1118
|
+
const normalized = text.replace(/\r\n?/g, "\n");
|
|
1119
|
+
return normalized
|
|
1120
|
+
.replace(/[ \t\f\v]*\n+[ \t\f\v]*/g, (match, offset: number, source: string) => {
|
|
1121
|
+
const left = source.slice(0, offset).replace(/[ \t\f\v]+$/g, "").at(-1) ?? "";
|
|
1122
|
+
const right = source.slice(offset + match.length).replace(/^[ \t\f\v]+/g, "").at(0) ?? "";
|
|
1123
|
+
return lineBreakJoiner(left, right);
|
|
1124
|
+
})
|
|
963
1125
|
.replace(/[ \t\f\v]{2,}/g, " ")
|
|
964
1126
|
.trim();
|
|
965
1127
|
}
|
|
966
1128
|
|
|
1129
|
+
function normalizeRawTextForPostprocess(rawText: string): string {
|
|
1130
|
+
const raw = rawText.trim();
|
|
1131
|
+
return rawTextRequestsMultiline(raw) ? raw : collapseUnexpectedLineBreaks(raw);
|
|
1132
|
+
}
|
|
1133
|
+
|
|
967
1134
|
function preserveExpectedPostprocessLayout(rawText: string, output: string): string {
|
|
968
1135
|
if (rawTextRequestsMultiline(rawText)) return output.trim();
|
|
969
1136
|
return collapseUnexpectedLineBreaks(output);
|
|
@@ -1050,7 +1217,7 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
|
|
|
1050
1217
|
messages: [
|
|
1051
1218
|
{
|
|
1052
1219
|
role: "user",
|
|
1053
|
-
content: buildPostprocessPrompt(ctx, raw, config),
|
|
1220
|
+
content: buildPostprocessPrompt(ctx, normalizeRawTextForPostprocess(raw), config),
|
|
1054
1221
|
timestamp: Date.now(),
|
|
1055
1222
|
},
|
|
1056
1223
|
],
|
|
@@ -1092,9 +1259,10 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
|
1092
1259
|
function cleanupStaleRecordingState(config: VoiceConfig): string[] {
|
|
1093
1260
|
const state = readState(config);
|
|
1094
1261
|
if (!state || pidAlive(state.pid)) return [];
|
|
1262
|
+
const volumeWarning = restoreSystemOutputVolumeNow(state.systemVolume);
|
|
1095
1263
|
const cleanupWarnings = cleanupRecordingArtifacts(state);
|
|
1096
1264
|
clearState(config);
|
|
1097
|
-
return cleanupWarnings;
|
|
1265
|
+
return [volumeWarning, ...cleanupWarnings].filter((message): message is string => Boolean(message));
|
|
1098
1266
|
}
|
|
1099
1267
|
|
|
1100
1268
|
function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
|
|
@@ -1128,6 +1296,7 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
1128
1296
|
throw error;
|
|
1129
1297
|
}
|
|
1130
1298
|
const deviceName = recordingDeviceName(config, cmd[0]);
|
|
1299
|
+
const volumeDucking = createSystemVolumeDuckingState(config);
|
|
1131
1300
|
|
|
1132
1301
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
|
|
1133
1302
|
let child: ReturnType<typeof spawn>;
|
|
@@ -1152,7 +1321,11 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
1152
1321
|
startedAt: new Date().toISOString(),
|
|
1153
1322
|
recorderTarget: config.recorderTarget || undefined,
|
|
1154
1323
|
deviceName,
|
|
1324
|
+
systemVolume: volumeDucking.state,
|
|
1155
1325
|
});
|
|
1326
|
+
if (volumeDucking.warning) ctx.ui.notify(`Voice input warning: ${volumeDucking.warning}`, "warning");
|
|
1327
|
+
const duckingWarning = await applySystemVolumeDucking(volumeDucking.state);
|
|
1328
|
+
if (duckingWarning) ctx.ui.notify(`Voice input warning: ${duckingWarning}`, "warning");
|
|
1156
1329
|
|
|
1157
1330
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("accent", recordingStatusText(deviceName)));
|
|
1158
1331
|
}
|
|
@@ -1169,7 +1342,9 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1169
1342
|
|
|
1170
1343
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", transcribe ? "● transcribing" : "● stopping"));
|
|
1171
1344
|
if (pidAlive(state.pid)) await stopProcessGroup(state.pid);
|
|
1345
|
+
const volumeRestoreWarning = await restoreSystemOutputVolume(state.systemVolume);
|
|
1172
1346
|
clearState(config);
|
|
1347
|
+
if (volumeRestoreWarning) ctx.ui.notify(`Voice input warning: ${volumeRestoreWarning}`, "warning");
|
|
1173
1348
|
if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
|
|
1174
1349
|
|
|
1175
1350
|
if (!transcribe) {
|
|
@@ -1211,14 +1386,15 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1211
1386
|
if (!result.text.trim()) {
|
|
1212
1387
|
ctx.ui.setStatus("voice-input", undefined);
|
|
1213
1388
|
ctx.ui.notify(
|
|
1214
|
-
`
|
|
1215
|
-
"
|
|
1389
|
+
`No speech detected. audio=${(durationMs / 1000).toFixed(2)}s total=${result.timings.totalMs}ms`,
|
|
1390
|
+
"info",
|
|
1216
1391
|
);
|
|
1217
1392
|
return;
|
|
1218
1393
|
}
|
|
1219
1394
|
|
|
1220
1395
|
let finalText = result.text;
|
|
1221
1396
|
let postprocessMs = 0;
|
|
1397
|
+
let postprocessSucceeded = false;
|
|
1222
1398
|
let postprocessUsed = false;
|
|
1223
1399
|
if (config.postprocessEnabled) {
|
|
1224
1400
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● polishing"));
|
|
@@ -1226,7 +1402,7 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1226
1402
|
try {
|
|
1227
1403
|
finalText = await postprocessTranscript(ctx, result.text, config);
|
|
1228
1404
|
postprocessMs = Date.now() - postprocessStart;
|
|
1229
|
-
|
|
1405
|
+
postprocessSucceeded = true;
|
|
1230
1406
|
} catch (error) {
|
|
1231
1407
|
postprocessMs = Date.now() - postprocessStart;
|
|
1232
1408
|
ctx.ui.notify(
|
|
@@ -1236,6 +1412,9 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
1236
1412
|
}
|
|
1237
1413
|
}
|
|
1238
1414
|
|
|
1415
|
+
finalText = preserveExpectedPostprocessLayout(result.text, finalText);
|
|
1416
|
+
postprocessUsed = postprocessSucceeded && finalText.trim() !== result.text.trim();
|
|
1417
|
+
|
|
1239
1418
|
ctx.ui.setStatus("voice-input", undefined);
|
|
1240
1419
|
insertIntoEditor(ctx, finalText);
|
|
1241
1420
|
ctx.ui.notify(
|
|
@@ -1262,6 +1441,7 @@ function setupHelp(config = getConfig()): string {
|
|
|
1262
1441
|
"- To create/update the JSON config file, run: /voice init",
|
|
1263
1442
|
"- To save/update the key, run: /voice key",
|
|
1264
1443
|
`- Polish: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
|
|
1444
|
+
`- System volume ducking: ${config.duckSystemVolume ? `${Math.round(config.duckSystemVolumeFactor * 100)}% over ${config.duckSystemVolumeFadeMs}ms` : "disabled"}`,
|
|
1265
1445
|
`- Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
|
|
1266
1446
|
"- After saving the key, run: /voice config",
|
|
1267
1447
|
].join("\n");
|
|
@@ -1298,8 +1478,11 @@ function configSummary(config: VoiceConfig): string {
|
|
|
1298
1478
|
`- config file: ${config.configPath}${existsSync(config.configPath) ? "" : " (missing; run /voice init to create it)"}`,
|
|
1299
1479
|
`- volcApiKey: ${config.apiKey ? "set" : "missing"} (update with /voice key)`,
|
|
1300
1480
|
`- polishModel: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
|
|
1481
|
+
`- duckSystemVolume: ${config.duckSystemVolume ? "enabled" : "disabled"}`,
|
|
1482
|
+
`- duckSystemVolumeFactor: ${config.duckSystemVolumeFactor}`,
|
|
1483
|
+
`- duckSystemVolumeFadeMs: ${config.duckSystemVolumeFadeMs}`,
|
|
1301
1484
|
`- current recording device: ${currentDevice}`,
|
|
1302
|
-
"Config keys: volcApiKey, polishModel. Leave polishModel empty to disable polish.",
|
|
1485
|
+
"Config keys: volcApiKey, polishModel, duckSystemVolume, duckSystemVolumeFactor, duckSystemVolumeFadeMs. Leave polishModel empty to disable polish.",
|
|
1303
1486
|
`VolcEngine API key URL: ${VOLC_API_KEY_URL}`,
|
|
1304
1487
|
].join("\n");
|
|
1305
1488
|
}
|