pi-voice-input 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +3 -4
- package/README.md +34 -8
- package/extensions/voice-input.ts +443 -146
- package/package.json +7 -3
- package/.env.example +0 -27
package/AGENTS.md
CHANGED
|
@@ -13,8 +13,8 @@ Development workflow for this repo.
|
|
|
13
13
|
|
|
14
14
|
## Secrets and local data
|
|
15
15
|
|
|
16
|
-
- Never commit API keys, `.env`, recordings, logs, caches, or `node_modules`.
|
|
17
|
-
- User credentials belong in `~/.pi/agent/voice-input.
|
|
16
|
+
- Never commit API keys, `.env`, local config JSON, recordings, logs, caches, or `node_modules`.
|
|
17
|
+
- User credentials and plugin settings belong in `~/.pi/agent/voice-input.config.json`, usually written by `/voice key` or `/voice init`.
|
|
18
18
|
- Do not print or copy real API keys into commits, docs, tests, or command output.
|
|
19
19
|
- The explicit VolcEngine API key URL that should be shown to users is:
|
|
20
20
|
`https://console.volcengine.com/speech/new/setting/apikeys?projectName=default`
|
|
@@ -33,7 +33,6 @@ npm pack --dry-run
|
|
|
33
33
|
Check that `npm pack --dry-run` includes only publishable files, normally:
|
|
34
34
|
|
|
35
35
|
```text
|
|
36
|
-
.env.example
|
|
37
36
|
AGENTS.md
|
|
38
37
|
README.md
|
|
39
38
|
extensions/voice-input.ts
|
|
@@ -50,7 +49,7 @@ Then check:
|
|
|
50
49
|
|
|
51
50
|
```bash
|
|
52
51
|
git status --short
|
|
53
|
-
rg -n "VOLC_API_KEY=|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}
|
|
52
|
+
rg -n '"volcApiKey"\\s*:\\s*"[^"]+"|VOLC_API_KEY=|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' \
|
|
54
53
|
--glob '!node_modules/**' --glob '!package-lock.json' . || true
|
|
55
54
|
```
|
|
56
55
|
|
package/README.md
CHANGED
|
@@ -23,6 +23,8 @@ pi extension: extensions/voice-input.ts
|
|
|
23
23
|
├─ parses the WAV container in TypeScript and extracts raw PCM
|
|
24
24
|
├─ sends PCM frames to the configured ASR provider via ws
|
|
25
25
|
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
26
|
+
├─ optionally post-processes raw ASR text with a configured pi model
|
|
27
|
+
│ └─ default: deepseek/deepseek-v4-flash, no reasoning option
|
|
26
28
|
└─ appends the final transcript to pi's editor with ctx.ui.setEditorText()
|
|
27
29
|
```
|
|
28
30
|
|
|
@@ -56,23 +58,44 @@ Planned provider direction:
|
|
|
56
58
|
- add more ASR providers without changing the shortcut/user workflow
|
|
57
59
|
- keep provider credentials and options isolated in config
|
|
58
60
|
|
|
59
|
-
## Configure
|
|
61
|
+
## Configure
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
All plugin settings live in one JSON file:
|
|
62
64
|
|
|
63
65
|
```text
|
|
64
|
-
/voice
|
|
66
|
+
~/.pi/agent/voice-input.config.json
|
|
65
67
|
```
|
|
66
68
|
|
|
67
|
-
|
|
69
|
+
Package-local and project-local env files are not read.
|
|
68
70
|
|
|
69
|
-
|
|
71
|
+
Create or normalize the file from inside pi:
|
|
70
72
|
|
|
71
|
-
|
|
73
|
+
```text
|
|
74
|
+
/voice init
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Then set the VolcEngine Speech API key:
|
|
78
|
+
|
|
79
|
+
```text
|
|
80
|
+
/voice key
|
|
81
|
+
```
|
|
72
82
|
|
|
73
83
|
The key URL is also shown inside pi when the key is missing, when you run `/voice key`, and in `/voice help`:
|
|
74
84
|
|
|
75
|
-
|
|
85
|
+
https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
86
|
+
|
|
87
|
+
The config file is plain JSON and can be edited directly:
|
|
88
|
+
|
|
89
|
+
```json
|
|
90
|
+
{
|
|
91
|
+
"volcApiKey": "",
|
|
92
|
+
"polishModel": "deepseek/deepseek-v4-flash"
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
`polishModel` is resolved from pi's model registry, so any model shown by `pi --list-models` can be used. Leave it empty to disable polish. If polishing fails, the raw ASR transcript is inserted instead.
|
|
97
|
+
|
|
98
|
+
Verify the effective non-secret config:
|
|
76
99
|
|
|
77
100
|
```text
|
|
78
101
|
/voice config
|
|
@@ -95,6 +118,7 @@ Slash commands:
|
|
|
95
118
|
/voice cancel # stop recording without transcribing
|
|
96
119
|
/voice status # show recorder state
|
|
97
120
|
/voice config # show effective non-secret config and whether API key is detected
|
|
121
|
+
/voice init # create or normalize ~/.pi/agent/voice-input.config.json
|
|
98
122
|
/voice key # prompt for and save the current provider API key
|
|
99
123
|
/voice help # show setup help, including the explicit VolcEngine API key URL
|
|
100
124
|
```
|
|
@@ -102,8 +126,10 @@ Slash commands:
|
|
|
102
126
|
## Notes
|
|
103
127
|
|
|
104
128
|
- The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
|
|
105
|
-
- The default
|
|
129
|
+
- The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
106
130
|
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
131
|
+
- When `polishModel` is set, polishing uses the current editor content and recent session messages as context, but outputs only the refined user instruction.
|
|
132
|
+
- While recording, the status line and tool panel show `Recording with [device name]`.
|
|
107
133
|
|
|
108
134
|
## Development
|
|
109
135
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtensionAPI, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { completeSimple, type Api, type Model } from "@earendil-works/pi-ai";
|
|
2
3
|
import { Key } from "@earendil-works/pi-tui";
|
|
3
4
|
import { spawn, spawnSync } from "node:child_process";
|
|
4
5
|
import { randomUUID } from "node:crypto";
|
|
@@ -15,15 +16,25 @@ import {
|
|
|
15
16
|
} from "node:fs";
|
|
16
17
|
import { homedir } from "node:os";
|
|
17
18
|
import path from "node:path";
|
|
18
|
-
import { fileURLToPath } from "node:url";
|
|
19
19
|
import { gzipSync, gunzipSync } from "node:zlib";
|
|
20
20
|
import WebSocket from "ws";
|
|
21
21
|
|
|
22
|
-
const
|
|
23
|
-
const PACKAGE_ROOT = path.resolve(EXTENSION_DIR, "..");
|
|
24
|
-
const PRIVATE_CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.env");
|
|
22
|
+
const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.json");
|
|
25
23
|
const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
|
|
26
24
|
const DEFAULT_SHORTCUT = Key.ctrlShift("r");
|
|
25
|
+
const DEFAULT_POSTPROCESS_MODEL = "deepseek/deepseek-v4-flash";
|
|
26
|
+
const POSTPROCESS_SYSTEM_PROMPT = `你是 pi 语音输入插件的语音识别后处理器。你的唯一任务是润色原始 ASR 文本,使其成为可直接提交给编码智能体的用户指令。
|
|
27
|
+
|
|
28
|
+
规则:
|
|
29
|
+
- 只输出润色后的用户指令正文,不要输出解释、标题、前后缀、引号、代码围栏或寒暄。
|
|
30
|
+
- 绝对不要回答、执行或解决用户语音中提出的问题;即使原始语音是问题,也只能把这个问题本身整理成清晰文本,不要给出答案、方案、代码或结论。
|
|
31
|
+
- 以忠实保留用户信息为最高优先级。不要一味概括、压缩或简述;不要删除条件、约束、例子、数值、文件名、错误信息、多个请求、前后顺序或语气重点。
|
|
32
|
+
- 结合上下文理解省略指代、当前任务、文件/项目名称和用户意图;上下文仅用于理解,不要重复上下文内容,除非原始语音明确要求引用或修改它。
|
|
33
|
+
- 修正明显的语音识别错误、同音/近音错误、断句和标点错误;保留代码标识符、命令、路径、URL、模型名、包名和专有名词。
|
|
34
|
+
- 如果用户口误后自我更正(例如“不是……是……”“不对……”“算了改成……”),只保留更正后的正确指令,删除错误说法和更正过程。
|
|
35
|
+
- 让结果完整、符合逻辑、指令明确、有指导性;必要时拆成条目或步骤,但不得丢失原始信息。
|
|
36
|
+
- 不要凭空添加原始语音没有表达的新需求;不确定时保留原意并用更清晰的措辞表达。
|
|
37
|
+
- 输出语言必须跟随用户原始语音的主要语言,而不是上下文语言;不要因为上下文是中文/英文就把用户语音翻译成上下文语言。`;
|
|
27
38
|
|
|
28
39
|
const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
|
|
29
40
|
const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
|
|
@@ -35,9 +46,15 @@ const SERIALIZATION_NONE = 0b0000;
|
|
|
35
46
|
const SERIALIZATION_JSON = 0b0001;
|
|
36
47
|
const COMPRESSION_GZIP = 0b0001;
|
|
37
48
|
|
|
38
|
-
type
|
|
49
|
+
type JsonObject = Record<string, unknown>;
|
|
50
|
+
|
|
51
|
+
type VoiceInputConfigFile = {
|
|
52
|
+
volcApiKey: string;
|
|
53
|
+
polishModel: string;
|
|
54
|
+
};
|
|
39
55
|
|
|
40
56
|
type VoiceConfig = {
|
|
57
|
+
configPath: string;
|
|
41
58
|
apiKey: string;
|
|
42
59
|
wsUrl: string;
|
|
43
60
|
resourceId: string;
|
|
@@ -56,6 +73,11 @@ type VoiceConfig = {
|
|
|
56
73
|
enablePunc: boolean;
|
|
57
74
|
enableDdc: boolean;
|
|
58
75
|
showUtterances: boolean;
|
|
76
|
+
postprocessEnabled: boolean;
|
|
77
|
+
postprocessModel: string;
|
|
78
|
+
postprocessTimeoutMs: number;
|
|
79
|
+
postprocessMaxTokens: number;
|
|
80
|
+
postprocessContextChars: number;
|
|
59
81
|
};
|
|
60
82
|
|
|
61
83
|
type RecordingState = {
|
|
@@ -64,6 +86,7 @@ type RecordingState = {
|
|
|
64
86
|
logPath: string;
|
|
65
87
|
startedAt: string;
|
|
66
88
|
recorderTarget?: string;
|
|
89
|
+
deviceName?: string;
|
|
67
90
|
};
|
|
68
91
|
|
|
69
92
|
type DecodedFrame = {
|
|
@@ -85,139 +108,94 @@ type TranscriptionResult = {
|
|
|
85
108
|
};
|
|
86
109
|
};
|
|
87
110
|
|
|
88
|
-
function
|
|
89
|
-
|
|
90
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
91
|
-
const line = rawLine.trim();
|
|
92
|
-
if (!line || line.startsWith("#")) continue;
|
|
93
|
-
const match = line.match(/^([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$/);
|
|
94
|
-
if (!match) continue;
|
|
95
|
-
const key = match[1];
|
|
96
|
-
let value = match[2] ?? "";
|
|
97
|
-
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
98
|
-
value = value.slice(1, -1);
|
|
99
|
-
}
|
|
100
|
-
env[key] = value;
|
|
101
|
-
}
|
|
102
|
-
return env;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
function loadEnvFiles(): EnvMap {
|
|
106
|
-
const candidates = [
|
|
107
|
-
PRIVATE_CONFIG_PATH,
|
|
108
|
-
path.join(PACKAGE_ROOT, ".env"),
|
|
109
|
-
path.join(process.cwd(), ".env"),
|
|
110
|
-
];
|
|
111
|
-
const merged: EnvMap = {};
|
|
112
|
-
for (const file of candidates) {
|
|
113
|
-
if (!existsSync(file)) continue;
|
|
114
|
-
Object.assign(merged, parseEnvText(readFileSync(file, "utf8")));
|
|
115
|
-
}
|
|
116
|
-
return merged;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
function setting(env: EnvMap, name: string, fallback = ""): string {
|
|
120
|
-
const value = process.env[name] ?? env[name];
|
|
121
|
-
return value == null ? fallback : value;
|
|
111
|
+
function ensureDir(dir: string) {
|
|
112
|
+
mkdirSync(dir, { recursive: true });
|
|
122
113
|
}
|
|
123
114
|
|
|
124
|
-
function
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
}
|
|
129
|
-
return fallback;
|
|
115
|
+
function defaultConfigFile(): VoiceInputConfigFile {
|
|
116
|
+
return {
|
|
117
|
+
volcApiKey: "",
|
|
118
|
+
polishModel: DEFAULT_POSTPROCESS_MODEL,
|
|
119
|
+
};
|
|
130
120
|
}
|
|
131
121
|
|
|
132
|
-
function
|
|
133
|
-
|
|
134
|
-
if (["1", "true", "yes", "on"].includes(raw)) return true;
|
|
135
|
-
if (["0", "false", "no", "off"].includes(raw)) return false;
|
|
136
|
-
return fallback;
|
|
122
|
+
function isObject(value: unknown): value is JsonObject {
|
|
123
|
+
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
|
137
124
|
}
|
|
138
125
|
|
|
139
|
-
function
|
|
140
|
-
const
|
|
141
|
-
|
|
142
|
-
return Number.isFinite(value) ? value : fallback;
|
|
126
|
+
function stringField(source: JsonObject, name: string, fallback: string): string {
|
|
127
|
+
const value = source[name];
|
|
128
|
+
return typeof value === "string" ? value : fallback;
|
|
143
129
|
}
|
|
144
130
|
|
|
145
|
-
function
|
|
146
|
-
|
|
131
|
+
function normalizeConfigFile(input: unknown): VoiceInputConfigFile {
|
|
132
|
+
const defaults = defaultConfigFile();
|
|
133
|
+
const root = isObject(input) ? input : {};
|
|
134
|
+
return {
|
|
135
|
+
volcApiKey: stringField(root, "volcApiKey", defaults.volcApiKey).trim(),
|
|
136
|
+
polishModel: stringField(root, "polishModel", defaults.polishModel).trim(),
|
|
137
|
+
};
|
|
147
138
|
}
|
|
148
139
|
|
|
149
|
-
function
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
140
|
+
function writeConfigFile(config: unknown) {
|
|
141
|
+
ensureDir(path.dirname(CONFIG_PATH));
|
|
142
|
+
writeFileSync(CONFIG_PATH, `${JSON.stringify(normalizeConfigFile(config), null, 2)}\n`, { mode: 0o600 });
|
|
143
|
+
chmodSync(CONFIG_PATH, 0o600);
|
|
153
144
|
}
|
|
154
145
|
|
|
155
|
-
function
|
|
156
|
-
|
|
157
|
-
|
|
146
|
+
function loadConfigFile(): VoiceInputConfigFile {
|
|
147
|
+
if (!existsSync(CONFIG_PATH)) return defaultConfigFile();
|
|
148
|
+
try {
|
|
149
|
+
return normalizeConfigFile(JSON.parse(readFileSync(CONFIG_PATH, "utf8")));
|
|
150
|
+
} catch (error) {
|
|
151
|
+
throw new Error(`Failed to read voice input config ${CONFIG_PATH}: ${error instanceof Error ? error.message : String(error)}`);
|
|
152
|
+
}
|
|
158
153
|
}
|
|
159
154
|
|
|
160
155
|
function getConfig(): VoiceConfig {
|
|
161
|
-
const
|
|
162
|
-
const
|
|
163
|
-
const
|
|
156
|
+
const fileConfig = loadConfigFile();
|
|
157
|
+
const voiceHome = path.join(homedir(), ".pi", "agent", "voice-input");
|
|
158
|
+
const polishModel = fileConfig.polishModel.trim();
|
|
164
159
|
|
|
165
160
|
return {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
161
|
+
configPath: CONFIG_PATH,
|
|
162
|
+
apiKey: fileConfig.volcApiKey.trim(),
|
|
163
|
+
wsUrl: "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream",
|
|
164
|
+
resourceId: "volc.seedasr.sauc.duration",
|
|
165
|
+
language: "",
|
|
166
|
+
uid: "pi-voice-input",
|
|
167
|
+
prompt: "",
|
|
168
|
+
segmentMs: 5000,
|
|
169
|
+
requestTimeoutMs: 90000,
|
|
170
|
+
finalizeDelayMs: 100,
|
|
171
|
+
recorderTarget: "",
|
|
172
|
+
recordingsDir: path.join(voiceHome, "recordings"),
|
|
173
|
+
statePath: path.join(voiceHome, "recording.json"),
|
|
174
|
+
logDir: path.join(voiceHome, "logs"),
|
|
175
|
+
shortcut: DEFAULT_SHORTCUT,
|
|
176
|
+
enableItn: true,
|
|
177
|
+
enablePunc: true,
|
|
178
|
+
enableDdc: false,
|
|
179
|
+
showUtterances: false,
|
|
180
|
+
postprocessEnabled: polishModel.length > 0,
|
|
181
|
+
postprocessModel: polishModel,
|
|
182
|
+
postprocessTimeoutMs: 30000,
|
|
183
|
+
postprocessMaxTokens: 2048,
|
|
184
|
+
postprocessContextChars: 6000,
|
|
184
185
|
};
|
|
185
186
|
}
|
|
186
187
|
|
|
187
|
-
function
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
function envValue(value: string): string {
|
|
192
|
-
if (/^[A-Za-z0-9_./:@+-]*$/.test(value)) return value;
|
|
193
|
-
return JSON.stringify(value);
|
|
188
|
+
function ensureConfigFile(): boolean {
|
|
189
|
+
const existed = existsSync(CONFIG_PATH);
|
|
190
|
+
writeConfigFile(loadConfigFile());
|
|
191
|
+
return !existed;
|
|
194
192
|
}
|
|
195
193
|
|
|
196
|
-
function
|
|
197
|
-
if (/\r|\n/.test(
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
const lines = original ? original.split(/\r?\n/) : [];
|
|
202
|
-
const replacement = `${name}=${envValue(value)}`;
|
|
203
|
-
let replaced = false;
|
|
204
|
-
|
|
205
|
-
const nextLines = lines.map((line) => {
|
|
206
|
-
if (new RegExp(`^\\s*${name}\\s*=`).test(line)) {
|
|
207
|
-
replaced = true;
|
|
208
|
-
return replacement;
|
|
209
|
-
}
|
|
210
|
-
return line;
|
|
211
|
-
});
|
|
212
|
-
|
|
213
|
-
if (!replaced) {
|
|
214
|
-
if (nextLines.length > 0 && nextLines[nextLines.length - 1] !== "") nextLines.push("");
|
|
215
|
-
nextLines.push("# Managed by pi-voice-input. You can also update this with /voice key.");
|
|
216
|
-
nextLines.push(replacement);
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
writeFileSync(PRIVATE_CONFIG_PATH, nextLines.join("\n").replace(/\n*$/, "\n"), { mode: 0o600 });
|
|
220
|
-
chmodSync(PRIVATE_CONFIG_PATH, 0o600);
|
|
194
|
+
function writeConfigApiKey(apiKey: string) {
|
|
195
|
+
if (/\r|\n/.test(apiKey)) throw new Error("volcApiKey must be a single-line value");
|
|
196
|
+
const config = loadConfigFile();
|
|
197
|
+
config.volcApiKey = apiKey.trim();
|
|
198
|
+
writeConfigFile(config);
|
|
221
199
|
}
|
|
222
200
|
|
|
223
201
|
function timestampForFilename(): string {
|
|
@@ -228,6 +206,12 @@ function commandExists(command: string): boolean {
|
|
|
228
206
|
return spawnSync("sh", ["-lc", `command -v ${command}`], { stdio: "ignore" }).status === 0;
|
|
229
207
|
}
|
|
230
208
|
|
|
209
|
+
function commandOutput(command: string, args: string[], timeoutMs = 1500): string {
|
|
210
|
+
const result = spawnSync(command, args, { encoding: "utf8", timeout: timeoutMs });
|
|
211
|
+
if (result.status !== 0) return "";
|
|
212
|
+
return (result.stdout || "").trim();
|
|
213
|
+
}
|
|
214
|
+
|
|
231
215
|
function recorderCommand(config: VoiceConfig, outputPath: string): string[] {
|
|
232
216
|
if (commandExists("pw-record")) {
|
|
233
217
|
const cmd = ["pw-record", "--rate", "16000", "--channels", "1", "--format", "s16"];
|
|
@@ -241,6 +225,98 @@ function recorderCommand(config: VoiceConfig, outputPath: string): string[] {
|
|
|
241
225
|
throw new Error("No recorder found. Install PipeWire tools (pw-record) or alsa-utils (arecord).");
|
|
242
226
|
}
|
|
243
227
|
|
|
228
|
+
type PipeWireSource = {
|
|
229
|
+
id: string;
|
|
230
|
+
name: string;
|
|
231
|
+
description: string;
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
function parsePactlSources(text: string): PipeWireSource[] {
|
|
235
|
+
const sources: PipeWireSource[] = [];
|
|
236
|
+
let current: PipeWireSource | null = null;
|
|
237
|
+
for (const line of text.split(/\r?\n/)) {
|
|
238
|
+
const sourceMatch = line.match(/^Source #(\S+)/);
|
|
239
|
+
if (sourceMatch) {
|
|
240
|
+
if (current) sources.push(current);
|
|
241
|
+
current = { id: sourceMatch[1], name: "", description: "" };
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
if (!current) continue;
|
|
245
|
+
const nameMatch = line.match(/^\s*Name:\s*(.+)$/);
|
|
246
|
+
if (nameMatch) {
|
|
247
|
+
current.name = nameMatch[1].trim();
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
const descriptionMatch = line.match(/^\s*Description:\s*(.+)$/);
|
|
251
|
+
if (descriptionMatch) current.description = descriptionMatch[1].trim();
|
|
252
|
+
}
|
|
253
|
+
if (current) sources.push(current);
|
|
254
|
+
return sources;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function wpctlProperty(text: string, property: string): string {
|
|
258
|
+
const escaped = property.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
259
|
+
const match = text.match(new RegExp(`(?:^|\\n)\\s*\\*?\\s*${escaped}\\s*=\\s*"([^"]+)"`));
|
|
260
|
+
return match?.[1]?.trim() ?? "";
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
function inspectPipeWireSource(target: string): string {
|
|
264
|
+
if (!commandExists("wpctl")) return "";
|
|
265
|
+
const inspect = commandOutput("wpctl", ["inspect", target]);
|
|
266
|
+
return (
|
|
267
|
+
wpctlProperty(inspect, "node.description") ||
|
|
268
|
+
wpctlProperty(inspect, "node.nick") ||
|
|
269
|
+
wpctlProperty(inspect, "node.name")
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function defaultPipeWireSourceFromStatus(): string {
|
|
274
|
+
if (!commandExists("wpctl")) return "";
|
|
275
|
+
const status = commandOutput("wpctl", ["status"]);
|
|
276
|
+
let inSources = false;
|
|
277
|
+
for (const line of status.split(/\r?\n/)) {
|
|
278
|
+
if (/Sources:/.test(line)) {
|
|
279
|
+
inSources = true;
|
|
280
|
+
continue;
|
|
281
|
+
}
|
|
282
|
+
if (inSources && /^\s*[├└]─/.test(line)) break;
|
|
283
|
+
if (!inSources) continue;
|
|
284
|
+
const match = line.match(/^\s*│\s+\*\s+\d+\.\s+(.+?)(?:\s+\[|$)/);
|
|
285
|
+
if (match) return match[1].trim();
|
|
286
|
+
}
|
|
287
|
+
return "";
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function pipeWireSourceName(target: string): string {
|
|
291
|
+
const sources = commandExists("pactl") ? parsePactlSources(commandOutput("pactl", ["list", "sources"])) : [];
|
|
292
|
+
|
|
293
|
+
if (!target) {
|
|
294
|
+
const defaultName = commandExists("pactl") ? commandOutput("pactl", ["get-default-source"]) : "";
|
|
295
|
+
const source = sources.find((item) => item.name === defaultName);
|
|
296
|
+
return (
|
|
297
|
+
source?.description ||
|
|
298
|
+
source?.name ||
|
|
299
|
+
inspectPipeWireSource("@DEFAULT_SOURCE@") ||
|
|
300
|
+
defaultPipeWireSourceFromStatus() ||
|
|
301
|
+
defaultName ||
|
|
302
|
+
"default microphone"
|
|
303
|
+
);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const source = sources.find((item) => item.id === target || item.name === target || item.description === target);
|
|
307
|
+
return source?.description || source?.name || (/^\d+$/.test(target) ? inspectPipeWireSource(target) : "") || target;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function recordingDeviceName(config: VoiceConfig, recorderExecutable: string): string {
|
|
311
|
+
if (recorderExecutable === "pw-record") return pipeWireSourceName(config.recorderTarget);
|
|
312
|
+
if (recorderExecutable === "arecord") return "ALSA default microphone";
|
|
313
|
+
return config.recorderTarget || "default microphone";
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function recordingStatusText(deviceName: string): string {
|
|
317
|
+
return `Recording with ${deviceName || "default microphone"}`;
|
|
318
|
+
}
|
|
319
|
+
|
|
244
320
|
function readState(config: VoiceConfig): RecordingState | null {
|
|
245
321
|
if (!existsSync(config.statePath)) return null;
|
|
246
322
|
return JSON.parse(readFileSync(config.statePath, "utf8")) as RecordingState;
|
|
@@ -457,8 +533,9 @@ function parseRecordedWav(filePath: string): { pcm: Buffer; durationMs: number }
|
|
|
457
533
|
|
|
458
534
|
function missingCredentialsMessage(): string {
|
|
459
535
|
return [
|
|
460
|
-
"Missing
|
|
536
|
+
"Missing VolcEngine API key in the pi voice input config.",
|
|
461
537
|
"Run /voice key and paste your VolcEngine Speech API key.",
|
|
538
|
+
`Config file: ${CONFIG_PATH}`,
|
|
462
539
|
`Get/create the key here: ${VOLC_API_KEY_URL}`,
|
|
463
540
|
"Run /voice config to verify whether the key is detected.",
|
|
464
541
|
].join("\n");
|
|
@@ -611,12 +688,202 @@ async function transcribePcm(pcm: Buffer, durationMs: number, config: VoiceConfi
|
|
|
611
688
|
};
|
|
612
689
|
}
|
|
613
690
|
|
|
614
|
-
function
|
|
691
|
+
function tailText(text: string, maxChars: number): string {
|
|
692
|
+
if (maxChars <= 0) return "";
|
|
693
|
+
if (text.length <= maxChars) return text;
|
|
694
|
+
return `…${text.slice(-maxChars)}`;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
function truncateText(text: string, maxChars: number): string {
|
|
698
|
+
if (maxChars <= 0) return "";
|
|
699
|
+
if (text.length <= maxChars) return text;
|
|
700
|
+
return `${text.slice(0, maxChars)}…`;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
function textFromContent(content: unknown): string {
|
|
704
|
+
if (typeof content === "string") return content;
|
|
705
|
+
if (!Array.isArray(content)) return "";
|
|
706
|
+
return content
|
|
707
|
+
.map((part) => {
|
|
708
|
+
if (!part || typeof part !== "object") return "";
|
|
709
|
+
const block = part as { type?: unknown; text?: unknown };
|
|
710
|
+
if (block.type === "text" && typeof block.text === "string") return block.text;
|
|
711
|
+
return "";
|
|
712
|
+
})
|
|
713
|
+
.filter(Boolean)
|
|
714
|
+
.join("\n");
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
function getEditorContext(ctx: ExtensionContext, maxChars: number): string {
|
|
718
|
+
if (maxChars <= 0) return "";
|
|
719
|
+
try {
|
|
720
|
+
return tailText(ctx.ui.getEditorText().trim(), maxChars);
|
|
721
|
+
} catch {
|
|
722
|
+
return "";
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
function getRecentSessionContext(ctx: ExtensionContext, maxChars: number): string {
|
|
727
|
+
if (maxChars <= 0) return "";
|
|
728
|
+
const lines: string[] = [];
|
|
729
|
+
for (const entry of ctx.sessionManager.getBranch()) {
|
|
730
|
+
if (entry.type !== "message") continue;
|
|
731
|
+
const message = entry.message as { role?: unknown; content?: unknown };
|
|
732
|
+
if (message.role !== "user" && message.role !== "assistant") continue;
|
|
733
|
+
const text = textFromContent(message.content).replace(/\s+/g, " ").trim();
|
|
734
|
+
if (!text) continue;
|
|
735
|
+
lines.push(`${message.role}: ${truncateText(text, 1200)}`);
|
|
736
|
+
}
|
|
737
|
+
return tailText(lines.slice(-8).join("\n"), maxChars);
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
function simplifyModelReference(value: string): string {
|
|
741
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
function stripThinkingSuffix(value: string): string {
|
|
745
|
+
return value.replace(/:(?:off|minimal|low|medium|high|xhigh)$/i, "");
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
function modelLabel(model: Model<Api>): string {
|
|
749
|
+
return `${model.provider}/${model.id}`;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
function resolvePostprocessModel(ctx: ExtensionContext, reference: string): Model<Api> {
|
|
753
|
+
const requested = stripThinkingSuffix(reference.trim());
|
|
754
|
+
if (!requested) throw new Error("polishModel is empty in voice input config");
|
|
755
|
+
|
|
756
|
+
const models = ctx.modelRegistry.getAll();
|
|
757
|
+
const lower = requested.toLowerCase();
|
|
758
|
+
const simple = simplifyModelReference(requested);
|
|
759
|
+
|
|
760
|
+
const exactCanonical = models.filter((model) => modelLabel(model).toLowerCase() === lower);
|
|
761
|
+
if (exactCanonical.length === 1) return exactCanonical[0];
|
|
762
|
+
|
|
763
|
+
const exactBare = models.filter((model) => model.id.toLowerCase() === lower || model.name.toLowerCase() === lower);
|
|
764
|
+
if (exactBare.length === 1) return exactBare[0];
|
|
765
|
+
if (exactBare.length > 1) {
|
|
766
|
+
throw new Error(
|
|
767
|
+
`Ambiguous postprocess model "${reference}". Use provider/model, e.g. ${exactBare.map(modelLabel).slice(0, 5).join(", ")}`,
|
|
768
|
+
);
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
const exactSimple = models.filter(
|
|
772
|
+
(model) =>
|
|
773
|
+
simplifyModelReference(modelLabel(model)) === simple ||
|
|
774
|
+
simplifyModelReference(model.id) === simple ||
|
|
775
|
+
simplifyModelReference(model.name) === simple,
|
|
776
|
+
);
|
|
777
|
+
if (exactSimple.length === 1) return exactSimple[0];
|
|
778
|
+
if (exactSimple.length > 1) {
|
|
779
|
+
throw new Error(
|
|
780
|
+
`Ambiguous postprocess model "${reference}". Use provider/model, e.g. ${exactSimple.map(modelLabel).slice(0, 5).join(", ")}`,
|
|
781
|
+
);
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
const fuzzy = models.filter(
|
|
785
|
+
(model) =>
|
|
786
|
+
modelLabel(model).toLowerCase().includes(lower) ||
|
|
787
|
+
model.id.toLowerCase().includes(lower) ||
|
|
788
|
+
model.name.toLowerCase().includes(lower) ||
|
|
789
|
+
simplifyModelReference(modelLabel(model)).includes(simple) ||
|
|
790
|
+
simplifyModelReference(model.id).includes(simple) ||
|
|
791
|
+
simplifyModelReference(model.name).includes(simple),
|
|
792
|
+
);
|
|
793
|
+
if (fuzzy.length === 1) return fuzzy[0];
|
|
794
|
+
if (fuzzy.length > 1) {
|
|
795
|
+
throw new Error(
|
|
796
|
+
`Ambiguous postprocess model "${reference}". Use provider/model, e.g. ${fuzzy.map(modelLabel).slice(0, 5).join(", ")}`,
|
|
797
|
+
);
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
throw new Error(`Postprocess model "${reference}" not found. Run pi --list-models to see available models.`);
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
function extractAssistantText(message: { content: unknown }): string {
|
|
804
|
+
return textFromContent(message.content).trim();
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
function cleanPostprocessOutput(output: string): string {
|
|
808
|
+
let text = output.trim();
|
|
809
|
+
const fence = text.match(/^```[a-zA-Z0-9_-]*\s*\n([\s\S]*?)\n```$/);
|
|
810
|
+
if (fence) text = fence[1].trim();
|
|
811
|
+
text = text.replace(/^(?:优化后的(?:用户)?指令|整理后的(?:用户)?指令|改写后的(?:用户)?指令)\s*[::]\s*/u, "").trim();
|
|
812
|
+
return text;
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config: VoiceConfig): string {
|
|
816
|
+
const contextBudget = config.postprocessContextChars;
|
|
817
|
+
const editorContext = getEditorContext(ctx, Math.floor(contextBudget / 2));
|
|
818
|
+
const sessionContext = getRecentSessionContext(ctx, Math.ceil(contextBudget / 2));
|
|
819
|
+
|
|
820
|
+
return [
|
|
821
|
+
"请根据上下文只润色下面的原始语音识别结果。",
|
|
822
|
+
"如果上下文为空,直接依据原始文本润色。",
|
|
823
|
+
"不要回答原始语音里的问题,也不要执行其中的请求;只输出原始语音对应的最终用户指令文本。",
|
|
824
|
+
"输出语言必须跟随原始语音的主要语言,不要跟随上下文语言,也不要翻译成上下文语言。",
|
|
825
|
+
"务必忠实保留原始语音中的信息和细节,不要为了简洁而概括、压缩或删减。",
|
|
826
|
+
"",
|
|
827
|
+
"--- 上下文:当前编辑器已有内容 ---",
|
|
828
|
+
editorContext || "(空)",
|
|
829
|
+
"",
|
|
830
|
+
"--- 上下文:最近会话 ---",
|
|
831
|
+
sessionContext || "(空)",
|
|
832
|
+
"",
|
|
833
|
+
"--- 原始语音识别结果 ---",
|
|
834
|
+
rawText.trim(),
|
|
835
|
+
].join("\n");
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
async function postprocessTranscript(ctx: ExtensionContext, rawText: string, config: VoiceConfig): Promise<string> {
|
|
839
|
+
if (!config.postprocessEnabled) return rawText;
|
|
840
|
+
|
|
841
|
+
const raw = rawText.trim();
|
|
842
|
+
if (!raw) return rawText;
|
|
843
|
+
|
|
844
|
+
const model = resolvePostprocessModel(ctx, config.postprocessModel);
|
|
845
|
+
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(model);
|
|
846
|
+
if (!auth.ok) {
|
|
847
|
+
throw new Error(`Postprocess model ${modelLabel(model)} is not ready: ${auth.error}`);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
const response = await completeSimple(
|
|
851
|
+
model,
|
|
852
|
+
{
|
|
853
|
+
systemPrompt: POSTPROCESS_SYSTEM_PROMPT,
|
|
854
|
+
messages: [
|
|
855
|
+
{
|
|
856
|
+
role: "user",
|
|
857
|
+
content: buildPostprocessPrompt(ctx, raw, config),
|
|
858
|
+
timestamp: Date.now(),
|
|
859
|
+
},
|
|
860
|
+
],
|
|
861
|
+
tools: [],
|
|
862
|
+
},
|
|
863
|
+
{
|
|
864
|
+
apiKey: auth.apiKey,
|
|
865
|
+
headers: auth.headers,
|
|
866
|
+
temperature: 0,
|
|
867
|
+
maxTokens: config.postprocessMaxTokens,
|
|
868
|
+
timeoutMs: config.postprocessTimeoutMs,
|
|
869
|
+
maxRetries: 0,
|
|
870
|
+
cacheRetention: "none",
|
|
871
|
+
signal: ctx.signal,
|
|
872
|
+
},
|
|
873
|
+
);
|
|
874
|
+
|
|
875
|
+
if (response.stopReason === "error" || response.stopReason === "aborted") {
|
|
876
|
+
throw new Error(response.errorMessage || `Postprocess model stopped with ${response.stopReason}`);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
const polished = cleanPostprocessOutput(extractAssistantText(response));
|
|
880
|
+
return polished || rawText;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
function insertIntoEditor(ctx: ExtensionContext, text: string) {
|
|
615
884
|
const trimmed = text.trim();
|
|
616
885
|
if (!trimmed) return;
|
|
617
|
-
|
|
618
|
-
const separator = current.trim().length > 0 && !current.endsWith("\n") ? "\n" : "";
|
|
619
|
-
ctx.ui.setEditorText(`${current}${separator}${trimmed}`);
|
|
886
|
+
ctx.ui.pasteToEditor(trimmed);
|
|
620
887
|
}
|
|
621
888
|
|
|
622
889
|
async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
@@ -628,8 +895,9 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
628
895
|
const config = getConfig();
|
|
629
896
|
const existing = readState(config);
|
|
630
897
|
if (existing && pidAlive(existing.pid)) {
|
|
631
|
-
|
|
632
|
-
ctx.ui.
|
|
898
|
+
const deviceName = existing.deviceName || recordingDeviceName(config, commandExists("pw-record") ? "pw-record" : "arecord");
|
|
899
|
+
ctx.ui.notify(`Already recording: pid=${existing.pid}. ${recordingStatusText(deviceName)}`, "warning");
|
|
900
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("error", recordingStatusText(deviceName)));
|
|
633
901
|
return;
|
|
634
902
|
}
|
|
635
903
|
if (existing) clearState(config);
|
|
@@ -639,6 +907,7 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
639
907
|
const outputPath = path.join(config.recordingsDir, `recording-${timestampForFilename()}.wav`);
|
|
640
908
|
const logPath = path.join(config.logDir, `recording-${timestampForFilename()}.log`);
|
|
641
909
|
const cmd = recorderCommand(config, outputPath);
|
|
910
|
+
const deviceName = recordingDeviceName(config, cmd[0]);
|
|
642
911
|
|
|
643
912
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
|
|
644
913
|
const logFd = openSync(logPath, "a");
|
|
@@ -656,10 +925,11 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
656
925
|
logPath,
|
|
657
926
|
startedAt: new Date().toISOString(),
|
|
658
927
|
recorderTarget: config.recorderTarget || undefined,
|
|
928
|
+
deviceName,
|
|
659
929
|
});
|
|
660
930
|
|
|
661
|
-
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("error",
|
|
662
|
-
ctx.ui.notify(
|
|
931
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("error", recordingStatusText(deviceName)));
|
|
932
|
+
ctx.ui.notify(`${recordingStatusText(deviceName)}. Press Ctrl+Shift+R again to stop/transcribe.`, "info");
|
|
663
933
|
}
|
|
664
934
|
|
|
665
935
|
async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
@@ -691,9 +961,9 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
691
961
|
const { pcm, durationMs } = parseRecordedWav(state.path);
|
|
692
962
|
const decodeMs = Date.now() - decodeStart;
|
|
693
963
|
const result = await transcribePcm(pcm, durationMs, config);
|
|
694
|
-
ctx.ui.setStatus("voice-input", undefined);
|
|
695
964
|
|
|
696
965
|
if (!result.text.trim()) {
|
|
966
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
697
967
|
ctx.ui.notify(
|
|
698
968
|
`Transcription finished but no text was returned. audio=${(durationMs / 1000).toFixed(2)}s total=${result.timings.totalMs}ms`,
|
|
699
969
|
"warning",
|
|
@@ -701,9 +971,31 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
701
971
|
return;
|
|
702
972
|
}
|
|
703
973
|
|
|
704
|
-
|
|
974
|
+
let finalText = result.text;
|
|
975
|
+
let postprocessMs = 0;
|
|
976
|
+
let postprocessUsed = false;
|
|
977
|
+
if (config.postprocessEnabled) {
|
|
978
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● polishing"));
|
|
979
|
+
const postprocessStart = Date.now();
|
|
980
|
+
try {
|
|
981
|
+
finalText = await postprocessTranscript(ctx, result.text, config);
|
|
982
|
+
postprocessMs = Date.now() - postprocessStart;
|
|
983
|
+
postprocessUsed = finalText.trim() !== result.text.trim();
|
|
984
|
+
} catch (error) {
|
|
985
|
+
postprocessMs = Date.now() - postprocessStart;
|
|
986
|
+
ctx.ui.notify(
|
|
987
|
+
`Voice postprocess failed; inserting raw transcript. ${error instanceof Error ? error.message : String(error)}`,
|
|
988
|
+
"warning",
|
|
989
|
+
);
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
994
|
+
insertIntoEditor(ctx, finalText);
|
|
705
995
|
ctx.ui.notify(
|
|
706
|
-
`Voice text inserted. audio=${(durationMs / 1000).toFixed(2)}s decode=${decodeMs}ms asr=${result.timings.totalMs}ms
|
|
996
|
+
`Voice text inserted. audio=${(durationMs / 1000).toFixed(2)}s decode=${decodeMs}ms asr=${result.timings.totalMs}ms${
|
|
997
|
+
config.postprocessEnabled ? ` postprocess=${postprocessMs}ms${postprocessUsed ? " polished" : ""}` : ""
|
|
998
|
+
} packets=${result.packets}`,
|
|
707
999
|
"info",
|
|
708
1000
|
);
|
|
709
1001
|
}
|
|
@@ -722,8 +1014,11 @@ function setupHelp(config = getConfig()): string {
|
|
|
722
1014
|
return [
|
|
723
1015
|
"pi Voice Input setup:",
|
|
724
1016
|
"- Current provider: VolcEngine WebSocket ASR",
|
|
1017
|
+
`- Config file: ${config.configPath}`,
|
|
725
1018
|
`- API key: ${config.apiKey ? "set" : "missing"}`,
|
|
1019
|
+
"- To create/update the JSON config file, run: /voice init",
|
|
726
1020
|
"- To save/update the key, run: /voice key",
|
|
1021
|
+
`- Polish: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
|
|
727
1022
|
`- Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
|
|
728
1023
|
"- After saving the key, run: /voice config",
|
|
729
1024
|
].join("\n");
|
|
@@ -734,12 +1029,12 @@ async function configureApiKey(ctx: ExtensionContext, providedKey = "") {
|
|
|
734
1029
|
|
|
735
1030
|
if (!apiKey) {
|
|
736
1031
|
if (!ctx.hasUI) {
|
|
737
|
-
ctx.ui.notify(`Run /voice key in interactive pi, or
|
|
1032
|
+
ctx.ui.notify(`Run /voice key in interactive pi, or edit ${CONFIG_PATH}. Get a key from ${VOLC_API_KEY_URL}.`, "error");
|
|
738
1033
|
return;
|
|
739
1034
|
}
|
|
740
1035
|
ctx.ui.notify(`Get/create a VolcEngine Speech API key here:\n${VOLC_API_KEY_URL}`, "info");
|
|
741
1036
|
const current = getConfig().apiKey;
|
|
742
|
-
const placeholder = current ? "Paste a new VolcEngine API key (current key is already set)" : "Paste
|
|
1037
|
+
const placeholder = current ? "Paste a new VolcEngine API key (current key is already set)" : "Paste VolcEngine API key";
|
|
743
1038
|
apiKey = (await ctx.ui.input("VolcEngine API key", placeholder))?.trim() ?? "";
|
|
744
1039
|
}
|
|
745
1040
|
|
|
@@ -748,25 +1043,21 @@ async function configureApiKey(ctx: ExtensionContext, providedKey = "") {
|
|
|
748
1043
|
return;
|
|
749
1044
|
}
|
|
750
1045
|
|
|
751
|
-
|
|
752
|
-
ctx.ui.notify(
|
|
1046
|
+
writeConfigApiKey(apiKey);
|
|
1047
|
+
ctx.ui.notify(`VolcEngine API key saved in ${CONFIG_PATH}. Run /voice config to verify it is detected.`, "info");
|
|
753
1048
|
}
|
|
754
1049
|
|
|
755
1050
|
function configSummary(config: VoiceConfig): string {
|
|
1051
|
+
const recorderExecutable = commandExists("pw-record") ? "pw-record" : commandExists("arecord") ? "arecord" : "";
|
|
1052
|
+
const currentDevice = recorderExecutable ? recordingDeviceName(config, recorderExecutable) : "no recorder found";
|
|
756
1053
|
return [
|
|
757
1054
|
"Voice input config:",
|
|
758
|
-
`-
|
|
759
|
-
`-
|
|
760
|
-
`-
|
|
761
|
-
`-
|
|
762
|
-
|
|
763
|
-
`- segment: ${config.segmentMs}ms`,
|
|
764
|
-
`- recordings: ${config.recordingsDir}`,
|
|
765
|
-
`- state: ${config.statePath}`,
|
|
766
|
-
`- shortcut: ${config.shortcut}`,
|
|
767
|
-
"Run /voice key to save/update the current provider API key.",
|
|
1055
|
+
`- config file: ${config.configPath}${existsSync(config.configPath) ? "" : " (missing; run /voice init to create it)"}`,
|
|
1056
|
+
`- volcApiKey: ${config.apiKey ? "set" : "missing"} (update with /voice key)`,
|
|
1057
|
+
`- polishModel: ${config.postprocessEnabled ? config.postprocessModel : "disabled"}`,
|
|
1058
|
+
`- current recording device: ${currentDevice}`,
|
|
1059
|
+
"Config keys: volcApiKey, polishModel. Leave polishModel empty to disable polish.",
|
|
768
1060
|
`VolcEngine API key URL: ${VOLC_API_KEY_URL}`,
|
|
769
|
-
"Config files checked: ~/.pi/agent/voice-input.env, package .env, current .env; shell env overrides them.",
|
|
770
1061
|
].join("\n");
|
|
771
1062
|
}
|
|
772
1063
|
|
|
@@ -786,7 +1077,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
786
1077
|
});
|
|
787
1078
|
|
|
788
1079
|
pi.registerCommand("voice", {
|
|
789
|
-
description: "Voice input: start | stop | status | toggle | cancel | config | key | help",
|
|
1080
|
+
description: "Voice input: start | stop | status | toggle | cancel | config | init | key | help",
|
|
790
1081
|
handler: async (args, ctx) => {
|
|
791
1082
|
const input = (args || "toggle").trim();
|
|
792
1083
|
const action = (input.split(/\s+/, 1)[0] || "toggle").toLowerCase();
|
|
@@ -814,6 +1105,11 @@ export default function (pi: ExtensionAPI) {
|
|
|
814
1105
|
ctx.ui.notify(configSummary(getConfig()), "info");
|
|
815
1106
|
return;
|
|
816
1107
|
}
|
|
1108
|
+
if (action === "init") {
|
|
1109
|
+
const created = ensureConfigFile();
|
|
1110
|
+
ctx.ui.notify(`${created ? "Created" : "Updated"} voice input config: ${CONFIG_PATH}`, "info");
|
|
1111
|
+
return;
|
|
1112
|
+
}
|
|
817
1113
|
if (["key", "api-key", "apikey", "setup", "configure"].includes(action)) {
|
|
818
1114
|
await configureApiKey(ctx, rest);
|
|
819
1115
|
return;
|
|
@@ -826,7 +1122,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
826
1122
|
await toggleRecording(ctx);
|
|
827
1123
|
return;
|
|
828
1124
|
}
|
|
829
|
-
ctx.ui.notify("Usage: /voice start | stop | status | toggle | cancel | config | key | help", "error");
|
|
1125
|
+
ctx.ui.notify("Usage: /voice start | stop | status | toggle | cancel | config | init | key | help", "error");
|
|
830
1126
|
} catch (error) {
|
|
831
1127
|
ctx.ui.setStatus("voice-input", undefined);
|
|
832
1128
|
ctx.ui.notify(`Voice command error: ${error instanceof Error ? error.message : String(error)}`, "error");
|
|
@@ -842,7 +1138,8 @@ export default function (pi: ExtensionAPI) {
|
|
|
842
1138
|
ctx.ui.notify(
|
|
843
1139
|
[
|
|
844
1140
|
`Voice input loaded: ${startupConfig.shortcut} toggles recording.`,
|
|
845
|
-
"API key is missing. Run /voice key to set it up.",
|
|
1141
|
+
"API key is missing. Run /voice key to set it up, or edit the JSON config file.",
|
|
1142
|
+
`Config file: ${startupConfig.configPath}`,
|
|
846
1143
|
`Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
|
|
847
1144
|
].join("\n"),
|
|
848
1145
|
"warning",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-voice-input",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "provider-extensible voice input extension for pi",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"keywords": [
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
"author": "tr-nc",
|
|
16
16
|
"repository": {
|
|
17
17
|
"type": "git",
|
|
18
|
-
"url": "https://github.com/tr-nc/pi-voice-input.git"
|
|
18
|
+
"url": "git+https://github.com/tr-nc/pi-voice-input.git"
|
|
19
19
|
},
|
|
20
20
|
"bugs": {
|
|
21
21
|
"url": "https://github.com/tr-nc/pi-voice-input/issues"
|
|
@@ -23,7 +23,6 @@
|
|
|
23
23
|
"homepage": "https://github.com/tr-nc/pi-voice-input#readme",
|
|
24
24
|
"files": [
|
|
25
25
|
"extensions",
|
|
26
|
-
".env.example",
|
|
27
26
|
"README.md",
|
|
28
27
|
"AGENTS.md"
|
|
29
28
|
],
|
|
@@ -36,6 +35,7 @@
|
|
|
36
35
|
"ws": "^8.20.1"
|
|
37
36
|
},
|
|
38
37
|
"devDependencies": {
|
|
38
|
+
"@earendil-works/pi-ai": "*",
|
|
39
39
|
"@earendil-works/pi-coding-agent": "*",
|
|
40
40
|
"@earendil-works/pi-tui": "*",
|
|
41
41
|
"@types/node": "^25.8.0",
|
|
@@ -43,10 +43,14 @@
|
|
|
43
43
|
"typescript": "^6.0.3"
|
|
44
44
|
},
|
|
45
45
|
"peerDependencies": {
|
|
46
|
+
"@earendil-works/pi-ai": "*",
|
|
46
47
|
"@earendil-works/pi-coding-agent": "*",
|
|
47
48
|
"@earendil-works/pi-tui": "*"
|
|
48
49
|
},
|
|
49
50
|
"peerDependenciesMeta": {
|
|
51
|
+
"@earendil-works/pi-ai": {
|
|
52
|
+
"optional": true
|
|
53
|
+
},
|
|
50
54
|
"@earendil-works/pi-coding-agent": {
|
|
51
55
|
"optional": true
|
|
52
56
|
},
|
package/.env.example
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
# Copy to ~/.pi/agent/voice-input.env or to this package as .env.
|
|
2
|
-
# Do not commit real credentials.
|
|
3
|
-
|
|
4
|
-
# Required for the current provider: VolcEngine speech API key.
|
|
5
|
-
VOLC_API_KEY=
|
|
6
|
-
|
|
7
|
-
# Optional ASR settings.
|
|
8
|
-
VOLC_WS_URL=wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream
|
|
9
|
-
VOLC_STREAM_RESOURCE_ID=volc.seedasr.sauc.duration
|
|
10
|
-
ASR_LANGUAGE=
|
|
11
|
-
ASR_PROMPT=
|
|
12
|
-
STREAM_SEGMENT_MS=5000
|
|
13
|
-
ASR_REQUEST_TIMEOUT_MS=90000
|
|
14
|
-
|
|
15
|
-
# Optional recorder settings.
|
|
16
|
-
# Leave empty to let PipeWire choose the default microphone.
|
|
17
|
-
RECORDER_TARGET=
|
|
18
|
-
RECORDING_FINALIZE_DELAY=0.1
|
|
19
|
-
|
|
20
|
-
# Optional storage settings. Defaults to ~/.pi/agent/voice-input.
|
|
21
|
-
VOICE_INPUT_HOME=~/.pi/agent/voice-input
|
|
22
|
-
RECORDINGS_DIR=recordings
|
|
23
|
-
RECORDER_STATE=recording.json
|
|
24
|
-
RECORDER_LOG_DIR=logs
|
|
25
|
-
|
|
26
|
-
# Optional shortcut. Default is Ctrl+Shift+R.
|
|
27
|
-
VOICE_INPUT_SHORTCUT=ctrl+shift+r
|