pi-voice-input 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +27 -0
- package/README.md +201 -0
- package/extensions/voice-input.ts +761 -0
- package/package.json +55 -0
package/.env.example
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copy to ~/.pi/agent/voice-input.env or to this package as .env.
|
|
2
|
+
# Do not commit real credentials.
|
|
3
|
+
|
|
4
|
+
# Required for the current provider: VolcEngine speech API key.
|
|
5
|
+
VOLC_API_KEY=
|
|
6
|
+
|
|
7
|
+
# Optional ASR settings.
|
|
8
|
+
VOLC_WS_URL=wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream
|
|
9
|
+
VOLC_STREAM_RESOURCE_ID=volc.seedasr.sauc.duration
|
|
10
|
+
ASR_LANGUAGE=
|
|
11
|
+
ASR_PROMPT=
|
|
12
|
+
STREAM_SEGMENT_MS=5000
|
|
13
|
+
ASR_REQUEST_TIMEOUT_MS=90000
|
|
14
|
+
|
|
15
|
+
# Optional recorder settings.
|
|
16
|
+
# Leave empty to let PipeWire choose the default microphone.
|
|
17
|
+
RECORDER_TARGET=
|
|
18
|
+
RECORDING_FINALIZE_DELAY=0.1
|
|
19
|
+
|
|
20
|
+
# Optional storage settings. Defaults to ~/.pi/agent/voice-input.
|
|
21
|
+
VOICE_INPUT_HOME=~/.pi/agent/voice-input
|
|
22
|
+
RECORDINGS_DIR=recordings
|
|
23
|
+
RECORDER_STATE=recording.json
|
|
24
|
+
RECORDER_LOG_DIR=logs
|
|
25
|
+
|
|
26
|
+
# Optional shortcut. Default is Ctrl+Shift+R.
|
|
27
|
+
VOICE_INPUT_SHORTCUT=ctrl+shift+r
|
package/README.md
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# pi Voice Input
|
|
2
|
+
|
|
3
|
+
A publishable, pure TypeScript [pi](https://pi.dev/) extension for local voice input.
|
|
4
|
+
|
|
5
|
+
- Press `Ctrl+Shift+R` once to start recording.
|
|
6
|
+
- Press `Ctrl+Shift+R` again to stop.
|
|
7
|
+
- The extension sends the audio to an ASR provider.
|
|
8
|
+
- The recognized text is inserted into pi's editor without submitting.
|
|
9
|
+
|
|
10
|
+
The provider layer is intended to be extensible. **Current version supports only VolcEngine WebSocket ASR.**
|
|
11
|
+
|
|
12
|
+
No Python, `uv`, upload service, or `ffmpeg` is required for normal shortcut usage.
|
|
13
|
+
|
|
14
|
+
## Architecture
|
|
15
|
+
|
|
16
|
+
```text
|
|
17
|
+
pi extension: extensions/voice-input.ts
|
|
18
|
+
├─ registers Ctrl+Shift+R and /voice commands
|
|
19
|
+
├─ starts/stops a local recorder process
|
|
20
|
+
│ ├─ preferred: pw-record
|
|
21
|
+
│ └─ fallback: arecord
|
|
22
|
+
├─ records 16 kHz mono 16-bit WAV
|
|
23
|
+
├─ parses the WAV container in TypeScript and extracts raw PCM
|
|
24
|
+
├─ sends PCM frames to the configured ASR provider via ws
|
|
25
|
+
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
26
|
+
└─ appends the final transcript to pi's editor with ctx.ui.setEditorText()
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Runtime package dependency:
|
|
30
|
+
|
|
31
|
+
- `ws`
|
|
32
|
+
|
|
33
|
+
System dependency, one of:
|
|
34
|
+
|
|
35
|
+
- `pw-record` from PipeWire tools, preferred
|
|
36
|
+
- `arecord` from alsa-utils, fallback
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
Install the published package with pi:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pi install npm:pi-voice-input
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
To pin a specific version:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pi install npm:pi-voice-input@0.1.0
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
If pi is already running, reload extensions after installation:
|
|
53
|
+
|
|
54
|
+
```text
|
|
55
|
+
/reload
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Providers
|
|
59
|
+
|
|
60
|
+
The extension is structured around a provider boundary: recording, editor insertion, and command handling are generic; ASR transport/protocol logic is provider-specific.
|
|
61
|
+
|
|
62
|
+
Currently implemented provider:
|
|
63
|
+
|
|
64
|
+
- VolcEngine WebSocket ASR (`bigmodel_nostream`)
|
|
65
|
+
|
|
66
|
+
Planned provider direction:
|
|
67
|
+
|
|
68
|
+
- add more ASR providers without changing the shortcut/user workflow
|
|
69
|
+
- keep provider credentials and options isolated in config
|
|
70
|
+
|
|
71
|
+
## Configure credentials
|
|
72
|
+
|
|
73
|
+
Create a config file:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
mkdir -p ~/.pi/agent
|
|
77
|
+
cp .env.example ~/.pi/agent/voice-input.env
|
|
78
|
+
$EDITOR ~/.pi/agent/voice-input.env
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
At minimum, set:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
VOLC_API_KEY=your_volcengine_speech_api_key
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
You can get/manage the key here:
|
|
88
|
+
|
|
89
|
+
https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
90
|
+
|
|
91
|
+
If `VOLC_API_KEY` is missing, the extension does not silently fail. It shows an error notification explaining:
|
|
92
|
+
|
|
93
|
+
- that `VOLC_API_KEY` is missing
|
|
94
|
+
- where to put it: `~/.pi/agent/voice-input.env`
|
|
95
|
+
- the exact config line to add
|
|
96
|
+
- the Volcengine API-key settings URL
|
|
97
|
+
- that `/voice config` can be used to verify detection
|
|
98
|
+
|
|
99
|
+
## Configuration reference
|
|
100
|
+
|
|
101
|
+
Example:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Required
|
|
105
|
+
VOLC_API_KEY=your_volcengine_speech_api_key
|
|
106
|
+
|
|
107
|
+
# Current provider: VolcEngine WebSocket ASR endpoint and resource
|
|
108
|
+
VOLC_WS_URL=wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream
|
|
109
|
+
VOLC_STREAM_RESOURCE_ID=volc.seedasr.sauc.duration
|
|
110
|
+
|
|
111
|
+
# Empty means auto-detect. Example: zh-CN.
|
|
112
|
+
ASR_LANGUAGE=
|
|
113
|
+
|
|
114
|
+
# Optional contextual prompt for ASR.
|
|
115
|
+
ASR_PROMPT=
|
|
116
|
+
|
|
117
|
+
# Faster for post-recording batch transcription. Use 200 for realtime-like packet size.
|
|
118
|
+
STREAM_SEGMENT_MS=5000
|
|
119
|
+
ASR_REQUEST_TIMEOUT_MS=90000
|
|
120
|
+
|
|
121
|
+
# Empty means use PipeWire's default source.
|
|
122
|
+
RECORDER_TARGET=
|
|
123
|
+
RECORDING_FINALIZE_DELAY=0.1
|
|
124
|
+
|
|
125
|
+
# Storage for recordings, logs, and state.
|
|
126
|
+
VOICE_INPUT_HOME=~/.pi/agent/voice-input
|
|
127
|
+
RECORDINGS_DIR=recordings
|
|
128
|
+
RECORDER_STATE=recording.json
|
|
129
|
+
RECORDER_LOG_DIR=logs
|
|
130
|
+
|
|
131
|
+
# Shortcut. Default: ctrl+shift+r
|
|
132
|
+
VOICE_INPUT_SHORTCUT=ctrl+shift+r
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Config loading order, later values override earlier ones:
|
|
136
|
+
|
|
137
|
+
1. `~/.pi/agent/voice-input.env`
|
|
138
|
+
2. package-local `.env`
|
|
139
|
+
3. current-working-directory `.env`
|
|
140
|
+
4. shell environment variables
|
|
141
|
+
|
|
142
|
+
Do not commit real credentials. Keep private local values in `.env` or `~/.pi/agent/voice-input.env`.
|
|
143
|
+
|
|
144
|
+
## Usage
|
|
145
|
+
|
|
146
|
+
Shortcut:
|
|
147
|
+
|
|
148
|
+
```text
|
|
149
|
+
Ctrl+Shift+R
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Slash commands:
|
|
153
|
+
|
|
154
|
+
```text
|
|
155
|
+
/voice start # start recording
|
|
156
|
+
/voice stop # stop, transcribe, insert text
|
|
157
|
+
/voice toggle # start if idle, stop if recording
|
|
158
|
+
/voice cancel # stop recording without transcribing
|
|
159
|
+
/voice status # show recorder state
|
|
160
|
+
/voice config # show effective non-secret config and whether API key is detected
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Notes
|
|
164
|
+
|
|
165
|
+
- The extension uses post-recording WebSocket ASR: it records locally first, then sends the stopped recording in chunks. It is optimized for fast voice input, not live subtitles.
|
|
166
|
+
- The default `STREAM_SEGMENT_MS=5000` is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
167
|
+
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
168
|
+
|
|
169
|
+
## Development
|
|
170
|
+
|
|
171
|
+
Clone the repo and install dependencies:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
git clone git@github.com:tr-nc/pi-voice-input.git
|
|
175
|
+
cd pi-voice-input
|
|
176
|
+
npm install
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Run directly without installing the package:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pi -e ./extensions/voice-input.ts
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Or install the local checkout while developing:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
pi install .
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
After changing the extension while pi is open, run:
|
|
192
|
+
|
|
193
|
+
```text
|
|
194
|
+
/reload
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Volcengine links
|
|
198
|
+
|
|
199
|
+
- API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default
|
|
200
|
+
- ASR product page: https://www.volcengine.com/product/asr
|
|
201
|
+
- WebSocket ASR docs: https://www.volcengine.com/docs/6561/1354869?lang=zh
|
|
@@ -0,0 +1,761 @@
|
|
|
1
|
+
import type { ExtensionAPI, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { Key } from "@earendil-works/pi-tui";
|
|
3
|
+
import { spawn, spawnSync } from "node:child_process";
|
|
4
|
+
import { randomUUID } from "node:crypto";
|
|
5
|
+
import {
|
|
6
|
+
closeSync,
|
|
7
|
+
existsSync,
|
|
8
|
+
mkdirSync,
|
|
9
|
+
openSync,
|
|
10
|
+
readFileSync,
|
|
11
|
+
statSync,
|
|
12
|
+
unlinkSync,
|
|
13
|
+
writeFileSync,
|
|
14
|
+
} from "node:fs";
|
|
15
|
+
import { homedir } from "node:os";
|
|
16
|
+
import path from "node:path";
|
|
17
|
+
import { fileURLToPath } from "node:url";
|
|
18
|
+
import { gzipSync, gunzipSync } from "node:zlib";
|
|
19
|
+
import WebSocket from "ws";
|
|
20
|
+
|
|
21
|
+
const EXTENSION_DIR = path.dirname(fileURLToPath(import.meta.url));
|
|
22
|
+
const PACKAGE_ROOT = path.resolve(EXTENSION_DIR, "..");
|
|
23
|
+
const DEFAULT_SHORTCUT = Key.ctrlShift("r");
|
|
24
|
+
|
|
25
|
+
const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
|
|
26
|
+
const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
|
|
27
|
+
const MSG_TYPE_SERVER_FULL_RESPONSE = 0b1001;
|
|
28
|
+
const MSG_TYPE_SERVER_ERROR_RESPONSE = 0b1111;
|
|
29
|
+
const FLAG_POS_SEQUENCE = 0b0001;
|
|
30
|
+
const FLAG_NEG_WITH_SEQUENCE = 0b0011;
|
|
31
|
+
const SERIALIZATION_NONE = 0b0000;
|
|
32
|
+
const SERIALIZATION_JSON = 0b0001;
|
|
33
|
+
const COMPRESSION_GZIP = 0b0001;
|
|
34
|
+
|
|
35
|
+
type EnvMap = Record<string, string>;
|
|
36
|
+
|
|
37
|
+
type VoiceConfig = {
|
|
38
|
+
apiKey: string;
|
|
39
|
+
wsUrl: string;
|
|
40
|
+
resourceId: string;
|
|
41
|
+
language: string;
|
|
42
|
+
uid: string;
|
|
43
|
+
prompt: string;
|
|
44
|
+
segmentMs: number;
|
|
45
|
+
requestTimeoutMs: number;
|
|
46
|
+
finalizeDelayMs: number;
|
|
47
|
+
recorderTarget: string;
|
|
48
|
+
recordingsDir: string;
|
|
49
|
+
statePath: string;
|
|
50
|
+
logDir: string;
|
|
51
|
+
shortcut: string;
|
|
52
|
+
enableItn: boolean;
|
|
53
|
+
enablePunc: boolean;
|
|
54
|
+
enableDdc: boolean;
|
|
55
|
+
showUtterances: boolean;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
type RecordingState = {
|
|
59
|
+
pid: number;
|
|
60
|
+
path: string;
|
|
61
|
+
logPath: string;
|
|
62
|
+
startedAt: string;
|
|
63
|
+
recorderTarget?: string;
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
type DecodedFrame = {
|
|
67
|
+
messageType: number;
|
|
68
|
+
sequence: number | null;
|
|
69
|
+
isLast: boolean;
|
|
70
|
+
payload: unknown;
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
type TranscriptionResult = {
|
|
74
|
+
text: string;
|
|
75
|
+
durationMs: number;
|
|
76
|
+
packets: number;
|
|
77
|
+
timings: {
|
|
78
|
+
wsOpenMs: number;
|
|
79
|
+
sendMs: number;
|
|
80
|
+
waitMs: number;
|
|
81
|
+
totalMs: number;
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
function parseEnvText(text: string): EnvMap {
|
|
86
|
+
const env: EnvMap = {};
|
|
87
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
88
|
+
const line = rawLine.trim();
|
|
89
|
+
if (!line || line.startsWith("#")) continue;
|
|
90
|
+
const match = line.match(/^([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$/);
|
|
91
|
+
if (!match) continue;
|
|
92
|
+
const key = match[1];
|
|
93
|
+
let value = match[2] ?? "";
|
|
94
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
95
|
+
value = value.slice(1, -1);
|
|
96
|
+
}
|
|
97
|
+
env[key] = value;
|
|
98
|
+
}
|
|
99
|
+
return env;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function loadEnvFiles(): EnvMap {
|
|
103
|
+
const candidates = [
|
|
104
|
+
path.join(homedir(), ".pi", "agent", "voice-input.env"),
|
|
105
|
+
path.join(PACKAGE_ROOT, ".env"),
|
|
106
|
+
path.join(process.cwd(), ".env"),
|
|
107
|
+
];
|
|
108
|
+
const merged: EnvMap = {};
|
|
109
|
+
for (const file of candidates) {
|
|
110
|
+
if (!existsSync(file)) continue;
|
|
111
|
+
Object.assign(merged, parseEnvText(readFileSync(file, "utf8")));
|
|
112
|
+
}
|
|
113
|
+
return merged;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function setting(env: EnvMap, name: string, fallback = ""): string {
|
|
117
|
+
const value = process.env[name] ?? env[name];
|
|
118
|
+
return value == null ? fallback : value;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function settingAny(env: EnvMap, names: string[], fallback = ""): string {
|
|
122
|
+
for (const name of names) {
|
|
123
|
+
const value = process.env[name] ?? env[name];
|
|
124
|
+
if (value != null && value !== "") return value;
|
|
125
|
+
}
|
|
126
|
+
return fallback;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function boolSetting(env: EnvMap, name: string, fallback: boolean): boolean {
|
|
130
|
+
const raw = setting(env, name, fallback ? "true" : "false").trim().toLowerCase();
|
|
131
|
+
if (["1", "true", "yes", "on"].includes(raw)) return true;
|
|
132
|
+
if (["0", "false", "no", "off"].includes(raw)) return false;
|
|
133
|
+
return fallback;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function numberSetting(env: EnvMap, name: string, fallback: number): number {
|
|
137
|
+
const raw = setting(env, name, String(fallback)).trim();
|
|
138
|
+
const value = Number(raw);
|
|
139
|
+
return Number.isFinite(value) ? value : fallback;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function clamp(value: number, min: number, max: number): number {
|
|
143
|
+
return Math.min(max, Math.max(min, value));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function expandHome(value: string): string {
|
|
147
|
+
if (value === "~") return homedir();
|
|
148
|
+
if (value.startsWith("~/")) return path.join(homedir(), value.slice(2));
|
|
149
|
+
return value;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function resolvePath(value: string, baseDir: string): string {
|
|
153
|
+
const expanded = expandHome(value);
|
|
154
|
+
return path.isAbsolute(expanded) ? expanded : path.resolve(baseDir, expanded);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function getConfig(): VoiceConfig {
|
|
158
|
+
const env = loadEnvFiles();
|
|
159
|
+
const defaultHome = path.join(homedir(), ".pi", "agent", "voice-input");
|
|
160
|
+
const voiceHome = resolvePath(setting(env, "VOICE_INPUT_HOME", defaultHome), process.cwd());
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
apiKey: settingAny(env, ["VOLC_API_KEY", "VOLCENGINE_API_KEY", "DOUBAO_ASR_API_KEY"]).trim(),
|
|
164
|
+
wsUrl: setting(env, "VOLC_WS_URL", "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream").trim(),
|
|
165
|
+
resourceId: setting(env, "VOLC_STREAM_RESOURCE_ID", "volc.seedasr.sauc.duration").trim(),
|
|
166
|
+
language: settingAny(env, ["ASR_LANGUAGE", "VOLC_ASR_LANGUAGE"], "").trim(),
|
|
167
|
+
uid: setting(env, "ASR_UID", "pi-voice-input").trim(),
|
|
168
|
+
prompt: setting(env, "ASR_PROMPT", "").trim(),
|
|
169
|
+
segmentMs: clamp(Math.round(numberSetting(env, "STREAM_SEGMENT_MS", 5000)), 100, 20000),
|
|
170
|
+
requestTimeoutMs: clamp(Math.round(numberSetting(env, "ASR_REQUEST_TIMEOUT_MS", 90000)), 1000, 10 * 60 * 1000),
|
|
171
|
+
finalizeDelayMs: clamp(numberSetting(env, "RECORDING_FINALIZE_DELAY", 0.1) * 1000, 0, 5000),
|
|
172
|
+
recorderTarget: setting(env, "RECORDER_TARGET", "").trim(),
|
|
173
|
+
recordingsDir: resolvePath(setting(env, "RECORDINGS_DIR", "recordings"), voiceHome),
|
|
174
|
+
statePath: resolvePath(setting(env, "RECORDER_STATE", "recording.json"), voiceHome),
|
|
175
|
+
logDir: resolvePath(setting(env, "RECORDER_LOG_DIR", "logs"), voiceHome),
|
|
176
|
+
shortcut: setting(env, "VOICE_INPUT_SHORTCUT", DEFAULT_SHORTCUT).trim() || DEFAULT_SHORTCUT,
|
|
177
|
+
enableItn: boolSetting(env, "ENABLE_ITN", true),
|
|
178
|
+
enablePunc: boolSetting(env, "ENABLE_PUNC", true),
|
|
179
|
+
enableDdc: boolSetting(env, "ENABLE_DDC", false),
|
|
180
|
+
showUtterances: boolSetting(env, "SHOW_UTTERANCES", false),
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function ensureDir(dir: string) {
|
|
185
|
+
mkdirSync(dir, { recursive: true });
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function timestampForFilename(): string {
|
|
189
|
+
return new Date().toISOString().replace(/[-:]/g, "").replace(/\.\d{3}Z$/, "");
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function commandExists(command: string): boolean {
|
|
193
|
+
return spawnSync("sh", ["-lc", `command -v ${command}`], { stdio: "ignore" }).status === 0;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
function recorderCommand(config: VoiceConfig, outputPath: string): string[] {
|
|
197
|
+
if (commandExists("pw-record")) {
|
|
198
|
+
const cmd = ["pw-record", "--rate", "16000", "--channels", "1", "--format", "s16"];
|
|
199
|
+
if (config.recorderTarget) cmd.push("--target", config.recorderTarget);
|
|
200
|
+
cmd.push(outputPath);
|
|
201
|
+
return cmd;
|
|
202
|
+
}
|
|
203
|
+
if (commandExists("arecord")) {
|
|
204
|
+
return ["arecord", "-q", "-f", "S16_LE", "-r", "16000", "-c", "1", "-t", "wav", outputPath];
|
|
205
|
+
}
|
|
206
|
+
throw new Error("No recorder found. Install PipeWire tools (pw-record) or alsa-utils (arecord).");
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function readState(config: VoiceConfig): RecordingState | null {
|
|
210
|
+
if (!existsSync(config.statePath)) return null;
|
|
211
|
+
return JSON.parse(readFileSync(config.statePath, "utf8")) as RecordingState;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function writeState(config: VoiceConfig, state: RecordingState) {
|
|
215
|
+
ensureDir(path.dirname(config.statePath));
|
|
216
|
+
writeFileSync(config.statePath, JSON.stringify(state, null, 2));
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function clearState(config: VoiceConfig) {
|
|
220
|
+
try {
|
|
221
|
+
unlinkSync(config.statePath);
|
|
222
|
+
} catch (error) {
|
|
223
|
+
if ((error as NodeJS.ErrnoException).code !== "ENOENT") throw error;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function pidAlive(pid: number): boolean {
|
|
228
|
+
try {
|
|
229
|
+
process.kill(pid, 0);
|
|
230
|
+
return true;
|
|
231
|
+
} catch (error) {
|
|
232
|
+
return (error as NodeJS.ErrnoException).code === "EPERM";
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function sleep(ms: number): Promise<void> {
|
|
237
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
async function stopProcessGroup(pid: number, waitMs = 1500) {
|
|
241
|
+
const signals: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGKILL"];
|
|
242
|
+
for (const signal of signals) {
|
|
243
|
+
if (!pidAlive(pid)) return;
|
|
244
|
+
try {
|
|
245
|
+
process.kill(-pid, signal);
|
|
246
|
+
} catch {
|
|
247
|
+
try {
|
|
248
|
+
process.kill(pid, signal);
|
|
249
|
+
} catch {
|
|
250
|
+
// ignore
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
const deadline = Date.now() + waitMs;
|
|
255
|
+
while (Date.now() < deadline) {
|
|
256
|
+
if (!pidAlive(pid)) return;
|
|
257
|
+
await sleep(50);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function bufferFromWsData(data: WebSocket.RawData): Buffer {
|
|
263
|
+
if (Buffer.isBuffer(data)) return data;
|
|
264
|
+
if (data instanceof ArrayBuffer) return Buffer.from(data);
|
|
265
|
+
if (Array.isArray(data)) return Buffer.concat(data);
|
|
266
|
+
return Buffer.from(data as ArrayBufferLike);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function wsHeader(messageType: number, flags: number, serialization: number, compression: number): Buffer {
|
|
270
|
+
return Buffer.from([
|
|
271
|
+
(0b0001 << 4) | 0b0001,
|
|
272
|
+
(messageType << 4) | flags,
|
|
273
|
+
(serialization << 4) | compression,
|
|
274
|
+
0,
|
|
275
|
+
]);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function wsFullClientRequest(sequence: number, payload: unknown): Buffer {
|
|
279
|
+
const body = gzipSync(Buffer.from(JSON.stringify(payload), "utf8"));
|
|
280
|
+
const meta = Buffer.alloc(8);
|
|
281
|
+
meta.writeInt32BE(sequence, 0);
|
|
282
|
+
meta.writeUInt32BE(body.length, 4);
|
|
283
|
+
return Buffer.concat([
|
|
284
|
+
wsHeader(MSG_TYPE_CLIENT_FULL_REQUEST, FLAG_POS_SEQUENCE, SERIALIZATION_JSON, COMPRESSION_GZIP),
|
|
285
|
+
meta,
|
|
286
|
+
body,
|
|
287
|
+
]);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function wsAudioRequest(sequence: number, audio: Buffer, isLast: boolean): Buffer {
|
|
291
|
+
const body = gzipSync(audio);
|
|
292
|
+
const meta = Buffer.alloc(8);
|
|
293
|
+
meta.writeInt32BE(isLast ? -sequence : sequence, 0);
|
|
294
|
+
meta.writeUInt32BE(body.length, 4);
|
|
295
|
+
return Buffer.concat([
|
|
296
|
+
wsHeader(
|
|
297
|
+
MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST,
|
|
298
|
+
isLast ? FLAG_NEG_WITH_SEQUENCE : FLAG_POS_SEQUENCE,
|
|
299
|
+
SERIALIZATION_NONE,
|
|
300
|
+
COMPRESSION_GZIP,
|
|
301
|
+
),
|
|
302
|
+
meta,
|
|
303
|
+
body,
|
|
304
|
+
]);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
function wsDecodePayload(serialization: number, compression: number, payload: Buffer): unknown {
|
|
308
|
+
const decoded = compression === COMPRESSION_GZIP && payload.length > 0 ? gunzipSync(payload) : payload;
|
|
309
|
+
if (serialization === SERIALIZATION_JSON && decoded.length > 0) {
|
|
310
|
+
return JSON.parse(decoded.toString("utf8"));
|
|
311
|
+
}
|
|
312
|
+
return decoded;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
function parseServerFrame(data: WebSocket.RawData): DecodedFrame {
|
|
316
|
+
const msg = bufferFromWsData(data);
|
|
317
|
+
if (msg.length < 4) throw new Error("Invalid ASR frame: header too short");
|
|
318
|
+
|
|
319
|
+
const headerSize = msg[0] & 0x0f;
|
|
320
|
+
const messageType = msg[1] >> 4;
|
|
321
|
+
const flags = msg[1] & 0x0f;
|
|
322
|
+
const serialization = msg[2] >> 4;
|
|
323
|
+
const compression = msg[2] & 0x0f;
|
|
324
|
+
let offset = headerSize * 4;
|
|
325
|
+
|
|
326
|
+
let sequence: number | null = null;
|
|
327
|
+
const isLast = Boolean(flags & 0b0010);
|
|
328
|
+
if (flags & 0b0001) {
|
|
329
|
+
sequence = msg.readInt32BE(offset);
|
|
330
|
+
offset += 4;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if (messageType === MSG_TYPE_SERVER_FULL_RESPONSE) {
|
|
334
|
+
const payloadSize = msg.readUInt32BE(offset);
|
|
335
|
+
offset += 4;
|
|
336
|
+
const payload = msg.subarray(offset, offset + payloadSize);
|
|
337
|
+
return {
|
|
338
|
+
messageType,
|
|
339
|
+
sequence,
|
|
340
|
+
isLast,
|
|
341
|
+
payload: wsDecodePayload(serialization, compression, payload),
|
|
342
|
+
};
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if (messageType === MSG_TYPE_SERVER_ERROR_RESPONSE) {
|
|
346
|
+
const errorCode = msg.readInt32BE(offset);
|
|
347
|
+
offset += 4;
|
|
348
|
+
const payloadSize = msg.readUInt32BE(offset);
|
|
349
|
+
offset += 4;
|
|
350
|
+
const payload = msg.subarray(offset, offset + payloadSize);
|
|
351
|
+
let detail: unknown;
|
|
352
|
+
try {
|
|
353
|
+
detail = wsDecodePayload(serialization, compression, payload);
|
|
354
|
+
} catch {
|
|
355
|
+
detail = payload.toString("utf8");
|
|
356
|
+
}
|
|
357
|
+
throw new Error(`Volcengine ASR protocol error ${errorCode}: ${JSON.stringify(detail)}`);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return { messageType, sequence, isLast, payload: null };
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function extractText(payload: unknown): string {
|
|
364
|
+
if (!payload || typeof payload !== "object") return "";
|
|
365
|
+
const root = payload as { result?: { text?: unknown; utterances?: Array<{ text?: unknown }> } };
|
|
366
|
+
if (typeof root.result?.text === "string" && root.result.text) return root.result.text;
|
|
367
|
+
if (Array.isArray(root.result?.utterances)) {
|
|
368
|
+
return root.result.utterances.map((u) => (typeof u.text === "string" ? u.text : "")).join("").trim();
|
|
369
|
+
}
|
|
370
|
+
return "";
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
function sendWs(ws: WebSocket, frame: Buffer): Promise<void> {
|
|
374
|
+
return new Promise((resolve, reject) => {
|
|
375
|
+
ws.send(frame, { binary: true }, (error) => {
|
|
376
|
+
if (error) reject(error);
|
|
377
|
+
else resolve();
|
|
378
|
+
});
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
function parseRecordedWav(filePath: string): { pcm: Buffer; durationMs: number } {
|
|
383
|
+
const wav = readFileSync(filePath);
|
|
384
|
+
if (wav.length < 44 || wav.toString("ascii", 0, 4) !== "RIFF" || wav.toString("ascii", 8, 12) !== "WAVE") {
|
|
385
|
+
throw new Error(`Recording is not a WAV file: ${filePath}`);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
let offset = 12;
|
|
389
|
+
let fmt: { format: number; channels: number; rate: number; bits: number } | null = null;
|
|
390
|
+
let data: Buffer | null = null;
|
|
391
|
+
|
|
392
|
+
while (offset + 8 <= wav.length) {
|
|
393
|
+
const id = wav.toString("ascii", offset, offset + 4);
|
|
394
|
+
const size = wav.readUInt32LE(offset + 4);
|
|
395
|
+
const start = offset + 8;
|
|
396
|
+
const end = Math.min(start + size, wav.length);
|
|
397
|
+
|
|
398
|
+
if (id === "fmt ") {
|
|
399
|
+
fmt = {
|
|
400
|
+
format: wav.readUInt16LE(start),
|
|
401
|
+
channels: wav.readUInt16LE(start + 2),
|
|
402
|
+
rate: wav.readUInt32LE(start + 4),
|
|
403
|
+
bits: wav.readUInt16LE(start + 14),
|
|
404
|
+
};
|
|
405
|
+
} else if (id === "data") {
|
|
406
|
+
data = wav.subarray(start, end);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
offset = start + size + (size % 2);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if (!fmt || !data) throw new Error(`Incomplete WAV recording: ${filePath}`);
|
|
413
|
+
const isPcm = fmt.format === 1 || fmt.format === 0xfffe;
|
|
414
|
+
if (!isPcm || fmt.channels !== 1 || fmt.rate !== 16000 || fmt.bits !== 16) {
|
|
415
|
+
throw new Error(
|
|
416
|
+
`Expected 16kHz mono 16-bit PCM WAV, got format=${fmt.format} channels=${fmt.channels} rate=${fmt.rate} bits=${fmt.bits}`,
|
|
417
|
+
);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return { pcm: data, durationMs: Math.round((data.length / (16000 * 2)) * 1000) };
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
function missingCredentialsMessage(): string {
|
|
424
|
+
return [
|
|
425
|
+
"Missing VOLC_API_KEY for Volcengine ASR.",
|
|
426
|
+
"Create ~/.pi/agent/voice-input.env with:",
|
|
427
|
+
" VOLC_API_KEY=your_volcengine_speech_api_key",
|
|
428
|
+
"Optional: copy .env.example from this package as a template.",
|
|
429
|
+
"API key settings: https://console.volcengine.com/speech/new/setting/apikeys?projectName=default",
|
|
430
|
+
"Run /voice config to verify whether the key is detected.",
|
|
431
|
+
].join("\n");
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
async function transcribePcm(pcm: Buffer, durationMs: number, config: VoiceConfig): Promise<TranscriptionResult> {
|
|
435
|
+
if (!config.apiKey) {
|
|
436
|
+
throw new Error(missingCredentialsMessage());
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const connectId = randomUUID();
|
|
440
|
+
const startedAt = Date.now();
|
|
441
|
+
const ws = new WebSocket(config.wsUrl, {
|
|
442
|
+
headers: {
|
|
443
|
+
"X-Api-Key": config.apiKey,
|
|
444
|
+
"X-Api-Resource-Id": config.resourceId,
|
|
445
|
+
"X-Api-Connect-Id": connectId,
|
|
446
|
+
"X-Api-Request-Id": connectId,
|
|
447
|
+
},
|
|
448
|
+
handshakeTimeout: 15_000,
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
const openStart = Date.now();
|
|
452
|
+
await new Promise<void>((resolve, reject) => {
|
|
453
|
+
ws.once("open", resolve);
|
|
454
|
+
ws.once("error", reject);
|
|
455
|
+
});
|
|
456
|
+
const wsOpenMs = Date.now() - openStart;
|
|
457
|
+
|
|
458
|
+
let finalText = "";
|
|
459
|
+
let seenLast = false;
|
|
460
|
+
let waitStart = 0;
|
|
461
|
+
|
|
462
|
+
const completion = new Promise<void>((resolve, reject) => {
|
|
463
|
+
const timer = setTimeout(() => {
|
|
464
|
+
reject(new Error(`ASR timeout after ${config.requestTimeoutMs}ms`));
|
|
465
|
+
try {
|
|
466
|
+
ws.close();
|
|
467
|
+
} catch {
|
|
468
|
+
// ignore
|
|
469
|
+
}
|
|
470
|
+
}, config.requestTimeoutMs);
|
|
471
|
+
|
|
472
|
+
const cleanup = () => {
|
|
473
|
+
clearTimeout(timer);
|
|
474
|
+
ws.off("message", onMessage);
|
|
475
|
+
ws.off("error", onError);
|
|
476
|
+
ws.off("close", onClose);
|
|
477
|
+
};
|
|
478
|
+
|
|
479
|
+
const resolveOnce = () => {
|
|
480
|
+
cleanup();
|
|
481
|
+
resolve();
|
|
482
|
+
};
|
|
483
|
+
|
|
484
|
+
const rejectOnce = (error: Error) => {
|
|
485
|
+
cleanup();
|
|
486
|
+
reject(error);
|
|
487
|
+
};
|
|
488
|
+
|
|
489
|
+
const onMessage = (data: WebSocket.RawData) => {
|
|
490
|
+
try {
|
|
491
|
+
const frame = parseServerFrame(data);
|
|
492
|
+
const text = extractText(frame.payload);
|
|
493
|
+
if (text) finalText = text;
|
|
494
|
+
if (frame.isLast) {
|
|
495
|
+
seenLast = true;
|
|
496
|
+
resolveOnce();
|
|
497
|
+
}
|
|
498
|
+
} catch (error) {
|
|
499
|
+
rejectOnce(error as Error);
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
|
|
503
|
+
const onError = (error: Error) => rejectOnce(error);
|
|
504
|
+
const onClose = (code: number, reason: Buffer) => {
|
|
505
|
+
if (!seenLast) rejectOnce(new Error(`ASR WebSocket closed before final response: ${code} ${reason.toString()}`));
|
|
506
|
+
};
|
|
507
|
+
|
|
508
|
+
ws.on("message", onMessage);
|
|
509
|
+
ws.on("error", onError);
|
|
510
|
+
ws.on("close", onClose);
|
|
511
|
+
});
|
|
512
|
+
|
|
513
|
+
const audioPayload: Record<string, unknown> = {
|
|
514
|
+
format: "pcm",
|
|
515
|
+
codec: "raw",
|
|
516
|
+
rate: 16000,
|
|
517
|
+
bits: 16,
|
|
518
|
+
channel: 1,
|
|
519
|
+
};
|
|
520
|
+
if (config.language && config.wsUrl.includes("bigmodel_nostream")) {
|
|
521
|
+
audioPayload.language = config.language;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
const requestPayload: Record<string, unknown> = {
|
|
525
|
+
user: { uid: config.uid || "pi-voice-input" },
|
|
526
|
+
audio: audioPayload,
|
|
527
|
+
request: {
|
|
528
|
+
model_name: "bigmodel",
|
|
529
|
+
enable_itn: config.enableItn,
|
|
530
|
+
enable_punc: config.enablePunc,
|
|
531
|
+
enable_ddc: config.enableDdc,
|
|
532
|
+
show_utterances: config.showUtterances,
|
|
533
|
+
result_type: "full",
|
|
534
|
+
...(config.prompt ? { corpus: { context: config.prompt } } : {}),
|
|
535
|
+
},
|
|
536
|
+
};
|
|
537
|
+
|
|
538
|
+
const sendStart = Date.now();
|
|
539
|
+
let sequence = 1;
|
|
540
|
+
let packets = 0;
|
|
541
|
+
await sendWs(ws, wsFullClientRequest(sequence, requestPayload));
|
|
542
|
+
sequence += 1;
|
|
543
|
+
|
|
544
|
+
const segmentSize = Math.max(1, Math.floor((16000 * 2 * config.segmentMs) / 1000));
|
|
545
|
+
if (pcm.length === 0) {
|
|
546
|
+
await sendWs(ws, wsAudioRequest(sequence, Buffer.alloc(0), true));
|
|
547
|
+
packets = 1;
|
|
548
|
+
} else {
|
|
549
|
+
for (let offset = 0; offset < pcm.length; offset += segmentSize) {
|
|
550
|
+
const isLast = offset + segmentSize >= pcm.length;
|
|
551
|
+
await sendWs(ws, wsAudioRequest(sequence, pcm.subarray(offset, offset + segmentSize), isLast));
|
|
552
|
+
packets += 1;
|
|
553
|
+
if (!isLast) sequence += 1;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
const sendMs = Date.now() - sendStart;
|
|
557
|
+
|
|
558
|
+
waitStart = Date.now();
|
|
559
|
+
await completion;
|
|
560
|
+
const waitMs = Date.now() - waitStart;
|
|
561
|
+
|
|
562
|
+
try {
|
|
563
|
+
ws.close();
|
|
564
|
+
} catch {
|
|
565
|
+
// ignore
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
return {
|
|
569
|
+
text: finalText,
|
|
570
|
+
durationMs,
|
|
571
|
+
packets,
|
|
572
|
+
timings: {
|
|
573
|
+
wsOpenMs,
|
|
574
|
+
sendMs,
|
|
575
|
+
waitMs,
|
|
576
|
+
totalMs: Date.now() - startedAt,
|
|
577
|
+
},
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
function appendToEditor(ctx: ExtensionContext, text: string) {
|
|
582
|
+
const trimmed = text.trim();
|
|
583
|
+
if (!trimmed) return;
|
|
584
|
+
const current = ctx.ui.getEditorText();
|
|
585
|
+
const separator = current.trim().length > 0 && !current.endsWith("\n") ? "\n" : "";
|
|
586
|
+
ctx.ui.setEditorText(`${current}${separator}${trimmed}`);
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
590
|
+
const state = readState(config);
|
|
591
|
+
return Boolean(state && pidAlive(state.pid));
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
async function startRecording(ctx: ExtensionContext) {
|
|
595
|
+
const config = getConfig();
|
|
596
|
+
const existing = readState(config);
|
|
597
|
+
if (existing && pidAlive(existing.pid)) {
|
|
598
|
+
ctx.ui.notify(`Already recording: pid=${existing.pid}`, "warning");
|
|
599
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("error", "● recording"));
|
|
600
|
+
return;
|
|
601
|
+
}
|
|
602
|
+
if (existing) clearState(config);
|
|
603
|
+
|
|
604
|
+
ensureDir(config.recordingsDir);
|
|
605
|
+
ensureDir(config.logDir);
|
|
606
|
+
const outputPath = path.join(config.recordingsDir, `recording-${timestampForFilename()}.wav`);
|
|
607
|
+
const logPath = path.join(config.logDir, `recording-${timestampForFilename()}.log`);
|
|
608
|
+
const cmd = recorderCommand(config, outputPath);
|
|
609
|
+
|
|
610
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
|
|
611
|
+
const logFd = openSync(logPath, "a");
|
|
612
|
+
const child = spawn(cmd[0], cmd.slice(1), {
|
|
613
|
+
detached: true,
|
|
614
|
+
stdio: ["ignore", logFd, logFd],
|
|
615
|
+
});
|
|
616
|
+
child.unref();
|
|
617
|
+
closeSync(logFd);
|
|
618
|
+
|
|
619
|
+
if (!child.pid) throw new Error("Recorder failed to start: no pid returned");
|
|
620
|
+
writeState(config, {
|
|
621
|
+
pid: child.pid,
|
|
622
|
+
path: outputPath,
|
|
623
|
+
logPath,
|
|
624
|
+
startedAt: new Date().toISOString(),
|
|
625
|
+
recorderTarget: config.recorderTarget || undefined,
|
|
626
|
+
});
|
|
627
|
+
|
|
628
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("error", "● recording"));
|
|
629
|
+
ctx.ui.notify("Voice recording started. Press Ctrl+Shift+R again to stop/transcribe.", "info");
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
633
|
+
const config = getConfig();
|
|
634
|
+
const state = readState(config);
|
|
635
|
+
if (!state) {
|
|
636
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
637
|
+
ctx.ui.notify("Not recording.", "warning");
|
|
638
|
+
return;
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", transcribe ? "● transcribing" : "● stopping"));
|
|
642
|
+
if (pidAlive(state.pid)) await stopProcessGroup(state.pid);
|
|
643
|
+
clearState(config);
|
|
644
|
+
if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
|
|
645
|
+
|
|
646
|
+
if (!existsSync(state.path) || statSync(state.path).size === 0) {
|
|
647
|
+
const log = existsSync(state.logPath) ? readFileSync(state.logPath, "utf8") : "";
|
|
648
|
+
throw new Error(`Recording file missing/empty: ${state.path}\nRecorder log:\n${log}`);
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (!transcribe) {
|
|
652
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
653
|
+
ctx.ui.notify(`Voice recording stopped: ${state.path}`, "info");
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
const decodeStart = Date.now();
|
|
658
|
+
const { pcm, durationMs } = parseRecordedWav(state.path);
|
|
659
|
+
const decodeMs = Date.now() - decodeStart;
|
|
660
|
+
const result = await transcribePcm(pcm, durationMs, config);
|
|
661
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
662
|
+
|
|
663
|
+
if (!result.text.trim()) {
|
|
664
|
+
ctx.ui.notify(
|
|
665
|
+
`Transcription finished but no text was returned. audio=${(durationMs / 1000).toFixed(2)}s total=${result.timings.totalMs}ms`,
|
|
666
|
+
"warning",
|
|
667
|
+
);
|
|
668
|
+
return;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
appendToEditor(ctx, result.text);
|
|
672
|
+
ctx.ui.notify(
|
|
673
|
+
`Voice text inserted. audio=${(durationMs / 1000).toFixed(2)}s decode=${decodeMs}ms asr=${result.timings.totalMs}ms packets=${result.packets}`,
|
|
674
|
+
"info",
|
|
675
|
+
);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
async function toggleRecording(ctx: ExtensionContext) {
|
|
679
|
+
if (!ctx.hasUI) {
|
|
680
|
+
ctx.ui.notify("voice input requires interactive pi UI", "error");
|
|
681
|
+
return;
|
|
682
|
+
}
|
|
683
|
+
const config = getConfig();
|
|
684
|
+
if (await isRecording(config)) await stopRecording(ctx, true);
|
|
685
|
+
else await startRecording(ctx);
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
function configSummary(config: VoiceConfig): string {
|
|
689
|
+
return [
|
|
690
|
+
"Voice input config:",
|
|
691
|
+
`- api key: ${config.apiKey ? "set" : "missing"}`,
|
|
692
|
+
`- ws url: ${config.wsUrl}`,
|
|
693
|
+
`- resource id: ${config.resourceId}`,
|
|
694
|
+
`- language: ${config.language || "auto"}`,
|
|
695
|
+
`- recorder target: ${config.recorderTarget || "PipeWire/default"}`,
|
|
696
|
+
`- segment: ${config.segmentMs}ms`,
|
|
697
|
+
`- recordings: ${config.recordingsDir}`,
|
|
698
|
+
`- state: ${config.statePath}`,
|
|
699
|
+
`- shortcut: ${config.shortcut}`,
|
|
700
|
+
"Config files checked: ~/.pi/agent/voice-input.env, package .env, current .env; shell env overrides them.",
|
|
701
|
+
].join("\n");
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
export default function (pi: ExtensionAPI) {
|
|
705
|
+
const startupConfig = getConfig();
|
|
706
|
+
|
|
707
|
+
pi.registerShortcut(startupConfig.shortcut as ReturnType<typeof Key.ctrlShift>, {
|
|
708
|
+
description: "Toggle voice recording and insert transcription into editor",
|
|
709
|
+
handler: async (ctx) => {
|
|
710
|
+
try {
|
|
711
|
+
await toggleRecording(ctx);
|
|
712
|
+
} catch (error) {
|
|
713
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
714
|
+
ctx.ui.notify(`Voice input error: ${error instanceof Error ? error.message : String(error)}`, "error");
|
|
715
|
+
}
|
|
716
|
+
},
|
|
717
|
+
});
|
|
718
|
+
|
|
719
|
+
pi.registerCommand("voice", {
|
|
720
|
+
description: "Voice input: start | stop | status | toggle | cancel | config",
|
|
721
|
+
handler: async (args, ctx) => {
|
|
722
|
+
const action = (args || "toggle").trim().toLowerCase();
|
|
723
|
+
try {
|
|
724
|
+
if (action === "start") {
|
|
725
|
+
await startRecording(ctx);
|
|
726
|
+
return;
|
|
727
|
+
}
|
|
728
|
+
if (action === "stop") {
|
|
729
|
+
await stopRecording(ctx, true);
|
|
730
|
+
return;
|
|
731
|
+
}
|
|
732
|
+
if (action === "cancel") {
|
|
733
|
+
await stopRecording(ctx, false);
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
if (action === "status") {
|
|
737
|
+
const config = getConfig();
|
|
738
|
+
const state = readState(config);
|
|
739
|
+
ctx.ui.notify(JSON.stringify({ recording: Boolean(state && pidAlive(state.pid)), state }, null, 2), "info");
|
|
740
|
+
return;
|
|
741
|
+
}
|
|
742
|
+
if (action === "config") {
|
|
743
|
+
ctx.ui.notify(configSummary(getConfig()), "info");
|
|
744
|
+
return;
|
|
745
|
+
}
|
|
746
|
+
if (action === "toggle" || action === "") {
|
|
747
|
+
await toggleRecording(ctx);
|
|
748
|
+
return;
|
|
749
|
+
}
|
|
750
|
+
ctx.ui.notify("Usage: /voice start | stop | status | toggle | cancel | config", "error");
|
|
751
|
+
} catch (error) {
|
|
752
|
+
ctx.ui.setStatus("voice-input", undefined);
|
|
753
|
+
ctx.ui.notify(`Voice command error: ${error instanceof Error ? error.message : String(error)}`, "error");
|
|
754
|
+
}
|
|
755
|
+
},
|
|
756
|
+
});
|
|
757
|
+
|
|
758
|
+
pi.on("session_start", (_event, ctx) => {
|
|
759
|
+
ctx.ui.notify(`Voice input loaded: ${startupConfig.shortcut} toggles recording.`, "info");
|
|
760
|
+
});
|
|
761
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pi-voice-input",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "provider-extensible voice input extension for pi",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"keywords": [
|
|
7
|
+
"pi-package",
|
|
8
|
+
"pi-extension",
|
|
9
|
+
"voice-input",
|
|
10
|
+
"speech-to-text",
|
|
11
|
+
"dictation",
|
|
12
|
+
"asr"
|
|
13
|
+
],
|
|
14
|
+
"license": "MIT",
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "git+ssh://git@github.com/tr-nc/pi-voice-input.git"
|
|
18
|
+
},
|
|
19
|
+
"bugs": {
|
|
20
|
+
"url": "https://github.com/tr-nc/pi-voice-input/issues"
|
|
21
|
+
},
|
|
22
|
+
"homepage": "https://github.com/tr-nc/pi-voice-input#readme",
|
|
23
|
+
"files": [
|
|
24
|
+
"extensions",
|
|
25
|
+
".env.example",
|
|
26
|
+
"README.md"
|
|
27
|
+
],
|
|
28
|
+
"pi": {
|
|
29
|
+
"extensions": [
|
|
30
|
+
"extensions"
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"ws": "^8.20.1"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"@earendil-works/pi-coding-agent": "*",
|
|
38
|
+
"@earendil-works/pi-tui": "*",
|
|
39
|
+
"@types/node": "^25.8.0",
|
|
40
|
+
"@types/ws": "^8.18.1",
|
|
41
|
+
"typescript": "^6.0.3"
|
|
42
|
+
},
|
|
43
|
+
"peerDependencies": {
|
|
44
|
+
"@earendil-works/pi-coding-agent": "*",
|
|
45
|
+
"@earendil-works/pi-tui": "*"
|
|
46
|
+
},
|
|
47
|
+
"peerDependenciesMeta": {
|
|
48
|
+
"@earendil-works/pi-coding-agent": {
|
|
49
|
+
"optional": true
|
|
50
|
+
},
|
|
51
|
+
"@earendil-works/pi-tui": {
|
|
52
|
+
"optional": true
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|