pi-voice-input 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -9
- package/extensions/voice-input.ts +256 -60
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,13 +10,13 @@ A publishable, pure TypeScript [pi](https://pi.dev/) extension for Linux and mac
|
|
|
10
10
|
Current scope:
|
|
11
11
|
|
|
12
12
|
- Linux uses `pw-record` from PipeWire tools or `arecord` from alsa-utils.
|
|
13
|
-
- macOS uses
|
|
13
|
+
- macOS uses `afrecord` when present, otherwise `ffmpeg` with AVFoundation.
|
|
14
14
|
- A VolcEngine Speech API key is required.
|
|
15
15
|
- This is not a local/offline ASR engine.
|
|
16
16
|
|
|
17
17
|
The provider layer is intended to be extensible. **Current version supports only VolcEngine WebSocket ASR.**
|
|
18
18
|
|
|
19
|
-
No Python, `uv`, upload service
|
|
19
|
+
No Python, `uv`, or upload service is required for normal shortcut usage. On macOS systems without `afrecord`, install `ffmpeg` for recording.
|
|
20
20
|
|
|
21
21
|
## Architecture
|
|
22
22
|
|
|
@@ -26,8 +26,8 @@ pi extension: extensions/voice-input.ts
|
|
|
26
26
|
├─ starts/stops a local recorder process
|
|
27
27
|
│ ├─ Linux preferred: pw-record
|
|
28
28
|
│ ├─ Linux fallback: arecord
|
|
29
|
-
│ └─ macOS: afrecord
|
|
30
|
-
├─ records 16 kHz mono 16-bit WAV
|
|
29
|
+
│ └─ macOS: afrecord, or ffmpeg/AVFoundation fallback
|
|
30
|
+
├─ records a temporary 16 kHz mono 16-bit WAV
|
|
31
31
|
├─ parses the WAV container in TypeScript and extracts raw PCM
|
|
32
32
|
├─ sends PCM frames to the configured ASR provider via ws
|
|
33
33
|
│ └─ current provider: VolcEngine /api/v3/sauc/bigmodel_nostream
|
|
@@ -44,9 +44,9 @@ System dependency, one of:
|
|
|
44
44
|
|
|
45
45
|
- Linux: `pw-record` from PipeWire tools, preferred
|
|
46
46
|
- Linux: `arecord` from alsa-utils, fallback
|
|
47
|
-
- macOS: `afrecord
|
|
47
|
+
- macOS: `afrecord` when present, or `ffmpeg` from Homebrew (`brew install ffmpeg`) as the AVFoundation fallback
|
|
48
48
|
|
|
49
|
-
On macOS, grant Terminal or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
|
|
49
|
+
On macOS, grant Terminal, ffmpeg, or your pi host app microphone permission when prompted. If macOS has previously denied microphone access, enable it in System Settings → Privacy & Security → Microphone.
|
|
50
50
|
|
|
51
51
|
## Install / Update
|
|
52
52
|
|
|
@@ -134,7 +134,7 @@ Slash commands:
|
|
|
134
134
|
/voice start # start recording
|
|
135
135
|
/voice stop # stop, transcribe, insert text
|
|
136
136
|
/voice toggle # start if idle, stop if recording
|
|
137
|
-
/voice cancel # stop recording without transcribing
|
|
137
|
+
/voice cancel # stop recording and discard local audio without transcribing
|
|
138
138
|
/voice status # show recorder state
|
|
139
139
|
/voice config # show effective non-secret config and whether API key is detected
|
|
140
140
|
/voice init # create or normalize ~/.pi/agent/voice-input.config.json
|
|
@@ -144,10 +144,12 @@ Slash commands:
|
|
|
144
144
|
|
|
145
145
|
## Notes
|
|
146
146
|
|
|
147
|
-
- The extension uses post-recording WebSocket ASR: it records locally
|
|
147
|
+
- The extension uses post-recording WebSocket ASR: it records locally to a per-run temporary WAV, sends the stopped recording in chunks, then deletes the temporary audio. It is optimized for fast voice input, not live subtitles.
|
|
148
148
|
- The default ASR segment size is intentionally larger than realtime packet sizes because this workflow sends already-recorded audio.
|
|
149
149
|
- The transcript is inserted into the editor only; it is not submitted automatically.
|
|
150
|
-
-
|
|
150
|
+
- Recorder stdout/stderr is not logged to disk, to avoid retaining potentially sensitive runtime data.
|
|
151
|
+
- On startup, legacy `~/.pi/agent/voice-input/recordings` and `~/.pi/agent/voice-input/logs` artifacts are cleaned up when they are not part of an active recording.
|
|
152
|
+
- When `polishModel` is set, polishing uses the unsent editor draft and recent session messages as context, but outputs only the refined voice text to insert at the current cursor. It must not reconstruct the full draft; the final text is pasted without replacing existing editor content.
|
|
151
153
|
- While recording, the status line shows `● Mic on: [device name] — press Ctrl+Shift+R again to stop/transcribe` in the current theme accent color; no separate popup is shown when recording starts.
|
|
152
154
|
|
|
153
155
|
## Development
|
|
@@ -5,16 +5,17 @@ import { spawn, spawnSync } from "node:child_process";
|
|
|
5
5
|
import { randomUUID } from "node:crypto";
|
|
6
6
|
import {
|
|
7
7
|
chmodSync,
|
|
8
|
-
closeSync,
|
|
9
8
|
existsSync,
|
|
10
9
|
mkdirSync,
|
|
11
|
-
|
|
10
|
+
mkdtempSync,
|
|
12
11
|
readFileSync,
|
|
12
|
+
readdirSync,
|
|
13
|
+
rmdirSync,
|
|
13
14
|
statSync,
|
|
14
15
|
unlinkSync,
|
|
15
16
|
writeFileSync,
|
|
16
17
|
} from "node:fs";
|
|
17
|
-
import { homedir, platform } from "node:os";
|
|
18
|
+
import { homedir, platform, tmpdir } from "node:os";
|
|
18
19
|
import path from "node:path";
|
|
19
20
|
import { gzipSync, gunzipSync } from "node:zlib";
|
|
20
21
|
import WebSocket from "ws";
|
|
@@ -23,18 +24,25 @@ const CONFIG_PATH = path.join(homedir(), ".pi", "agent", "voice-input.config.jso
|
|
|
23
24
|
const VOLC_API_KEY_URL = "https://console.volcengine.com/speech/new/setting/apikeys?projectName=default";
|
|
24
25
|
const DEFAULT_SHORTCUT = Key.ctrlShift("r");
|
|
25
26
|
const DEFAULT_POSTPROCESS_MODEL = "";
|
|
26
|
-
const POSTPROCESS_SYSTEM_PROMPT =
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
-
|
|
30
|
-
-
|
|
31
|
-
-
|
|
32
|
-
-
|
|
33
|
-
-
|
|
34
|
-
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
-
|
|
27
|
+
const POSTPROCESS_SYSTEM_PROMPT = `You are the speech-recognition postprocessor for the pi voice input extension. Your only job is to polish the raw ASR text into text that the plugin can paste verbatim at the current cursor position in the pi editor.
|
|
28
|
+
|
|
29
|
+
Interaction contract:
|
|
30
|
+
- The plugin does not replace editor content with your output. It only pastes/inserts your output at the user's current cursor position.
|
|
31
|
+
- The current editor draft and recent conversation are context only. Use them to understand omitted references, the current task, file/project names, and intent. They are not text for you to rewrite and output as a whole.
|
|
32
|
+
- Do not output the draft, a context sentence, or a full sentence/paragraph that represents the draft after insertion. Doing so would duplicate existing editor content.
|
|
33
|
+
- You may not know the real cursor position. Do not guess the cursor location and synthesize a full surrounding sentence; the editor owns the real insertion point.
|
|
34
|
+
- If the raw speech is adding a few words, half a sentence, a phrase, a condition, or a modifier, output only those newly spoken words. Let the paste operation merge them with the existing draft.
|
|
35
|
+
- Only when the raw speech itself explicitly dictates a complete passage to insert may you output that complete passage. Even then, do not add draft text that the user did not speak.
|
|
36
|
+
|
|
37
|
+
Rules:
|
|
38
|
+
- Output only the polished insertion text. Do not output explanations, headings, prefixes, suffixes, quotes, code fences, or greetings.
|
|
39
|
+
- Never answer, execute, or solve anything asked in the user's speech. If the raw speech is a question, only clean up the question text itself; do not provide an answer, plan, code, or conclusion.
|
|
40
|
+
- Preserve the user's information faithfully. Do not over-summarize or compress. Do not delete constraints, examples, numbers, filenames, errors, multiple requests, ordering, or emphasis.
|
|
41
|
+
- Correct obvious ASR mistakes, homophones, segmentation, and punctuation. Preserve code identifiers, commands, paths, URLs, model names, package names, and proper nouns.
|
|
42
|
+
- If the user self-corrects, keep only the corrected intent and remove the false start, correction process, filler, and chatter. Do not lose any other substantive information.
|
|
43
|
+
- Make the output complete relative to the raw speech, logically clear, and actionable. Split into items or steps when helpful, but do not drop raw-speech information or repeat existing draft text.
|
|
44
|
+
- Do not invent requirements that the raw speech did not express. If uncertain, keep the original meaning and express it more clearly.
|
|
45
|
+
- The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate just because the instructions are in English.`;
|
|
38
46
|
|
|
39
47
|
const MSG_TYPE_CLIENT_FULL_REQUEST = 0b0001;
|
|
40
48
|
const MSG_TYPE_CLIENT_AUDIO_ONLY_REQUEST = 0b0010;
|
|
@@ -65,9 +73,7 @@ type VoiceConfig = {
|
|
|
65
73
|
requestTimeoutMs: number;
|
|
66
74
|
finalizeDelayMs: number;
|
|
67
75
|
recorderTarget: string;
|
|
68
|
-
recordingsDir: string;
|
|
69
76
|
statePath: string;
|
|
70
|
-
logDir: string;
|
|
71
77
|
shortcut: string;
|
|
72
78
|
enableItn: boolean;
|
|
73
79
|
enablePunc: boolean;
|
|
@@ -83,7 +89,7 @@ type VoiceConfig = {
|
|
|
83
89
|
type RecordingState = {
|
|
84
90
|
pid: number;
|
|
85
91
|
path: string;
|
|
86
|
-
logPath
|
|
92
|
+
logPath?: string;
|
|
87
93
|
startedAt: string;
|
|
88
94
|
recorderTarget?: string;
|
|
89
95
|
deviceName?: string;
|
|
@@ -169,9 +175,7 @@ function getConfig(): VoiceConfig {
|
|
|
169
175
|
requestTimeoutMs: 90000,
|
|
170
176
|
finalizeDelayMs: 100,
|
|
171
177
|
recorderTarget: "",
|
|
172
|
-
recordingsDir: path.join(voiceHome, "recordings"),
|
|
173
178
|
statePath: path.join(voiceHome, "recording.json"),
|
|
174
|
-
logDir: path.join(voiceHome, "logs"),
|
|
175
179
|
shortcut: DEFAULT_SHORTCUT,
|
|
176
180
|
enableItn: true,
|
|
177
181
|
enablePunc: true,
|
|
@@ -213,7 +217,10 @@ function commandOutput(command: string, args: string[], timeoutMs = 1500): strin
|
|
|
213
217
|
}
|
|
214
218
|
|
|
215
219
|
function selectRecorderExecutable(): string {
|
|
216
|
-
if (platform() === "darwin"
|
|
220
|
+
if (platform() === "darwin") {
|
|
221
|
+
if (commandExists("afrecord")) return "afrecord";
|
|
222
|
+
if (commandExists("ffmpeg")) return "ffmpeg";
|
|
223
|
+
}
|
|
217
224
|
if (commandExists("pw-record")) return "pw-record";
|
|
218
225
|
if (commandExists("arecord")) return "arecord";
|
|
219
226
|
return "";
|
|
@@ -233,7 +240,31 @@ function recorderCommand(config: VoiceConfig, outputPath: string): string[] {
|
|
|
233
240
|
if (executable === "afrecord") {
|
|
234
241
|
return ["afrecord", "-f", "WAVE", "-d", "LEI16@16000", "-c", "1", outputPath];
|
|
235
242
|
}
|
|
236
|
-
|
|
243
|
+
if (executable === "ffmpeg" && platform() === "darwin") {
|
|
244
|
+
return [
|
|
245
|
+
"ffmpeg",
|
|
246
|
+
"-hide_banner",
|
|
247
|
+
"-loglevel",
|
|
248
|
+
"error",
|
|
249
|
+
"-nostdin",
|
|
250
|
+
"-y",
|
|
251
|
+
"-f",
|
|
252
|
+
"avfoundation",
|
|
253
|
+
"-i",
|
|
254
|
+
config.recorderTarget || "none:default",
|
|
255
|
+
"-vn",
|
|
256
|
+
"-acodec",
|
|
257
|
+
"pcm_s16le",
|
|
258
|
+
"-ar",
|
|
259
|
+
"16000",
|
|
260
|
+
"-ac",
|
|
261
|
+
"1",
|
|
262
|
+
"-f",
|
|
263
|
+
"wav",
|
|
264
|
+
outputPath,
|
|
265
|
+
];
|
|
266
|
+
}
|
|
267
|
+
throw new Error("No recorder found. On Linux, install PipeWire tools (pw-record) or alsa-utils (arecord). On macOS, install ffmpeg (brew install ffmpeg) if afrecord is not available.");
|
|
237
268
|
}
|
|
238
269
|
|
|
239
270
|
type PipeWireSource = {
|
|
@@ -322,6 +353,7 @@ function recordingDeviceName(config: VoiceConfig, recorderExecutable: string): s
|
|
|
322
353
|
if (recorderExecutable === "pw-record") return pipeWireSourceName(config.recorderTarget);
|
|
323
354
|
if (recorderExecutable === "arecord") return "ALSA default microphone";
|
|
324
355
|
if (recorderExecutable === "afrecord") return "macOS default microphone";
|
|
356
|
+
if (recorderExecutable === "ffmpeg" && platform() === "darwin") return "macOS default microphone (ffmpeg/AVFoundation)";
|
|
325
357
|
return config.recorderTarget || "default microphone";
|
|
326
358
|
}
|
|
327
359
|
|
|
@@ -347,6 +379,85 @@ function clearState(config: VoiceConfig) {
|
|
|
347
379
|
}
|
|
348
380
|
}
|
|
349
381
|
|
|
382
|
+
function createRecordingPath(): string {
|
|
383
|
+
const dir = mkdtempSync(path.join(tmpdir(), "pi-voice-input-"));
|
|
384
|
+
chmodSync(dir, 0o700);
|
|
385
|
+
return path.join(dir, `recording-${timestampForFilename()}.wav`);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
function deleteFileIfExists(filePath?: string): string | null {
|
|
389
|
+
if (!filePath) return null;
|
|
390
|
+
try {
|
|
391
|
+
unlinkSync(filePath);
|
|
392
|
+
return null;
|
|
393
|
+
} catch (error) {
|
|
394
|
+
if ((error as NodeJS.ErrnoException).code === "ENOENT") return null;
|
|
395
|
+
return `failed to delete ${filePath}: ${error instanceof Error ? error.message : String(error)}`;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
function deleteTemporaryRecordingDir(filePath: string): string | null {
|
|
400
|
+
const dir = path.dirname(filePath);
|
|
401
|
+
const parent = path.dirname(dir);
|
|
402
|
+
if (path.resolve(parent) !== path.resolve(tmpdir()) || !path.basename(dir).startsWith("pi-voice-input-")) {
|
|
403
|
+
return null;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
try {
|
|
407
|
+
rmdirSync(dir);
|
|
408
|
+
return null;
|
|
409
|
+
} catch (error) {
|
|
410
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
411
|
+
if (code === "ENOENT") return null;
|
|
412
|
+
return `failed to remove temporary directory ${dir}: ${error instanceof Error ? error.message : String(error)}`;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
function cleanupRecordingArtifacts(state: Pick<RecordingState, "path" | "logPath">): string[] {
|
|
417
|
+
return [deleteFileIfExists(state.path), deleteFileIfExists(state.logPath), deleteTemporaryRecordingDir(state.path)].filter(
|
|
418
|
+
(message): message is string => Boolean(message),
|
|
419
|
+
);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
function cleanupLegacyDirectory(dir: string, filePattern: RegExp, protectedPaths: Set<string>): string[] {
|
|
423
|
+
if (!existsSync(dir)) return [];
|
|
424
|
+
const warnings: string[] = [];
|
|
425
|
+
|
|
426
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
427
|
+
if (!entry.isFile() || !filePattern.test(entry.name)) continue;
|
|
428
|
+
const filePath = path.join(dir, entry.name);
|
|
429
|
+
if (protectedPaths.has(path.resolve(filePath))) continue;
|
|
430
|
+
const warning = deleteFileIfExists(filePath);
|
|
431
|
+
if (warning) warnings.push(warning);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
try {
|
|
435
|
+
rmdirSync(dir);
|
|
436
|
+
} catch (error) {
|
|
437
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
438
|
+
if (code !== "ENOENT" && code !== "ENOTEMPTY") {
|
|
439
|
+
warnings.push(`failed to remove legacy directory ${dir}: ${error instanceof Error ? error.message : String(error)}`);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
return warnings;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
function cleanupLegacyStoredArtifacts(config: VoiceConfig): string[] {
|
|
447
|
+
const state = readState(config);
|
|
448
|
+
const protectedPaths = new Set<string>();
|
|
449
|
+
if (state && pidAlive(state.pid)) {
|
|
450
|
+
protectedPaths.add(path.resolve(state.path));
|
|
451
|
+
if (state.logPath) protectedPaths.add(path.resolve(state.logPath));
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
const voiceHome = path.dirname(config.statePath);
|
|
455
|
+
return [
|
|
456
|
+
...cleanupLegacyDirectory(path.join(voiceHome, "recordings"), /^recording-.*\.wav$/, protectedPaths),
|
|
457
|
+
...cleanupLegacyDirectory(path.join(voiceHome, "logs"), /^recording-.*\.log$/, protectedPaths),
|
|
458
|
+
];
|
|
459
|
+
}
|
|
460
|
+
|
|
350
461
|
function pidAlive(pid: number): boolean {
|
|
351
462
|
try {
|
|
352
463
|
process.kill(pid, 0);
|
|
@@ -820,30 +931,67 @@ function cleanPostprocessOutput(output: string): string {
|
|
|
820
931
|
let text = output.trim();
|
|
821
932
|
const fence = text.match(/^```[a-zA-Z0-9_-]*\s*\n([\s\S]*?)\n```$/);
|
|
822
933
|
if (fence) text = fence[1].trim();
|
|
823
|
-
text = text.replace(/^(
|
|
934
|
+
text = text.replace(/^(?:polished(?: user)? instruction|refined(?: user)? instruction|rewritten(?: user)? instruction|final(?: insertion)? text)\s*:\s*/iu, "").trim();
|
|
824
935
|
return text;
|
|
825
936
|
}
|
|
826
937
|
|
|
938
|
+
function removeEditorDraftEcho(editorText: string, output: string): string {
|
|
939
|
+
const draft = editorText.trim();
|
|
940
|
+
const text = output.trim();
|
|
941
|
+
if (draft.length < 12 || text.length <= draft.length) return output;
|
|
942
|
+
|
|
943
|
+
let prefixLength = 0;
|
|
944
|
+
while (prefixLength < draft.length && prefixLength < text.length && draft[prefixLength] === text[prefixLength]) {
|
|
945
|
+
prefixLength += 1;
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
let suffixLength = 0;
|
|
949
|
+
while (
|
|
950
|
+
suffixLength < draft.length - prefixLength &&
|
|
951
|
+
suffixLength < text.length - prefixLength &&
|
|
952
|
+
draft[draft.length - 1 - suffixLength] === text[text.length - 1 - suffixLength]
|
|
953
|
+
) {
|
|
954
|
+
suffixLength += 1;
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
if (prefixLength + suffixLength !== draft.length) return output;
|
|
958
|
+
const insertedText = text.slice(prefixLength, text.length - suffixLength).trim();
|
|
959
|
+
return insertedText || output;
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
function getFullEditorText(ctx: ExtensionContext): string {
|
|
963
|
+
try {
|
|
964
|
+
return ctx.ui.getEditorText();
|
|
965
|
+
} catch {
|
|
966
|
+
return "";
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
|
|
827
970
|
function buildPostprocessPrompt(ctx: ExtensionContext, rawText: string, config: VoiceConfig): string {
|
|
828
971
|
const contextBudget = config.postprocessContextChars;
|
|
829
972
|
const editorContext = getEditorContext(ctx, Math.floor(contextBudget / 2));
|
|
830
973
|
const sessionContext = getRecentSessionContext(ctx, Math.ceil(contextBudget / 2));
|
|
831
974
|
|
|
832
975
|
return [
|
|
833
|
-
"
|
|
834
|
-
"
|
|
835
|
-
"
|
|
836
|
-
"
|
|
837
|
-
"
|
|
838
|
-
"
|
|
976
|
+
"Polish only the raw ASR text below, using context only when it helps disambiguate the user's intent.",
|
|
977
|
+
"If context is empty or irrelevant, polish the raw text directly.",
|
|
978
|
+
"Do not answer the raw speech, and do not execute its request. Output only the final text that should be inserted into the editor.",
|
|
979
|
+
"The output language must match the primary language of the raw speech, not the context language and not this English prompt. Do not translate.",
|
|
980
|
+
"Faithfully preserve the information and details in the raw speech. Do not summarize, compress, or delete details merely for brevity.",
|
|
981
|
+
"IMPORTANT: your output will be pasted verbatim at the current cursor position. It is not a replacement and not a rewrite of the whole editor draft.",
|
|
982
|
+
"The current editor draft is context only. Do not rewrite, repeat, complete, delete, or replace existing draft text. Do not output the full sentence after insertion.",
|
|
983
|
+
"The true cursor position is not marked in the draft shown here; the pi editor owns the actual insertion point. Do not guess the cursor and synthesize a full surrounding sentence.",
|
|
984
|
+
"If the raw speech is an inline insertion, continuation, a few words, or a phrase, output only the newly spoken words or phrase.",
|
|
985
|
+
"Example: draft is `Please make this function async and [cursor].`, raw speech is `add error handling`, correct output is `add error handling`, not `Please make this function async and add error handling.`.",
|
|
986
|
+
"Example: draft is `This variable name is [cursor]unclear`, raw speech is `still`, correct output is `still`, not `This variable name is still unclear`.",
|
|
839
987
|
"",
|
|
840
|
-
"---
|
|
841
|
-
editorContext.trim() || "
|
|
988
|
+
"--- Context: current unsent editor draft (context only; do not output wholesale) ---",
|
|
989
|
+
editorContext.trim() || "(empty)",
|
|
842
990
|
"",
|
|
843
|
-
"---
|
|
844
|
-
sessionContext || "
|
|
991
|
+
"--- Context: recent conversation ---",
|
|
992
|
+
sessionContext || "(empty)",
|
|
845
993
|
"",
|
|
846
|
-
"---
|
|
994
|
+
"--- Raw ASR text ---",
|
|
847
995
|
rawText.trim(),
|
|
848
996
|
].join("\n");
|
|
849
997
|
}
|
|
@@ -890,7 +1038,7 @@ async function postprocessTranscript(ctx: ExtensionContext, rawText: string, con
|
|
|
890
1038
|
}
|
|
891
1039
|
|
|
892
1040
|
const polished = cleanPostprocessOutput(extractAssistantText(response));
|
|
893
|
-
return polished
|
|
1041
|
+
return polished ? removeEditorDraftEcho(getFullEditorText(ctx), polished) : rawText;
|
|
894
1042
|
}
|
|
895
1043
|
|
|
896
1044
|
function insertIntoEditor(ctx: ExtensionContext, text: string) {
|
|
@@ -904,6 +1052,14 @@ async function isRecording(config: VoiceConfig): Promise<boolean> {
|
|
|
904
1052
|
return Boolean(state && pidAlive(state.pid));
|
|
905
1053
|
}
|
|
906
1054
|
|
|
1055
|
+
function cleanupStaleRecordingState(config: VoiceConfig): string[] {
|
|
1056
|
+
const state = readState(config);
|
|
1057
|
+
if (!state || pidAlive(state.pid)) return [];
|
|
1058
|
+
const cleanupWarnings = cleanupRecordingArtifacts(state);
|
|
1059
|
+
clearState(config);
|
|
1060
|
+
return cleanupWarnings;
|
|
1061
|
+
}
|
|
1062
|
+
|
|
907
1063
|
function requireInteractiveUi(ctx: ExtensionContext, action: string): boolean {
|
|
908
1064
|
if (ctx.hasUI) return true;
|
|
909
1065
|
ctx.ui.notify(`Voice ${action} requires interactive pi UI. Use /voice config or /voice help for setup information.`, "error");
|
|
@@ -920,29 +1076,42 @@ async function startRecording(ctx: ExtensionContext) {
|
|
|
920
1076
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("accent", recordingStatusText(deviceName)));
|
|
921
1077
|
return;
|
|
922
1078
|
}
|
|
923
|
-
if (existing)
|
|
1079
|
+
if (existing) {
|
|
1080
|
+
const cleanupWarnings = cleanupRecordingArtifacts(existing);
|
|
1081
|
+
clearState(config);
|
|
1082
|
+
if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
|
|
1083
|
+
}
|
|
924
1084
|
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
1085
|
+
const outputPath = createRecordingPath();
|
|
1086
|
+
let cmd: string[];
|
|
1087
|
+
try {
|
|
1088
|
+
cmd = recorderCommand(config, outputPath);
|
|
1089
|
+
} catch (error) {
|
|
1090
|
+
cleanupRecordingArtifacts({ path: outputPath });
|
|
1091
|
+
throw error;
|
|
1092
|
+
}
|
|
930
1093
|
const deviceName = recordingDeviceName(config, cmd[0]);
|
|
931
1094
|
|
|
932
1095
|
ctx.ui.setStatus("voice-input", ctx.ui.theme.fg("warning", "● starting mic"));
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
1096
|
+
let child: ReturnType<typeof spawn>;
|
|
1097
|
+
try {
|
|
1098
|
+
child = spawn(cmd[0], cmd.slice(1), {
|
|
1099
|
+
detached: true,
|
|
1100
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
1101
|
+
});
|
|
1102
|
+
} catch (error) {
|
|
1103
|
+
cleanupRecordingArtifacts({ path: outputPath });
|
|
1104
|
+
throw error;
|
|
1105
|
+
}
|
|
938
1106
|
child.unref();
|
|
939
|
-
closeSync(logFd);
|
|
940
1107
|
|
|
941
|
-
if (!child.pid)
|
|
1108
|
+
if (!child.pid) {
|
|
1109
|
+
cleanupRecordingArtifacts({ path: outputPath });
|
|
1110
|
+
throw new Error("Recorder failed to start: no pid returned");
|
|
1111
|
+
}
|
|
942
1112
|
writeState(config, {
|
|
943
1113
|
pid: child.pid,
|
|
944
1114
|
path: outputPath,
|
|
945
|
-
logPath,
|
|
946
1115
|
startedAt: new Date().toISOString(),
|
|
947
1116
|
recorderTarget: config.recorderTarget || undefined,
|
|
948
1117
|
deviceName,
|
|
@@ -966,21 +1135,41 @@ async function stopRecording(ctx: ExtensionContext, transcribe = true) {
|
|
|
966
1135
|
clearState(config);
|
|
967
1136
|
if (config.finalizeDelayMs > 0) await sleep(config.finalizeDelayMs);
|
|
968
1137
|
|
|
969
|
-
if (!existsSync(state.path) || statSync(state.path).size === 0) {
|
|
970
|
-
const log = existsSync(state.logPath) ? readFileSync(state.logPath, "utf8") : "";
|
|
971
|
-
throw new Error(`Recording file missing/empty: ${state.path}\nRecorder log:\n${log}`);
|
|
972
|
-
}
|
|
973
|
-
|
|
974
1138
|
if (!transcribe) {
|
|
1139
|
+
const cleanupWarnings = cleanupRecordingArtifacts(state);
|
|
975
1140
|
ctx.ui.setStatus("voice-input", undefined);
|
|
976
|
-
ctx.ui.notify(
|
|
1141
|
+
ctx.ui.notify(
|
|
1142
|
+
cleanupWarnings.length
|
|
1143
|
+
? `Voice recording cancelled; local audio discard attempted, but cleanup had warnings:\n${cleanupWarnings.join("\n")}`
|
|
1144
|
+
: "Voice recording cancelled; local audio discarded.",
|
|
1145
|
+
cleanupWarnings.length ? "warning" : "info",
|
|
1146
|
+
);
|
|
977
1147
|
return;
|
|
978
1148
|
}
|
|
979
1149
|
|
|
1150
|
+
if (!existsSync(state.path) || statSync(state.path).size === 0) {
|
|
1151
|
+
const cleanupWarnings = cleanupRecordingArtifacts(state);
|
|
1152
|
+
throw new Error(
|
|
1153
|
+
`Recording file missing/empty: ${state.path}. Recorder output is not persisted for privacy.${
|
|
1154
|
+
cleanupWarnings.length ? `\nCleanup warnings:\n${cleanupWarnings.join("\n")}` : ""
|
|
1155
|
+
}`,
|
|
1156
|
+
);
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
let decodeMs = 0;
|
|
1160
|
+
let durationMs = 0;
|
|
1161
|
+
let result: TranscriptionResult | undefined;
|
|
980
1162
|
const decodeStart = Date.now();
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
1163
|
+
try {
|
|
1164
|
+
const recording = parseRecordedWav(state.path);
|
|
1165
|
+
durationMs = recording.durationMs;
|
|
1166
|
+
decodeMs = Date.now() - decodeStart;
|
|
1167
|
+
result = await transcribePcm(recording.pcm, recording.durationMs, config);
|
|
1168
|
+
} finally {
|
|
1169
|
+
const cleanupWarnings = cleanupRecordingArtifacts(state);
|
|
1170
|
+
if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
|
|
1171
|
+
}
|
|
1172
|
+
if (!result) throw new Error("Transcription failed before a result was produced");
|
|
984
1173
|
|
|
985
1174
|
if (!result.text.trim()) {
|
|
986
1175
|
ctx.ui.setStatus("voice-input", undefined);
|
|
@@ -1148,7 +1337,14 @@ export default function (pi: ExtensionAPI) {
|
|
|
1148
1337
|
});
|
|
1149
1338
|
|
|
1150
1339
|
pi.on("session_start", (_event, ctx) => {
|
|
1151
|
-
|
|
1340
|
+
const currentConfig = getConfig();
|
|
1341
|
+
const cleanupWarnings = [
|
|
1342
|
+
...cleanupStaleRecordingState(currentConfig),
|
|
1343
|
+
...cleanupLegacyStoredArtifacts(currentConfig),
|
|
1344
|
+
];
|
|
1345
|
+
if (cleanupWarnings.length) ctx.ui.notify(`Voice input cleanup warning:\n${cleanupWarnings.join("\n")}`, "warning");
|
|
1346
|
+
|
|
1347
|
+
if (currentConfig.apiKey) {
|
|
1152
1348
|
ctx.ui.notify(`Voice input loaded: ${startupConfig.shortcut} toggles recording.`, "info");
|
|
1153
1349
|
return;
|
|
1154
1350
|
}
|
|
@@ -1156,7 +1352,7 @@ export default function (pi: ExtensionAPI) {
|
|
|
1156
1352
|
[
|
|
1157
1353
|
`Voice input loaded: ${startupConfig.shortcut} toggles recording.`,
|
|
1158
1354
|
"API key is missing. Run /voice key to set it up, or edit the JSON config file.",
|
|
1159
|
-
`Config file: ${
|
|
1355
|
+
`Config file: ${currentConfig.configPath}`,
|
|
1160
1356
|
`Get/create a VolcEngine Speech API key here: ${VOLC_API_KEY_URL}`,
|
|
1161
1357
|
].join("\n"),
|
|
1162
1358
|
"warning",
|