@iinm/plain-agent 1.7.18 → 1.7.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,250 @@
1
+ import { spawn, spawnSync } from "node:child_process";
2
+
3
+ /**
4
+ * @typedef {Object} VoiceRecorderConfig
5
+ * @property {string} command
6
+ * @property {string[]} args
7
+ * Must write raw 16-bit little-endian mono PCM to stdout at the sample
8
+ * rate required by the chosen provider (24 kHz for OpenAI, 16 kHz for
9
+ * Gemini).
10
+ */
11
+
12
+ /**
13
+ * @typedef {Object} VoiceSessionCallbacks
14
+ * @property {(text: string) => void} onTranscript
15
+ * @property {(error: Error) => void} onError
16
+ * @property {() => void} [onClose]
17
+ */
18
+
19
+ /**
20
+ * @typedef {Object} VoiceSession
21
+ * @property {() => Promise<void>} stop
22
+ */
23
+
24
+ /**
25
+ * @typedef {Object} RecorderHandle
26
+ * @property {() => void} stop
27
+ */
28
+
29
+ export const VOICE_DEBUG = process.env.PLAIN_VOICE_DEBUG === "1";
30
+
31
+ /**
32
+ * @param {number} sampleRate
33
+ * @returns {VoiceRecorderConfig[]}
34
+ */
35
+ export function getRecorderCandidates(sampleRate) {
36
+ const rate = String(sampleRate);
37
+ const isMac = process.platform === "darwin";
38
+ /** @type {VoiceRecorderConfig[]} */
39
+ const candidates = [];
40
+
41
+ if (!isMac) {
42
+ candidates.push({
43
+ command: "arecord",
44
+ args: ["-q", "-f", "S16_LE", "-c", "1", "-r", rate, "-t", "raw"],
45
+ });
46
+ }
47
+
48
+ candidates.push({
49
+ command: "sox",
50
+ args: [
51
+ "-q",
52
+ "-d",
53
+ "-b",
54
+ "16",
55
+ "-c",
56
+ "1",
57
+ "-r",
58
+ rate,
59
+ "-e",
60
+ "signed-integer",
61
+ "-t",
62
+ "raw",
63
+ "-",
64
+ ],
65
+ });
66
+
67
+ const ffmpegInput = isMac
68
+ ? ["-f", "avfoundation", "-i", ":0"]
69
+ : ["-f", "alsa", "-i", "default"];
70
+ candidates.push({
71
+ command: "ffmpeg",
72
+ args: [
73
+ "-hide_banner",
74
+ "-loglevel",
75
+ "error",
76
+ ...ffmpegInput,
77
+ "-ac",
78
+ "1",
79
+ "-ar",
80
+ rate,
81
+ "-f",
82
+ "s16le",
83
+ "-",
84
+ ],
85
+ });
86
+
87
+ return candidates;
88
+ }
89
+
90
+ /**
91
+ * @param {VoiceRecorderConfig[]} candidates
92
+ * @returns {VoiceRecorderConfig | null}
93
+ */
94
+ export function detectRecorder(candidates) {
95
+ return candidates.find((c) => isCommandAvailable(c.command)) ?? null;
96
+ }
97
+
98
+ /**
99
+ * @param {string} command
100
+ */
101
+ export function isCommandAvailable(command) {
102
+ if (process.platform === "win32") {
103
+ const result = spawnSync("where", [command], { stdio: "ignore" });
104
+ return result.status === 0;
105
+ }
106
+ const result = spawnSync("sh", ["-c", `command -v ${command}`], {
107
+ stdio: "ignore",
108
+ });
109
+ return result.status === 0;
110
+ }
111
+
112
+ /**
113
+ * Spawn a recorder subprocess that emits raw PCM on stdout, and wire its
114
+ * lifecycle events to the provided callbacks. This is purely transport
115
+ * plumbing — it knows nothing about any specific STT provider.
116
+ *
117
+ * @param {object} options
118
+ * @param {VoiceRecorderConfig} options.recorder
119
+ * @param {(chunk: Buffer) => void} options.onAudio
120
+ * @param {(error: Error) => void} options.onError
121
+ * @param {() => void} options.onExit - Called after the recorder subprocess exits (for any reason).
122
+ * @returns {RecorderHandle}
123
+ */
124
+ export function startRecorder({ recorder, onAudio, onError, onExit }) {
125
+ const child = spawn(recorder.command, recorder.args, {
126
+ stdio: ["ignore", "pipe", "pipe"],
127
+ });
128
+
129
+ /** @type {string[]} */
130
+ const stderrChunks = [];
131
+ child.stderr.on("data", (chunk) => {
132
+ stderrChunks.push(chunk.toString("utf8"));
133
+ });
134
+
135
+ child.on("error", (err) => {
136
+ const suffix =
137
+ /** @type {NodeJS.ErrnoException} */ (err).code === "ENOENT"
138
+ ? ` (command "${recorder.command}" not found)`
139
+ : "";
140
+ onError(new Error(`Recorder failed to start${suffix}: ${err.message}`));
141
+ });
142
+
143
+ child.on("exit", (code, signal) => {
144
+ if (code !== 0 && signal === null) {
145
+ const stderrText = stderrChunks.join("").trim();
146
+ onError(
147
+ new Error(
148
+ `Recorder "${recorder.command}" exited with code ${code}${
149
+ stderrText ? `: ${stderrText}` : ""
150
+ }`,
151
+ ),
152
+ );
153
+ }
154
+ onExit();
155
+ });
156
+
157
+ child.stdout.on("data", onAudio);
158
+
159
+ return {
160
+ stop() {
161
+ try {
162
+ child.kill("SIGTERM");
163
+ } catch {
164
+ // ignore
165
+ }
166
+ },
167
+ };
168
+ }
169
+
170
+ /**
171
+ * Report an error asynchronously and return an already-terminated session.
172
+ *
173
+ * @param {VoiceSessionCallbacks} callbacks
174
+ * @param {Error} error
175
+ * @returns {VoiceSession}
176
+ */
177
+ export function failVoiceSessionAsync(callbacks, error) {
178
+ queueMicrotask(() => {
179
+ callbacks.onError(error);
180
+ callbacks.onClose?.();
181
+ });
182
+ return { stop: async () => {} };
183
+ }
184
+
185
+ /**
186
+ * Drop whitespace sitting between two CJK characters. Some providers return
187
+ * Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
188
+ * mixed strings like "Windows を使う" keep their inter-script spaces.
189
+ *
190
+ * @returns {{ push: (text: string) => string, flush: () => string }}
191
+ */
192
+ export function createCJKSpaceNormalizer() {
193
+ let prevChar = "";
194
+ let pendingSpaces = "";
195
+ const isSpace = (/** @type {string} */ c) =>
196
+ c === " " || c === "\t" || c === "\u3000";
197
+
198
+ return {
199
+ push(text) {
200
+ let out = "";
201
+ for (const ch of text) {
202
+ if (isSpace(ch)) {
203
+ pendingSpaces += ch;
204
+ continue;
205
+ }
206
+ if (pendingSpaces.length > 0) {
207
+ if (!(isCJKChar(prevChar) && isCJKChar(ch))) {
208
+ out += pendingSpaces;
209
+ }
210
+ pendingSpaces = "";
211
+ }
212
+ out += ch;
213
+ prevChar = ch;
214
+ }
215
+ return out;
216
+ },
217
+ flush() {
218
+ const out = pendingSpaces;
219
+ pendingSpaces = "";
220
+ prevChar = "";
221
+ return out;
222
+ },
223
+ };
224
+ }
225
+
226
+ /**
227
+ * @param {string} ch
228
+ * @returns {boolean}
229
+ */
230
+ function isCJKChar(ch) {
231
+ const code = ch.codePointAt(0);
232
+ if (code === undefined) return false;
233
+ return (
234
+ (code >= 0x3000 && code <= 0x33ff) ||
235
+ (code >= 0x3400 && code <= 0x4dbf) ||
236
+ (code >= 0x4e00 && code <= 0x9fff) ||
237
+ (code >= 0xac00 && code <= 0xd7af) ||
238
+ (code >= 0xf900 && code <= 0xfaff) ||
239
+ (code >= 0xff00 && code <= 0xffef) ||
240
+ (code >= 0x20000 && code <= 0x2ffff)
241
+ );
242
+ }
243
+
244
+ /**
245
+ * @param {unknown} value
246
+ * @returns {value is Record<string, unknown>}
247
+ */
248
+ export function isObjectLike(value) {
249
+ return typeof value === "object" && value !== null;
250
+ }
@@ -0,0 +1,62 @@
1
+ /**
2
+ * @typedef {Object} VoiceToggleKey
3
+ * @property {number} byte
4
+ * @property {string} label
5
+ */
6
+
7
+ // Bytes reserved for other terminal/readline uses — cannot be used as a voice toggle.
8
+ // 0x03 = Ctrl-C (SIGINT)
9
+ // 0x04 = Ctrl-D (EOF / readline exit)
10
+ // 0x09 = Ctrl-I (Tab)
11
+ // 0x0a = Ctrl-J (LF / Enter)
12
+ // 0x0d = Ctrl-M (CR / Enter)
13
+ // 0x11 = Ctrl-Q (XON: resume terminal output)
14
+ // 0x13 = Ctrl-S (XOFF: suspend terminal output)
15
+ const RESERVED_TERMINAL_BYTES = new Set([
16
+ 0x03, 0x04, 0x09, 0x0a, 0x0d, 0x11, 0x13,
17
+ ]);
18
+
19
+ /**
20
+ * Parse a "ctrl-<char>" binding into the raw byte the terminal sends in
21
+ * raw mode. Only Ctrl-<char> is supported because it is the only family
22
+ * the pre-readline pipeline can recognize without a full key decoder.
23
+ *
24
+ * @param {string | undefined} spec
25
+ * @returns {VoiceToggleKey}
26
+ */
27
+ export function parseVoiceToggleKey(spec) {
28
+ const raw = (spec ?? "ctrl-o").trim().toLowerCase();
29
+
30
+ const match = /^ctrl-(.)$/.exec(raw);
31
+ if (!match) {
32
+ throw new Error(
33
+ `Invalid voiceInput.toggleKey "${spec}". Expected "ctrl-<char>".`,
34
+ );
35
+ }
36
+
37
+ const ch = match[1];
38
+ const code = ch.charCodeAt(0);
39
+
40
+ // Subtracting a fixed offset from the character's ASCII code yields the
41
+ // control byte (0x01–0x1f) the terminal sends for that Ctrl combination.
42
+ let byte;
43
+ if (code >= 0x61 && code <= 0x7a) {
44
+ // a–z (0x61–0x7a): subtract 0x60 → 0x01 (Ctrl-A) – 0x1a (Ctrl-Z)
45
+ byte = code - 0x60;
46
+ } else if (code >= 0x5b && code <= 0x5f) {
47
+ // [ \ ] ^ _ (0x5b–0x5f): subtract 0x40 → 0x1b (Ctrl-[) – 0x1f (Ctrl-_)
48
+ byte = code - 0x40;
49
+ } else {
50
+ throw new Error(
51
+ `Unsupported voiceInput.toggleKey "${spec}". Use ctrl-<letter> or ctrl-<[ \\ ] ^ _>.`,
52
+ );
53
+ }
54
+
55
+ if (RESERVED_TERMINAL_BYTES.has(byte)) {
56
+ throw new Error(
57
+ `voiceInput.toggleKey "${spec}" conflicts with a reserved terminal/readline key.`,
58
+ );
59
+ }
60
+
61
+ return { byte, label: `Ctrl-${ch.toUpperCase()}` };
62
+ }