@iinm/plain-agent 1.8.4 → 1.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/bin/plain +1 -1
  2. package/package.json +7 -5
  3. package/sandbox/bin/plain-sandbox +13 -0
  4. package/src/agent.d.ts +52 -0
  5. package/src/agent.mjs +204 -0
  6. package/src/agentLoop.mjs +419 -0
  7. package/src/agentState.mjs +41 -0
  8. package/src/claudeCodePlugin.mjs +164 -0
  9. package/src/cliArgs.mjs +175 -0
  10. package/src/cliBatch.mjs +147 -0
  11. package/src/cliCommands.mjs +283 -0
  12. package/src/cliCompleter.mjs +227 -0
  13. package/src/cliCost.mjs +309 -0
  14. package/src/cliFormatter.mjs +413 -0
  15. package/src/cliInteractive.mjs +529 -0
  16. package/src/cliInterruptTransform.mjs +51 -0
  17. package/src/cliMuteTransform.mjs +26 -0
  18. package/src/cliPasteTransform.mjs +183 -0
  19. package/src/config.d.ts +36 -0
  20. package/src/config.mjs +197 -0
  21. package/src/context/loadAgentRoles.mjs +294 -0
  22. package/src/context/loadPrompts.mjs +337 -0
  23. package/src/context/loadUserMessageContext.mjs +147 -0
  24. package/src/costTracker.mjs +210 -0
  25. package/src/env.mjs +44 -0
  26. package/src/main.mjs +281 -0
  27. package/src/mcpClient.mjs +351 -0
  28. package/src/mcpIntegration.mjs +160 -0
  29. package/src/model.d.ts +109 -0
  30. package/src/modelCaller.mjs +32 -0
  31. package/src/modelDefinition.d.ts +92 -0
  32. package/src/prompt.mjs +138 -0
  33. package/src/providers/anthropic.d.ts +248 -0
  34. package/src/providers/anthropic.mjs +587 -0
  35. package/src/providers/bedrock.d.ts +249 -0
  36. package/src/providers/bedrock.mjs +700 -0
  37. package/src/providers/gemini.d.ts +208 -0
  38. package/src/providers/gemini.mjs +754 -0
  39. package/src/providers/openai.d.ts +281 -0
  40. package/src/providers/openai.mjs +544 -0
  41. package/src/providers/openaiCompatible.d.ts +147 -0
  42. package/src/providers/openaiCompatible.mjs +652 -0
  43. package/src/providers/platform/awsSigV4.mjs +184 -0
  44. package/src/providers/platform/azure.mjs +42 -0
  45. package/src/providers/platform/bedrock.mjs +78 -0
  46. package/src/providers/platform/googleCloud.mjs +34 -0
  47. package/src/subagent.mjs +265 -0
  48. package/src/tmpfile.mjs +27 -0
  49. package/src/tool.d.ts +74 -0
  50. package/src/toolExecutor.mjs +236 -0
  51. package/src/toolInputValidator.mjs +183 -0
  52. package/src/toolUseApprover.mjs +99 -0
  53. package/src/tools/askURL.mjs +209 -0
  54. package/src/tools/askWeb.mjs +208 -0
  55. package/src/tools/compactContext.d.ts +4 -0
  56. package/src/tools/compactContext.mjs +87 -0
  57. package/src/tools/execCommand.d.ts +22 -0
  58. package/src/tools/execCommand.mjs +200 -0
  59. package/src/tools/patchFile.d.ts +4 -0
  60. package/src/tools/patchFile.mjs +133 -0
  61. package/src/tools/switchToMainAgent.d.ts +3 -0
  62. package/src/tools/switchToMainAgent.mjs +43 -0
  63. package/src/tools/switchToSubagent.d.ts +4 -0
  64. package/src/tools/switchToSubagent.mjs +59 -0
  65. package/src/tools/tmuxCommand.d.ts +14 -0
  66. package/src/tools/tmuxCommand.mjs +194 -0
  67. package/src/tools/writeFile.d.ts +4 -0
  68. package/src/tools/writeFile.mjs +56 -0
  69. package/src/usageStore.mjs +167 -0
  70. package/src/utils/evalJSONConfig.mjs +72 -0
  71. package/src/utils/matchValue.d.ts +6 -0
  72. package/src/utils/matchValue.mjs +40 -0
  73. package/src/utils/noThrow.mjs +31 -0
  74. package/src/utils/notify.mjs +29 -0
  75. package/src/utils/parseFileRange.mjs +18 -0
  76. package/src/utils/readFileRange.mjs +33 -0
  77. package/src/utils/retryOnError.mjs +41 -0
  78. package/src/voiceInput.mjs +61 -0
  79. package/src/voiceInputGemini.mjs +105 -0
  80. package/src/voiceInputOpenAI.mjs +104 -0
  81. package/src/voiceInputSession.mjs +543 -0
  82. package/src/voiceToggleKey.mjs +62 -0
  83. package/dist/main.mjs +0 -473
  84. package/dist/main.mjs.map +0 -7
@@ -0,0 +1,104 @@
1
+ import {
2
+ isObjectLike,
3
+ startWebSocketVoiceSession,
4
+ } from "./voiceInputSession.mjs";
5
+
6
+ /**
7
+ * @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
8
+ */
9
+
10
+ /**
11
+ * @typedef {Object} VoiceInputOpenAIConfig
12
+ * @property {"openai"} provider
13
+ * @property {string} apiKey
14
+ * @property {string} [model] - Defaults to "gpt-4o-transcribe".
15
+ * @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
16
+ * @property {string} [baseURL]
17
+ * @property {VoiceRecorderConfig} [recorder]
18
+ * @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
19
+ */
20
+
21
+ const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
22
+ const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
23
+ const OPENAI_SAMPLE_RATE = 24000;
24
+ const OPENAI_LABEL = "OpenAI Realtime";
25
+
26
+ /**
27
+ * Start a voice input session backed by the OpenAI Realtime transcription
28
+ * WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
29
+ * forwards transcript deltas via `onTranscript`.
30
+ *
31
+ * @param {object} options
32
+ * @param {VoiceInputOpenAIConfig} options.config
33
+ * @param {VoiceSessionCallbacks} options.callbacks
34
+ * @returns {VoiceSession}
35
+ */
36
+ export function startOpenAIVoiceSession({ config, callbacks }) {
37
+ /** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
38
+ const hooks = {
39
+ label: OPENAI_LABEL,
40
+ sampleRate: OPENAI_SAMPLE_RATE,
41
+ buildWsUrl(config) {
42
+ const base = config.baseURL ?? OPENAI_DEFAULT_WS;
43
+ return `${base}?intent=transcription`;
44
+ },
45
+ buildWsOptions(config) {
46
+ return {
47
+ headers: {
48
+ Authorization: `Bearer ${config.apiKey}`,
49
+ "OpenAI-Beta": "realtime=v1",
50
+ },
51
+ };
52
+ },
53
+ buildSetupMessage(config) {
54
+ const model = config.model ?? OPENAI_DEFAULT_MODEL;
55
+ /** @type {{ model: string, language?: string }} */
56
+ const transcription = { model };
57
+ if (config.language) transcription.language = config.language;
58
+ // The `?intent=transcription` endpoint uses the flat transcription-session
59
+ // schema, not the nested `session.audio.input.*` realtime schema.
60
+ return {
61
+ type: "transcription_session.update",
62
+ session: {
63
+ input_audio_format: "pcm16",
64
+ input_audio_transcription: transcription,
65
+ turn_detection: { type: "server_vad" },
66
+ },
67
+ };
68
+ },
69
+ isReadyMessage(message) {
70
+ return (
71
+ isObjectLike(message) &&
72
+ (message.type === "transcription_session.created" ||
73
+ message.type === "transcription_session.updated")
74
+ );
75
+ },
76
+ extractError(message) {
77
+ if (!isObjectLike(message) || message.type !== "error") return undefined;
78
+ const error = message.error;
79
+ if (!isObjectLike(error)) return undefined;
80
+ return typeof error.message === "string"
81
+ ? error.message
82
+ : JSON.stringify(error);
83
+ },
84
+ extractTranscript(message) {
85
+ if (
86
+ isObjectLike(message) &&
87
+ message.type === "conversation.item.input_audio_transcription.delta" &&
88
+ typeof message.delta === "string" &&
89
+ message.delta.length > 0
90
+ ) {
91
+ return message.delta;
92
+ }
93
+ return undefined;
94
+ },
95
+ buildAudioPayload(chunk, _sampleRate) {
96
+ return {
97
+ type: "input_audio_buffer.append",
98
+ audio: chunk.toString("base64"),
99
+ };
100
+ },
101
+ };
102
+
103
+ return startWebSocketVoiceSession({ hooks, config, callbacks });
104
+ }
@@ -0,0 +1,543 @@
1
+ import { spawn, spawnSync } from "node:child_process";
2
+
3
+ /**
4
+ * @typedef {Object} VoiceRecorderConfig
5
+ * @property {string} command
6
+ * @property {string[]} args
7
+ * Must write raw 16-bit little-endian mono PCM to stdout at the sample
8
+ * rate required by the chosen provider (24 kHz for OpenAI, 16 kHz for
9
+ * Gemini).
10
+ */
11
+
12
+ /**
13
+ * @typedef {Object} VoiceSessionCallbacks
14
+ * @property {(text: string) => void} onTranscript
15
+ * @property {(error: Error) => void} onError
16
+ * @property {() => void} [onClose]
17
+ */
18
+
19
+ /**
20
+ * @typedef {Object} VoiceSession
21
+ * @property {() => Promise<void>} stop
22
+ */
23
+
24
+ /**
25
+ * @typedef {Object} RecorderHandle
26
+ * @property {() => void} stop
27
+ */
28
+
29
+ export const VOICE_DEBUG = process.env.PLAIN_VOICE_DEBUG === "1";
30
+
31
+ /**
32
+ * @param {number} sampleRate
33
+ * @returns {VoiceRecorderConfig[]}
34
+ */
35
+ export function getRecorderCandidates(sampleRate) {
36
+ const rate = String(sampleRate);
37
+ const isMac = process.platform === "darwin";
38
+ /** @type {VoiceRecorderConfig[]} */
39
+ const candidates = [];
40
+
41
+ if (!isMac) {
42
+ candidates.push({
43
+ command: "arecord",
44
+ args: ["-q", "-f", "S16_LE", "-c", "1", "-r", rate, "-t", "raw"],
45
+ });
46
+ }
47
+
48
+ candidates.push({
49
+ command: "sox",
50
+ args: [
51
+ "-q",
52
+ "-d",
53
+ "-b",
54
+ "16",
55
+ "-c",
56
+ "1",
57
+ "-r",
58
+ rate,
59
+ "-e",
60
+ "signed-integer",
61
+ "-t",
62
+ "raw",
63
+ "-",
64
+ ],
65
+ });
66
+
67
+ const ffmpegInput = isMac
68
+ ? ["-f", "avfoundation", "-i", ":0"]
69
+ : ["-f", "alsa", "-i", "default"];
70
+ candidates.push({
71
+ command: "ffmpeg",
72
+ args: [
73
+ "-hide_banner",
74
+ "-loglevel",
75
+ "error",
76
+ ...ffmpegInput,
77
+ "-ac",
78
+ "1",
79
+ "-ar",
80
+ rate,
81
+ "-f",
82
+ "s16le",
83
+ "-",
84
+ ],
85
+ });
86
+
87
+ return candidates;
88
+ }
89
+
90
+ /**
91
+ * @param {VoiceRecorderConfig[]} candidates
92
+ * @returns {VoiceRecorderConfig | null}
93
+ */
94
+ export function detectRecorder(candidates) {
95
+ return candidates.find((c) => isCommandAvailable(c.command)) ?? null;
96
+ }
97
+
98
+ /**
99
+ * @param {string} command
100
+ */
101
+ export function isCommandAvailable(command) {
102
+ if (process.platform === "win32") {
103
+ const result = spawnSync("where", [command], { stdio: "ignore" });
104
+ return result.status === 0;
105
+ }
106
+ const result = spawnSync("sh", ["-c", `command -v ${command}`], {
107
+ stdio: "ignore",
108
+ });
109
+ return result.status === 0;
110
+ }
111
+
112
+ /**
113
+ * Spawn a recorder subprocess that emits raw PCM on stdout, and wire its
114
+ * lifecycle events to the provided callbacks. This is purely transport
115
+ * plumbing — it knows nothing about any specific STT provider.
116
+ *
117
+ * @param {object} options
118
+ * @param {VoiceRecorderConfig} options.recorder
119
+ * @param {(chunk: Buffer) => void} options.onAudio
120
+ * @param {(error: Error) => void} options.onError
121
+ * @param {() => void} options.onExit - Called after the recorder subprocess exits (for any reason).
122
+ * @returns {RecorderHandle}
123
+ */
124
+ export function startRecorder({ recorder, onAudio, onError, onExit }) {
125
+ const child = spawn(recorder.command, recorder.args, {
126
+ stdio: ["ignore", "pipe", "pipe"],
127
+ });
128
+
129
+ /** @type {string[]} */
130
+ const stderrChunks = [];
131
+ child.stderr.on("data", (chunk) => {
132
+ stderrChunks.push(chunk.toString("utf8"));
133
+ });
134
+
135
+ child.on("error", (err) => {
136
+ const suffix =
137
+ /** @type {NodeJS.ErrnoException} */ (err).code === "ENOENT"
138
+ ? ` (command "${recorder.command}" not found)`
139
+ : "";
140
+ onError(new Error(`Recorder failed to start${suffix}: ${err.message}`));
141
+ });
142
+
143
+ child.on("exit", (code, signal) => {
144
+ if (code !== 0 && signal === null) {
145
+ const stderrText = stderrChunks.join("").trim();
146
+ onError(
147
+ new Error(
148
+ `Recorder "${recorder.command}" exited with code ${code}${
149
+ stderrText ? `: ${stderrText}` : ""
150
+ }`,
151
+ ),
152
+ );
153
+ }
154
+ onExit();
155
+ });
156
+
157
+ child.stdout.on("data", onAudio);
158
+
159
+ return {
160
+ stop() {
161
+ try {
162
+ child.kill("SIGTERM");
163
+ } catch {
164
+ // ignore
165
+ }
166
+ },
167
+ };
168
+ }
169
+
170
+ /**
171
+ * Report an error asynchronously and return an already-terminated session.
172
+ *
173
+ * Calls `onError` followed by `onClose` in a microtask, ensuring the caller
174
+ * receives a valid {@link VoiceSession} synchronously while still notifying
175
+ * the consumer of the failure.
176
+ *
177
+ * @param {VoiceSessionCallbacks} callbacks
178
+ * @param {Error} error
179
+ * @returns {VoiceSession}
180
+ */
181
+ export function failVoiceSessionAsync(callbacks, error) {
182
+ queueMicrotask(() => {
183
+ callbacks.onError(error);
184
+ callbacks.onClose?.();
185
+ });
186
+ return { stop: async () => {} };
187
+ }
188
+
189
+ /**
190
+ * Provider-specific hook contract for {@link startWebSocketVoiceSession}.
191
+ *
192
+ * Each hook is called at a specific point in the session lifecycle:
193
+ *
194
+ * 1. **Construction** – `buildWsUrl` (and optionally `buildWsOptions`) are
195
+ * invoked immediately to create the WebSocket.
196
+ * 2. **Open** – `buildSetupMessage` is sent as the first JSON message once the
197
+ * WebSocket opens.
198
+ * 3. **Ready** – `isReadyMessage` is tested on every incoming message until it
199
+ * returns `true`. At that point the session transitions to *ready* and any
200
+ * buffered audio chunks are flushed.
201
+ * 4. **Streaming** – `buildAudioPayload` is called for every recorder chunk
202
+ * while the WebSocket is open and ready.
203
+ * 5. **Error extraction** – `extractError` is checked on every message before
204
+ * transcript extraction. If it returns a string, the session reports an
205
+ * error and drops the message.
206
+ * 6. **Transcription** – `extractTranscript` is called on every message after
207
+ * the session is ready. Non-empty results are pushed through the CJK
208
+ * space normalizer and then forwarded to `onTranscript`.
209
+ *
210
+ * @template TConfig
211
+ * @typedef {Object} VoiceProviderHooks
212
+ * @property {string} label - Human-readable provider name (used in logs and
213
+ * error messages).
214
+ * @property {number} sampleRate - PCM sample rate expected by the provider
215
+ * (e.g. 16000 for Gemini, 24000 for OpenAI). Passed to the recorder and
216
+ * `buildAudioPayload`.
217
+ * @property {(config: TConfig) => string} buildWsUrl - Returns the full
218
+ * WebSocket URL, including any query parameters.
219
+ * @property {(config: TConfig) => { headers?: Record<string, string> }} [buildWsOptions]
220
+ * - Returns optional per-provider WebSocket constructor options. Node's
221
+ * global WebSocket (undici) accepts a non-standard `headers` option that
222
+ * is not declared in the standard typings.
223
+ * @property {(config: TConfig) => object} buildSetupMessage - Returns the
224
+ * first JSON message sent immediately after the WebSocket opens.
225
+ * @property {(message: unknown) => boolean} isReadyMessage - Returns `true`
226
+ * when the given server message signals that the provider is ready to
227
+ * receive audio.
228
+ * @property {(message: unknown) => string | undefined} extractTranscript -
229
+ * Extracts a transcript delta from a server message. Return `undefined`
230
+ * when the message carries no transcript.
231
+ * @property {(message: unknown) => string | undefined} [extractError] -
232
+ * Extracts an error description from a server message. Return `undefined`
233
+ * when the message carries no error.
234
+ * @property {(chunk: Buffer, sampleRate: number) => object} buildAudioPayload -
235
+ * Wraps a raw PCM chunk into the provider-specific JSON payload. The
236
+ * `sampleRate` argument is the same value as `hooks.sampleRate`.
237
+ */
238
+
239
+ /**
240
+ * Shared WebSocket voice session implementation used by both Gemini and
241
+ * OpenAI drivers.
242
+ *
243
+ * Responsibilities of this function:
244
+ * - Detect and start a suitable system audio recorder.
245
+ * - Establish the provider WebSocket connection.
246
+ * - Manage the lifecycle (setup → ready → streaming → close).
247
+ * - Buffer audio chunks while the connection is not yet ready.
248
+ * - Apply CJK space normalization to transcript text.
249
+ *
250
+ * Responsibilities of the caller (the driver):
251
+ * - Provide a {@link VoiceProviderHooks} object that knows the provider's
252
+ * wire protocol (URLs, headers, message schemas).
253
+ * - Supply `config` and `callbacks` from the user's call site.
254
+ *
255
+ * @template TConfig
256
+ * @param {object} options
257
+ * @param {VoiceProviderHooks<TConfig>} options.hooks
258
+ * @param {TConfig & { recorder?: VoiceRecorderConfig }} options.config
259
+ * @param {VoiceSessionCallbacks} options.callbacks
260
+ * @returns {VoiceSession}
261
+ */
262
+ export function startWebSocketVoiceSession({ hooks, config, callbacks }) {
263
+ const recorder =
264
+ config.recorder ?? detectRecorder(getRecorderCandidates(hooks.sampleRate));
265
+ if (!recorder) {
266
+ return failVoiceSessionAsync(
267
+ callbacks,
268
+ new Error(
269
+ "No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
270
+ ),
271
+ );
272
+ }
273
+
274
+ if (!isCommandAvailable(recorder.command)) {
275
+ return failVoiceSessionAsync(
276
+ callbacks,
277
+ new Error(
278
+ `Voice recorder command "${recorder.command}" not found on PATH.`,
279
+ ),
280
+ );
281
+ }
282
+
283
+ let stopped = false;
284
+ let closeEmitted = false;
285
+ let ready = false;
286
+ /** @type {Buffer[]} */
287
+ const pendingAudio = [];
288
+ const normalizer = createCJKSpaceNormalizer();
289
+
290
+ function emitClose() {
291
+ if (closeEmitted) return;
292
+ closeEmitted = true;
293
+ callbacks.onClose?.();
294
+ }
295
+
296
+ const wsUrl = hooks.buildWsUrl(config);
297
+ const wsOptions = hooks.buildWsOptions?.(config);
298
+
299
+ // Node's global WebSocket (undici) accepts a non-standard `headers`
300
+ // option. The built-in typings only declare the standards-compliant
301
+ // constructor, so cast through `WebSocket`-as-constructor.
302
+ const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
303
+ /** @type {unknown} */ (WebSocket)
304
+ );
305
+ const ws = new Ctor(wsUrl, wsOptions);
306
+ ws.binaryType = "arraybuffer";
307
+
308
+ const rec = startRecorder({
309
+ recorder,
310
+ onAudio(chunk) {
311
+ if (stopped) return;
312
+ if (ready && ws.readyState === WebSocket.OPEN) {
313
+ sendAudio(chunk);
314
+ } else {
315
+ pendingAudio.push(chunk);
316
+ }
317
+ },
318
+ onError(err) {
319
+ if (!stopped) callbacks.onError(err);
320
+ stop();
321
+ },
322
+ onExit() {
323
+ stop();
324
+ },
325
+ });
326
+
327
+ /**
328
+ * @param {Buffer} chunk
329
+ */
330
+ function sendAudio(chunk) {
331
+ const payload = hooks.buildAudioPayload(chunk, hooks.sampleRate);
332
+ try {
333
+ ws.send(JSON.stringify(payload));
334
+ } catch (err) {
335
+ if (VOICE_DEBUG) {
336
+ process.stderr.write(
337
+ `[voiceInput] sendAudio dropped: ${formatError(err)}\n`,
338
+ );
339
+ }
340
+ }
341
+ }
342
+
343
+ ws.addEventListener("open", () => {
344
+ const setup = hooks.buildSetupMessage(config);
345
+ try {
346
+ ws.send(JSON.stringify(setup));
347
+ } catch (err) {
348
+ callbacks.onError(
349
+ new Error(`Failed to send setup message: ${formatError(err)}`),
350
+ );
351
+ stop();
352
+ }
353
+ });
354
+
355
+ ws.addEventListener("message", (event) => {
356
+ if (stopped) return;
357
+ let raw = "";
358
+ let message;
359
+ try {
360
+ raw =
361
+ typeof event.data === "string"
362
+ ? event.data
363
+ : Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
364
+ "utf8",
365
+ );
366
+ message = JSON.parse(raw);
367
+ } catch (err) {
368
+ callbacks.onError(
369
+ new Error(`Failed to parse server message: ${formatError(err)}`),
370
+ );
371
+ return;
372
+ }
373
+ if (!isObjectLike(message)) return;
374
+ if (VOICE_DEBUG) {
375
+ process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
376
+ }
377
+
378
+ const errorText = hooks.extractError?.(message);
379
+ if (errorText) {
380
+ callbacks.onError(new Error(`${hooks.label} error: ${errorText}`));
381
+ return;
382
+ }
383
+
384
+ if (!ready && hooks.isReadyMessage(message)) {
385
+ ready = true;
386
+ for (const chunk of pendingAudio.splice(0)) {
387
+ if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
388
+ }
389
+ return;
390
+ }
391
+
392
+ const transcript = hooks.extractTranscript(message);
393
+ if (transcript && transcript.length > 0) {
394
+ const normalized = normalizer.push(transcript);
395
+ if (normalized.length > 0) {
396
+ callbacks.onTranscript(normalized);
397
+ }
398
+ }
399
+ });
400
+
401
+ ws.addEventListener("error", (event) => {
402
+ if (stopped) return;
403
+ const message =
404
+ /** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
405
+ callbacks.onError(new Error(`${hooks.label} WebSocket error: ${message}`));
406
+ stop();
407
+ });
408
+
409
+ ws.addEventListener("close", (event) => {
410
+ if (!stopped && event.code !== 1000 && event.code !== 1005) {
411
+ const reason = event.reason ? `: ${event.reason}` : "";
412
+ callbacks.onError(
413
+ new Error(
414
+ `${hooks.label} WebSocket closed (code ${event.code}${reason})`,
415
+ ),
416
+ );
417
+ }
418
+ stopped = true;
419
+ rec.stop();
420
+ emitClose();
421
+ });
422
+
423
+ if (VOICE_DEBUG) {
424
+ process.stderr.write(
425
+ `[voiceInput] driver=${hooks.label} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
426
+ );
427
+ }
428
+
429
+ /**
430
+ * Stops the recorder and closes the WebSocket.
431
+ *
432
+ * **Note on asynchronicity:** This function is `async` only to satisfy the
433
+ * {@link VoiceSession} interface. It is called without `await` from event
434
+ * listeners (recorder exit, WebSocket error/close). Callers must not rely
435
+ * on the returned promise because unhandled rejections would crash the
436
+ * process. If the function is ever changed to perform real async work,
437
+ * every call site must wrap it with `.catch(() => {})`.
438
+ */
439
+ async function stop() {
440
+ if (stopped) return;
441
+ stopped = true;
442
+ rec.stop();
443
+ pendingAudio.length = 0;
444
+ if (
445
+ ws.readyState === WebSocket.OPEN ||
446
+ ws.readyState === WebSocket.CONNECTING
447
+ ) {
448
+ try {
449
+ ws.close(1000, "client stop");
450
+ } catch (err) {
451
+ if (VOICE_DEBUG) {
452
+ process.stderr.write(
453
+ `[voiceInput] ws.close failed: ${formatError(err)}\n`,
454
+ );
455
+ }
456
+ }
457
+ }
458
+ emitClose();
459
+ }
460
+
461
+ return { stop };
462
+ }
463
+
464
+ /**
465
+ * Drop whitespace sitting between two CJK characters. Some providers return
466
+ * Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
467
+ * mixed strings like "Windows を使う" keep their inter-script spaces.
468
+ *
469
+ * @returns {{ push: (text: string) => string, flush: () => string }}
470
+ */
471
+ export function createCJKSpaceNormalizer() {
472
+ let prevChar = "";
473
+ let pendingSpaces = "";
474
+
475
+ /**
476
+ * @param {string} c
477
+ * @returns {boolean}
478
+ */
479
+ function isSpace(c) {
480
+ return c === " " || c === "\t" || c === "\u3000";
481
+ }
482
+
483
+ return {
484
+ push(text) {
485
+ let out = "";
486
+ for (const ch of text) {
487
+ if (isSpace(ch)) {
488
+ pendingSpaces += ch;
489
+ continue;
490
+ }
491
+ if (pendingSpaces.length > 0) {
492
+ if (!(isCJKChar(prevChar) && isCJKChar(ch))) {
493
+ out += pendingSpaces;
494
+ }
495
+ pendingSpaces = "";
496
+ }
497
+ out += ch;
498
+ prevChar = ch;
499
+ }
500
+ return out;
501
+ },
502
+ flush() {
503
+ const out = pendingSpaces;
504
+ pendingSpaces = "";
505
+ prevChar = "";
506
+ return out;
507
+ },
508
+ };
509
+ }
510
+
511
+ /**
512
+ * @param {string} ch
513
+ * @returns {boolean}
514
+ */
515
+ function isCJKChar(ch) {
516
+ const code = ch.codePointAt(0);
517
+ if (code === undefined) return false;
518
+ return (
519
+ (code >= 0x3000 && code <= 0x33ff) ||
520
+ (code >= 0x3400 && code <= 0x4dbf) ||
521
+ (code >= 0x4e00 && code <= 0x9fff) ||
522
+ (code >= 0xac00 && code <= 0xd7af) ||
523
+ (code >= 0xf900 && code <= 0xfaff) ||
524
+ (code >= 0xff00 && code <= 0xffef) ||
525
+ (code >= 0x20000 && code <= 0x2ffff)
526
+ );
527
+ }
528
+
529
+ /**
530
+ * @param {unknown} value
531
+ * @returns {value is Record<string, unknown>}
532
+ */
533
+ export function isObjectLike(value) {
534
+ return typeof value === "object" && value !== null;
535
+ }
536
+
537
+ /**
538
+ * @param {unknown} err
539
+ * @returns {string}
540
+ */
541
+ function formatError(err) {
542
+ return err instanceof Error ? err.message : String(err);
543
+ }
@@ -0,0 +1,62 @@
1
+ /**
2
+ * @typedef {Object} VoiceToggleKey
3
+ * @property {number} byte
4
+ * @property {string} label
5
+ */
6
+
7
+ // Bytes reserved for other terminal/readline uses — cannot be used as a voice toggle.
8
+ // 0x03 = Ctrl-C (SIGINT)
9
+ // 0x04 = Ctrl-D (EOF / readline exit)
10
+ // 0x09 = Ctrl-I (Tab)
11
+ // 0x0a = Ctrl-J (LF / Enter)
12
+ // 0x0d = Ctrl-M (CR / Enter)
13
+ // 0x11 = Ctrl-Q (XON: resume terminal output)
14
+ // 0x13 = Ctrl-S (XOFF: suspend terminal output)
15
+ const RESERVED_TERMINAL_BYTES = new Set([
16
+ 0x03, 0x04, 0x09, 0x0a, 0x0d, 0x11, 0x13,
17
+ ]);
18
+
19
+ /**
20
+ * Parse a "ctrl-<char>" binding into the raw byte the terminal sends in
21
+ * raw mode. Only Ctrl-<char> is supported because it is the only family
22
+ * the pre-readline pipeline can recognize without a full key decoder.
23
+ *
24
+ * @param {string | undefined} spec
25
+ * @returns {VoiceToggleKey}
26
+ */
27
+ export function parseVoiceToggleKey(spec) {
28
+ const raw = (spec ?? "ctrl-o").trim().toLowerCase();
29
+
30
+ const match = /^ctrl-(.)$/.exec(raw);
31
+ if (!match) {
32
+ throw new Error(
33
+ `Invalid voiceInput.toggleKey "${spec}". Expected "ctrl-<char>".`,
34
+ );
35
+ }
36
+
37
+ const ch = match[1];
38
+ const code = ch.charCodeAt(0);
39
+
40
+ // Subtracting a fixed offset from the character's ASCII code yields the
41
+ // control byte (0x01–0x1f) the terminal sends for that Ctrl combination.
42
+ let byte;
43
+ if (code >= 0x61 && code <= 0x7a) {
44
+ // a–z (0x61–0x7a): subtract 0x60 → 0x01 (Ctrl-A) – 0x1a (Ctrl-Z)
45
+ byte = code - 0x60;
46
+ } else if (code >= 0x5b && code <= 0x5f) {
47
+ // [ \ ] ^ _ (0x5b–0x5f): subtract 0x40 → 0x1b (Ctrl-[) – 0x1f (Ctrl-_)
48
+ byte = code - 0x40;
49
+ } else {
50
+ throw new Error(
51
+ `Unsupported voiceInput.toggleKey "${spec}". Use ctrl-<letter> or ctrl-<[ \\ ] ^ _>.`,
52
+ );
53
+ }
54
+
55
+ if (RESERVED_TERMINAL_BYTES.has(byte)) {
56
+ throw new Error(
57
+ `voiceInput.toggleKey "${spec}" conflicts with a reserved terminal/readline key.`,
58
+ );
59
+ }
60
+
61
+ return { byte, label: `Ctrl-${ch.toUpperCase()}` };
62
+ }