@iinm/plain-agent 1.11.9 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -48
- package/config/config.predefined.json +18 -2
- package/package.json +1 -1
- package/src/cli/interactive.mjs +2 -118
- package/src/cli/interruptTransform.mjs +7 -22
- package/src/config.d.ts +0 -2
- package/src/config.mjs +0 -3
- package/src/main.mjs +0 -1
- package/src/cli/muteTransform.mjs +0 -26
- package/src/voice/gemini.mjs +0 -102
- package/src/voice/input.mjs +0 -29
- package/src/voice/openai.mjs +0 -102
- package/src/voice/session.mjs +0 -543
- package/src/voice/toggleKey.mjs +0 -62
package/README.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
[](https://deepwiki.com/iinm/plain-agent)
|
|
4
4
|
[](https://www.npmjs.com/package/@iinm/plain-agent)
|
|
5
5
|
[](https://packagephobia.com/result?p=@iinm/plain-agent)
|
|
6
|
-
[](https://socket.dev/npm/package/@iinm/plain-agent)
|
|
7
7
|
[](https://github.com/iinm/plain-agent/actions/workflows/github-code-scanning/codeql)
|
|
8
8
|
|
|
9
9
|
A lightweight terminal-based coding agent focused on safety and low token cost
|
|
@@ -25,7 +25,6 @@ A lightweight terminal-based coding agent focused on safety and low token cost
|
|
|
25
25
|
- [Prompts](#prompts)
|
|
26
26
|
- [Subagents](#subagents)
|
|
27
27
|
- [Claude Code Plugin Support](#claude-code-plugin-support)
|
|
28
|
-
- [Voice Input](#voice-input)
|
|
29
28
|
- [Appendix: Creating Least-Privilege Users for Cloud Providers](#appendix-creating-least-privilege-users-for-cloud-providers)
|
|
30
29
|
- [Developer Notes](#developer-notes)
|
|
31
30
|
|
|
@@ -848,52 +847,6 @@ Example:
|
|
|
848
847
|
plain install-claude-code-plugins
|
|
849
848
|
```
|
|
850
849
|
|
|
851
|
-
## Voice Input
|
|
852
|
-
|
|
853
|
-
Press **Ctrl-O** to start recording, then press it again to stop. Partial transcripts are inserted into the prompt as you speak, so you can edit and send them like regular text.
|
|
854
|
-
|
|
855
|
-
### Requirements
|
|
856
|
-
|
|
857
|
-
- A recording command on `PATH`: `arecord`, `sox`, or `ffmpeg`.
|
|
858
|
-
- An API key for the chosen provider.
|
|
859
|
-
- Your host must have microphone access.
|
|
860
|
-
|
|
861
|
-
### Providers
|
|
862
|
-
|
|
863
|
-
**OpenAI Realtime**
|
|
864
|
-
|
|
865
|
-
```js
|
|
866
|
-
// ~/.config/plain-agent/config.local.json
|
|
867
|
-
{
|
|
868
|
-
"voiceInput": {
|
|
869
|
-
"provider": "openai",
|
|
870
|
-
"apiKey": "<OPENAI_API_KEY>"
|
|
871
|
-
// "model": "gpt-4o-transcribe", // or "gpt-4o-mini-transcribe", "whisper-1"
|
|
872
|
-
// "language": "ja" // ISO-639-1 code. Improves accuracy and latency.
|
|
873
|
-
}
|
|
874
|
-
}
|
|
875
|
-
```
|
|
876
|
-
|
|
877
|
-
**Gemini Live**
|
|
878
|
-
|
|
879
|
-
```js
|
|
880
|
-
// ~/.config/plain-agent/config.local.json
|
|
881
|
-
{
|
|
882
|
-
"voiceInput": {
|
|
883
|
-
"provider": "gemini",
|
|
884
|
-
"apiKey": "<GEMINI_API_KEY>"
|
|
885
|
-
// "model": "gemini-3.1-flash-live-preview",
|
|
886
|
-
// "language": "ja"
|
|
887
|
-
}
|
|
888
|
-
}
|
|
889
|
-
```
|
|
890
|
-
|
|
891
|
-
### Options
|
|
892
|
-
|
|
893
|
-
- `toggleKey` — Rebind the toggle key. Accepts `"ctrl-<char>"` where `<char>`
|
|
894
|
-
is a letter (a-z) or one of `[ \ ] ^ _`. Defaults to `"ctrl-o"`.
|
|
895
|
-
- `recorder` — Override automatic recorder detection, e.g. `{ "command": "sox", "args": ["-q", "-d", "-b", "16", "-c", "1", "-r", "24000", "-e", "signed-integer", "-t", "raw", "-"] }`. It must write raw 16-bit little-endian mono PCM to stdout at 24 kHz (OpenAI) or 16 kHz (Gemini).
|
|
896
|
-
|
|
897
850
|
## Appendix: Creating Least-Privilege Users for Cloud Providers
|
|
898
851
|
|
|
899
852
|
<details>
|
|
@@ -5,9 +5,15 @@
|
|
|
5
5
|
"patterns": [
|
|
6
6
|
{
|
|
7
7
|
"toolName": "exec_command",
|
|
8
|
-
"input": { "command":
|
|
8
|
+
"input": { "command": "find" },
|
|
9
9
|
"action": "deny",
|
|
10
|
-
"reason": "Use
|
|
10
|
+
"reason": "Use fd instead; fd respects .gitignore by default"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"toolName": "exec_command",
|
|
14
|
+
"input": { "command": "grep" },
|
|
15
|
+
"action": "deny",
|
|
16
|
+
"reason": "Use rg instead; rg respects .gitignore by default"
|
|
11
17
|
},
|
|
12
18
|
{
|
|
13
19
|
"toolName": "exec_command",
|
|
@@ -146,6 +152,16 @@
|
|
|
146
152
|
}
|
|
147
153
|
],
|
|
148
154
|
"tests": [
|
|
155
|
+
{
|
|
156
|
+
"desc": "find should be denied",
|
|
157
|
+
"toolUse": { "toolName": "exec_command", "input": { "command": "find" } },
|
|
158
|
+
"expectedAction": "deny"
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"desc": "grep should be denied",
|
|
162
|
+
"toolUse": { "toolName": "exec_command", "input": { "command": "grep" } },
|
|
163
|
+
"expectedAction": "deny"
|
|
164
|
+
},
|
|
149
165
|
{
|
|
150
166
|
"desc": "ls should be allowed",
|
|
151
167
|
"toolUse": { "toolName": "exec_command", "input": { "command": "ls" } },
|
package/package.json
CHANGED
package/src/cli/interactive.mjs
CHANGED
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
* @import { UserEventEmitter, AgentEventEmitter, AgentCommands } from "../agent"
|
|
3
3
|
* @import { ClaudeCodePlugin } from "../claudeCodePlugin.mjs"
|
|
4
4
|
* @import { Tool, SandboxModeProvider } from "../tool"
|
|
5
|
-
* @import { VoiceInputConfig } from "../voice/input.mjs"
|
|
6
|
-
* @import { VoiceSession } from "../voice/session.mjs"
|
|
7
5
|
*/
|
|
8
6
|
|
|
9
7
|
import readline from "node:readline";
|
|
@@ -11,8 +9,6 @@ import { styleText } from "node:util";
|
|
|
11
9
|
import { appendUsageRecord, buildUsageRecord } from "../usageStore.mjs";
|
|
12
10
|
import { createSequentialExecutor } from "../utils/createSequentialExecutor.mjs";
|
|
13
11
|
import { notify } from "../utils/notify.mjs";
|
|
14
|
-
import { startVoiceSession } from "../voice/input.mjs";
|
|
15
|
-
import { parseVoiceToggleKey } from "../voice/toggleKey.mjs";
|
|
16
12
|
import { createCommandHandler } from "./commands.mjs";
|
|
17
13
|
import { createCompleter, SLASH_COMMANDS } from "./completer.mjs";
|
|
18
14
|
import {
|
|
@@ -21,7 +17,6 @@ import {
|
|
|
21
17
|
printMessage,
|
|
22
18
|
} from "./formatter.mjs";
|
|
23
19
|
import { createInterruptTransform } from "./interruptTransform.mjs";
|
|
24
|
-
import { createMuteTransform } from "./muteTransform.mjs";
|
|
25
20
|
import { createPasteHandler } from "./pasteTransform.mjs";
|
|
26
21
|
import { createStreamFormatter } from "./streamFormatter.mjs";
|
|
27
22
|
|
|
@@ -67,7 +62,6 @@ const HELP_MESSAGE = [
|
|
|
67
62
|
* @property {boolean} sandbox
|
|
68
63
|
* @property {() => Promise<void>} onStop
|
|
69
64
|
* @property {ClaudeCodePlugin[]} [claudeCodePlugins]
|
|
70
|
-
* @property {VoiceInputConfig} [voiceInput]
|
|
71
65
|
* @property {Tool & SandboxModeProvider} [execCommandTool]
|
|
72
66
|
*/
|
|
73
67
|
|
|
@@ -112,7 +106,6 @@ export function startInteractiveSession({
|
|
|
112
106
|
sandbox,
|
|
113
107
|
onStop,
|
|
114
108
|
claudeCodePlugins,
|
|
115
|
-
voiceInput,
|
|
116
109
|
execCommandTool,
|
|
117
110
|
}) {
|
|
118
111
|
/** @type {{ turn: boolean, multiLineBuffer: string[] | null, subagentName: string, toolSpinnerIndex: number, toolSpinnerLastTime: number }} */
|
|
@@ -127,19 +120,9 @@ export function startInteractiveSession({
|
|
|
127
120
|
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
128
121
|
const SPINNER_INTERVAL_MS = 80;
|
|
129
122
|
|
|
130
|
-
/**
|
|
131
|
-
* Active voice input session, or null when not recording.
|
|
132
|
-
* @type {{ session: VoiceSession, startCursor: number, transcriptLength: number } | null}
|
|
133
|
-
*/
|
|
134
|
-
let voice = null;
|
|
135
|
-
|
|
136
123
|
// Create the stream buffer instance for this session
|
|
137
124
|
const streamBuffer = createStreamBuffer();
|
|
138
125
|
|
|
139
|
-
// Parse the voice toggle key once at startup so misconfiguration fails
|
|
140
|
-
// loudly instead of silently falling back.
|
|
141
|
-
const voiceToggle = parseVoiceToggleKey(voiceInput?.toggleKey);
|
|
142
|
-
|
|
143
126
|
const getCliPrompt = (subagentName = "", flashMessage = "") =>
|
|
144
127
|
[
|
|
145
128
|
"",
|
|
@@ -198,100 +181,7 @@ export function startInteractiveSession({
|
|
|
198
181
|
cli.prompt();
|
|
199
182
|
};
|
|
200
183
|
|
|
201
|
-
const stopVoiceSession = async () => {
|
|
202
|
-
if (!voice) return;
|
|
203
|
-
const current = voice;
|
|
204
|
-
voice = null;
|
|
205
|
-
await current.session.stop();
|
|
206
|
-
cli.setPrompt(currentCliPrompt);
|
|
207
|
-
// @ts-expect-error - internal property
|
|
208
|
-
cli._refreshLine?.();
|
|
209
|
-
};
|
|
210
|
-
|
|
211
|
-
const handleVoiceToggle = () => {
|
|
212
|
-
// Ignore while the agent is working.
|
|
213
|
-
if (!state.turn) return;
|
|
214
|
-
|
|
215
|
-
if (voice) {
|
|
216
|
-
stopVoiceSession();
|
|
217
|
-
return;
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
if (!voiceInput) {
|
|
221
|
-
cli.setPrompt(
|
|
222
|
-
getCliPrompt(
|
|
223
|
-
state.subagentName,
|
|
224
|
-
styleText(
|
|
225
|
-
"yellow",
|
|
226
|
-
`Voice input not configured. Set \`voiceInput\` in your config to enable ${voiceToggle.label}.`,
|
|
227
|
-
),
|
|
228
|
-
),
|
|
229
|
-
);
|
|
230
|
-
cli.prompt(true);
|
|
231
|
-
return;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
const startCursor = cli.cursor;
|
|
235
|
-
const session = startVoiceSession({
|
|
236
|
-
config: voiceInput,
|
|
237
|
-
callbacks: {
|
|
238
|
-
onTranscript: (delta) => {
|
|
239
|
-
if (!voice) return;
|
|
240
|
-
const insertAt = voice.startCursor + voice.transcriptLength;
|
|
241
|
-
// Insert delta at the recording's insertion point. User input is
|
|
242
|
-
// swallowed while recording, so the buffer around `insertAt` is
|
|
243
|
-
// stable.
|
|
244
|
-
const before = cli.line.slice(0, insertAt);
|
|
245
|
-
const after = cli.line.slice(insertAt);
|
|
246
|
-
// `line` and `cursor` are declared readonly in the Node typings but
|
|
247
|
-
// are writable at runtime — the existing code already patches
|
|
248
|
-
// `_refreshLine` in the same way.
|
|
249
|
-
const mutableCli = /** @type {{ line: string, cursor: number }} */ (
|
|
250
|
-
/** @type {unknown} */ (cli)
|
|
251
|
-
);
|
|
252
|
-
mutableCli.line = before + delta + after;
|
|
253
|
-
mutableCli.cursor = insertAt + delta.length;
|
|
254
|
-
voice.transcriptLength += delta.length;
|
|
255
|
-
// @ts-expect-error - internal property
|
|
256
|
-
cli._refreshLine?.();
|
|
257
|
-
},
|
|
258
|
-
onError: (err) => {
|
|
259
|
-
voice = null;
|
|
260
|
-
cli.setPrompt(
|
|
261
|
-
getCliPrompt(
|
|
262
|
-
state.subagentName,
|
|
263
|
-
styleText("red", `Voice input error: ${err.message}`),
|
|
264
|
-
),
|
|
265
|
-
);
|
|
266
|
-
cli.prompt(true);
|
|
267
|
-
},
|
|
268
|
-
onClose: () => {
|
|
269
|
-
if (!voice) return;
|
|
270
|
-
voice = null;
|
|
271
|
-
cli.setPrompt(currentCliPrompt);
|
|
272
|
-
// @ts-expect-error - internal property
|
|
273
|
-
cli._refreshLine?.();
|
|
274
|
-
},
|
|
275
|
-
},
|
|
276
|
-
});
|
|
277
|
-
voice = { session, startCursor, transcriptLength: 0 };
|
|
278
|
-
cli.setPrompt(
|
|
279
|
-
getCliPrompt(
|
|
280
|
-
state.subagentName,
|
|
281
|
-
styleText(["red", "bold"], `● REC (${voiceToggle.label} to stop)`),
|
|
282
|
-
),
|
|
283
|
-
);
|
|
284
|
-
// @ts-expect-error - internal property
|
|
285
|
-
cli._refreshLine?.();
|
|
286
|
-
};
|
|
287
|
-
|
|
288
184
|
const handleCtrlC = () => {
|
|
289
|
-
// Stop voice recording first if active.
|
|
290
|
-
if (voice) {
|
|
291
|
-
stopVoiceSession();
|
|
292
|
-
return;
|
|
293
|
-
}
|
|
294
|
-
|
|
295
185
|
// Agent turn: pause auto-approve; do not clear input.
|
|
296
186
|
if (!state.turn) {
|
|
297
187
|
agentCommands.pauseAutoApprove();
|
|
@@ -347,20 +237,14 @@ export function startInteractiveSession({
|
|
|
347
237
|
};
|
|
348
238
|
|
|
349
239
|
// Pre-readline pipeline:
|
|
350
|
-
// stdin -> interrupt (Ctrl-C / Ctrl-D) ->
|
|
240
|
+
// stdin -> interrupt (Ctrl-C / Ctrl-D) -> paste (bracketed paste) -> readline
|
|
351
241
|
const interrupt = createInterruptTransform({
|
|
352
242
|
onCtrlC: handleCtrlC,
|
|
353
243
|
onCtrlD: handleCtrlD,
|
|
354
|
-
onVoiceToggle: handleVoiceToggle,
|
|
355
|
-
voiceToggleByte: voiceToggle.byte,
|
|
356
244
|
});
|
|
357
|
-
// While a voice session is recording, swallow all stdin bytes other than
|
|
358
|
-
// Ctrl-C / Ctrl-D / the voice toggle key so transcript insertion stays
|
|
359
|
-
// consistent.
|
|
360
|
-
const mute = createMuteTransform({ isMuted: () => voice !== null });
|
|
361
245
|
const paste = createPasteHandler();
|
|
362
246
|
|
|
363
|
-
process.stdin.pipe(interrupt).pipe(
|
|
247
|
+
process.stdin.pipe(interrupt).pipe(paste.transform);
|
|
364
248
|
|
|
365
249
|
// Enable bracketed paste mode
|
|
366
250
|
if (process.stdout.isTTY) {
|
|
@@ -1,31 +1,21 @@
|
|
|
1
1
|
import { Transform } from "node:stream";
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Create a Transform that intercepts Ctrl-C (0x03)
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* Create a Transform that intercepts Ctrl-C (0x03) and Ctrl-D (0x04).
|
|
5
|
+
* When one of those bytes is seen anywhere in a chunk, the corresponding
|
|
6
|
+
* callback is invoked and the entire chunk is dropped so that downstream
|
|
7
|
+
* consumers (e.g. readline) never observe it. All other input flows
|
|
8
|
+
* through unchanged.
|
|
9
9
|
*
|
|
10
10
|
* Priority when multiple handled bytes appear in the same chunk:
|
|
11
|
-
* Ctrl-C > Ctrl-D
|
|
11
|
+
* Ctrl-C > Ctrl-D.
|
|
12
12
|
*
|
|
13
13
|
* @param {object} handlers
|
|
14
14
|
* @param {() => void} handlers.onCtrlC - Called when Ctrl-C is detected
|
|
15
15
|
* @param {() => void} handlers.onCtrlD - Called when Ctrl-D is detected
|
|
16
|
-
* @param {() => void} [handlers.onVoiceToggle]
|
|
17
|
-
* Called when the voice toggle byte is detected.
|
|
18
|
-
* @param {number} [handlers.voiceToggleByte]
|
|
19
|
-
* Byte value for the voice toggle key. Defaults to 0x0f (Ctrl-O).
|
|
20
16
|
* @returns {Transform}
|
|
21
17
|
*/
|
|
22
|
-
export function createInterruptTransform({
|
|
23
|
-
onCtrlC,
|
|
24
|
-
onCtrlD,
|
|
25
|
-
onVoiceToggle,
|
|
26
|
-
voiceToggleByte = 0x0f,
|
|
27
|
-
}) {
|
|
28
|
-
const voiceToggleChar = String.fromCharCode(voiceToggleByte);
|
|
18
|
+
export function createInterruptTransform({ onCtrlC, onCtrlD }) {
|
|
29
19
|
return new Transform({
|
|
30
20
|
transform(chunk, _encoding, callback) {
|
|
31
21
|
const data = chunk.toString("utf8");
|
|
@@ -39,11 +29,6 @@ export function createInterruptTransform({
|
|
|
39
29
|
callback();
|
|
40
30
|
return;
|
|
41
31
|
}
|
|
42
|
-
if (onVoiceToggle && data.includes(voiceToggleChar)) {
|
|
43
|
-
onVoiceToggle();
|
|
44
|
-
callback();
|
|
45
|
-
return;
|
|
46
|
-
}
|
|
47
32
|
this.push(chunk);
|
|
48
33
|
callback();
|
|
49
34
|
},
|
package/src/config.d.ts
CHANGED
|
@@ -10,7 +10,6 @@ import {
|
|
|
10
10
|
WebSearchToolGeminiOptions,
|
|
11
11
|
WebSearchToolGeminiVertexAIOptions,
|
|
12
12
|
} from "./tools/webSearch.mjs";
|
|
13
|
-
import { VoiceInputConfig } from "./voice/input.mjs";
|
|
14
13
|
|
|
15
14
|
/**
|
|
16
15
|
* JSON-serializable webFetch configuration.
|
|
@@ -88,7 +87,6 @@ export type AppConfig = {
|
|
|
88
87
|
};
|
|
89
88
|
mcpServers?: Record<string, MCPServerConfig>;
|
|
90
89
|
notifyCmd?: { command: string; args?: string[] };
|
|
91
|
-
voiceInput?: VoiceInputConfig;
|
|
92
90
|
claudeCodePlugins?: ClaudeCodePluginRepo[];
|
|
93
91
|
};
|
|
94
92
|
|
package/src/config.mjs
CHANGED
|
@@ -129,9 +129,6 @@ export async function loadAppConfig(options = {}) {
|
|
|
129
129
|
...(merged.claudeCodePlugins ?? []),
|
|
130
130
|
...(config.claudeCodePlugins ?? []),
|
|
131
131
|
],
|
|
132
|
-
voiceInput: config.voiceInput
|
|
133
|
-
? { ...(merged.voiceInput ?? {}), ...config.voiceInput }
|
|
134
|
-
: merged.voiceInput,
|
|
135
132
|
};
|
|
136
133
|
}
|
|
137
134
|
|
package/src/main.mjs
CHANGED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import { Transform } from "node:stream";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Create a Transform that swallows all chunks while `isMuted()` returns true,
|
|
5
|
-
* and passes them through unchanged while it returns false.
|
|
6
|
-
*
|
|
7
|
-
* Intended to sit between `createInterruptTransform` and the paste handler so
|
|
8
|
-
* that callers can fully silence regular stdin input during special modes
|
|
9
|
-
* (e.g. while a voice input session is recording) without coupling that
|
|
10
|
-
* concern to the interrupt-detection logic.
|
|
11
|
-
*
|
|
12
|
-
* @param {object} options
|
|
13
|
-
* @param {() => boolean} options.isMuted
|
|
14
|
-
* Called for each incoming chunk; when true the chunk is dropped.
|
|
15
|
-
* @returns {Transform}
|
|
16
|
-
*/
|
|
17
|
-
export function createMuteTransform({ isMuted }) {
|
|
18
|
-
return new Transform({
|
|
19
|
-
transform(chunk, _encoding, callback) {
|
|
20
|
-
if (!isMuted()) {
|
|
21
|
-
this.push(chunk);
|
|
22
|
-
}
|
|
23
|
-
callback();
|
|
24
|
-
},
|
|
25
|
-
});
|
|
26
|
-
}
|
package/src/voice/gemini.mjs
DELETED
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import { isObjectLike, startWebSocketVoiceSession } from "./session.mjs";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./session.mjs"
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* @typedef {Object} VoiceInputGeminiConfig
|
|
9
|
-
* @property {"gemini"} provider
|
|
10
|
-
* @property {string} apiKey
|
|
11
|
-
* @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
|
|
12
|
-
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
|
|
13
|
-
* @property {string} [baseURL]
|
|
14
|
-
* @property {VoiceRecorderConfig} [recorder]
|
|
15
|
-
* @property {string} [toggleKey]
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
|
|
19
|
-
const GEMINI_DEFAULT_WS =
|
|
20
|
-
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
|
|
21
|
-
const GEMINI_SAMPLE_RATE = 16000;
|
|
22
|
-
const GEMINI_LABEL = "Gemini Live";
|
|
23
|
-
|
|
24
|
-
/**
|
|
25
|
-
* Start a voice input session backed by the Gemini Live BidiGenerateContent
|
|
26
|
-
* WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
|
|
27
|
-
* forwards transcript deltas via `onTranscript`.
|
|
28
|
-
*
|
|
29
|
-
* Gemini Live was designed for voice agents, not pure STT, so the setup
|
|
30
|
-
* message forces `maxOutputTokens: 1` and disables thinking on 2.5 models
|
|
31
|
-
* to minimise wasted audio output.
|
|
32
|
-
*
|
|
33
|
-
* @param {object} options
|
|
34
|
-
* @param {VoiceInputGeminiConfig} options.config
|
|
35
|
-
* @param {VoiceSessionCallbacks} options.callbacks
|
|
36
|
-
* @returns {VoiceSession}
|
|
37
|
-
*/
|
|
38
|
-
export function startGeminiVoiceSession({ config, callbacks }) {
|
|
39
|
-
/** @type {VoiceProviderHooks<VoiceInputGeminiConfig>} */
|
|
40
|
-
const hooks = {
|
|
41
|
-
label: GEMINI_LABEL,
|
|
42
|
-
sampleRate: GEMINI_SAMPLE_RATE,
|
|
43
|
-
buildWsUrl(config) {
|
|
44
|
-
const base = config.baseURL ?? GEMINI_DEFAULT_WS;
|
|
45
|
-
return `${base}?key=${encodeURIComponent(config.apiKey)}`;
|
|
46
|
-
},
|
|
47
|
-
buildSetupMessage(config) {
|
|
48
|
-
const model = config.model ?? GEMINI_DEFAULT_MODEL;
|
|
49
|
-
/** @type {Record<string, unknown>} */
|
|
50
|
-
const generationConfig = {
|
|
51
|
-
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
52
|
-
// > The native audio models only support `AUDIO` response modality.
|
|
53
|
-
responseModalities: ["AUDIO"],
|
|
54
|
-
maxOutputTokens: 1,
|
|
55
|
-
};
|
|
56
|
-
if (model.includes("2.5")) {
|
|
57
|
-
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
58
|
-
}
|
|
59
|
-
/** @type {Record<string, unknown>} */
|
|
60
|
-
const setup = {
|
|
61
|
-
model: `models/${model}`,
|
|
62
|
-
generationConfig,
|
|
63
|
-
inputAudioTranscription: {},
|
|
64
|
-
};
|
|
65
|
-
if (config.language) {
|
|
66
|
-
setup.systemInstruction = {
|
|
67
|
-
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
return { setup };
|
|
71
|
-
},
|
|
72
|
-
isReadyMessage(message) {
|
|
73
|
-
return isObjectLike(message) && "setupComplete" in message;
|
|
74
|
-
},
|
|
75
|
-
extractTranscript(message) {
|
|
76
|
-
if (!isObjectLike(message)) return undefined;
|
|
77
|
-
const serverContent = message.serverContent;
|
|
78
|
-
if (!isObjectLike(serverContent)) return undefined;
|
|
79
|
-
const transcription = serverContent.inputTranscription;
|
|
80
|
-
if (
|
|
81
|
-
isObjectLike(transcription) &&
|
|
82
|
-
typeof transcription.text === "string" &&
|
|
83
|
-
transcription.text.length > 0
|
|
84
|
-
) {
|
|
85
|
-
return transcription.text;
|
|
86
|
-
}
|
|
87
|
-
return undefined;
|
|
88
|
-
},
|
|
89
|
-
buildAudioPayload(chunk, sampleRate) {
|
|
90
|
-
return {
|
|
91
|
-
realtimeInput: {
|
|
92
|
-
audio: {
|
|
93
|
-
data: chunk.toString("base64"),
|
|
94
|
-
mimeType: `audio/pcm;rate=${sampleRate}`,
|
|
95
|
-
},
|
|
96
|
-
},
|
|
97
|
-
};
|
|
98
|
-
},
|
|
99
|
-
};
|
|
100
|
-
|
|
101
|
-
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
102
|
-
}
|
package/src/voice/input.mjs
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import { startGeminiVoiceSession } from "./gemini.mjs";
|
|
2
|
-
import { startOpenAIVoiceSession } from "./openai.mjs";
|
|
3
|
-
import { failVoiceSessionAsync } from "./session.mjs";
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* @typedef {import("./openai.mjs").VoiceInputOpenAIConfig | import("./gemini.mjs").VoiceInputGeminiConfig} VoiceInputConfig
|
|
7
|
-
*/
|
|
8
|
-
/**
|
|
9
|
-
* Start a voice input session. Dispatches to the provider-specific
|
|
10
|
-
* implementation based on `config.provider`.
|
|
11
|
-
*
|
|
12
|
-
* @param {object} options
|
|
13
|
-
* @param {VoiceInputConfig} options.config
|
|
14
|
-
* @param {import("./session.mjs").VoiceSessionCallbacks} options.callbacks
|
|
15
|
-
* @returns {import("./session.mjs").VoiceSession}
|
|
16
|
-
*/
|
|
17
|
-
export function startVoiceSession({ config, callbacks }) {
|
|
18
|
-
if (config.provider === "openai") {
|
|
19
|
-
return startOpenAIVoiceSession({ config, callbacks });
|
|
20
|
-
}
|
|
21
|
-
if (config.provider === "gemini") {
|
|
22
|
-
return startGeminiVoiceSession({ config, callbacks });
|
|
23
|
-
}
|
|
24
|
-
const provider = /** @type {{ provider: string }} */ (config).provider;
|
|
25
|
-
return failVoiceSessionAsync(
|
|
26
|
-
callbacks,
|
|
27
|
-
new Error(`Unsupported voiceInput.provider: ${provider}`),
|
|
28
|
-
);
|
|
29
|
-
}
|
package/src/voice/openai.mjs
DELETED
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import { isObjectLike, startWebSocketVoiceSession } from "./session.mjs";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./session.mjs"
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* @typedef {Object} VoiceInputOpenAIConfig
|
|
9
|
-
* @property {"openai"} provider
|
|
10
|
-
* @property {string} apiKey
|
|
11
|
-
* @property {string} [model] - Transcription model. Defaults to "gpt-realtime-whisper".
|
|
12
|
-
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
|
|
13
|
-
* @property {string} [baseURL]
|
|
14
|
-
* @property {VoiceRecorderConfig} [recorder]
|
|
15
|
-
* @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
const OPENAI_DEFAULT_TRANSCRIPTION_MODEL = "gpt-realtime-whisper";
|
|
19
|
-
const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
|
|
20
|
-
const OPENAI_SAMPLE_RATE = 24000;
|
|
21
|
-
const OPENAI_LABEL = "OpenAI Realtime";
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Start a voice input session backed by the OpenAI Realtime transcription
|
|
25
|
-
* WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
|
|
26
|
-
* forwards transcript deltas via `onTranscript`.
|
|
27
|
-
*
|
|
28
|
-
* @param {object} options
|
|
29
|
-
* @param {VoiceInputOpenAIConfig} options.config
|
|
30
|
-
* @param {VoiceSessionCallbacks} options.callbacks
|
|
31
|
-
* @returns {VoiceSession}
|
|
32
|
-
*/
|
|
33
|
-
export function startOpenAIVoiceSession({ config, callbacks }) {
|
|
34
|
-
/** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
|
|
35
|
-
const hooks = {
|
|
36
|
-
label: OPENAI_LABEL,
|
|
37
|
-
sampleRate: OPENAI_SAMPLE_RATE,
|
|
38
|
-
buildWsUrl(config) {
|
|
39
|
-
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
40
|
-
return `${base}?intent=transcription`;
|
|
41
|
-
},
|
|
42
|
-
buildWsOptions(config) {
|
|
43
|
-
return {
|
|
44
|
-
headers: {
|
|
45
|
-
Authorization: `Bearer ${config.apiKey}`,
|
|
46
|
-
},
|
|
47
|
-
};
|
|
48
|
-
},
|
|
49
|
-
buildSetupMessage(config) {
|
|
50
|
-
const model = config.model ?? OPENAI_DEFAULT_TRANSCRIPTION_MODEL;
|
|
51
|
-
/** @type {{ model: string, language?: string }} */
|
|
52
|
-
const transcription = { model };
|
|
53
|
-
if (config.language) transcription.language = config.language;
|
|
54
|
-
return {
|
|
55
|
-
type: "session.update",
|
|
56
|
-
session: {
|
|
57
|
-
type: "transcription",
|
|
58
|
-
audio: {
|
|
59
|
-
input: {
|
|
60
|
-
format: { type: "audio/pcm", rate: OPENAI_SAMPLE_RATE },
|
|
61
|
-
transcription,
|
|
62
|
-
},
|
|
63
|
-
},
|
|
64
|
-
},
|
|
65
|
-
};
|
|
66
|
-
},
|
|
67
|
-
isReadyMessage(message) {
|
|
68
|
-
return (
|
|
69
|
-
isObjectLike(message) &&
|
|
70
|
-
(message.type === "session.created" ||
|
|
71
|
-
message.type === "session.updated")
|
|
72
|
-
);
|
|
73
|
-
},
|
|
74
|
-
extractError(message) {
|
|
75
|
-
if (!isObjectLike(message) || message.type !== "error") return undefined;
|
|
76
|
-
const error = message.error;
|
|
77
|
-
if (!isObjectLike(error)) return undefined;
|
|
78
|
-
return typeof error.message === "string"
|
|
79
|
-
? error.message
|
|
80
|
-
: JSON.stringify(error);
|
|
81
|
-
},
|
|
82
|
-
extractTranscript(message) {
|
|
83
|
-
if (
|
|
84
|
-
isObjectLike(message) &&
|
|
85
|
-
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
86
|
-
typeof message.delta === "string" &&
|
|
87
|
-
message.delta.length > 0
|
|
88
|
-
) {
|
|
89
|
-
return message.delta;
|
|
90
|
-
}
|
|
91
|
-
return undefined;
|
|
92
|
-
},
|
|
93
|
-
buildAudioPayload(chunk, _sampleRate) {
|
|
94
|
-
return {
|
|
95
|
-
type: "input_audio_buffer.append",
|
|
96
|
-
audio: chunk.toString("base64"),
|
|
97
|
-
};
|
|
98
|
-
},
|
|
99
|
-
};
|
|
100
|
-
|
|
101
|
-
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
102
|
-
}
|
package/src/voice/session.mjs
DELETED
|
@@ -1,543 +0,0 @@
|
|
|
1
|
-
import { spawn, spawnSync } from "node:child_process";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @typedef {Object} VoiceRecorderConfig
|
|
5
|
-
* @property {string} command
|
|
6
|
-
* @property {string[]} args
|
|
7
|
-
* Must write raw 16-bit little-endian mono PCM to stdout at the sample
|
|
8
|
-
* rate required by the chosen provider (24 kHz for OpenAI, 16 kHz for
|
|
9
|
-
* Gemini).
|
|
10
|
-
*/
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* @typedef {Object} VoiceSessionCallbacks
|
|
14
|
-
* @property {(text: string) => void} onTranscript
|
|
15
|
-
* @property {(error: Error) => void} onError
|
|
16
|
-
* @property {() => void} [onClose]
|
|
17
|
-
*/
|
|
18
|
-
|
|
19
|
-
/**
|
|
20
|
-
* @typedef {Object} VoiceSession
|
|
21
|
-
* @property {() => Promise<void>} stop
|
|
22
|
-
*/
|
|
23
|
-
|
|
24
|
-
/**
|
|
25
|
-
* @typedef {Object} RecorderHandle
|
|
26
|
-
* @property {() => void} stop
|
|
27
|
-
*/
|
|
28
|
-
|
|
29
|
-
export const VOICE_DEBUG = process.env.PLAIN_VOICE_DEBUG === "1";
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* @param {number} sampleRate
|
|
33
|
-
* @returns {VoiceRecorderConfig[]}
|
|
34
|
-
*/
|
|
35
|
-
export function getRecorderCandidates(sampleRate) {
|
|
36
|
-
const rate = String(sampleRate);
|
|
37
|
-
const isMac = process.platform === "darwin";
|
|
38
|
-
/** @type {VoiceRecorderConfig[]} */
|
|
39
|
-
const candidates = [];
|
|
40
|
-
|
|
41
|
-
if (!isMac) {
|
|
42
|
-
candidates.push({
|
|
43
|
-
command: "arecord",
|
|
44
|
-
args: ["-q", "-f", "S16_LE", "-c", "1", "-r", rate, "-t", "raw"],
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
candidates.push({
|
|
49
|
-
command: "sox",
|
|
50
|
-
args: [
|
|
51
|
-
"-q",
|
|
52
|
-
"-d",
|
|
53
|
-
"-b",
|
|
54
|
-
"16",
|
|
55
|
-
"-c",
|
|
56
|
-
"1",
|
|
57
|
-
"-r",
|
|
58
|
-
rate,
|
|
59
|
-
"-e",
|
|
60
|
-
"signed-integer",
|
|
61
|
-
"-t",
|
|
62
|
-
"raw",
|
|
63
|
-
"-",
|
|
64
|
-
],
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
const ffmpegInput = isMac
|
|
68
|
-
? ["-f", "avfoundation", "-i", ":0"]
|
|
69
|
-
: ["-f", "alsa", "-i", "default"];
|
|
70
|
-
candidates.push({
|
|
71
|
-
command: "ffmpeg",
|
|
72
|
-
args: [
|
|
73
|
-
"-hide_banner",
|
|
74
|
-
"-loglevel",
|
|
75
|
-
"error",
|
|
76
|
-
...ffmpegInput,
|
|
77
|
-
"-ac",
|
|
78
|
-
"1",
|
|
79
|
-
"-ar",
|
|
80
|
-
rate,
|
|
81
|
-
"-f",
|
|
82
|
-
"s16le",
|
|
83
|
-
"-",
|
|
84
|
-
],
|
|
85
|
-
});
|
|
86
|
-
|
|
87
|
-
return candidates;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* @param {VoiceRecorderConfig[]} candidates
|
|
92
|
-
* @returns {VoiceRecorderConfig | null}
|
|
93
|
-
*/
|
|
94
|
-
export function detectRecorder(candidates) {
|
|
95
|
-
return candidates.find((c) => isCommandAvailable(c.command)) ?? null;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* @param {string} command
|
|
100
|
-
*/
|
|
101
|
-
export function isCommandAvailable(command) {
|
|
102
|
-
if (process.platform === "win32") {
|
|
103
|
-
const result = spawnSync("where", [command], { stdio: "ignore" });
|
|
104
|
-
return result.status === 0;
|
|
105
|
-
}
|
|
106
|
-
const result = spawnSync("sh", ["-c", `command -v ${command}`], {
|
|
107
|
-
stdio: "ignore",
|
|
108
|
-
});
|
|
109
|
-
return result.status === 0;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Spawn a recorder subprocess that emits raw PCM on stdout, and wire its
|
|
114
|
-
* lifecycle events to the provided callbacks. This is purely transport
|
|
115
|
-
* plumbing — it knows nothing about any specific STT provider.
|
|
116
|
-
*
|
|
117
|
-
* @param {object} options
|
|
118
|
-
* @param {VoiceRecorderConfig} options.recorder
|
|
119
|
-
* @param {(chunk: Buffer) => void} options.onAudio
|
|
120
|
-
* @param {(error: Error) => void} options.onError
|
|
121
|
-
* @param {() => void} options.onExit - Called after the recorder subprocess exits (for any reason).
|
|
122
|
-
* @returns {RecorderHandle}
|
|
123
|
-
*/
|
|
124
|
-
export function startRecorder({ recorder, onAudio, onError, onExit }) {
|
|
125
|
-
const child = spawn(recorder.command, recorder.args, {
|
|
126
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
127
|
-
});
|
|
128
|
-
|
|
129
|
-
/** @type {string[]} */
|
|
130
|
-
const stderrChunks = [];
|
|
131
|
-
child.stderr.on("data", (chunk) => {
|
|
132
|
-
stderrChunks.push(chunk.toString("utf8"));
|
|
133
|
-
});
|
|
134
|
-
|
|
135
|
-
child.on("error", (err) => {
|
|
136
|
-
const suffix =
|
|
137
|
-
/** @type {NodeJS.ErrnoException} */ (err).code === "ENOENT"
|
|
138
|
-
? ` (command "${recorder.command}" not found)`
|
|
139
|
-
: "";
|
|
140
|
-
onError(new Error(`Recorder failed to start${suffix}: ${err.message}`));
|
|
141
|
-
});
|
|
142
|
-
|
|
143
|
-
child.on("exit", (code, signal) => {
|
|
144
|
-
if (code !== 0 && signal === null) {
|
|
145
|
-
const stderrText = stderrChunks.join("").trim();
|
|
146
|
-
onError(
|
|
147
|
-
new Error(
|
|
148
|
-
`Recorder "${recorder.command}" exited with code ${code}${
|
|
149
|
-
stderrText ? `: ${stderrText}` : ""
|
|
150
|
-
}`,
|
|
151
|
-
),
|
|
152
|
-
);
|
|
153
|
-
}
|
|
154
|
-
onExit();
|
|
155
|
-
});
|
|
156
|
-
|
|
157
|
-
child.stdout.on("data", onAudio);
|
|
158
|
-
|
|
159
|
-
return {
|
|
160
|
-
stop() {
|
|
161
|
-
try {
|
|
162
|
-
child.kill("SIGTERM");
|
|
163
|
-
} catch {
|
|
164
|
-
// ignore
|
|
165
|
-
}
|
|
166
|
-
},
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
/**
|
|
171
|
-
* Report an error asynchronously and return an already-terminated session.
|
|
172
|
-
*
|
|
173
|
-
* Calls `onError` followed by `onClose` in a microtask, ensuring the caller
|
|
174
|
-
* receives a valid {@link VoiceSession} synchronously while still notifying
|
|
175
|
-
* the consumer of the failure.
|
|
176
|
-
*
|
|
177
|
-
* @param {VoiceSessionCallbacks} callbacks
|
|
178
|
-
* @param {Error} error
|
|
179
|
-
* @returns {VoiceSession}
|
|
180
|
-
*/
|
|
181
|
-
export function failVoiceSessionAsync(callbacks, error) {
|
|
182
|
-
queueMicrotask(() => {
|
|
183
|
-
callbacks.onError(error);
|
|
184
|
-
callbacks.onClose?.();
|
|
185
|
-
});
|
|
186
|
-
return { stop: async () => {} };
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
/**
|
|
190
|
-
* Provider-specific hook contract for {@link startWebSocketVoiceSession}.
|
|
191
|
-
*
|
|
192
|
-
* Each hook is called at a specific point in the session lifecycle:
|
|
193
|
-
*
|
|
194
|
-
* 1. **Construction** – `buildWsUrl` (and optionally `buildWsOptions`) are
|
|
195
|
-
* invoked immediately to create the WebSocket.
|
|
196
|
-
* 2. **Open** – `buildSetupMessage` is sent as the first JSON message once the
|
|
197
|
-
* WebSocket opens.
|
|
198
|
-
* 3. **Ready** – `isReadyMessage` is tested on every incoming message until it
|
|
199
|
-
* returns `true`. At that point the session transitions to *ready* and any
|
|
200
|
-
* buffered audio chunks are flushed.
|
|
201
|
-
* 4. **Streaming** – `buildAudioPayload` is called for every recorder chunk
|
|
202
|
-
* while the WebSocket is open and ready.
|
|
203
|
-
* 5. **Error extraction** – `extractError` is checked on every message before
|
|
204
|
-
* transcript extraction. If it returns a string, the session reports an
|
|
205
|
-
* error and drops the message.
|
|
206
|
-
* 6. **Transcription** – `extractTranscript` is called on every message after
|
|
207
|
-
* the session is ready. Non-empty results are pushed through the CJK
|
|
208
|
-
* space normalizer and then forwarded to `onTranscript`.
|
|
209
|
-
*
|
|
210
|
-
* @template TConfig
|
|
211
|
-
* @typedef {Object} VoiceProviderHooks
|
|
212
|
-
* @property {string} label - Human-readable provider name (used in logs and
|
|
213
|
-
* error messages).
|
|
214
|
-
* @property {number} sampleRate - PCM sample rate expected by the provider
|
|
215
|
-
* (e.g. 16000 for Gemini, 24000 for OpenAI). Passed to the recorder and
|
|
216
|
-
* `buildAudioPayload`.
|
|
217
|
-
* @property {(config: TConfig) => string} buildWsUrl - Returns the full
|
|
218
|
-
* WebSocket URL, including any query parameters.
|
|
219
|
-
* @property {(config: TConfig) => { headers?: Record<string, string> }} [buildWsOptions]
|
|
220
|
-
* - Returns optional per-provider WebSocket constructor options. Node's
|
|
221
|
-
* global WebSocket (undici) accepts a non-standard `headers` option that
|
|
222
|
-
* is not declared in the standard typings.
|
|
223
|
-
* @property {(config: TConfig) => object} buildSetupMessage - Returns the
|
|
224
|
-
* first JSON message sent immediately after the WebSocket opens.
|
|
225
|
-
* @property {(message: unknown) => boolean} isReadyMessage - Returns `true`
|
|
226
|
-
* when the given server message signals that the provider is ready to
|
|
227
|
-
* receive audio.
|
|
228
|
-
* @property {(message: unknown) => string | undefined} extractTranscript -
|
|
229
|
-
* Extracts a transcript delta from a server message. Return `undefined`
|
|
230
|
-
* when the message carries no transcript.
|
|
231
|
-
* @property {(message: unknown) => string | undefined} [extractError] -
|
|
232
|
-
* Extracts an error description from a server message. Return `undefined`
|
|
233
|
-
* when the message carries no error.
|
|
234
|
-
* @property {(chunk: Buffer, sampleRate: number) => object} buildAudioPayload -
|
|
235
|
-
* Wraps a raw PCM chunk into the provider-specific JSON payload. The
|
|
236
|
-
* `sampleRate` argument is the same value as `hooks.sampleRate`.
|
|
237
|
-
*/
|
|
238
|
-
|
|
239
|
-
/**
|
|
240
|
-
* Shared WebSocket voice session implementation used by both Gemini and
|
|
241
|
-
* OpenAI drivers.
|
|
242
|
-
*
|
|
243
|
-
* Responsibilities of this function:
|
|
244
|
-
* - Detect and start a suitable system audio recorder.
|
|
245
|
-
* - Establish the provider WebSocket connection.
|
|
246
|
-
* - Manage the lifecycle (setup → ready → streaming → close).
|
|
247
|
-
* - Buffer audio chunks while the connection is not yet ready.
|
|
248
|
-
* - Apply CJK space normalization to transcript text.
|
|
249
|
-
*
|
|
250
|
-
* Responsibilities of the caller (the driver):
|
|
251
|
-
* - Provide a {@link VoiceProviderHooks} object that knows the provider's
|
|
252
|
-
* wire protocol (URLs, headers, message schemas).
|
|
253
|
-
* - Supply `config` and `callbacks` from the user's call site.
|
|
254
|
-
*
|
|
255
|
-
* @template TConfig
|
|
256
|
-
* @param {object} options
|
|
257
|
-
* @param {VoiceProviderHooks<TConfig>} options.hooks
|
|
258
|
-
* @param {TConfig & { recorder?: VoiceRecorderConfig }} options.config
|
|
259
|
-
* @param {VoiceSessionCallbacks} options.callbacks
|
|
260
|
-
* @returns {VoiceSession}
|
|
261
|
-
*/
|
|
262
|
-
export function startWebSocketVoiceSession({ hooks, config, callbacks }) {
|
|
263
|
-
const recorder =
|
|
264
|
-
config.recorder ?? detectRecorder(getRecorderCandidates(hooks.sampleRate));
|
|
265
|
-
if (!recorder) {
|
|
266
|
-
return failVoiceSessionAsync(
|
|
267
|
-
callbacks,
|
|
268
|
-
new Error(
|
|
269
|
-
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
270
|
-
),
|
|
271
|
-
);
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
if (!isCommandAvailable(recorder.command)) {
|
|
275
|
-
return failVoiceSessionAsync(
|
|
276
|
-
callbacks,
|
|
277
|
-
new Error(
|
|
278
|
-
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
279
|
-
),
|
|
280
|
-
);
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
let stopped = false;
|
|
284
|
-
let closeEmitted = false;
|
|
285
|
-
let ready = false;
|
|
286
|
-
/** @type {Buffer[]} */
|
|
287
|
-
const pendingAudio = [];
|
|
288
|
-
const normalizer = createCJKSpaceNormalizer();
|
|
289
|
-
|
|
290
|
-
function emitClose() {
|
|
291
|
-
if (closeEmitted) return;
|
|
292
|
-
closeEmitted = true;
|
|
293
|
-
callbacks.onClose?.();
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
const wsUrl = hooks.buildWsUrl(config);
|
|
297
|
-
const wsOptions = hooks.buildWsOptions?.(config);
|
|
298
|
-
|
|
299
|
-
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
300
|
-
// option. The built-in typings only declare the standards-compliant
|
|
301
|
-
// constructor, so cast through `WebSocket`-as-constructor.
|
|
302
|
-
const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
303
|
-
/** @type {unknown} */ (WebSocket)
|
|
304
|
-
);
|
|
305
|
-
const ws = new Ctor(wsUrl, wsOptions);
|
|
306
|
-
ws.binaryType = "arraybuffer";
|
|
307
|
-
|
|
308
|
-
const rec = startRecorder({
|
|
309
|
-
recorder,
|
|
310
|
-
onAudio(chunk) {
|
|
311
|
-
if (stopped) return;
|
|
312
|
-
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
313
|
-
sendAudio(chunk);
|
|
314
|
-
} else {
|
|
315
|
-
pendingAudio.push(chunk);
|
|
316
|
-
}
|
|
317
|
-
},
|
|
318
|
-
onError(err) {
|
|
319
|
-
if (!stopped) callbacks.onError(err);
|
|
320
|
-
stop();
|
|
321
|
-
},
|
|
322
|
-
onExit() {
|
|
323
|
-
stop();
|
|
324
|
-
},
|
|
325
|
-
});
|
|
326
|
-
|
|
327
|
-
/**
|
|
328
|
-
* @param {Buffer} chunk
|
|
329
|
-
*/
|
|
330
|
-
function sendAudio(chunk) {
|
|
331
|
-
const payload = hooks.buildAudioPayload(chunk, hooks.sampleRate);
|
|
332
|
-
try {
|
|
333
|
-
ws.send(JSON.stringify(payload));
|
|
334
|
-
} catch (err) {
|
|
335
|
-
if (VOICE_DEBUG) {
|
|
336
|
-
process.stderr.write(
|
|
337
|
-
`[voiceInput] sendAudio dropped: ${formatError(err)}\n`,
|
|
338
|
-
);
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
ws.addEventListener("open", () => {
|
|
344
|
-
const setup = hooks.buildSetupMessage(config);
|
|
345
|
-
try {
|
|
346
|
-
ws.send(JSON.stringify(setup));
|
|
347
|
-
} catch (err) {
|
|
348
|
-
callbacks.onError(
|
|
349
|
-
new Error(`Failed to send setup message: ${formatError(err)}`),
|
|
350
|
-
);
|
|
351
|
-
stop();
|
|
352
|
-
}
|
|
353
|
-
});
|
|
354
|
-
|
|
355
|
-
ws.addEventListener("message", (event) => {
|
|
356
|
-
if (stopped) return;
|
|
357
|
-
let raw = "";
|
|
358
|
-
let message;
|
|
359
|
-
try {
|
|
360
|
-
raw =
|
|
361
|
-
typeof event.data === "string"
|
|
362
|
-
? event.data
|
|
363
|
-
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
364
|
-
"utf8",
|
|
365
|
-
);
|
|
366
|
-
message = JSON.parse(raw);
|
|
367
|
-
} catch (err) {
|
|
368
|
-
callbacks.onError(
|
|
369
|
-
new Error(`Failed to parse server message: ${formatError(err)}`),
|
|
370
|
-
);
|
|
371
|
-
return;
|
|
372
|
-
}
|
|
373
|
-
if (!isObjectLike(message)) return;
|
|
374
|
-
if (VOICE_DEBUG) {
|
|
375
|
-
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
const errorText = hooks.extractError?.(message);
|
|
379
|
-
if (errorText) {
|
|
380
|
-
callbacks.onError(new Error(`${hooks.label} error: ${errorText}`));
|
|
381
|
-
return;
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
if (!ready && hooks.isReadyMessage(message)) {
|
|
385
|
-
ready = true;
|
|
386
|
-
for (const chunk of pendingAudio.splice(0)) {
|
|
387
|
-
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
388
|
-
}
|
|
389
|
-
return;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
const transcript = hooks.extractTranscript(message);
|
|
393
|
-
if (transcript && transcript.length > 0) {
|
|
394
|
-
const normalized = normalizer.push(transcript);
|
|
395
|
-
if (normalized.length > 0) {
|
|
396
|
-
callbacks.onTranscript(normalized);
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
});
|
|
400
|
-
|
|
401
|
-
ws.addEventListener("error", (event) => {
|
|
402
|
-
if (stopped) return;
|
|
403
|
-
const message =
|
|
404
|
-
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
405
|
-
callbacks.onError(new Error(`${hooks.label} WebSocket error: ${message}`));
|
|
406
|
-
stop();
|
|
407
|
-
});
|
|
408
|
-
|
|
409
|
-
ws.addEventListener("close", (event) => {
|
|
410
|
-
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
411
|
-
const reason = event.reason ? `: ${event.reason}` : "";
|
|
412
|
-
callbacks.onError(
|
|
413
|
-
new Error(
|
|
414
|
-
`${hooks.label} WebSocket closed (code ${event.code}${reason})`,
|
|
415
|
-
),
|
|
416
|
-
);
|
|
417
|
-
}
|
|
418
|
-
stopped = true;
|
|
419
|
-
rec.stop();
|
|
420
|
-
emitClose();
|
|
421
|
-
});
|
|
422
|
-
|
|
423
|
-
if (VOICE_DEBUG) {
|
|
424
|
-
process.stderr.write(
|
|
425
|
-
`[voiceInput] driver=${hooks.label} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
|
|
426
|
-
);
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/**
|
|
430
|
-
* Stops the recorder and closes the WebSocket.
|
|
431
|
-
*
|
|
432
|
-
* **Note on asynchronicity:** This function is `async` only to satisfy the
|
|
433
|
-
* {@link VoiceSession} interface. It is called without `await` from event
|
|
434
|
-
* listeners (recorder exit, WebSocket error/close). Callers must not rely
|
|
435
|
-
* on the returned promise because unhandled rejections would crash the
|
|
436
|
-
* process. If the function is ever changed to perform real async work,
|
|
437
|
-
* every call site must wrap it with `.catch(() => {})`.
|
|
438
|
-
*/
|
|
439
|
-
async function stop() {
|
|
440
|
-
if (stopped) return;
|
|
441
|
-
stopped = true;
|
|
442
|
-
rec.stop();
|
|
443
|
-
pendingAudio.length = 0;
|
|
444
|
-
if (
|
|
445
|
-
ws.readyState === WebSocket.OPEN ||
|
|
446
|
-
ws.readyState === WebSocket.CONNECTING
|
|
447
|
-
) {
|
|
448
|
-
try {
|
|
449
|
-
ws.close(1000, "client stop");
|
|
450
|
-
} catch (err) {
|
|
451
|
-
if (VOICE_DEBUG) {
|
|
452
|
-
process.stderr.write(
|
|
453
|
-
`[voiceInput] ws.close failed: ${formatError(err)}\n`,
|
|
454
|
-
);
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
emitClose();
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
return { stop };
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
/**
|
|
465
|
-
* Drop whitespace sitting between two CJK characters. Some providers return
|
|
466
|
-
* Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
|
|
467
|
-
* mixed strings like "Windows を使う" keep their inter-script spaces.
|
|
468
|
-
*
|
|
469
|
-
* @returns {{ push: (text: string) => string, flush: () => string }}
|
|
470
|
-
*/
|
|
471
|
-
export function createCJKSpaceNormalizer() {
|
|
472
|
-
let prevChar = "";
|
|
473
|
-
let pendingSpaces = "";
|
|
474
|
-
|
|
475
|
-
/**
|
|
476
|
-
* @param {string} c
|
|
477
|
-
* @returns {boolean}
|
|
478
|
-
*/
|
|
479
|
-
function isSpace(c) {
|
|
480
|
-
return c === " " || c === "\t" || c === "\u3000";
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
return {
|
|
484
|
-
push(text) {
|
|
485
|
-
let out = "";
|
|
486
|
-
for (const ch of text) {
|
|
487
|
-
if (isSpace(ch)) {
|
|
488
|
-
pendingSpaces += ch;
|
|
489
|
-
continue;
|
|
490
|
-
}
|
|
491
|
-
if (pendingSpaces.length > 0) {
|
|
492
|
-
if (!(isCJKChar(prevChar) && isCJKChar(ch))) {
|
|
493
|
-
out += pendingSpaces;
|
|
494
|
-
}
|
|
495
|
-
pendingSpaces = "";
|
|
496
|
-
}
|
|
497
|
-
out += ch;
|
|
498
|
-
prevChar = ch;
|
|
499
|
-
}
|
|
500
|
-
return out;
|
|
501
|
-
},
|
|
502
|
-
flush() {
|
|
503
|
-
const out = pendingSpaces;
|
|
504
|
-
pendingSpaces = "";
|
|
505
|
-
prevChar = "";
|
|
506
|
-
return out;
|
|
507
|
-
},
|
|
508
|
-
};
|
|
509
|
-
}
|
|
510
|
-
|
|
511
|
-
/**
|
|
512
|
-
* @param {string} ch
|
|
513
|
-
* @returns {boolean}
|
|
514
|
-
*/
|
|
515
|
-
function isCJKChar(ch) {
|
|
516
|
-
const code = ch.codePointAt(0);
|
|
517
|
-
if (code === undefined) return false;
|
|
518
|
-
return (
|
|
519
|
-
(code >= 0x3000 && code <= 0x33ff) ||
|
|
520
|
-
(code >= 0x3400 && code <= 0x4dbf) ||
|
|
521
|
-
(code >= 0x4e00 && code <= 0x9fff) ||
|
|
522
|
-
(code >= 0xac00 && code <= 0xd7af) ||
|
|
523
|
-
(code >= 0xf900 && code <= 0xfaff) ||
|
|
524
|
-
(code >= 0xff00 && code <= 0xffef) ||
|
|
525
|
-
(code >= 0x20000 && code <= 0x2ffff)
|
|
526
|
-
);
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
/**
|
|
530
|
-
* @param {unknown} value
|
|
531
|
-
* @returns {value is Record<string, unknown>}
|
|
532
|
-
*/
|
|
533
|
-
export function isObjectLike(value) {
|
|
534
|
-
return typeof value === "object" && value !== null;
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
/**
|
|
538
|
-
* @param {unknown} err
|
|
539
|
-
* @returns {string}
|
|
540
|
-
*/
|
|
541
|
-
function formatError(err) {
|
|
542
|
-
return err instanceof Error ? err.message : String(err);
|
|
543
|
-
}
|
package/src/voice/toggleKey.mjs
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @typedef {Object} VoiceToggleKey
|
|
3
|
-
* @property {number} byte
|
|
4
|
-
* @property {string} label
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
// Bytes reserved for other terminal/readline uses — cannot be used as a voice toggle.
|
|
8
|
-
// 0x03 = Ctrl-C (SIGINT)
|
|
9
|
-
// 0x04 = Ctrl-D (EOF / readline exit)
|
|
10
|
-
// 0x09 = Ctrl-I (Tab)
|
|
11
|
-
// 0x0a = Ctrl-J (LF / Enter)
|
|
12
|
-
// 0x0d = Ctrl-M (CR / Enter)
|
|
13
|
-
// 0x11 = Ctrl-Q (XON: resume terminal output)
|
|
14
|
-
// 0x13 = Ctrl-S (XOFF: suspend terminal output)
|
|
15
|
-
const RESERVED_TERMINAL_BYTES = new Set([
|
|
16
|
-
0x03, 0x04, 0x09, 0x0a, 0x0d, 0x11, 0x13,
|
|
17
|
-
]);
|
|
18
|
-
|
|
19
|
-
/**
|
|
20
|
-
* Parse a "ctrl-<char>" binding into the raw byte the terminal sends in
|
|
21
|
-
* raw mode. Only Ctrl-<char> is supported because it is the only family
|
|
22
|
-
* the pre-readline pipeline can recognize without a full key decoder.
|
|
23
|
-
*
|
|
24
|
-
* @param {string | undefined} spec
|
|
25
|
-
* @returns {VoiceToggleKey}
|
|
26
|
-
*/
|
|
27
|
-
export function parseVoiceToggleKey(spec) {
|
|
28
|
-
const raw = (spec ?? "ctrl-o").trim().toLowerCase();
|
|
29
|
-
|
|
30
|
-
const match = /^ctrl-(.)$/.exec(raw);
|
|
31
|
-
if (!match) {
|
|
32
|
-
throw new Error(
|
|
33
|
-
`Invalid voiceInput.toggleKey "${spec}". Expected "ctrl-<char>".`,
|
|
34
|
-
);
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
const ch = match[1];
|
|
38
|
-
const code = ch.charCodeAt(0);
|
|
39
|
-
|
|
40
|
-
// Subtracting a fixed offset from the character's ASCII code yields the
|
|
41
|
-
// control byte (0x01–0x1f) the terminal sends for that Ctrl combination.
|
|
42
|
-
let byte;
|
|
43
|
-
if (code >= 0x61 && code <= 0x7a) {
|
|
44
|
-
// a–z (0x61–0x7a): subtract 0x60 → 0x01 (Ctrl-A) – 0x1a (Ctrl-Z)
|
|
45
|
-
byte = code - 0x60;
|
|
46
|
-
} else if (code >= 0x5b && code <= 0x5f) {
|
|
47
|
-
// [ \ ] ^ _ (0x5b–0x5f): subtract 0x40 → 0x1b (Ctrl-[) – 0x1f (Ctrl-_)
|
|
48
|
-
byte = code - 0x40;
|
|
49
|
-
} else {
|
|
50
|
-
throw new Error(
|
|
51
|
-
`Unsupported voiceInput.toggleKey "${spec}". Use ctrl-<letter> or ctrl-<[ \\ ] ^ _>.`,
|
|
52
|
-
);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
if (RESERVED_TERMINAL_BYTES.has(byte)) {
|
|
56
|
-
throw new Error(
|
|
57
|
-
`voiceInput.toggleKey "${spec}" conflicts with a reserved terminal/readline key.`,
|
|
58
|
-
);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
return { byte, label: `Ctrl-${ch.toUpperCase()}` };
|
|
62
|
-
}
|