@iinm/plain-agent 1.7.16 → 1.7.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -24
- package/package.json +1 -1
- package/src/cliInteractive.mjs +116 -2
- package/src/cliInterruptTransform.mjs +22 -5
- package/src/cliMuteTransform.mjs +26 -0
- package/src/config.d.ts +2 -0
- package/src/config.mjs +3 -0
- package/src/main.mjs +1 -0
- package/src/voiceInput.mjs +671 -0
package/README.md
CHANGED
|
@@ -60,19 +60,19 @@ Create the configuration.
|
|
|
60
60
|
{
|
|
61
61
|
"name": "anthropic",
|
|
62
62
|
"variant": "default",
|
|
63
|
-
"apiKey": "
|
|
63
|
+
"apiKey": "<ANTHROPIC_API_KEY>"
|
|
64
64
|
// Or
|
|
65
65
|
// "apiKey": { "$env": "ANTHROPIC_API_KEY" }
|
|
66
66
|
},
|
|
67
67
|
{
|
|
68
68
|
"name": "gemini",
|
|
69
69
|
"variant": "default",
|
|
70
|
-
"apiKey": "
|
|
70
|
+
"apiKey": "<GEMINI_API_KEY>"
|
|
71
71
|
},
|
|
72
72
|
{
|
|
73
73
|
"name": "openai",
|
|
74
74
|
"variant": "default",
|
|
75
|
-
"apiKey": "
|
|
75
|
+
"apiKey": "<OPENAI_API_KEY>"
|
|
76
76
|
},
|
|
77
77
|
],
|
|
78
78
|
|
|
@@ -81,7 +81,7 @@ Create the configuration.
|
|
|
81
81
|
// askWeb: Searches the web to answer questions requiring up-to-date information or external sources.
|
|
82
82
|
"askWeb": {
|
|
83
83
|
"provider": "gemini",
|
|
84
|
-
"apiKey": "
|
|
84
|
+
"apiKey": "<GEMINI_API_KEY>",
|
|
85
85
|
"model": "gemini-3-flash-preview"
|
|
86
86
|
// Optional
|
|
87
87
|
// "baseURL": "<proxy_url>"
|
|
@@ -98,7 +98,7 @@ Create the configuration.
|
|
|
98
98
|
// Directly injecting URL content into context is not supported to prevent prompt injection.
|
|
99
99
|
"askURL": {
|
|
100
100
|
"provider": "gemini",
|
|
101
|
-
"apiKey": "
|
|
101
|
+
"apiKey": "<GEMINI_API_KEY>"
|
|
102
102
|
"model": "gemini-3-flash-preview"
|
|
103
103
|
// Optional
|
|
104
104
|
// "baseURL": "<proxy_url>"
|
|
@@ -134,7 +134,7 @@ Create the configuration.
|
|
|
134
134
|
"name": "bedrock",
|
|
135
135
|
"variant": "default",
|
|
136
136
|
"baseURL": "https://bedrock-runtime.<region>.amazonaws.com",
|
|
137
|
-
"awsProfile": "
|
|
137
|
+
"awsProfile": "<AWS_PROFILE>"
|
|
138
138
|
},
|
|
139
139
|
{
|
|
140
140
|
// Requires gcloud CLI to get authentication token
|
|
@@ -159,19 +159,19 @@ Create the configuration.
|
|
|
159
159
|
"name": "openai-compatible",
|
|
160
160
|
"variant": "ollama",
|
|
161
161
|
"baseURL": "https://ollama.com",
|
|
162
|
-
"apiKey": "
|
|
162
|
+
"apiKey": "<API_KEY>"
|
|
163
163
|
},
|
|
164
164
|
{
|
|
165
165
|
"name": "openai-compatible",
|
|
166
166
|
"variant": "huggingface",
|
|
167
167
|
"baseURL": "https://router.huggingface.co",
|
|
168
|
-
"apiKey": "
|
|
168
|
+
"apiKey": "<HUGGINGFACE_API_KEY>"
|
|
169
169
|
},
|
|
170
170
|
{
|
|
171
171
|
"name": "openai-compatible",
|
|
172
172
|
"variant": "fireworks",
|
|
173
173
|
"baseURL": "https://api.fireworks.ai/inference",
|
|
174
|
-
"apiKey": "
|
|
174
|
+
"apiKey": "<FIREWORKS_API_KEY>"
|
|
175
175
|
}
|
|
176
176
|
]
|
|
177
177
|
}
|
|
@@ -243,7 +243,7 @@ Create the configuration.
|
|
|
243
243
|
"name": "bedrock",
|
|
244
244
|
"variant": "jp",
|
|
245
245
|
"baseURL": "https://bedrock-runtime.ap-northeast-1.amazonaws.com",
|
|
246
|
-
"awsProfile": "
|
|
246
|
+
"awsProfile": "<AWS_PROFILE>"
|
|
247
247
|
}
|
|
248
248
|
]
|
|
249
249
|
}
|
|
@@ -463,7 +463,7 @@ The agent loads configuration files in the following order. Settings in later fi
|
|
|
463
463
|
// ⚠️ Add this to config.local.json to avoid committing secrets to Git
|
|
464
464
|
"slack": {
|
|
465
465
|
"command": "npx",
|
|
466
|
-
"args": ["-y", "mcp-remote", "https://mcp.slack.com/mcp", "--header", "Authorization:Bearer
|
|
466
|
+
"args": ["-y", "mcp-remote", "https://mcp.slack.com/mcp", "--header", "Authorization:Bearer <SLACK_TOKEN>"],
|
|
467
467
|
},
|
|
468
468
|
"notion": {
|
|
469
469
|
"command": "npx",
|
|
@@ -480,12 +480,18 @@ The agent loads configuration files in the following order. Settings in later fi
|
|
|
480
480
|
// ⚠️ Add this to config.local.json to avoid committing secrets to Git
|
|
481
481
|
"google_developer-knowledge": {
|
|
482
482
|
"command": "npx",
|
|
483
|
-
"args": ["-y", "mcp-remote", "https://developerknowledge.googleapis.com/mcp", "--header", "X-Goog-Api-Key
|
|
483
|
+
"args": ["-y", "mcp-remote", "https://developerknowledge.googleapis.com/mcp", "--header", "X-Goog-Api-Key:<GOOGLE_API_KEY>"]
|
|
484
484
|
}
|
|
485
485
|
},
|
|
486
486
|
|
|
487
487
|
// Override default notification command
|
|
488
488
|
// "notifyCmd": "/path/to/notification-command"
|
|
489
|
+
|
|
490
|
+
// (Optional) Voice input. See "Voice Input" below.
|
|
491
|
+
// "voiceInput": {
|
|
492
|
+
// "provider": "openai",
|
|
493
|
+
// "apiKey": "<OPENAI_API_KEY>"
|
|
494
|
+
// }
|
|
489
495
|
}
|
|
490
496
|
```
|
|
491
497
|
</details>
|
|
@@ -606,6 +612,53 @@ Example:
|
|
|
606
612
|
plain install-claude-code-plugins
|
|
607
613
|
```
|
|
608
614
|
|
|
615
|
+
## Voice Input
|
|
616
|
+
|
|
617
|
+
Press **Ctrl-O** to start recording, press it again to stop. Partial
|
|
618
|
+
transcripts are inserted into the prompt as you speak so you can edit
|
|
619
|
+
and send them like regular text.
|
|
620
|
+
|
|
621
|
+
### Requirements
|
|
622
|
+
|
|
623
|
+
- A recording command on `PATH`: `arecord`, `sox`, or `ffmpeg`.
|
|
624
|
+
- An API key for the chosen provider.
|
|
625
|
+
- Your host must have microphone access. The sandbox does not need to.
|
|
626
|
+
|
|
627
|
+
### Providers
|
|
628
|
+
|
|
629
|
+
**OpenAI Realtime** (default, recommended):
|
|
630
|
+
|
|
631
|
+
```js
|
|
632
|
+
{
|
|
633
|
+
"voiceInput": {
|
|
634
|
+
"provider": "openai",
|
|
635
|
+
"apiKey": "<OPENAI_API_KEY>"
|
|
636
|
+
// "model": "gpt-4o-transcribe", // or "gpt-4o-mini-transcribe", "whisper-1"
|
|
637
|
+
// "language": "ja" // ISO-639-1 code. Improves accuracy and latency.
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
```
|
|
641
|
+
|
|
642
|
+
**Gemini Live** (preview API; model names and pricing may change):
|
|
643
|
+
|
|
644
|
+
```js
|
|
645
|
+
{
|
|
646
|
+
"voiceInput": {
|
|
647
|
+
"provider": "gemini",
|
|
648
|
+
"apiKey": "<GEMINI_API_KEY>"
|
|
649
|
+
// "model": "gemini-3.1-flash-live-preview",
|
|
650
|
+
// "language": "ja"
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
### Options
|
|
656
|
+
|
|
657
|
+
- `toggleKey` — Rebind the toggle. Accepts `"ctrl-<char>"` where `<char>`
|
|
658
|
+
is a letter (a-z) or one of `[ \ ] ^ _`. Defaults to `"ctrl-o"`.
|
|
659
|
+
- `recorder` — Override recorder auto-detection. Must write raw 16-bit
|
|
660
|
+
little-endian mono PCM to stdout at 24 kHz (OpenAI) or 16 kHz (Gemini).
|
|
661
|
+
|
|
609
662
|
## Development
|
|
610
663
|
|
|
611
664
|
```sh
|
|
@@ -644,9 +697,9 @@ npm publish --access public
|
|
|
644
697
|
|
|
645
698
|
```sh
|
|
646
699
|
# IAM Identity Center
|
|
647
|
-
identity_center_instance_arn="
|
|
648
|
-
identity_store_id
|
|
649
|
-
aws_account_id
|
|
700
|
+
identity_center_instance_arn="<IDENTITY_CENTER_INSTANCE_ARN>" # e.g., arn:aws:sso:::instance/ssoins-xxxxxxxxxxxxxxxx"
|
|
701
|
+
identity_store_id=<IDENTITY_STORE_ID>
|
|
702
|
+
aws_account_id=<AWS_ACCOUNT_ID>
|
|
650
703
|
|
|
651
704
|
# Create a permission set
|
|
652
705
|
permission_set_arn=$(aws sso-admin create-permission-set \
|
|
@@ -681,10 +734,10 @@ aws sso-admin put-inline-policy-to-permission-set \
|
|
|
681
734
|
--inline-policy "$policy"
|
|
682
735
|
|
|
683
736
|
# Create an SSO user
|
|
684
|
-
sso_user_name
|
|
685
|
-
sso_user_email
|
|
686
|
-
sso_user_family_name
|
|
687
|
-
sso_user_given_name
|
|
737
|
+
sso_user_name=<SSO_USER_NAME>
|
|
738
|
+
sso_user_email=<SSO_USER_EMAIL>
|
|
739
|
+
sso_user_family_name=<SSO_USER_FAMILY_NAME>
|
|
740
|
+
sso_user_given_name=<SSO_USER_GIVEN_NAME>
|
|
688
741
|
|
|
689
742
|
user_id=$(aws identitystore create-user \
|
|
690
743
|
--identity-store-id "$identity_store_id" \
|
|
@@ -725,8 +778,8 @@ aws bedrock-runtime invoke-model \
|
|
|
725
778
|
<summary><b>Azure - Microsoft Foundry</b></summary>
|
|
726
779
|
|
|
727
780
|
```sh
|
|
728
|
-
resource_group
|
|
729
|
-
account_name
|
|
781
|
+
resource_group=<RESOURCE_GROUP>
|
|
782
|
+
account_name=<ACCOUNT_NAME> # resource name
|
|
730
783
|
|
|
731
784
|
# Create a service principal
|
|
732
785
|
service_principal=$(az ad sp create-for-rbac --name "CodingAgentServicePrincipal" --skip-assignment)
|
|
@@ -758,10 +811,10 @@ az login --service-principal -u "$app_id" -p "$app_secret" --tenant "$tenant_id"
|
|
|
758
811
|
<summary><b>Google Cloud Vertex AI</b></summary>
|
|
759
812
|
|
|
760
813
|
```sh
|
|
761
|
-
project_id
|
|
762
|
-
service_account_name
|
|
814
|
+
project_id=<PROJECT_ID>
|
|
815
|
+
service_account_name=<SERVICE_ACCOUNT_NAME>
|
|
763
816
|
service_account_email="${service_account_name}@${project_id}.iam.gserviceaccount.com"
|
|
764
|
-
your_account_email
|
|
817
|
+
your_account_email=<YOUR_ACCOUNT_EMAIL>
|
|
765
818
|
|
|
766
819
|
# Create a service account
|
|
767
820
|
gcloud iam service-accounts create "$service_account_name" \
|
package/package.json
CHANGED
package/src/cliInteractive.mjs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @import { UserEventEmitter, AgentEventEmitter, AgentCommands } from "./agent"
|
|
3
3
|
* @import { ClaudeCodePlugin } from "./claudeCodePlugin.mjs"
|
|
4
|
+
* @import { VoiceInputConfig, VoiceSession } from "./voiceInput.mjs"
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import readline from "node:readline";
|
|
@@ -13,8 +14,10 @@ import {
|
|
|
13
14
|
printMessage,
|
|
14
15
|
} from "./cliFormatter.mjs";
|
|
15
16
|
import { createInterruptTransform } from "./cliInterruptTransform.mjs";
|
|
17
|
+
import { createMuteTransform } from "./cliMuteTransform.mjs";
|
|
16
18
|
import { createPasteHandler } from "./cliPasteTransform.mjs";
|
|
17
19
|
import { notify } from "./utils/notify.mjs";
|
|
20
|
+
import { parseVoiceToggleKey, startVoiceSession } from "./voiceInput.mjs";
|
|
18
21
|
|
|
19
22
|
const HELP_MESSAGE = [
|
|
20
23
|
"Commands:",
|
|
@@ -57,6 +60,7 @@ const HELP_MESSAGE = [
|
|
|
57
60
|
* @property {boolean} sandbox
|
|
58
61
|
* @property {() => Promise<void>} onStop
|
|
59
62
|
* @property {ClaudeCodePlugin[]} [claudeCodePlugins]
|
|
63
|
+
* @property {VoiceInputConfig} [voiceInput]
|
|
60
64
|
*/
|
|
61
65
|
|
|
62
66
|
/**
|
|
@@ -72,6 +76,7 @@ export function startInteractiveSession({
|
|
|
72
76
|
sandbox,
|
|
73
77
|
onStop,
|
|
74
78
|
claudeCodePlugins,
|
|
79
|
+
voiceInput,
|
|
75
80
|
}) {
|
|
76
81
|
/** @type {{ turn: boolean, multiLineBuffer: string[] | null, subagentName: string }} */
|
|
77
82
|
const state = {
|
|
@@ -80,6 +85,16 @@ export function startInteractiveSession({
|
|
|
80
85
|
subagentName: "",
|
|
81
86
|
};
|
|
82
87
|
|
|
88
|
+
/**
|
|
89
|
+
* Active voice input session, or null when not recording.
|
|
90
|
+
* @type {{ session: VoiceSession, startCursor: number, transcriptLength: number } | null}
|
|
91
|
+
*/
|
|
92
|
+
let voice = null;
|
|
93
|
+
|
|
94
|
+
// Parse the voice toggle key once at startup so misconfiguration fails
|
|
95
|
+
// loudly instead of silently falling back.
|
|
96
|
+
const voiceToggle = parseVoiceToggleKey(voiceInput?.toggleKey);
|
|
97
|
+
|
|
83
98
|
const getCliPrompt = (subagentName = "", flashMessage = "") =>
|
|
84
99
|
[
|
|
85
100
|
"",
|
|
@@ -136,7 +151,100 @@ export function startInteractiveSession({
|
|
|
136
151
|
cli.prompt();
|
|
137
152
|
};
|
|
138
153
|
|
|
154
|
+
const stopVoiceSession = async () => {
|
|
155
|
+
if (!voice) return;
|
|
156
|
+
const current = voice;
|
|
157
|
+
voice = null;
|
|
158
|
+
await current.session.stop();
|
|
159
|
+
cli.setPrompt(currentCliPrompt);
|
|
160
|
+
// @ts-expect-error - internal property
|
|
161
|
+
cli._refreshLine?.();
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
const handleVoiceToggle = () => {
|
|
165
|
+
// Ignore while the agent is working.
|
|
166
|
+
if (!state.turn) return;
|
|
167
|
+
|
|
168
|
+
if (voice) {
|
|
169
|
+
stopVoiceSession();
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (!voiceInput) {
|
|
174
|
+
cli.setPrompt(
|
|
175
|
+
getCliPrompt(
|
|
176
|
+
state.subagentName,
|
|
177
|
+
styleText(
|
|
178
|
+
"yellow",
|
|
179
|
+
`Voice input not configured. Set \`voiceInput\` in your config to enable ${voiceToggle.label}.`,
|
|
180
|
+
),
|
|
181
|
+
),
|
|
182
|
+
);
|
|
183
|
+
cli.prompt(true);
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const startCursor = cli.cursor;
|
|
188
|
+
const session = startVoiceSession({
|
|
189
|
+
config: voiceInput,
|
|
190
|
+
callbacks: {
|
|
191
|
+
onTranscript: (delta) => {
|
|
192
|
+
if (!voice) return;
|
|
193
|
+
const insertAt = voice.startCursor + voice.transcriptLength;
|
|
194
|
+
// Insert delta at the recording's insertion point. User input is
|
|
195
|
+
// swallowed while recording, so the buffer around `insertAt` is
|
|
196
|
+
// stable.
|
|
197
|
+
const before = cli.line.slice(0, insertAt);
|
|
198
|
+
const after = cli.line.slice(insertAt);
|
|
199
|
+
// `line` and `cursor` are declared readonly in the Node typings but
|
|
200
|
+
// are writable at runtime — the existing code already patches
|
|
201
|
+
// `_refreshLine` in the same way.
|
|
202
|
+
const mutableCli = /** @type {{ line: string, cursor: number }} */ (
|
|
203
|
+
/** @type {unknown} */ (cli)
|
|
204
|
+
);
|
|
205
|
+
mutableCli.line = before + delta + after;
|
|
206
|
+
mutableCli.cursor = insertAt + delta.length;
|
|
207
|
+
voice.transcriptLength += delta.length;
|
|
208
|
+
// @ts-expect-error - internal property
|
|
209
|
+
cli._refreshLine?.();
|
|
210
|
+
},
|
|
211
|
+
onError: (err) => {
|
|
212
|
+
voice = null;
|
|
213
|
+
cli.setPrompt(
|
|
214
|
+
getCliPrompt(
|
|
215
|
+
state.subagentName,
|
|
216
|
+
styleText("red", `Voice input error: ${err.message}`),
|
|
217
|
+
),
|
|
218
|
+
);
|
|
219
|
+
cli.prompt(true);
|
|
220
|
+
},
|
|
221
|
+
onClose: () => {
|
|
222
|
+
if (!voice) return;
|
|
223
|
+
voice = null;
|
|
224
|
+
cli.setPrompt(currentCliPrompt);
|
|
225
|
+
// @ts-expect-error - internal property
|
|
226
|
+
cli._refreshLine?.();
|
|
227
|
+
},
|
|
228
|
+
},
|
|
229
|
+
});
|
|
230
|
+
voice = { session, startCursor, transcriptLength: 0 };
|
|
231
|
+
cli.setPrompt(
|
|
232
|
+
getCliPrompt(
|
|
233
|
+
state.subagentName,
|
|
234
|
+
styleText(["red", "bold"], `● REC (${voiceToggle.label} to stop)`),
|
|
235
|
+
),
|
|
236
|
+
);
|
|
237
|
+
// @ts-expect-error - internal property
|
|
238
|
+
cli._refreshLine?.();
|
|
239
|
+
};
|
|
240
|
+
|
|
139
241
|
const handleCtrlC = () => {
|
|
242
|
+
// Stop voice recording first if active.
|
|
243
|
+
if (voice) {
|
|
244
|
+
stopVoiceSession();
|
|
245
|
+
return;
|
|
246
|
+
}
|
|
247
|
+
|
|
140
248
|
// Agent turn: pause auto-approve; do not clear input.
|
|
141
249
|
if (!state.turn) {
|
|
142
250
|
agentCommands.pauseAutoApprove();
|
|
@@ -192,14 +300,20 @@ export function startInteractiveSession({
|
|
|
192
300
|
};
|
|
193
301
|
|
|
194
302
|
// Pre-readline pipeline:
|
|
195
|
-
// stdin -> interrupt (Ctrl-C / Ctrl-D) -> paste (bracketed paste) -> readline
|
|
303
|
+
// stdin -> interrupt (Ctrl-C / Ctrl-D) -> mute (voice recording) -> paste (bracketed paste) -> readline
|
|
196
304
|
const interrupt = createInterruptTransform({
|
|
197
305
|
onCtrlC: handleCtrlC,
|
|
198
306
|
onCtrlD: handleCtrlD,
|
|
307
|
+
onVoiceToggle: handleVoiceToggle,
|
|
308
|
+
voiceToggleByte: voiceToggle.byte,
|
|
199
309
|
});
|
|
310
|
+
// While a voice session is recording, swallow all stdin bytes other than
|
|
311
|
+
// Ctrl-C / Ctrl-D / the voice toggle key so transcript insertion stays
|
|
312
|
+
// consistent.
|
|
313
|
+
const mute = createMuteTransform({ isMuted: () => voice !== null });
|
|
200
314
|
const paste = createPasteHandler();
|
|
201
315
|
|
|
202
|
-
process.stdin.pipe(interrupt).pipe(paste.transform);
|
|
316
|
+
process.stdin.pipe(interrupt).pipe(mute).pipe(paste.transform);
|
|
203
317
|
|
|
204
318
|
// Enable bracketed paste mode
|
|
205
319
|
if (process.stdout.isTTY) {
|
|
@@ -1,19 +1,31 @@
|
|
|
1
1
|
import { Transform } from "node:stream";
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Create a Transform that intercepts Ctrl-C (0x03)
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Create a Transform that intercepts Ctrl-C (0x03), Ctrl-D (0x04), and an
|
|
5
|
+
* optional "voice toggle" byte (default Ctrl-O, 0x0f). When one of those
|
|
6
|
+
* bytes is seen anywhere in a chunk, the corresponding callback is invoked
|
|
7
|
+
* and the entire chunk is dropped so that downstream consumers (e.g.
|
|
7
8
|
* readline) never observe it. All other input flows through unchanged.
|
|
8
9
|
*
|
|
9
|
-
*
|
|
10
|
+
* Priority when multiple handled bytes appear in the same chunk:
|
|
11
|
+
* Ctrl-C > Ctrl-D > voice toggle.
|
|
10
12
|
*
|
|
11
13
|
* @param {object} handlers
|
|
12
14
|
* @param {() => void} handlers.onCtrlC - Called when Ctrl-C is detected
|
|
13
15
|
* @param {() => void} handlers.onCtrlD - Called when Ctrl-D is detected
|
|
16
|
+
* @param {() => void} [handlers.onVoiceToggle]
|
|
17
|
+
* Called when the voice toggle byte is detected.
|
|
18
|
+
* @param {number} [handlers.voiceToggleByte]
|
|
19
|
+
* Byte value for the voice toggle key. Defaults to 0x0f (Ctrl-O).
|
|
14
20
|
* @returns {Transform}
|
|
15
21
|
*/
|
|
16
|
-
export function createInterruptTransform({
|
|
22
|
+
export function createInterruptTransform({
|
|
23
|
+
onCtrlC,
|
|
24
|
+
onCtrlD,
|
|
25
|
+
onVoiceToggle,
|
|
26
|
+
voiceToggleByte = 0x0f,
|
|
27
|
+
}) {
|
|
28
|
+
const voiceToggleChar = String.fromCharCode(voiceToggleByte);
|
|
17
29
|
return new Transform({
|
|
18
30
|
transform(chunk, _encoding, callback) {
|
|
19
31
|
const data = chunk.toString("utf8");
|
|
@@ -27,6 +39,11 @@ export function createInterruptTransform({ onCtrlC, onCtrlD }) {
|
|
|
27
39
|
callback();
|
|
28
40
|
return;
|
|
29
41
|
}
|
|
42
|
+
if (onVoiceToggle && data.includes(voiceToggleChar)) {
|
|
43
|
+
onVoiceToggle();
|
|
44
|
+
callback();
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
30
47
|
this.push(chunk);
|
|
31
48
|
callback();
|
|
32
49
|
},
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { Transform } from "node:stream";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Create a Transform that swallows all chunks while `isMuted()` returns true,
|
|
5
|
+
* and passes them through unchanged while it returns false.
|
|
6
|
+
*
|
|
7
|
+
* Intended to sit between `createInterruptTransform` and the paste handler so
|
|
8
|
+
* that callers can fully silence regular stdin input during special modes
|
|
9
|
+
* (e.g. while a voice input session is recording) without coupling that
|
|
10
|
+
* concern to the interrupt-detection logic.
|
|
11
|
+
*
|
|
12
|
+
* @param {object} options
|
|
13
|
+
* @param {() => boolean} options.isMuted
|
|
14
|
+
* Called for each incoming chunk; when true the chunk is dropped.
|
|
15
|
+
* @returns {Transform}
|
|
16
|
+
*/
|
|
17
|
+
export function createMuteTransform({ isMuted }) {
|
|
18
|
+
return new Transform({
|
|
19
|
+
transform(chunk, _encoding, callback) {
|
|
20
|
+
if (!isMuted()) {
|
|
21
|
+
this.push(chunk);
|
|
22
|
+
}
|
|
23
|
+
callback();
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
}
|
package/src/config.d.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { AskURLToolOptions } from "./tools/askURL.mjs";
|
|
|
4
4
|
import { AskWebToolOptions } from "./tools/askWeb.mjs";
|
|
5
5
|
import { ExecCommandSanboxConfig } from "./tools/execCommand";
|
|
6
6
|
import { ClaudeCodePluginRepo } from "./claudeCodePlugin.mjs";
|
|
7
|
+
import { VoiceInputConfig } from "./voiceInput.mjs";
|
|
7
8
|
|
|
8
9
|
export type AppConfig = {
|
|
9
10
|
model?: string;
|
|
@@ -21,6 +22,7 @@ export type AppConfig = {
|
|
|
21
22
|
};
|
|
22
23
|
mcpServers?: Record<string, MCPServerConfig>;
|
|
23
24
|
notifyCmd?: string;
|
|
25
|
+
voiceInput?: VoiceInputConfig;
|
|
24
26
|
claudeCodePlugins?: ClaudeCodePluginRepo[];
|
|
25
27
|
};
|
|
26
28
|
|
package/src/config.mjs
CHANGED
|
@@ -98,6 +98,9 @@ export async function loadAppConfig(options = {}) {
|
|
|
98
98
|
...(merged.claudeCodePlugins ?? []),
|
|
99
99
|
...(config.claudeCodePlugins ?? []),
|
|
100
100
|
],
|
|
101
|
+
voiceInput: config.voiceInput
|
|
102
|
+
? { ...(merged.voiceInput ?? {}), ...config.voiceInput }
|
|
103
|
+
: merged.voiceInput,
|
|
101
104
|
};
|
|
102
105
|
}
|
|
103
106
|
|
package/src/main.mjs
CHANGED
|
@@ -257,6 +257,7 @@ if (cliArgs.subcommand.type === "install-claude-code-plugins") {
|
|
|
257
257
|
...sessionOptions,
|
|
258
258
|
notifyCmd: appConfig.notifyCmd || AGENT_NOTIFY_CMD_DEFAULT,
|
|
259
259
|
claudeCodePlugins: resolvePluginPaths(appConfig.claudeCodePlugins ?? []),
|
|
260
|
+
voiceInput: appConfig.voiceInput,
|
|
260
261
|
});
|
|
261
262
|
}
|
|
262
263
|
})().catch((err) => {
|
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
import { spawn, spawnSync } from "node:child_process";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @typedef {VoiceInputOpenAIConfig | VoiceInputGeminiConfig} VoiceInputConfig
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* @typedef {Object} VoiceInputOpenAIConfig
|
|
9
|
+
* @property {"openai"} provider
|
|
10
|
+
* @property {string} apiKey
|
|
11
|
+
* @property {string} [model] - Defaults to "gpt-4o-transcribe".
|
|
12
|
+
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
|
|
13
|
+
* @property {string} [baseURL]
|
|
14
|
+
* @property {VoiceRecorderConfig} [recorder]
|
|
15
|
+
* @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* @typedef {Object} VoiceInputGeminiConfig
|
|
20
|
+
* @property {"gemini"} provider
|
|
21
|
+
* @property {string} apiKey
|
|
22
|
+
* @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
|
|
23
|
+
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
|
|
24
|
+
* @property {string} [baseURL]
|
|
25
|
+
* @property {VoiceRecorderConfig} [recorder]
|
|
26
|
+
* @property {string} [toggleKey]
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @typedef {Object} VoiceRecorderConfig
|
|
31
|
+
* @property {string} command
|
|
32
|
+
* @property {string[]} args
|
|
33
|
+
* Must write raw 16-bit little-endian mono PCM to stdout at the sample
|
|
34
|
+
* rate required by the chosen provider (24 kHz for OpenAI, 16 kHz for
|
|
35
|
+
* Gemini).
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* @typedef {Object} VoiceSessionCallbacks
|
|
40
|
+
* @property {(text: string) => void} onTranscript
|
|
41
|
+
* @property {(error: Error) => void} onError
|
|
42
|
+
* @property {() => void} [onClose]
|
|
43
|
+
*/
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* @typedef {Object} VoiceSession
|
|
47
|
+
* @property {() => Promise<void>} stop
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
const DEBUG = process.env.PLAIN_VOICE_DEBUG === "1";
|
|
51
|
+
|
|
52
|
+
// Bytes reserved for other terminal/readline uses — cannot be used as a voice toggle.
|
|
53
|
+
// 0x03 = Ctrl-C (SIGINT)
|
|
54
|
+
// 0x04 = Ctrl-D (EOF / readline exit)
|
|
55
|
+
// 0x09 = Ctrl-I (Tab)
|
|
56
|
+
// 0x0a = Ctrl-J (LF / Enter)
|
|
57
|
+
// 0x0d = Ctrl-M (CR / Enter)
|
|
58
|
+
// 0x11 = Ctrl-Q (XON: resume terminal output)
|
|
59
|
+
// 0x13 = Ctrl-S (XOFF: suspend terminal output)
|
|
60
|
+
const RESERVED_TERMINAL_BYTES = new Set([
|
|
61
|
+
0x03, 0x04, 0x09, 0x0a, 0x0d, 0x11, 0x13,
|
|
62
|
+
]);
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @typedef {Object} VoiceToggleKey
|
|
66
|
+
* @property {number} byte
|
|
67
|
+
* @property {string} label
|
|
68
|
+
*/
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Parse a "ctrl-<char>" binding into the raw byte the terminal sends in
|
|
72
|
+
* raw mode. Only Ctrl-<char> is supported because it is the only family
|
|
73
|
+
* the pre-readline pipeline can recognize without a full key decoder.
|
|
74
|
+
*
|
|
75
|
+
* @param {string | undefined} spec
|
|
76
|
+
* @returns {VoiceToggleKey}
|
|
77
|
+
*/
|
|
78
|
+
export function parseVoiceToggleKey(spec) {
|
|
79
|
+
const raw = (spec ?? "ctrl-o").trim().toLowerCase();
|
|
80
|
+
|
|
81
|
+
const match = /^ctrl-(.)$/.exec(raw);
|
|
82
|
+
if (!match) {
|
|
83
|
+
throw new Error(
|
|
84
|
+
`Invalid voiceInput.toggleKey "${spec}". Expected "ctrl-<char>".`,
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const ch = match[1];
|
|
89
|
+
const code = ch.charCodeAt(0);
|
|
90
|
+
|
|
91
|
+
// Subtracting a fixed offset from the character's ASCII code yields the
|
|
92
|
+
// control byte (0x01–0x1f) the terminal sends for that Ctrl combination.
|
|
93
|
+
let byte;
|
|
94
|
+
if (code >= 0x61 && code <= 0x7a) {
|
|
95
|
+
// a–z (0x61–0x7a): subtract 0x60 → 0x01 (Ctrl-A) – 0x1a (Ctrl-Z)
|
|
96
|
+
byte = code - 0x60;
|
|
97
|
+
} else if (code >= 0x5b && code <= 0x5f) {
|
|
98
|
+
// [ \ ] ^ _ (0x5b–0x5f): subtract 0x40 → 0x1b (Ctrl-[) – 0x1f (Ctrl-_)
|
|
99
|
+
byte = code - 0x40;
|
|
100
|
+
} else {
|
|
101
|
+
throw new Error(
|
|
102
|
+
`Unsupported voiceInput.toggleKey "${spec}". Use ctrl-<letter> or ctrl-<[ \\ ] ^ _>.`,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if (RESERVED_TERMINAL_BYTES.has(byte)) {
|
|
107
|
+
throw new Error(
|
|
108
|
+
`voiceInput.toggleKey "${spec}" conflicts with a reserved terminal/readline key.`,
|
|
109
|
+
);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return { byte, label: `Ctrl-${ch.toUpperCase()}` };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* @param {number} sampleRate
|
|
117
|
+
* @returns {VoiceRecorderConfig[]}
|
|
118
|
+
*/
|
|
119
|
+
export function getRecorderCandidates(sampleRate) {
|
|
120
|
+
const rate = String(sampleRate);
|
|
121
|
+
const isMac = process.platform === "darwin";
|
|
122
|
+
/** @type {VoiceRecorderConfig[]} */
|
|
123
|
+
const candidates = [];
|
|
124
|
+
|
|
125
|
+
if (!isMac) {
|
|
126
|
+
candidates.push({
|
|
127
|
+
command: "arecord",
|
|
128
|
+
args: ["-q", "-f", "S16_LE", "-c", "1", "-r", rate, "-t", "raw"],
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
candidates.push({
|
|
133
|
+
command: "sox",
|
|
134
|
+
args: [
|
|
135
|
+
"-q",
|
|
136
|
+
"-d",
|
|
137
|
+
"-b",
|
|
138
|
+
"16",
|
|
139
|
+
"-c",
|
|
140
|
+
"1",
|
|
141
|
+
"-r",
|
|
142
|
+
rate,
|
|
143
|
+
"-e",
|
|
144
|
+
"signed-integer",
|
|
145
|
+
"-t",
|
|
146
|
+
"raw",
|
|
147
|
+
"-",
|
|
148
|
+
],
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
const ffmpegInput = isMac
|
|
152
|
+
? ["-f", "avfoundation", "-i", ":0"]
|
|
153
|
+
: ["-f", "alsa", "-i", "default"];
|
|
154
|
+
candidates.push({
|
|
155
|
+
command: "ffmpeg",
|
|
156
|
+
args: [
|
|
157
|
+
"-hide_banner",
|
|
158
|
+
"-loglevel",
|
|
159
|
+
"error",
|
|
160
|
+
...ffmpegInput,
|
|
161
|
+
"-ac",
|
|
162
|
+
"1",
|
|
163
|
+
"-ar",
|
|
164
|
+
rate,
|
|
165
|
+
"-f",
|
|
166
|
+
"s16le",
|
|
167
|
+
"-",
|
|
168
|
+
],
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
return candidates;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* @param {VoiceRecorderConfig[]} candidates
|
|
176
|
+
* @returns {VoiceRecorderConfig | null}
|
|
177
|
+
*/
|
|
178
|
+
export function detectRecorder(candidates) {
|
|
179
|
+
return candidates.find((c) => isCommandAvailable(c.command)) ?? null;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* @param {string} command
|
|
184
|
+
*/
|
|
185
|
+
function isCommandAvailable(command) {
|
|
186
|
+
if (process.platform === "win32") {
|
|
187
|
+
const result = spawnSync("where", [command], { stdio: "ignore" });
|
|
188
|
+
return result.status === 0;
|
|
189
|
+
}
|
|
190
|
+
const result = spawnSync("sh", ["-c", `command -v ${command}`], {
|
|
191
|
+
stdio: "ignore",
|
|
192
|
+
});
|
|
193
|
+
return result.status === 0;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Start a voice input session. Spawns a recorder, opens a WebSocket to the
|
|
198
|
+
* configured provider, and streams transcript deltas via `onTranscript`.
|
|
199
|
+
*
|
|
200
|
+
* @param {object} options
|
|
201
|
+
* @param {VoiceInputConfig} options.config
|
|
202
|
+
* @param {VoiceSessionCallbacks} options.callbacks
|
|
203
|
+
* @returns {VoiceSession}
|
|
204
|
+
*/
|
|
205
|
+
export function startVoiceSession({ config, callbacks }) {
|
|
206
|
+
/**
|
|
207
|
+
* Report an error asynchronously and return an already-terminated session.
|
|
208
|
+
* @param {Error} error
|
|
209
|
+
* @returns {VoiceSession}
|
|
210
|
+
*/
|
|
211
|
+
function failAsync(error) {
|
|
212
|
+
queueMicrotask(() => {
|
|
213
|
+
callbacks.onError(error);
|
|
214
|
+
callbacks.onClose?.();
|
|
215
|
+
});
|
|
216
|
+
return { stop: async () => {} };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/** @type {VoiceDriver} */
|
|
220
|
+
let driver;
|
|
221
|
+
try {
|
|
222
|
+
driver = createDriver(config);
|
|
223
|
+
} catch (err) {
|
|
224
|
+
return failAsync(err instanceof Error ? err : new Error(String(err)));
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const recorder =
|
|
228
|
+
config.recorder ?? detectRecorder(getRecorderCandidates(driver.sampleRate));
|
|
229
|
+
if (!recorder) {
|
|
230
|
+
return failAsync(
|
|
231
|
+
new Error(
|
|
232
|
+
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
233
|
+
),
|
|
234
|
+
);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if (!isCommandAvailable(recorder.command)) {
|
|
238
|
+
return failAsync(
|
|
239
|
+
new Error(
|
|
240
|
+
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
241
|
+
),
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
let stopped = false;
|
|
246
|
+
let closeEmitted = false;
|
|
247
|
+
let ready = false;
|
|
248
|
+
/** @type {Buffer[]} */
|
|
249
|
+
const pendingAudio = [];
|
|
250
|
+
const normalizer = createCJKSpaceNormalizer();
|
|
251
|
+
|
|
252
|
+
const emitClose = () => {
|
|
253
|
+
if (closeEmitted) return;
|
|
254
|
+
closeEmitted = true;
|
|
255
|
+
callbacks.onClose?.();
|
|
256
|
+
};
|
|
257
|
+
|
|
258
|
+
const ws = driver.connect();
|
|
259
|
+
ws.binaryType = "arraybuffer";
|
|
260
|
+
|
|
261
|
+
const child = spawn(recorder.command, recorder.args, {
|
|
262
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
/** @type {string[]} */
|
|
266
|
+
const recorderStderr = [];
|
|
267
|
+
child.stderr.on("data", (chunk) => {
|
|
268
|
+
recorderStderr.push(chunk.toString("utf8"));
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
child.on("error", (err) => {
|
|
272
|
+
if (stopped) return;
|
|
273
|
+
const suffix =
|
|
274
|
+
/** @type {NodeJS.ErrnoException} */ (err).code === "ENOENT"
|
|
275
|
+
? ` (command "${recorder.command}" not found)`
|
|
276
|
+
: "";
|
|
277
|
+
callbacks.onError(
|
|
278
|
+
new Error(`Recorder failed to start${suffix}: ${err.message}`),
|
|
279
|
+
);
|
|
280
|
+
stop();
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
child.on("exit", (code, signal) => {
|
|
284
|
+
if (stopped) return;
|
|
285
|
+
if (code !== 0 && signal === null) {
|
|
286
|
+
const stderrText = recorderStderr.join("").trim();
|
|
287
|
+
callbacks.onError(
|
|
288
|
+
new Error(
|
|
289
|
+
`Recorder "${recorder.command}" exited with code ${code}${
|
|
290
|
+
stderrText ? `: ${stderrText}` : ""
|
|
291
|
+
}`,
|
|
292
|
+
),
|
|
293
|
+
);
|
|
294
|
+
}
|
|
295
|
+
stop();
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
child.stdout.on("data", (chunk) => {
|
|
299
|
+
if (stopped) return;
|
|
300
|
+
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
301
|
+
sendAudio(chunk);
|
|
302
|
+
} else {
|
|
303
|
+
pendingAudio.push(chunk);
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
ws.addEventListener("open", () => {
|
|
308
|
+
try {
|
|
309
|
+
ws.send(JSON.stringify(driver.buildSetup()));
|
|
310
|
+
} catch (err) {
|
|
311
|
+
callbacks.onError(
|
|
312
|
+
new Error(
|
|
313
|
+
`Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
|
|
314
|
+
),
|
|
315
|
+
);
|
|
316
|
+
stop();
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
ws.addEventListener("message", (event) => {
|
|
321
|
+
if (stopped) return;
|
|
322
|
+
let message;
|
|
323
|
+
let raw = "";
|
|
324
|
+
try {
|
|
325
|
+
raw =
|
|
326
|
+
typeof event.data === "string"
|
|
327
|
+
? event.data
|
|
328
|
+
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
329
|
+
"utf8",
|
|
330
|
+
);
|
|
331
|
+
message = JSON.parse(raw);
|
|
332
|
+
} catch (err) {
|
|
333
|
+
callbacks.onError(
|
|
334
|
+
new Error(
|
|
335
|
+
`Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
|
|
336
|
+
),
|
|
337
|
+
);
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
if (!isObject(message)) return;
|
|
341
|
+
if (DEBUG) {
|
|
342
|
+
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if (message.type === "error" && isObject(message.error)) {
|
|
346
|
+
const detail =
|
|
347
|
+
typeof message.error.message === "string"
|
|
348
|
+
? message.error.message
|
|
349
|
+
: JSON.stringify(message.error);
|
|
350
|
+
callbacks.onError(new Error(`${driver.label} error: ${detail}`));
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
if (!ready && driver.isReady(message)) {
|
|
355
|
+
ready = true;
|
|
356
|
+
for (const chunk of pendingAudio.splice(0)) {
|
|
357
|
+
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
358
|
+
}
|
|
359
|
+
return;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
const text = driver.parseTranscript(message);
|
|
363
|
+
if (text !== null) {
|
|
364
|
+
const normalized = normalizer.push(text);
|
|
365
|
+
if (normalized.length > 0) {
|
|
366
|
+
callbacks.onTranscript(normalized);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
});
|
|
370
|
+
|
|
371
|
+
ws.addEventListener("error", (event) => {
|
|
372
|
+
if (stopped) return;
|
|
373
|
+
const message =
|
|
374
|
+
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
375
|
+
callbacks.onError(new Error(`${driver.label} WebSocket error: ${message}`));
|
|
376
|
+
stop();
|
|
377
|
+
});
|
|
378
|
+
|
|
379
|
+
ws.addEventListener("close", (event) => {
|
|
380
|
+
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
381
|
+
const reason = event.reason ? `: ${event.reason}` : "";
|
|
382
|
+
callbacks.onError(
|
|
383
|
+
new Error(
|
|
384
|
+
`${driver.label} WebSocket closed (code ${event.code}${reason})`,
|
|
385
|
+
),
|
|
386
|
+
);
|
|
387
|
+
}
|
|
388
|
+
stopped = true;
|
|
389
|
+
try {
|
|
390
|
+
child.kill("SIGTERM");
|
|
391
|
+
} catch {
|
|
392
|
+
// ignore
|
|
393
|
+
}
|
|
394
|
+
emitClose();
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* @param {Buffer} chunk
|
|
399
|
+
*/
|
|
400
|
+
function sendAudio(chunk) {
|
|
401
|
+
const payload = driver.buildAudioMessage(chunk.toString("base64"));
|
|
402
|
+
try {
|
|
403
|
+
ws.send(JSON.stringify(payload));
|
|
404
|
+
} catch {
|
|
405
|
+
// connection may have just closed
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (DEBUG) {
|
|
410
|
+
process.stderr.write(
|
|
411
|
+
`[voiceInput] driver=${driver.label} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* @returns {Promise<void>}
|
|
417
|
+
*/
|
|
418
|
+
async function stop() {
|
|
419
|
+
if (stopped) return;
|
|
420
|
+
stopped = true;
|
|
421
|
+
try {
|
|
422
|
+
child.kill("SIGTERM");
|
|
423
|
+
} catch {
|
|
424
|
+
// ignore
|
|
425
|
+
}
|
|
426
|
+
if (
|
|
427
|
+
ws.readyState === WebSocket.OPEN ||
|
|
428
|
+
ws.readyState === WebSocket.CONNECTING
|
|
429
|
+
) {
|
|
430
|
+
try {
|
|
431
|
+
ws.close(1000, "client stop");
|
|
432
|
+
} catch {
|
|
433
|
+
// ignore
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
emitClose();
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
return { stop };
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* @typedef {Object} VoiceDriver
|
|
444
|
+
* @property {string} label
|
|
445
|
+
* @property {number} sampleRate
|
|
446
|
+
* @property {() => WebSocket} connect
|
|
447
|
+
* @property {() => object} buildSetup
|
|
448
|
+
* @property {(message: Record<string, unknown>) => boolean} isReady
|
|
449
|
+
* @property {(base64: string) => object} buildAudioMessage
|
|
450
|
+
* @property {(message: Record<string, unknown>) => string | null} parseTranscript
|
|
451
|
+
*/
|
|
452
|
+
|
|
453
|
+
/**
|
|
454
|
+
* @param {VoiceInputConfig} config
|
|
455
|
+
* @returns {VoiceDriver}
|
|
456
|
+
*/
|
|
457
|
+
function createDriver(config) {
|
|
458
|
+
if (config.provider === "openai") {
|
|
459
|
+
return createOpenAIDriver(config);
|
|
460
|
+
}
|
|
461
|
+
if (config.provider === "gemini") {
|
|
462
|
+
return createGeminiDriver(config);
|
|
463
|
+
}
|
|
464
|
+
throw new Error(
|
|
465
|
+
`Unsupported voiceInput.provider: ${/** @type {{provider: string}} */ (config).provider}`,
|
|
466
|
+
);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
|
|
470
|
+
const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
|
|
471
|
+
const OPENAI_SAMPLE_RATE = 24000;
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* @param {VoiceInputOpenAIConfig} config
|
|
475
|
+
* @returns {VoiceDriver}
|
|
476
|
+
*/
|
|
477
|
+
function createOpenAIDriver(config) {
|
|
478
|
+
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
479
|
+
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
480
|
+
return {
|
|
481
|
+
label: "OpenAI Realtime",
|
|
482
|
+
sampleRate: OPENAI_SAMPLE_RATE,
|
|
483
|
+
connect() {
|
|
484
|
+
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
485
|
+
// option. The built-in typings only declare the standards-compliant
|
|
486
|
+
// constructor, so cast through `WebSocket`-as-constructor.
|
|
487
|
+
const Ctor =
|
|
488
|
+
/** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
489
|
+
/** @type {unknown} */ (WebSocket)
|
|
490
|
+
);
|
|
491
|
+
return new Ctor(`${base}?intent=transcription`, {
|
|
492
|
+
headers: {
|
|
493
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
494
|
+
"OpenAI-Beta": "realtime=v1",
|
|
495
|
+
},
|
|
496
|
+
});
|
|
497
|
+
},
|
|
498
|
+
buildSetup() {
|
|
499
|
+
/** @type {{ model: string, language?: string }} */
|
|
500
|
+
const transcription = { model };
|
|
501
|
+
if (config.language) transcription.language = config.language;
|
|
502
|
+
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
503
|
+
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
504
|
+
return {
|
|
505
|
+
type: "transcription_session.update",
|
|
506
|
+
session: {
|
|
507
|
+
input_audio_format: "pcm16",
|
|
508
|
+
input_audio_transcription: transcription,
|
|
509
|
+
turn_detection: { type: "server_vad" },
|
|
510
|
+
},
|
|
511
|
+
};
|
|
512
|
+
},
|
|
513
|
+
isReady(message) {
|
|
514
|
+
return (
|
|
515
|
+
message.type === "transcription_session.created" ||
|
|
516
|
+
message.type === "transcription_session.updated"
|
|
517
|
+
);
|
|
518
|
+
},
|
|
519
|
+
buildAudioMessage(base64) {
|
|
520
|
+
return { type: "input_audio_buffer.append", audio: base64 };
|
|
521
|
+
},
|
|
522
|
+
parseTranscript(message) {
|
|
523
|
+
if (
|
|
524
|
+
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
525
|
+
typeof message.delta === "string" &&
|
|
526
|
+
message.delta.length > 0
|
|
527
|
+
) {
|
|
528
|
+
return message.delta;
|
|
529
|
+
}
|
|
530
|
+
return null;
|
|
531
|
+
},
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
|
|
536
|
+
const GEMINI_DEFAULT_WS =
|
|
537
|
+
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
|
|
538
|
+
const GEMINI_SAMPLE_RATE = 16000;
|
|
539
|
+
|
|
540
|
+
/**
|
|
541
|
+
* @param {VoiceInputGeminiConfig} config
|
|
542
|
+
* @returns {VoiceDriver}
|
|
543
|
+
*/
|
|
544
|
+
function createGeminiDriver(config) {
|
|
545
|
+
const model = config.model ?? GEMINI_DEFAULT_MODEL;
|
|
546
|
+
const base = config.baseURL ?? GEMINI_DEFAULT_WS;
|
|
547
|
+
return {
|
|
548
|
+
label: "Gemini Live",
|
|
549
|
+
sampleRate: GEMINI_SAMPLE_RATE,
|
|
550
|
+
connect() {
|
|
551
|
+
return new WebSocket(`${base}?key=${encodeURIComponent(config.apiKey)}`);
|
|
552
|
+
},
|
|
553
|
+
buildSetup() {
|
|
554
|
+
// Gemini Live was designed for voice agents, not pure STT.
|
|
555
|
+
// Force maxOutputTokens: 1 and disable thinking on 2.5 models
|
|
556
|
+
// to minimise wasted audio output.
|
|
557
|
+
|
|
558
|
+
/** @type {Record<string, unknown>} */
|
|
559
|
+
const generationConfig = {
|
|
560
|
+
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
561
|
+
// > The native audio models only support `AUDIO response modality.
|
|
562
|
+
responseModalities: ["AUDIO"],
|
|
563
|
+
maxOutputTokens: 1,
|
|
564
|
+
};
|
|
565
|
+
if (model.includes("2.5")) {
|
|
566
|
+
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
567
|
+
}
|
|
568
|
+
/** @type {Record<string, unknown>} */
|
|
569
|
+
const setup = {
|
|
570
|
+
model: `models/${model}`,
|
|
571
|
+
generationConfig,
|
|
572
|
+
inputAudioTranscription: {},
|
|
573
|
+
};
|
|
574
|
+
if (config.language) {
|
|
575
|
+
setup.systemInstruction = {
|
|
576
|
+
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
return { setup };
|
|
580
|
+
},
|
|
581
|
+
isReady(message) {
|
|
582
|
+
return "setupComplete" in message;
|
|
583
|
+
},
|
|
584
|
+
buildAudioMessage(base64) {
|
|
585
|
+
return {
|
|
586
|
+
realtimeInput: {
|
|
587
|
+
audio: {
|
|
588
|
+
data: base64,
|
|
589
|
+
mimeType: `audio/pcm;rate=${GEMINI_SAMPLE_RATE}`,
|
|
590
|
+
},
|
|
591
|
+
},
|
|
592
|
+
};
|
|
593
|
+
},
|
|
594
|
+
parseTranscript(message) {
|
|
595
|
+
const serverContent = message.serverContent;
|
|
596
|
+
if (!isObject(serverContent)) return null;
|
|
597
|
+
const t = serverContent.inputTranscription;
|
|
598
|
+
if (isObject(t) && typeof t.text === "string" && t.text.length > 0) {
|
|
599
|
+
return t.text;
|
|
600
|
+
}
|
|
601
|
+
return null;
|
|
602
|
+
},
|
|
603
|
+
};
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/**
|
|
607
|
+
* Drop whitespace sitting between two CJK characters. Some providers return
|
|
608
|
+
* Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
|
|
609
|
+
* mixed strings like "Windows を使う" keep their inter-script spaces.
|
|
610
|
+
*
|
|
611
|
+
* @returns {{ push: (text: string) => string, flush: () => string }}
|
|
612
|
+
*/
|
|
613
|
+
export function createCJKSpaceNormalizer() {
|
|
614
|
+
let prevChar = "";
|
|
615
|
+
let pendingSpaces = "";
|
|
616
|
+
const isSpace = (/** @type {string} */ c) =>
|
|
617
|
+
c === " " || c === "\t" || c === "\u3000";
|
|
618
|
+
|
|
619
|
+
return {
|
|
620
|
+
push(text) {
|
|
621
|
+
let out = "";
|
|
622
|
+
for (const ch of text) {
|
|
623
|
+
if (isSpace(ch)) {
|
|
624
|
+
pendingSpaces += ch;
|
|
625
|
+
continue;
|
|
626
|
+
}
|
|
627
|
+
if (pendingSpaces.length > 0) {
|
|
628
|
+
if (!(isCJKChar(prevChar) && isCJKChar(ch))) {
|
|
629
|
+
out += pendingSpaces;
|
|
630
|
+
}
|
|
631
|
+
pendingSpaces = "";
|
|
632
|
+
}
|
|
633
|
+
out += ch;
|
|
634
|
+
prevChar = ch;
|
|
635
|
+
}
|
|
636
|
+
return out;
|
|
637
|
+
},
|
|
638
|
+
flush() {
|
|
639
|
+
const out = pendingSpaces;
|
|
640
|
+
pendingSpaces = "";
|
|
641
|
+
prevChar = "";
|
|
642
|
+
return out;
|
|
643
|
+
},
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
/**
|
|
648
|
+
* @param {string} ch
|
|
649
|
+
* @returns {boolean}
|
|
650
|
+
*/
|
|
651
|
+
function isCJKChar(ch) {
|
|
652
|
+
const code = ch.codePointAt(0);
|
|
653
|
+
if (code === undefined) return false;
|
|
654
|
+
return (
|
|
655
|
+
(code >= 0x3000 && code <= 0x33ff) ||
|
|
656
|
+
(code >= 0x3400 && code <= 0x4dbf) ||
|
|
657
|
+
(code >= 0x4e00 && code <= 0x9fff) ||
|
|
658
|
+
(code >= 0xac00 && code <= 0xd7af) ||
|
|
659
|
+
(code >= 0xf900 && code <= 0xfaff) ||
|
|
660
|
+
(code >= 0xff00 && code <= 0xffef) ||
|
|
661
|
+
(code >= 0x20000 && code <= 0x2ffff)
|
|
662
|
+
);
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
/**
|
|
666
|
+
* @param {unknown} value
|
|
667
|
+
* @returns {value is Record<string, unknown>}
|
|
668
|
+
*/
|
|
669
|
+
function isObject(value) {
|
|
670
|
+
return typeof value === "object" && value !== null;
|
|
671
|
+
}
|