@iinm/plain-agent 1.7.18 → 1.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -83
- package/config/config.predefined.json +15 -15
- package/package.json +1 -3
- package/src/agentLoop.mjs +3 -1
- package/src/cliArgs.mjs +31 -1
- package/src/cliBatch.mjs +22 -0
- package/src/cliCost.mjs +309 -0
- package/src/cliFormatter.mjs +1 -1
- package/src/cliInteractive.mjs +29 -1
- package/src/config.d.ts +2 -2
- package/src/config.mjs +1 -1
- package/src/costTracker.mjs +58 -19
- package/src/env.mjs +9 -6
- package/src/main.mjs +17 -6
- package/src/model.d.ts +1 -1
- package/src/tools/patchFile.mjs +11 -12
- package/src/usageStore.mjs +167 -0
- package/src/utils/notify.mjs +3 -2
- package/src/voiceInput.mjs +24 -634
- package/src/voiceInputGemini.mjs +105 -0
- package/src/voiceInputOpenAI.mjs +104 -0
- package/src/voiceInputSession.mjs +543 -0
- package/src/voiceToggleKey.mjs +62 -0
- package/bin/plain-notify-terminal-bell +0 -3
package/src/voiceInput.mjs
CHANGED
|
@@ -1,201 +1,45 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { startGeminiVoiceSession } from "./voiceInputGemini.mjs";
|
|
2
|
+
import { startOpenAIVoiceSession } from "./voiceInputOpenAI.mjs";
|
|
3
|
+
import { failVoiceSessionAsync } from "./voiceInputSession.mjs";
|
|
2
4
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
* @property {"openai"} provider
|
|
10
|
-
* @property {string} apiKey
|
|
11
|
-
* @property {string} [model] - Defaults to "gpt-4o-transcribe".
|
|
12
|
-
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
|
|
13
|
-
* @property {string} [baseURL]
|
|
14
|
-
* @property {VoiceRecorderConfig} [recorder]
|
|
15
|
-
* @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* @typedef {Object} VoiceInputGeminiConfig
|
|
20
|
-
* @property {"gemini"} provider
|
|
21
|
-
* @property {string} apiKey
|
|
22
|
-
* @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
|
|
23
|
-
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
|
|
24
|
-
* @property {string} [baseURL]
|
|
25
|
-
* @property {VoiceRecorderConfig} [recorder]
|
|
26
|
-
* @property {string} [toggleKey]
|
|
27
|
-
*/
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* @typedef {Object} VoiceRecorderConfig
|
|
31
|
-
* @property {string} command
|
|
32
|
-
* @property {string[]} args
|
|
33
|
-
* Must write raw 16-bit little-endian mono PCM to stdout at the sample
|
|
34
|
-
* rate required by the chosen provider (24 kHz for OpenAI, 16 kHz for
|
|
35
|
-
* Gemini).
|
|
36
|
-
*/
|
|
5
|
+
export {
|
|
6
|
+
createCJKSpaceNormalizer,
|
|
7
|
+
detectRecorder,
|
|
8
|
+
getRecorderCandidates,
|
|
9
|
+
} from "./voiceInputSession.mjs";
|
|
10
|
+
export { parseVoiceToggleKey } from "./voiceToggleKey.mjs";
|
|
37
11
|
|
|
38
12
|
/**
|
|
39
|
-
* @typedef {
|
|
40
|
-
* @property {(text: string) => void} onTranscript
|
|
41
|
-
* @property {(error: Error) => void} onError
|
|
42
|
-
* @property {() => void} [onClose]
|
|
13
|
+
* @typedef {import("./voiceInputSession.mjs").VoiceRecorderConfig} VoiceRecorderConfig
|
|
43
14
|
*/
|
|
44
15
|
|
|
45
16
|
/**
|
|
46
|
-
* @typedef {
|
|
47
|
-
* @property {() => Promise<void>} stop
|
|
17
|
+
* @typedef {import("./voiceInputSession.mjs").VoiceSessionCallbacks} VoiceSessionCallbacks
|
|
48
18
|
*/
|
|
49
19
|
|
|
50
|
-
const DEBUG = process.env.PLAIN_VOICE_DEBUG === "1";
|
|
51
|
-
|
|
52
|
-
// Bytes reserved for other terminal/readline uses — cannot be used as a voice toggle.
|
|
53
|
-
// 0x03 = Ctrl-C (SIGINT)
|
|
54
|
-
// 0x04 = Ctrl-D (EOF / readline exit)
|
|
55
|
-
// 0x09 = Ctrl-I (Tab)
|
|
56
|
-
// 0x0a = Ctrl-J (LF / Enter)
|
|
57
|
-
// 0x0d = Ctrl-M (CR / Enter)
|
|
58
|
-
// 0x11 = Ctrl-Q (XON: resume terminal output)
|
|
59
|
-
// 0x13 = Ctrl-S (XOFF: suspend terminal output)
|
|
60
|
-
const RESERVED_TERMINAL_BYTES = new Set([
|
|
61
|
-
0x03, 0x04, 0x09, 0x0a, 0x0d, 0x11, 0x13,
|
|
62
|
-
]);
|
|
63
|
-
|
|
64
20
|
/**
|
|
65
|
-
* @typedef {
|
|
66
|
-
* @property {number} byte
|
|
67
|
-
* @property {string} label
|
|
21
|
+
* @typedef {import("./voiceInputSession.mjs").VoiceSession} VoiceSession
|
|
68
22
|
*/
|
|
69
23
|
|
|
70
24
|
/**
|
|
71
|
-
*
|
|
72
|
-
* raw mode. Only Ctrl-<char> is supported because it is the only family
|
|
73
|
-
* the pre-readline pipeline can recognize without a full key decoder.
|
|
74
|
-
*
|
|
75
|
-
* @param {string | undefined} spec
|
|
76
|
-
* @returns {VoiceToggleKey}
|
|
25
|
+
* @typedef {import("./voiceToggleKey.mjs").VoiceToggleKey} VoiceToggleKey
|
|
77
26
|
*/
|
|
78
|
-
export function parseVoiceToggleKey(spec) {
|
|
79
|
-
const raw = (spec ?? "ctrl-o").trim().toLowerCase();
|
|
80
|
-
|
|
81
|
-
const match = /^ctrl-(.)$/.exec(raw);
|
|
82
|
-
if (!match) {
|
|
83
|
-
throw new Error(
|
|
84
|
-
`Invalid voiceInput.toggleKey "${spec}". Expected "ctrl-<char>".`,
|
|
85
|
-
);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
const ch = match[1];
|
|
89
|
-
const code = ch.charCodeAt(0);
|
|
90
|
-
|
|
91
|
-
// Subtracting a fixed offset from the character's ASCII code yields the
|
|
92
|
-
// control byte (0x01–0x1f) the terminal sends for that Ctrl combination.
|
|
93
|
-
let byte;
|
|
94
|
-
if (code >= 0x61 && code <= 0x7a) {
|
|
95
|
-
// a–z (0x61–0x7a): subtract 0x60 → 0x01 (Ctrl-A) – 0x1a (Ctrl-Z)
|
|
96
|
-
byte = code - 0x60;
|
|
97
|
-
} else if (code >= 0x5b && code <= 0x5f) {
|
|
98
|
-
// [ \ ] ^ _ (0x5b–0x5f): subtract 0x40 → 0x1b (Ctrl-[) – 0x1f (Ctrl-_)
|
|
99
|
-
byte = code - 0x40;
|
|
100
|
-
} else {
|
|
101
|
-
throw new Error(
|
|
102
|
-
`Unsupported voiceInput.toggleKey "${spec}". Use ctrl-<letter> or ctrl-<[ \\ ] ^ _>.`,
|
|
103
|
-
);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
if (RESERVED_TERMINAL_BYTES.has(byte)) {
|
|
107
|
-
throw new Error(
|
|
108
|
-
`voiceInput.toggleKey "${spec}" conflicts with a reserved terminal/readline key.`,
|
|
109
|
-
);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
return { byte, label: `Ctrl-${ch.toUpperCase()}` };
|
|
113
|
-
}
|
|
114
27
|
|
|
115
28
|
/**
|
|
116
|
-
* @
|
|
117
|
-
* @returns {VoiceRecorderConfig[]}
|
|
29
|
+
* @typedef {import("./voiceInputOpenAI.mjs").VoiceInputOpenAIConfig} VoiceInputOpenAIConfig
|
|
118
30
|
*/
|
|
119
|
-
export function getRecorderCandidates(sampleRate) {
|
|
120
|
-
const rate = String(sampleRate);
|
|
121
|
-
const isMac = process.platform === "darwin";
|
|
122
|
-
/** @type {VoiceRecorderConfig[]} */
|
|
123
|
-
const candidates = [];
|
|
124
|
-
|
|
125
|
-
if (!isMac) {
|
|
126
|
-
candidates.push({
|
|
127
|
-
command: "arecord",
|
|
128
|
-
args: ["-q", "-f", "S16_LE", "-c", "1", "-r", rate, "-t", "raw"],
|
|
129
|
-
});
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
candidates.push({
|
|
133
|
-
command: "sox",
|
|
134
|
-
args: [
|
|
135
|
-
"-q",
|
|
136
|
-
"-d",
|
|
137
|
-
"-b",
|
|
138
|
-
"16",
|
|
139
|
-
"-c",
|
|
140
|
-
"1",
|
|
141
|
-
"-r",
|
|
142
|
-
rate,
|
|
143
|
-
"-e",
|
|
144
|
-
"signed-integer",
|
|
145
|
-
"-t",
|
|
146
|
-
"raw",
|
|
147
|
-
"-",
|
|
148
|
-
],
|
|
149
|
-
});
|
|
150
|
-
|
|
151
|
-
const ffmpegInput = isMac
|
|
152
|
-
? ["-f", "avfoundation", "-i", ":0"]
|
|
153
|
-
: ["-f", "alsa", "-i", "default"];
|
|
154
|
-
candidates.push({
|
|
155
|
-
command: "ffmpeg",
|
|
156
|
-
args: [
|
|
157
|
-
"-hide_banner",
|
|
158
|
-
"-loglevel",
|
|
159
|
-
"error",
|
|
160
|
-
...ffmpegInput,
|
|
161
|
-
"-ac",
|
|
162
|
-
"1",
|
|
163
|
-
"-ar",
|
|
164
|
-
rate,
|
|
165
|
-
"-f",
|
|
166
|
-
"s16le",
|
|
167
|
-
"-",
|
|
168
|
-
],
|
|
169
|
-
});
|
|
170
|
-
|
|
171
|
-
return candidates;
|
|
172
|
-
}
|
|
173
31
|
|
|
174
32
|
/**
|
|
175
|
-
* @
|
|
176
|
-
* @returns {VoiceRecorderConfig | null}
|
|
33
|
+
* @typedef {import("./voiceInputGemini.mjs").VoiceInputGeminiConfig} VoiceInputGeminiConfig
|
|
177
34
|
*/
|
|
178
|
-
export function detectRecorder(candidates) {
|
|
179
|
-
return candidates.find((c) => isCommandAvailable(c.command)) ?? null;
|
|
180
|
-
}
|
|
181
35
|
|
|
182
36
|
/**
|
|
183
|
-
* @
|
|
37
|
+
* @typedef {VoiceInputOpenAIConfig | VoiceInputGeminiConfig} VoiceInputConfig
|
|
184
38
|
*/
|
|
185
|
-
function isCommandAvailable(command) {
|
|
186
|
-
if (process.platform === "win32") {
|
|
187
|
-
const result = spawnSync("where", [command], { stdio: "ignore" });
|
|
188
|
-
return result.status === 0;
|
|
189
|
-
}
|
|
190
|
-
const result = spawnSync("sh", ["-c", `command -v ${command}`], {
|
|
191
|
-
stdio: "ignore",
|
|
192
|
-
});
|
|
193
|
-
return result.status === 0;
|
|
194
|
-
}
|
|
195
39
|
|
|
196
40
|
/**
|
|
197
|
-
* Start a voice input session.
|
|
198
|
-
*
|
|
41
|
+
* Start a voice input session. Dispatches to the provider-specific
|
|
42
|
+
* implementation based on `config.provider`.
|
|
199
43
|
*
|
|
200
44
|
* @param {object} options
|
|
201
45
|
* @param {VoiceInputConfig} options.config
|
|
@@ -203,469 +47,15 @@ function isCommandAvailable(command) {
|
|
|
203
47
|
* @returns {VoiceSession}
|
|
204
48
|
*/
|
|
205
49
|
export function startVoiceSession({ config, callbacks }) {
|
|
206
|
-
/**
|
|
207
|
-
* Report an error asynchronously and return an already-terminated session.
|
|
208
|
-
* @param {Error} error
|
|
209
|
-
* @returns {VoiceSession}
|
|
210
|
-
*/
|
|
211
|
-
function failAsync(error) {
|
|
212
|
-
queueMicrotask(() => {
|
|
213
|
-
callbacks.onError(error);
|
|
214
|
-
callbacks.onClose?.();
|
|
215
|
-
});
|
|
216
|
-
return { stop: async () => {} };
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
/** @type {VoiceDriver} */
|
|
220
|
-
let driver;
|
|
221
|
-
try {
|
|
222
|
-
driver = createDriver(config);
|
|
223
|
-
} catch (err) {
|
|
224
|
-
return failAsync(err instanceof Error ? err : new Error(String(err)));
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
const recorder =
|
|
228
|
-
config.recorder ?? detectRecorder(getRecorderCandidates(driver.sampleRate));
|
|
229
|
-
if (!recorder) {
|
|
230
|
-
return failAsync(
|
|
231
|
-
new Error(
|
|
232
|
-
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
233
|
-
),
|
|
234
|
-
);
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
if (!isCommandAvailable(recorder.command)) {
|
|
238
|
-
return failAsync(
|
|
239
|
-
new Error(
|
|
240
|
-
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
241
|
-
),
|
|
242
|
-
);
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
let stopped = false;
|
|
246
|
-
let closeEmitted = false;
|
|
247
|
-
let ready = false;
|
|
248
|
-
/** @type {Buffer[]} */
|
|
249
|
-
const pendingAudio = [];
|
|
250
|
-
const normalizer = createCJKSpaceNormalizer();
|
|
251
|
-
|
|
252
|
-
const emitClose = () => {
|
|
253
|
-
if (closeEmitted) return;
|
|
254
|
-
closeEmitted = true;
|
|
255
|
-
callbacks.onClose?.();
|
|
256
|
-
};
|
|
257
|
-
|
|
258
|
-
const ws = driver.connect();
|
|
259
|
-
ws.binaryType = "arraybuffer";
|
|
260
|
-
|
|
261
|
-
const child = spawn(recorder.command, recorder.args, {
|
|
262
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
263
|
-
});
|
|
264
|
-
|
|
265
|
-
/** @type {string[]} */
|
|
266
|
-
const recorderStderr = [];
|
|
267
|
-
child.stderr.on("data", (chunk) => {
|
|
268
|
-
recorderStderr.push(chunk.toString("utf8"));
|
|
269
|
-
});
|
|
270
|
-
|
|
271
|
-
child.on("error", (err) => {
|
|
272
|
-
if (stopped) return;
|
|
273
|
-
const suffix =
|
|
274
|
-
/** @type {NodeJS.ErrnoException} */ (err).code === "ENOENT"
|
|
275
|
-
? ` (command "${recorder.command}" not found)`
|
|
276
|
-
: "";
|
|
277
|
-
callbacks.onError(
|
|
278
|
-
new Error(`Recorder failed to start${suffix}: ${err.message}`),
|
|
279
|
-
);
|
|
280
|
-
stop();
|
|
281
|
-
});
|
|
282
|
-
|
|
283
|
-
child.on("exit", (code, signal) => {
|
|
284
|
-
if (stopped) return;
|
|
285
|
-
if (code !== 0 && signal === null) {
|
|
286
|
-
const stderrText = recorderStderr.join("").trim();
|
|
287
|
-
callbacks.onError(
|
|
288
|
-
new Error(
|
|
289
|
-
`Recorder "${recorder.command}" exited with code ${code}${
|
|
290
|
-
stderrText ? `: ${stderrText}` : ""
|
|
291
|
-
}`,
|
|
292
|
-
),
|
|
293
|
-
);
|
|
294
|
-
}
|
|
295
|
-
stop();
|
|
296
|
-
});
|
|
297
|
-
|
|
298
|
-
child.stdout.on("data", (chunk) => {
|
|
299
|
-
if (stopped) return;
|
|
300
|
-
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
301
|
-
sendAudio(chunk);
|
|
302
|
-
} else {
|
|
303
|
-
pendingAudio.push(chunk);
|
|
304
|
-
}
|
|
305
|
-
});
|
|
306
|
-
|
|
307
|
-
ws.addEventListener("open", () => {
|
|
308
|
-
try {
|
|
309
|
-
ws.send(JSON.stringify(driver.buildSetup()));
|
|
310
|
-
} catch (err) {
|
|
311
|
-
callbacks.onError(
|
|
312
|
-
new Error(
|
|
313
|
-
`Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
|
|
314
|
-
),
|
|
315
|
-
);
|
|
316
|
-
stop();
|
|
317
|
-
}
|
|
318
|
-
});
|
|
319
|
-
|
|
320
|
-
ws.addEventListener("message", (event) => {
|
|
321
|
-
if (stopped) return;
|
|
322
|
-
let message;
|
|
323
|
-
let raw = "";
|
|
324
|
-
try {
|
|
325
|
-
raw =
|
|
326
|
-
typeof event.data === "string"
|
|
327
|
-
? event.data
|
|
328
|
-
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
329
|
-
"utf8",
|
|
330
|
-
);
|
|
331
|
-
message = JSON.parse(raw);
|
|
332
|
-
} catch (err) {
|
|
333
|
-
callbacks.onError(
|
|
334
|
-
new Error(
|
|
335
|
-
`Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
|
|
336
|
-
),
|
|
337
|
-
);
|
|
338
|
-
return;
|
|
339
|
-
}
|
|
340
|
-
if (!isObject(message)) return;
|
|
341
|
-
if (DEBUG) {
|
|
342
|
-
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
if (message.type === "error" && isObject(message.error)) {
|
|
346
|
-
const detail =
|
|
347
|
-
typeof message.error.message === "string"
|
|
348
|
-
? message.error.message
|
|
349
|
-
: JSON.stringify(message.error);
|
|
350
|
-
callbacks.onError(new Error(`${driver.label} error: ${detail}`));
|
|
351
|
-
return;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
if (!ready && driver.isReady(message)) {
|
|
355
|
-
ready = true;
|
|
356
|
-
for (const chunk of pendingAudio.splice(0)) {
|
|
357
|
-
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
358
|
-
}
|
|
359
|
-
return;
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
const text = driver.parseTranscript(message);
|
|
363
|
-
if (text !== null) {
|
|
364
|
-
const normalized = normalizer.push(text);
|
|
365
|
-
if (normalized.length > 0) {
|
|
366
|
-
callbacks.onTranscript(normalized);
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
});
|
|
370
|
-
|
|
371
|
-
ws.addEventListener("error", (event) => {
|
|
372
|
-
if (stopped) return;
|
|
373
|
-
const message =
|
|
374
|
-
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
375
|
-
callbacks.onError(new Error(`${driver.label} WebSocket error: ${message}`));
|
|
376
|
-
stop();
|
|
377
|
-
});
|
|
378
|
-
|
|
379
|
-
ws.addEventListener("close", (event) => {
|
|
380
|
-
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
381
|
-
const reason = event.reason ? `: ${event.reason}` : "";
|
|
382
|
-
callbacks.onError(
|
|
383
|
-
new Error(
|
|
384
|
-
`${driver.label} WebSocket closed (code ${event.code}${reason})`,
|
|
385
|
-
),
|
|
386
|
-
);
|
|
387
|
-
}
|
|
388
|
-
stopped = true;
|
|
389
|
-
try {
|
|
390
|
-
child.kill("SIGTERM");
|
|
391
|
-
} catch {
|
|
392
|
-
// ignore
|
|
393
|
-
}
|
|
394
|
-
emitClose();
|
|
395
|
-
});
|
|
396
|
-
|
|
397
|
-
/**
|
|
398
|
-
* @param {Buffer} chunk
|
|
399
|
-
*/
|
|
400
|
-
function sendAudio(chunk) {
|
|
401
|
-
const payload = driver.buildAudioMessage(chunk.toString("base64"));
|
|
402
|
-
try {
|
|
403
|
-
ws.send(JSON.stringify(payload));
|
|
404
|
-
} catch {
|
|
405
|
-
// connection may have just closed
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
if (DEBUG) {
|
|
410
|
-
process.stderr.write(
|
|
411
|
-
`[voiceInput] driver=${driver.label} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
|
|
412
|
-
);
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
/**
|
|
416
|
-
* @returns {Promise<void>}
|
|
417
|
-
*/
|
|
418
|
-
async function stop() {
|
|
419
|
-
if (stopped) return;
|
|
420
|
-
stopped = true;
|
|
421
|
-
try {
|
|
422
|
-
child.kill("SIGTERM");
|
|
423
|
-
} catch {
|
|
424
|
-
// ignore
|
|
425
|
-
}
|
|
426
|
-
if (
|
|
427
|
-
ws.readyState === WebSocket.OPEN ||
|
|
428
|
-
ws.readyState === WebSocket.CONNECTING
|
|
429
|
-
) {
|
|
430
|
-
try {
|
|
431
|
-
ws.close(1000, "client stop");
|
|
432
|
-
} catch {
|
|
433
|
-
// ignore
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
emitClose();
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
return { stop };
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
/**
|
|
443
|
-
* @typedef {Object} VoiceDriver
|
|
444
|
-
* @property {string} label
|
|
445
|
-
* @property {number} sampleRate
|
|
446
|
-
* @property {() => WebSocket} connect
|
|
447
|
-
* @property {() => object} buildSetup
|
|
448
|
-
* @property {(message: Record<string, unknown>) => boolean} isReady
|
|
449
|
-
* @property {(base64: string) => object} buildAudioMessage
|
|
450
|
-
* @property {(message: Record<string, unknown>) => string | null} parseTranscript
|
|
451
|
-
*/
|
|
452
|
-
|
|
453
|
-
/**
|
|
454
|
-
* @param {VoiceInputConfig} config
|
|
455
|
-
* @returns {VoiceDriver}
|
|
456
|
-
*/
|
|
457
|
-
function createDriver(config) {
|
|
458
50
|
if (config.provider === "openai") {
|
|
459
|
-
return
|
|
51
|
+
return startOpenAIVoiceSession({ config, callbacks });
|
|
460
52
|
}
|
|
461
53
|
if (config.provider === "gemini") {
|
|
462
|
-
return
|
|
54
|
+
return startGeminiVoiceSession({ config, callbacks });
|
|
463
55
|
}
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
|
|
470
|
-
const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
|
|
471
|
-
const OPENAI_SAMPLE_RATE = 24000;
|
|
472
|
-
|
|
473
|
-
/**
|
|
474
|
-
* @param {VoiceInputOpenAIConfig} config
|
|
475
|
-
* @returns {VoiceDriver}
|
|
476
|
-
*/
|
|
477
|
-
function createOpenAIDriver(config) {
|
|
478
|
-
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
479
|
-
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
480
|
-
return {
|
|
481
|
-
label: "OpenAI Realtime",
|
|
482
|
-
sampleRate: OPENAI_SAMPLE_RATE,
|
|
483
|
-
connect() {
|
|
484
|
-
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
485
|
-
// option. The built-in typings only declare the standards-compliant
|
|
486
|
-
// constructor, so cast through `WebSocket`-as-constructor.
|
|
487
|
-
const Ctor =
|
|
488
|
-
/** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
489
|
-
/** @type {unknown} */ (WebSocket)
|
|
490
|
-
);
|
|
491
|
-
return new Ctor(`${base}?intent=transcription`, {
|
|
492
|
-
headers: {
|
|
493
|
-
Authorization: `Bearer ${config.apiKey}`,
|
|
494
|
-
"OpenAI-Beta": "realtime=v1",
|
|
495
|
-
},
|
|
496
|
-
});
|
|
497
|
-
},
|
|
498
|
-
buildSetup() {
|
|
499
|
-
/** @type {{ model: string, language?: string }} */
|
|
500
|
-
const transcription = { model };
|
|
501
|
-
if (config.language) transcription.language = config.language;
|
|
502
|
-
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
503
|
-
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
504
|
-
return {
|
|
505
|
-
type: "transcription_session.update",
|
|
506
|
-
session: {
|
|
507
|
-
input_audio_format: "pcm16",
|
|
508
|
-
input_audio_transcription: transcription,
|
|
509
|
-
turn_detection: { type: "server_vad" },
|
|
510
|
-
},
|
|
511
|
-
};
|
|
512
|
-
},
|
|
513
|
-
isReady(message) {
|
|
514
|
-
return (
|
|
515
|
-
message.type === "transcription_session.created" ||
|
|
516
|
-
message.type === "transcription_session.updated"
|
|
517
|
-
);
|
|
518
|
-
},
|
|
519
|
-
buildAudioMessage(base64) {
|
|
520
|
-
return { type: "input_audio_buffer.append", audio: base64 };
|
|
521
|
-
},
|
|
522
|
-
parseTranscript(message) {
|
|
523
|
-
if (
|
|
524
|
-
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
525
|
-
typeof message.delta === "string" &&
|
|
526
|
-
message.delta.length > 0
|
|
527
|
-
) {
|
|
528
|
-
return message.delta;
|
|
529
|
-
}
|
|
530
|
-
return null;
|
|
531
|
-
},
|
|
532
|
-
};
|
|
533
|
-
}
|
|
534
|
-
|
|
535
|
-
const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
|
|
536
|
-
const GEMINI_DEFAULT_WS =
|
|
537
|
-
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
|
|
538
|
-
const GEMINI_SAMPLE_RATE = 16000;
|
|
539
|
-
|
|
540
|
-
/**
|
|
541
|
-
* @param {VoiceInputGeminiConfig} config
|
|
542
|
-
* @returns {VoiceDriver}
|
|
543
|
-
*/
|
|
544
|
-
function createGeminiDriver(config) {
|
|
545
|
-
const model = config.model ?? GEMINI_DEFAULT_MODEL;
|
|
546
|
-
const base = config.baseURL ?? GEMINI_DEFAULT_WS;
|
|
547
|
-
return {
|
|
548
|
-
label: "Gemini Live",
|
|
549
|
-
sampleRate: GEMINI_SAMPLE_RATE,
|
|
550
|
-
connect() {
|
|
551
|
-
return new WebSocket(`${base}?key=${encodeURIComponent(config.apiKey)}`);
|
|
552
|
-
},
|
|
553
|
-
buildSetup() {
|
|
554
|
-
// Gemini Live was designed for voice agents, not pure STT.
|
|
555
|
-
// Force maxOutputTokens: 1 and disable thinking on 2.5 models
|
|
556
|
-
// to minimise wasted audio output.
|
|
557
|
-
|
|
558
|
-
/** @type {Record<string, unknown>} */
|
|
559
|
-
const generationConfig = {
|
|
560
|
-
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
561
|
-
// > The native audio models only support `AUDIO response modality.
|
|
562
|
-
responseModalities: ["AUDIO"],
|
|
563
|
-
maxOutputTokens: 1,
|
|
564
|
-
};
|
|
565
|
-
if (model.includes("2.5")) {
|
|
566
|
-
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
567
|
-
}
|
|
568
|
-
/** @type {Record<string, unknown>} */
|
|
569
|
-
const setup = {
|
|
570
|
-
model: `models/${model}`,
|
|
571
|
-
generationConfig,
|
|
572
|
-
inputAudioTranscription: {},
|
|
573
|
-
};
|
|
574
|
-
if (config.language) {
|
|
575
|
-
setup.systemInstruction = {
|
|
576
|
-
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
577
|
-
};
|
|
578
|
-
}
|
|
579
|
-
return { setup };
|
|
580
|
-
},
|
|
581
|
-
isReady(message) {
|
|
582
|
-
return "setupComplete" in message;
|
|
583
|
-
},
|
|
584
|
-
buildAudioMessage(base64) {
|
|
585
|
-
return {
|
|
586
|
-
realtimeInput: {
|
|
587
|
-
audio: {
|
|
588
|
-
data: base64,
|
|
589
|
-
mimeType: `audio/pcm;rate=${GEMINI_SAMPLE_RATE}`,
|
|
590
|
-
},
|
|
591
|
-
},
|
|
592
|
-
};
|
|
593
|
-
},
|
|
594
|
-
parseTranscript(message) {
|
|
595
|
-
const serverContent = message.serverContent;
|
|
596
|
-
if (!isObject(serverContent)) return null;
|
|
597
|
-
const t = serverContent.inputTranscription;
|
|
598
|
-
if (isObject(t) && typeof t.text === "string" && t.text.length > 0) {
|
|
599
|
-
return t.text;
|
|
600
|
-
}
|
|
601
|
-
return null;
|
|
602
|
-
},
|
|
603
|
-
};
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
/**
|
|
607
|
-
* Drop whitespace sitting between two CJK characters. Some providers return
|
|
608
|
-
* Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
|
|
609
|
-
* mixed strings like "Windows を使う" keep their inter-script spaces.
|
|
610
|
-
*
|
|
611
|
-
* @returns {{ push: (text: string) => string, flush: () => string }}
|
|
612
|
-
*/
|
|
613
|
-
export function createCJKSpaceNormalizer() {
|
|
614
|
-
let prevChar = "";
|
|
615
|
-
let pendingSpaces = "";
|
|
616
|
-
const isSpace = (/** @type {string} */ c) =>
|
|
617
|
-
c === " " || c === "\t" || c === "\u3000";
|
|
618
|
-
|
|
619
|
-
return {
|
|
620
|
-
push(text) {
|
|
621
|
-
let out = "";
|
|
622
|
-
for (const ch of text) {
|
|
623
|
-
if (isSpace(ch)) {
|
|
624
|
-
pendingSpaces += ch;
|
|
625
|
-
continue;
|
|
626
|
-
}
|
|
627
|
-
if (pendingSpaces.length > 0) {
|
|
628
|
-
if (!(isCJKChar(prevChar) && isCJKChar(ch))) {
|
|
629
|
-
out += pendingSpaces;
|
|
630
|
-
}
|
|
631
|
-
pendingSpaces = "";
|
|
632
|
-
}
|
|
633
|
-
out += ch;
|
|
634
|
-
prevChar = ch;
|
|
635
|
-
}
|
|
636
|
-
return out;
|
|
637
|
-
},
|
|
638
|
-
flush() {
|
|
639
|
-
const out = pendingSpaces;
|
|
640
|
-
pendingSpaces = "";
|
|
641
|
-
prevChar = "";
|
|
642
|
-
return out;
|
|
643
|
-
},
|
|
644
|
-
};
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
/**
|
|
648
|
-
* @param {string} ch
|
|
649
|
-
* @returns {boolean}
|
|
650
|
-
*/
|
|
651
|
-
function isCJKChar(ch) {
|
|
652
|
-
const code = ch.codePointAt(0);
|
|
653
|
-
if (code === undefined) return false;
|
|
654
|
-
return (
|
|
655
|
-
(code >= 0x3000 && code <= 0x33ff) ||
|
|
656
|
-
(code >= 0x3400 && code <= 0x4dbf) ||
|
|
657
|
-
(code >= 0x4e00 && code <= 0x9fff) ||
|
|
658
|
-
(code >= 0xac00 && code <= 0xd7af) ||
|
|
659
|
-
(code >= 0xf900 && code <= 0xfaff) ||
|
|
660
|
-
(code >= 0xff00 && code <= 0xffef) ||
|
|
661
|
-
(code >= 0x20000 && code <= 0x2ffff)
|
|
56
|
+
const provider = /** @type {{ provider: string }} */ (config).provider;
|
|
57
|
+
return failVoiceSessionAsync(
|
|
58
|
+
callbacks,
|
|
59
|
+
new Error(`Unsupported voiceInput.provider: ${provider}`),
|
|
662
60
|
);
|
|
663
61
|
}
|
|
664
|
-
|
|
665
|
-
/**
|
|
666
|
-
* @param {unknown} value
|
|
667
|
-
* @returns {value is Record<string, unknown>}
|
|
668
|
-
*/
|
|
669
|
-
function isObject(value) {
|
|
670
|
-
return typeof value === "object" && value !== null;
|
|
671
|
-
}
|