@iinm/plain-agent 1.7.18 → 1.7.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ import {
2
+ createCJKSpaceNormalizer,
3
+ detectRecorder,
4
+ failVoiceSessionAsync,
5
+ getRecorderCandidates,
6
+ isCommandAvailable,
7
+ isObjectLike,
8
+ startRecorder,
9
+ VOICE_DEBUG,
10
+ } from "./voiceInputSession.mjs";
11
+
12
+ /**
13
+ * @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
14
+ */
15
+
16
+ /**
17
+ * @typedef {Object} VoiceInputGeminiConfig
18
+ * @property {"gemini"} provider
19
+ * @property {string} apiKey
20
+ * @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
21
+ * @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
22
+ * @property {string} [baseURL]
23
+ * @property {VoiceRecorderConfig} [recorder]
24
+ * @property {string} [toggleKey]
25
+ */
26
+
27
+ const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
28
+ const GEMINI_DEFAULT_WS =
29
+ "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
30
+ const GEMINI_SAMPLE_RATE = 16000;
31
+ const GEMINI_LABEL = "Gemini Live";
32
+
33
+ /**
34
+ * Start a voice input session backed by the Gemini Live BidiGenerateContent
35
+ * WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
36
+ * forwards transcript deltas via `onTranscript`.
37
+ *
38
+ * Gemini Live was designed for voice agents, not pure STT, so the setup
39
+ * message forces `maxOutputTokens: 1` and disables thinking on 2.5 models
40
+ * to minimise wasted audio output.
41
+ *
42
+ * @param {object} options
43
+ * @param {VoiceInputGeminiConfig} options.config
44
+ * @param {VoiceSessionCallbacks} options.callbacks
45
+ * @returns {VoiceSession}
46
+ */
47
+ export function startGeminiVoiceSession({ config, callbacks }) {
48
+ const recorder =
49
+ config.recorder ??
50
+ detectRecorder(getRecorderCandidates(GEMINI_SAMPLE_RATE));
51
+ if (!recorder) {
52
+ return failVoiceSessionAsync(
53
+ callbacks,
54
+ new Error(
55
+ "No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
56
+ ),
57
+ );
58
+ }
59
+
60
+ if (!isCommandAvailable(recorder.command)) {
61
+ return failVoiceSessionAsync(
62
+ callbacks,
63
+ new Error(
64
+ `Voice recorder command "${recorder.command}" not found on PATH.`,
65
+ ),
66
+ );
67
+ }
68
+
69
+ const model = config.model ?? GEMINI_DEFAULT_MODEL;
70
+ const base = config.baseURL ?? GEMINI_DEFAULT_WS;
71
+
72
+ let stopped = false;
73
+ let closeEmitted = false;
74
+ let ready = false;
75
+ /** @type {Buffer[]} */
76
+ const pendingAudio = [];
77
+ const normalizer = createCJKSpaceNormalizer();
78
+
79
+ const emitClose = () => {
80
+ if (closeEmitted) return;
81
+ closeEmitted = true;
82
+ callbacks.onClose?.();
83
+ };
84
+
85
+ const ws = new WebSocket(`${base}?key=${encodeURIComponent(config.apiKey)}`);
86
+ ws.binaryType = "arraybuffer";
87
+
88
+ const rec = startRecorder({
89
+ recorder,
90
+ onAudio(chunk) {
91
+ if (stopped) return;
92
+ if (ready && ws.readyState === WebSocket.OPEN) {
93
+ sendAudio(chunk);
94
+ } else {
95
+ pendingAudio.push(chunk);
96
+ }
97
+ },
98
+ onError(err) {
99
+ if (!stopped) callbacks.onError(err);
100
+ stop();
101
+ },
102
+ onExit() {
103
+ stop();
104
+ },
105
+ });
106
+
107
+ /**
108
+ * @param {Buffer} chunk
109
+ */
110
+ function sendAudio(chunk) {
111
+ const payload = {
112
+ realtimeInput: {
113
+ audio: {
114
+ data: chunk.toString("base64"),
115
+ mimeType: `audio/pcm;rate=${GEMINI_SAMPLE_RATE}`,
116
+ },
117
+ },
118
+ };
119
+ try {
120
+ ws.send(JSON.stringify(payload));
121
+ } catch {
122
+ // connection may have just closed
123
+ }
124
+ }
125
+
126
+ ws.addEventListener("open", () => {
127
+ /** @type {Record<string, unknown>} */
128
+ const generationConfig = {
129
+ // https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
130
+ // > The native audio models only support `AUDIO` response modality.
131
+ responseModalities: ["AUDIO"],
132
+ maxOutputTokens: 1,
133
+ };
134
+ if (model.includes("2.5")) {
135
+ generationConfig.thinkingConfig = { thinkingBudget: 0 };
136
+ }
137
+ /** @type {Record<string, unknown>} */
138
+ const setup = {
139
+ model: `models/${model}`,
140
+ generationConfig,
141
+ inputAudioTranscription: {},
142
+ };
143
+ if (config.language) {
144
+ setup.systemInstruction = {
145
+ parts: [{ text: `The user is speaking in ${config.language}.` }],
146
+ };
147
+ }
148
+ try {
149
+ ws.send(JSON.stringify({ setup }));
150
+ } catch (err) {
151
+ callbacks.onError(
152
+ new Error(
153
+ `Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
154
+ ),
155
+ );
156
+ stop();
157
+ }
158
+ });
159
+
160
+ ws.addEventListener("message", (event) => {
161
+ if (stopped) return;
162
+ let raw = "";
163
+ let message;
164
+ try {
165
+ raw =
166
+ typeof event.data === "string"
167
+ ? event.data
168
+ : Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
169
+ "utf8",
170
+ );
171
+ message = JSON.parse(raw);
172
+ } catch (err) {
173
+ callbacks.onError(
174
+ new Error(
175
+ `Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
176
+ ),
177
+ );
178
+ return;
179
+ }
180
+ if (!isObjectLike(message)) return;
181
+ if (VOICE_DEBUG) {
182
+ process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
183
+ }
184
+
185
+ if (!ready && "setupComplete" in message) {
186
+ ready = true;
187
+ for (const chunk of pendingAudio.splice(0)) {
188
+ if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
189
+ }
190
+ return;
191
+ }
192
+
193
+ const serverContent = message.serverContent;
194
+ if (!isObjectLike(serverContent)) return;
195
+ const transcription = serverContent.inputTranscription;
196
+ if (
197
+ isObjectLike(transcription) &&
198
+ typeof transcription.text === "string" &&
199
+ transcription.text.length > 0
200
+ ) {
201
+ const normalized = normalizer.push(transcription.text);
202
+ if (normalized.length > 0) {
203
+ callbacks.onTranscript(normalized);
204
+ }
205
+ }
206
+ });
207
+
208
+ ws.addEventListener("error", (event) => {
209
+ if (stopped) return;
210
+ const message =
211
+ /** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
212
+ callbacks.onError(new Error(`${GEMINI_LABEL} WebSocket error: ${message}`));
213
+ stop();
214
+ });
215
+
216
+ ws.addEventListener("close", (event) => {
217
+ if (!stopped && event.code !== 1000 && event.code !== 1005) {
218
+ const reason = event.reason ? `: ${event.reason}` : "";
219
+ callbacks.onError(
220
+ new Error(
221
+ `${GEMINI_LABEL} WebSocket closed (code ${event.code}${reason})`,
222
+ ),
223
+ );
224
+ }
225
+ stopped = true;
226
+ rec.stop();
227
+ emitClose();
228
+ });
229
+
230
+ if (VOICE_DEBUG) {
231
+ process.stderr.write(
232
+ `[voiceInput] driver=${GEMINI_LABEL} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
233
+ );
234
+ }
235
+
236
+ /**
237
+ * @returns {Promise<void>}
238
+ */
239
+ async function stop() {
240
+ if (stopped) return;
241
+ stopped = true;
242
+ rec.stop();
243
+ if (
244
+ ws.readyState === WebSocket.OPEN ||
245
+ ws.readyState === WebSocket.CONNECTING
246
+ ) {
247
+ try {
248
+ ws.close(1000, "client stop");
249
+ } catch {
250
+ // ignore
251
+ }
252
+ }
253
+ emitClose();
254
+ }
255
+
256
+ return { stop };
257
+ }
@@ -0,0 +1,261 @@
1
+ import {
2
+ createCJKSpaceNormalizer,
3
+ detectRecorder,
4
+ failVoiceSessionAsync,
5
+ getRecorderCandidates,
6
+ isCommandAvailable,
7
+ isObjectLike,
8
+ startRecorder,
9
+ VOICE_DEBUG,
10
+ } from "./voiceInputSession.mjs";
11
+
12
+ /**
13
+ * @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
14
+ */
15
+
16
+ /**
17
+ * @typedef {Object} VoiceInputOpenAIConfig
18
+ * @property {"openai"} provider
19
+ * @property {string} apiKey
20
+ * @property {string} [model] - Defaults to "gpt-4o-transcribe".
21
+ * @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
22
+ * @property {string} [baseURL]
23
+ * @property {VoiceRecorderConfig} [recorder]
24
+ * @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
25
+ */
26
+
27
+ const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
28
+ const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
29
+ const OPENAI_SAMPLE_RATE = 24000;
30
+ const OPENAI_LABEL = "OpenAI Realtime";
31
+
32
+ /**
33
+ * Start a voice input session backed by the OpenAI Realtime transcription
34
+ * WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
35
+ * forwards transcript deltas via `onTranscript`.
36
+ *
37
+ * @param {object} options
38
+ * @param {VoiceInputOpenAIConfig} options.config
39
+ * @param {VoiceSessionCallbacks} options.callbacks
40
+ * @returns {VoiceSession}
41
+ */
42
+ export function startOpenAIVoiceSession({ config, callbacks }) {
43
+ const recorder =
44
+ config.recorder ??
45
+ detectRecorder(getRecorderCandidates(OPENAI_SAMPLE_RATE));
46
+ if (!recorder) {
47
+ return failVoiceSessionAsync(
48
+ callbacks,
49
+ new Error(
50
+ "No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
51
+ ),
52
+ );
53
+ }
54
+
55
+ if (!isCommandAvailable(recorder.command)) {
56
+ return failVoiceSessionAsync(
57
+ callbacks,
58
+ new Error(
59
+ `Voice recorder command "${recorder.command}" not found on PATH.`,
60
+ ),
61
+ );
62
+ }
63
+
64
+ const model = config.model ?? OPENAI_DEFAULT_MODEL;
65
+ const base = config.baseURL ?? OPENAI_DEFAULT_WS;
66
+
67
+ let stopped = false;
68
+ let closeEmitted = false;
69
+ let ready = false;
70
+ /** @type {Buffer[]} */
71
+ const pendingAudio = [];
72
+ const normalizer = createCJKSpaceNormalizer();
73
+
74
+ const emitClose = () => {
75
+ if (closeEmitted) return;
76
+ closeEmitted = true;
77
+ callbacks.onClose?.();
78
+ };
79
+
80
+ // Node's global WebSocket (undici) accepts a non-standard `headers`
81
+ // option. The built-in typings only declare the standards-compliant
82
+ // constructor, so cast through `WebSocket`-as-constructor.
83
+ const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
84
+ /** @type {unknown} */ (WebSocket)
85
+ );
86
+ const ws = new Ctor(`${base}?intent=transcription`, {
87
+ headers: {
88
+ Authorization: `Bearer ${config.apiKey}`,
89
+ "OpenAI-Beta": "realtime=v1",
90
+ },
91
+ });
92
+ ws.binaryType = "arraybuffer";
93
+
94
+ const rec = startRecorder({
95
+ recorder,
96
+ onAudio(chunk) {
97
+ if (stopped) return;
98
+ if (ready && ws.readyState === WebSocket.OPEN) {
99
+ sendAudio(chunk);
100
+ } else {
101
+ pendingAudio.push(chunk);
102
+ }
103
+ },
104
+ onError(err) {
105
+ if (!stopped) callbacks.onError(err);
106
+ stop();
107
+ },
108
+ onExit() {
109
+ stop();
110
+ },
111
+ });
112
+
113
+ /**
114
+ * @param {Buffer} chunk
115
+ */
116
+ function sendAudio(chunk) {
117
+ const payload = {
118
+ type: "input_audio_buffer.append",
119
+ audio: chunk.toString("base64"),
120
+ };
121
+ try {
122
+ ws.send(JSON.stringify(payload));
123
+ } catch {
124
+ // connection may have just closed
125
+ }
126
+ }
127
+
128
+ ws.addEventListener("open", () => {
129
+ /** @type {{ model: string, language?: string }} */
130
+ const transcription = { model };
131
+ if (config.language) transcription.language = config.language;
132
+ // The `?intent=transcription` endpoint uses the flat transcription-session
133
+ // schema, not the nested `session.audio.input.*` realtime schema.
134
+ const setup = {
135
+ type: "transcription_session.update",
136
+ session: {
137
+ input_audio_format: "pcm16",
138
+ input_audio_transcription: transcription,
139
+ turn_detection: { type: "server_vad" },
140
+ },
141
+ };
142
+ try {
143
+ ws.send(JSON.stringify(setup));
144
+ } catch (err) {
145
+ callbacks.onError(
146
+ new Error(
147
+ `Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
148
+ ),
149
+ );
150
+ stop();
151
+ }
152
+ });
153
+
154
+ ws.addEventListener("message", (event) => {
155
+ if (stopped) return;
156
+ let raw = "";
157
+ let message;
158
+ try {
159
+ raw =
160
+ typeof event.data === "string"
161
+ ? event.data
162
+ : Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
163
+ "utf8",
164
+ );
165
+ message = JSON.parse(raw);
166
+ } catch (err) {
167
+ callbacks.onError(
168
+ new Error(
169
+ `Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
170
+ ),
171
+ );
172
+ return;
173
+ }
174
+ if (!isObjectLike(message)) return;
175
+ if (VOICE_DEBUG) {
176
+ process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
177
+ }
178
+
179
+ if (message.type === "error" && isObjectLike(message.error)) {
180
+ const detail =
181
+ typeof message.error.message === "string"
182
+ ? message.error.message
183
+ : JSON.stringify(message.error);
184
+ callbacks.onError(new Error(`${OPENAI_LABEL} error: ${detail}`));
185
+ return;
186
+ }
187
+
188
+ if (
189
+ !ready &&
190
+ (message.type === "transcription_session.created" ||
191
+ message.type === "transcription_session.updated")
192
+ ) {
193
+ ready = true;
194
+ for (const chunk of pendingAudio.splice(0)) {
195
+ if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
196
+ }
197
+ return;
198
+ }
199
+
200
+ if (
201
+ message.type === "conversation.item.input_audio_transcription.delta" &&
202
+ typeof message.delta === "string" &&
203
+ message.delta.length > 0
204
+ ) {
205
+ const normalized = normalizer.push(message.delta);
206
+ if (normalized.length > 0) {
207
+ callbacks.onTranscript(normalized);
208
+ }
209
+ }
210
+ });
211
+
212
+ ws.addEventListener("error", (event) => {
213
+ if (stopped) return;
214
+ const message =
215
+ /** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
216
+ callbacks.onError(new Error(`${OPENAI_LABEL} WebSocket error: ${message}`));
217
+ stop();
218
+ });
219
+
220
+ ws.addEventListener("close", (event) => {
221
+ if (!stopped && event.code !== 1000 && event.code !== 1005) {
222
+ const reason = event.reason ? `: ${event.reason}` : "";
223
+ callbacks.onError(
224
+ new Error(
225
+ `${OPENAI_LABEL} WebSocket closed (code ${event.code}${reason})`,
226
+ ),
227
+ );
228
+ }
229
+ stopped = true;
230
+ rec.stop();
231
+ emitClose();
232
+ });
233
+
234
+ if (VOICE_DEBUG) {
235
+ process.stderr.write(
236
+ `[voiceInput] driver=${OPENAI_LABEL} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
237
+ );
238
+ }
239
+
240
+ /**
241
+ * @returns {Promise<void>}
242
+ */
243
+ async function stop() {
244
+ if (stopped) return;
245
+ stopped = true;
246
+ rec.stop();
247
+ if (
248
+ ws.readyState === WebSocket.OPEN ||
249
+ ws.readyState === WebSocket.CONNECTING
250
+ ) {
251
+ try {
252
+ ws.close(1000, "client stop");
253
+ } catch {
254
+ // ignore
255
+ }
256
+ }
257
+ emitClose();
258
+ }
259
+
260
+ return { stop };
261
+ }