@iinm/plain-agent 1.7.19 → 1.7.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,10 @@
1
1
  import {
2
- createCJKSpaceNormalizer,
3
- detectRecorder,
4
- failVoiceSessionAsync,
5
- getRecorderCandidates,
6
- isCommandAvailable,
7
2
  isObjectLike,
8
- startRecorder,
9
- VOICE_DEBUG,
3
+ startWebSocketVoiceSession,
10
4
  } from "./voiceInputSession.mjs";
11
5
 
12
6
  /**
13
- * @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
7
+ * @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
14
8
  */
15
9
 
16
10
  /**
@@ -45,213 +39,67 @@ const GEMINI_LABEL = "Gemini Live";
45
39
  * @returns {VoiceSession}
46
40
  */
47
41
  export function startGeminiVoiceSession({ config, callbacks }) {
48
- const recorder =
49
- config.recorder ??
50
- detectRecorder(getRecorderCandidates(GEMINI_SAMPLE_RATE));
51
- if (!recorder) {
52
- return failVoiceSessionAsync(
53
- callbacks,
54
- new Error(
55
- "No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
56
- ),
57
- );
58
- }
59
-
60
- if (!isCommandAvailable(recorder.command)) {
61
- return failVoiceSessionAsync(
62
- callbacks,
63
- new Error(
64
- `Voice recorder command "${recorder.command}" not found on PATH.`,
65
- ),
66
- );
67
- }
68
-
69
- const model = config.model ?? GEMINI_DEFAULT_MODEL;
70
- const base = config.baseURL ?? GEMINI_DEFAULT_WS;
71
-
72
- let stopped = false;
73
- let closeEmitted = false;
74
- let ready = false;
75
- /** @type {Buffer[]} */
76
- const pendingAudio = [];
77
- const normalizer = createCJKSpaceNormalizer();
78
-
79
- const emitClose = () => {
80
- if (closeEmitted) return;
81
- closeEmitted = true;
82
- callbacks.onClose?.();
83
- };
84
-
85
- const ws = new WebSocket(`${base}?key=${encodeURIComponent(config.apiKey)}`);
86
- ws.binaryType = "arraybuffer";
87
-
88
- const rec = startRecorder({
89
- recorder,
90
- onAudio(chunk) {
91
- if (stopped) return;
92
- if (ready && ws.readyState === WebSocket.OPEN) {
93
- sendAudio(chunk);
94
- } else {
95
- pendingAudio.push(chunk);
42
+ /** @type {VoiceProviderHooks<VoiceInputGeminiConfig>} */
43
+ const hooks = {
44
+ label: GEMINI_LABEL,
45
+ sampleRate: GEMINI_SAMPLE_RATE,
46
+ buildWsUrl(config) {
47
+ const base = config.baseURL ?? GEMINI_DEFAULT_WS;
48
+ return `${base}?key=${encodeURIComponent(config.apiKey)}`;
49
+ },
50
+ buildSetupMessage(config) {
51
+ const model = config.model ?? GEMINI_DEFAULT_MODEL;
52
+ /** @type {Record<string, unknown>} */
53
+ const generationConfig = {
54
+ // https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
55
+ // > The native audio models only support `AUDIO` response modality.
56
+ responseModalities: ["AUDIO"],
57
+ maxOutputTokens: 1,
58
+ };
59
+ if (model.includes("2.5")) {
60
+ generationConfig.thinkingConfig = { thinkingBudget: 0 };
61
+ }
62
+ /** @type {Record<string, unknown>} */
63
+ const setup = {
64
+ model: `models/${model}`,
65
+ generationConfig,
66
+ inputAudioTranscription: {},
67
+ };
68
+ if (config.language) {
69
+ setup.systemInstruction = {
70
+ parts: [{ text: `The user is speaking in ${config.language}.` }],
71
+ };
96
72
  }
73
+ return { setup };
97
74
  },
98
- onError(err) {
99
- if (!stopped) callbacks.onError(err);
100
- stop();
75
+ isReadyMessage(message) {
76
+ return isObjectLike(message) && "setupComplete" in message;
101
77
  },
102
- onExit() {
103
- stop();
78
+ extractTranscript(message) {
79
+ if (!isObjectLike(message)) return undefined;
80
+ const serverContent = message.serverContent;
81
+ if (!isObjectLike(serverContent)) return undefined;
82
+ const transcription = serverContent.inputTranscription;
83
+ if (
84
+ isObjectLike(transcription) &&
85
+ typeof transcription.text === "string" &&
86
+ transcription.text.length > 0
87
+ ) {
88
+ return transcription.text;
89
+ }
90
+ return undefined;
104
91
  },
105
- });
106
-
107
- /**
108
- * @param {Buffer} chunk
109
- */
110
- function sendAudio(chunk) {
111
- const payload = {
112
- realtimeInput: {
113
- audio: {
114
- data: chunk.toString("base64"),
115
- mimeType: `audio/pcm;rate=${GEMINI_SAMPLE_RATE}`,
92
+ buildAudioPayload(chunk, sampleRate) {
93
+ return {
94
+ realtimeInput: {
95
+ audio: {
96
+ data: chunk.toString("base64"),
97
+ mimeType: `audio/pcm;rate=${sampleRate}`,
98
+ },
116
99
  },
117
- },
118
- };
119
- try {
120
- ws.send(JSON.stringify(payload));
121
- } catch {
122
- // connection may have just closed
123
- }
124
- }
125
-
126
- ws.addEventListener("open", () => {
127
- /** @type {Record<string, unknown>} */
128
- const generationConfig = {
129
- // https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
130
- // > The native audio models only support `AUDIO` response modality.
131
- responseModalities: ["AUDIO"],
132
- maxOutputTokens: 1,
133
- };
134
- if (model.includes("2.5")) {
135
- generationConfig.thinkingConfig = { thinkingBudget: 0 };
136
- }
137
- /** @type {Record<string, unknown>} */
138
- const setup = {
139
- model: `models/${model}`,
140
- generationConfig,
141
- inputAudioTranscription: {},
142
- };
143
- if (config.language) {
144
- setup.systemInstruction = {
145
- parts: [{ text: `The user is speaking in ${config.language}.` }],
146
100
  };
147
- }
148
- try {
149
- ws.send(JSON.stringify({ setup }));
150
- } catch (err) {
151
- callbacks.onError(
152
- new Error(
153
- `Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
154
- ),
155
- );
156
- stop();
157
- }
158
- });
159
-
160
- ws.addEventListener("message", (event) => {
161
- if (stopped) return;
162
- let raw = "";
163
- let message;
164
- try {
165
- raw =
166
- typeof event.data === "string"
167
- ? event.data
168
- : Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
169
- "utf8",
170
- );
171
- message = JSON.parse(raw);
172
- } catch (err) {
173
- callbacks.onError(
174
- new Error(
175
- `Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
176
- ),
177
- );
178
- return;
179
- }
180
- if (!isObjectLike(message)) return;
181
- if (VOICE_DEBUG) {
182
- process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
183
- }
184
-
185
- if (!ready && "setupComplete" in message) {
186
- ready = true;
187
- for (const chunk of pendingAudio.splice(0)) {
188
- if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
189
- }
190
- return;
191
- }
192
-
193
- const serverContent = message.serverContent;
194
- if (!isObjectLike(serverContent)) return;
195
- const transcription = serverContent.inputTranscription;
196
- if (
197
- isObjectLike(transcription) &&
198
- typeof transcription.text === "string" &&
199
- transcription.text.length > 0
200
- ) {
201
- const normalized = normalizer.push(transcription.text);
202
- if (normalized.length > 0) {
203
- callbacks.onTranscript(normalized);
204
- }
205
- }
206
- });
207
-
208
- ws.addEventListener("error", (event) => {
209
- if (stopped) return;
210
- const message =
211
- /** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
212
- callbacks.onError(new Error(`${GEMINI_LABEL} WebSocket error: ${message}`));
213
- stop();
214
- });
215
-
216
- ws.addEventListener("close", (event) => {
217
- if (!stopped && event.code !== 1000 && event.code !== 1005) {
218
- const reason = event.reason ? `: ${event.reason}` : "";
219
- callbacks.onError(
220
- new Error(
221
- `${GEMINI_LABEL} WebSocket closed (code ${event.code}${reason})`,
222
- ),
223
- );
224
- }
225
- stopped = true;
226
- rec.stop();
227
- emitClose();
228
- });
229
-
230
- if (VOICE_DEBUG) {
231
- process.stderr.write(
232
- `[voiceInput] driver=${GEMINI_LABEL} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
233
- );
234
- }
235
-
236
- /**
237
- * @returns {Promise<void>}
238
- */
239
- async function stop() {
240
- if (stopped) return;
241
- stopped = true;
242
- rec.stop();
243
- if (
244
- ws.readyState === WebSocket.OPEN ||
245
- ws.readyState === WebSocket.CONNECTING
246
- ) {
247
- try {
248
- ws.close(1000, "client stop");
249
- } catch {
250
- // ignore
251
- }
252
- }
253
- emitClose();
254
- }
101
+ },
102
+ };
255
103
 
256
- return { stop };
104
+ return startWebSocketVoiceSession({ hooks, config, callbacks });
257
105
  }
@@ -1,16 +1,10 @@
1
1
  import {
2
- createCJKSpaceNormalizer,
3
- detectRecorder,
4
- failVoiceSessionAsync,
5
- getRecorderCandidates,
6
- isCommandAvailable,
7
2
  isObjectLike,
8
- startRecorder,
9
- VOICE_DEBUG,
3
+ startWebSocketVoiceSession,
10
4
  } from "./voiceInputSession.mjs";
11
5
 
12
6
  /**
13
- * @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
7
+ * @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
14
8
  */
15
9
 
16
10
  /**
@@ -40,222 +34,71 @@ const OPENAI_LABEL = "OpenAI Realtime";
40
34
  * @returns {VoiceSession}
41
35
  */
42
36
  export function startOpenAIVoiceSession({ config, callbacks }) {
43
- const recorder =
44
- config.recorder ??
45
- detectRecorder(getRecorderCandidates(OPENAI_SAMPLE_RATE));
46
- if (!recorder) {
47
- return failVoiceSessionAsync(
48
- callbacks,
49
- new Error(
50
- "No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
51
- ),
52
- );
53
- }
54
-
55
- if (!isCommandAvailable(recorder.command)) {
56
- return failVoiceSessionAsync(
57
- callbacks,
58
- new Error(
59
- `Voice recorder command "${recorder.command}" not found on PATH.`,
60
- ),
61
- );
62
- }
63
-
64
- const model = config.model ?? OPENAI_DEFAULT_MODEL;
65
- const base = config.baseURL ?? OPENAI_DEFAULT_WS;
66
-
67
- let stopped = false;
68
- let closeEmitted = false;
69
- let ready = false;
70
- /** @type {Buffer[]} */
71
- const pendingAudio = [];
72
- const normalizer = createCJKSpaceNormalizer();
73
-
74
- const emitClose = () => {
75
- if (closeEmitted) return;
76
- closeEmitted = true;
77
- callbacks.onClose?.();
78
- };
79
-
80
- // Node's global WebSocket (undici) accepts a non-standard `headers`
81
- // option. The built-in typings only declare the standards-compliant
82
- // constructor, so cast through `WebSocket`-as-constructor.
83
- const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
84
- /** @type {unknown} */ (WebSocket)
85
- );
86
- const ws = new Ctor(`${base}?intent=transcription`, {
87
- headers: {
88
- Authorization: `Bearer ${config.apiKey}`,
89
- "OpenAI-Beta": "realtime=v1",
90
- },
91
- });
92
- ws.binaryType = "arraybuffer";
93
-
94
- const rec = startRecorder({
95
- recorder,
96
- onAudio(chunk) {
97
- if (stopped) return;
98
- if (ready && ws.readyState === WebSocket.OPEN) {
99
- sendAudio(chunk);
100
- } else {
101
- pendingAudio.push(chunk);
102
- }
37
+ /** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
38
+ const hooks = {
39
+ label: OPENAI_LABEL,
40
+ sampleRate: OPENAI_SAMPLE_RATE,
41
+ buildWsUrl(config) {
42
+ const base = config.baseURL ?? OPENAI_DEFAULT_WS;
43
+ return `${base}?intent=transcription`;
103
44
  },
104
- onError(err) {
105
- if (!stopped) callbacks.onError(err);
106
- stop();
45
+ buildWsOptions(config) {
46
+ return {
47
+ headers: {
48
+ Authorization: `Bearer ${config.apiKey}`,
49
+ "OpenAI-Beta": "realtime=v1",
50
+ },
51
+ };
107
52
  },
108
- onExit() {
109
- stop();
53
+ buildSetupMessage(config) {
54
+ const model = config.model ?? OPENAI_DEFAULT_MODEL;
55
+ /** @type {{ model: string, language?: string }} */
56
+ const transcription = { model };
57
+ if (config.language) transcription.language = config.language;
58
+ // The `?intent=transcription` endpoint uses the flat transcription-session
59
+ // schema, not the nested `session.audio.input.*` realtime schema.
60
+ return {
61
+ type: "transcription_session.update",
62
+ session: {
63
+ input_audio_format: "pcm16",
64
+ input_audio_transcription: transcription,
65
+ turn_detection: { type: "server_vad" },
66
+ },
67
+ };
110
68
  },
111
- });
112
-
113
- /**
114
- * @param {Buffer} chunk
115
- */
116
- function sendAudio(chunk) {
117
- const payload = {
118
- type: "input_audio_buffer.append",
119
- audio: chunk.toString("base64"),
120
- };
121
- try {
122
- ws.send(JSON.stringify(payload));
123
- } catch {
124
- // connection may have just closed
125
- }
126
- }
127
-
128
- ws.addEventListener("open", () => {
129
- /** @type {{ model: string, language?: string }} */
130
- const transcription = { model };
131
- if (config.language) transcription.language = config.language;
132
- // The `?intent=transcription` endpoint uses the flat transcription-session
133
- // schema, not the nested `session.audio.input.*` realtime schema.
134
- const setup = {
135
- type: "transcription_session.update",
136
- session: {
137
- input_audio_format: "pcm16",
138
- input_audio_transcription: transcription,
139
- turn_detection: { type: "server_vad" },
140
- },
141
- };
142
- try {
143
- ws.send(JSON.stringify(setup));
144
- } catch (err) {
145
- callbacks.onError(
146
- new Error(
147
- `Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
148
- ),
149
- );
150
- stop();
151
- }
152
- });
153
-
154
- ws.addEventListener("message", (event) => {
155
- if (stopped) return;
156
- let raw = "";
157
- let message;
158
- try {
159
- raw =
160
- typeof event.data === "string"
161
- ? event.data
162
- : Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
163
- "utf8",
164
- );
165
- message = JSON.parse(raw);
166
- } catch (err) {
167
- callbacks.onError(
168
- new Error(
169
- `Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
170
- ),
171
- );
172
- return;
173
- }
174
- if (!isObjectLike(message)) return;
175
- if (VOICE_DEBUG) {
176
- process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
177
- }
178
-
179
- if (message.type === "error" && isObjectLike(message.error)) {
180
- const detail =
181
- typeof message.error.message === "string"
182
- ? message.error.message
183
- : JSON.stringify(message.error);
184
- callbacks.onError(new Error(`${OPENAI_LABEL} error: ${detail}`));
185
- return;
186
- }
187
-
188
- if (
189
- !ready &&
190
- (message.type === "transcription_session.created" ||
191
- message.type === "transcription_session.updated")
192
- ) {
193
- ready = true;
194
- for (const chunk of pendingAudio.splice(0)) {
195
- if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
196
- }
197
- return;
198
- }
199
-
200
- if (
201
- message.type === "conversation.item.input_audio_transcription.delta" &&
202
- typeof message.delta === "string" &&
203
- message.delta.length > 0
204
- ) {
205
- const normalized = normalizer.push(message.delta);
206
- if (normalized.length > 0) {
207
- callbacks.onTranscript(normalized);
208
- }
209
- }
210
- });
211
-
212
- ws.addEventListener("error", (event) => {
213
- if (stopped) return;
214
- const message =
215
- /** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
216
- callbacks.onError(new Error(`${OPENAI_LABEL} WebSocket error: ${message}`));
217
- stop();
218
- });
219
-
220
- ws.addEventListener("close", (event) => {
221
- if (!stopped && event.code !== 1000 && event.code !== 1005) {
222
- const reason = event.reason ? `: ${event.reason}` : "";
223
- callbacks.onError(
224
- new Error(
225
- `${OPENAI_LABEL} WebSocket closed (code ${event.code}${reason})`,
226
- ),
69
+ isReadyMessage(message) {
70
+ return (
71
+ isObjectLike(message) &&
72
+ (message.type === "transcription_session.created" ||
73
+ message.type === "transcription_session.updated")
227
74
  );
228
- }
229
- stopped = true;
230
- rec.stop();
231
- emitClose();
232
- });
233
-
234
- if (VOICE_DEBUG) {
235
- process.stderr.write(
236
- `[voiceInput] driver=${OPENAI_LABEL} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
237
- );
238
- }
239
-
240
- /**
241
- * @returns {Promise<void>}
242
- */
243
- async function stop() {
244
- if (stopped) return;
245
- stopped = true;
246
- rec.stop();
247
- if (
248
- ws.readyState === WebSocket.OPEN ||
249
- ws.readyState === WebSocket.CONNECTING
250
- ) {
251
- try {
252
- ws.close(1000, "client stop");
253
- } catch {
254
- // ignore
75
+ },
76
+ extractError(message) {
77
+ if (!isObjectLike(message) || message.type !== "error") return undefined;
78
+ const error = message.error;
79
+ if (!isObjectLike(error)) return undefined;
80
+ return typeof error.message === "string"
81
+ ? error.message
82
+ : JSON.stringify(error);
83
+ },
84
+ extractTranscript(message) {
85
+ if (
86
+ isObjectLike(message) &&
87
+ message.type === "conversation.item.input_audio_transcription.delta" &&
88
+ typeof message.delta === "string" &&
89
+ message.delta.length > 0
90
+ ) {
91
+ return message.delta;
255
92
  }
256
- }
257
- emitClose();
258
- }
93
+ return undefined;
94
+ },
95
+ buildAudioPayload(chunk, _sampleRate) {
96
+ return {
97
+ type: "input_audio_buffer.append",
98
+ audio: chunk.toString("base64"),
99
+ };
100
+ },
101
+ };
259
102
 
260
- return { stop };
103
+ return startWebSocketVoiceSession({ hooks, config, callbacks });
261
104
  }