@iinm/plain-agent 1.7.19 → 1.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -90
- package/config/config.predefined.json +15 -15
- package/package.json +1 -3
- package/src/agentLoop.mjs +3 -1
- package/src/cliCost.mjs +67 -32
- package/src/cliFormatter.mjs +1 -1
- package/src/cliInteractive.mjs +1 -1
- package/src/config.d.ts +2 -2
- package/src/config.mjs +1 -1
- package/src/costTracker.mjs +58 -19
- package/src/env.mjs +0 -6
- package/src/main.mjs +2 -6
- package/src/model.d.ts +1 -1
- package/src/tools/patchFile.mjs +11 -12
- package/src/utils/notify.mjs +3 -2
- package/src/voiceInputGemini.mjs +58 -210
- package/src/voiceInputOpenAI.mjs +63 -220
- package/src/voiceInputSession.mjs +295 -2
- package/bin/plain-notify-terminal-bell +0 -3
package/src/voiceInputOpenAI.mjs
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
|
-
createCJKSpaceNormalizer,
|
|
3
|
-
detectRecorder,
|
|
4
|
-
failVoiceSessionAsync,
|
|
5
|
-
getRecorderCandidates,
|
|
6
|
-
isCommandAvailable,
|
|
7
2
|
isObjectLike,
|
|
8
|
-
|
|
9
|
-
VOICE_DEBUG,
|
|
3
|
+
startWebSocketVoiceSession,
|
|
10
4
|
} from "./voiceInputSession.mjs";
|
|
11
5
|
|
|
12
6
|
/**
|
|
13
|
-
* @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
7
|
+
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
14
8
|
*/
|
|
15
9
|
|
|
16
10
|
/**
|
|
@@ -40,222 +34,71 @@ const OPENAI_LABEL = "OpenAI Realtime";
|
|
|
40
34
|
* @returns {VoiceSession}
|
|
41
35
|
*/
|
|
42
36
|
export function startOpenAIVoiceSession({ config, callbacks }) {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
51
|
-
),
|
|
52
|
-
);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
if (!isCommandAvailable(recorder.command)) {
|
|
56
|
-
return failVoiceSessionAsync(
|
|
57
|
-
callbacks,
|
|
58
|
-
new Error(
|
|
59
|
-
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
60
|
-
),
|
|
61
|
-
);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
65
|
-
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
66
|
-
|
|
67
|
-
let stopped = false;
|
|
68
|
-
let closeEmitted = false;
|
|
69
|
-
let ready = false;
|
|
70
|
-
/** @type {Buffer[]} */
|
|
71
|
-
const pendingAudio = [];
|
|
72
|
-
const normalizer = createCJKSpaceNormalizer();
|
|
73
|
-
|
|
74
|
-
const emitClose = () => {
|
|
75
|
-
if (closeEmitted) return;
|
|
76
|
-
closeEmitted = true;
|
|
77
|
-
callbacks.onClose?.();
|
|
78
|
-
};
|
|
79
|
-
|
|
80
|
-
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
81
|
-
// option. The built-in typings only declare the standards-compliant
|
|
82
|
-
// constructor, so cast through `WebSocket`-as-constructor.
|
|
83
|
-
const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
84
|
-
/** @type {unknown} */ (WebSocket)
|
|
85
|
-
);
|
|
86
|
-
const ws = new Ctor(`${base}?intent=transcription`, {
|
|
87
|
-
headers: {
|
|
88
|
-
Authorization: `Bearer ${config.apiKey}`,
|
|
89
|
-
"OpenAI-Beta": "realtime=v1",
|
|
90
|
-
},
|
|
91
|
-
});
|
|
92
|
-
ws.binaryType = "arraybuffer";
|
|
93
|
-
|
|
94
|
-
const rec = startRecorder({
|
|
95
|
-
recorder,
|
|
96
|
-
onAudio(chunk) {
|
|
97
|
-
if (stopped) return;
|
|
98
|
-
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
99
|
-
sendAudio(chunk);
|
|
100
|
-
} else {
|
|
101
|
-
pendingAudio.push(chunk);
|
|
102
|
-
}
|
|
37
|
+
/** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
|
|
38
|
+
const hooks = {
|
|
39
|
+
label: OPENAI_LABEL,
|
|
40
|
+
sampleRate: OPENAI_SAMPLE_RATE,
|
|
41
|
+
buildWsUrl(config) {
|
|
42
|
+
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
43
|
+
return `${base}?intent=transcription`;
|
|
103
44
|
},
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
45
|
+
buildWsOptions(config) {
|
|
46
|
+
return {
|
|
47
|
+
headers: {
|
|
48
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
49
|
+
"OpenAI-Beta": "realtime=v1",
|
|
50
|
+
},
|
|
51
|
+
};
|
|
107
52
|
},
|
|
108
|
-
|
|
109
|
-
|
|
53
|
+
buildSetupMessage(config) {
|
|
54
|
+
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
55
|
+
/** @type {{ model: string, language?: string }} */
|
|
56
|
+
const transcription = { model };
|
|
57
|
+
if (config.language) transcription.language = config.language;
|
|
58
|
+
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
59
|
+
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
60
|
+
return {
|
|
61
|
+
type: "transcription_session.update",
|
|
62
|
+
session: {
|
|
63
|
+
input_audio_format: "pcm16",
|
|
64
|
+
input_audio_transcription: transcription,
|
|
65
|
+
turn_detection: { type: "server_vad" },
|
|
66
|
+
},
|
|
67
|
+
};
|
|
110
68
|
},
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
function sendAudio(chunk) {
|
|
117
|
-
const payload = {
|
|
118
|
-
type: "input_audio_buffer.append",
|
|
119
|
-
audio: chunk.toString("base64"),
|
|
120
|
-
};
|
|
121
|
-
try {
|
|
122
|
-
ws.send(JSON.stringify(payload));
|
|
123
|
-
} catch {
|
|
124
|
-
// connection may have just closed
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
ws.addEventListener("open", () => {
|
|
129
|
-
/** @type {{ model: string, language?: string }} */
|
|
130
|
-
const transcription = { model };
|
|
131
|
-
if (config.language) transcription.language = config.language;
|
|
132
|
-
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
133
|
-
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
134
|
-
const setup = {
|
|
135
|
-
type: "transcription_session.update",
|
|
136
|
-
session: {
|
|
137
|
-
input_audio_format: "pcm16",
|
|
138
|
-
input_audio_transcription: transcription,
|
|
139
|
-
turn_detection: { type: "server_vad" },
|
|
140
|
-
},
|
|
141
|
-
};
|
|
142
|
-
try {
|
|
143
|
-
ws.send(JSON.stringify(setup));
|
|
144
|
-
} catch (err) {
|
|
145
|
-
callbacks.onError(
|
|
146
|
-
new Error(
|
|
147
|
-
`Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
|
|
148
|
-
),
|
|
149
|
-
);
|
|
150
|
-
stop();
|
|
151
|
-
}
|
|
152
|
-
});
|
|
153
|
-
|
|
154
|
-
ws.addEventListener("message", (event) => {
|
|
155
|
-
if (stopped) return;
|
|
156
|
-
let raw = "";
|
|
157
|
-
let message;
|
|
158
|
-
try {
|
|
159
|
-
raw =
|
|
160
|
-
typeof event.data === "string"
|
|
161
|
-
? event.data
|
|
162
|
-
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
163
|
-
"utf8",
|
|
164
|
-
);
|
|
165
|
-
message = JSON.parse(raw);
|
|
166
|
-
} catch (err) {
|
|
167
|
-
callbacks.onError(
|
|
168
|
-
new Error(
|
|
169
|
-
`Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
|
|
170
|
-
),
|
|
171
|
-
);
|
|
172
|
-
return;
|
|
173
|
-
}
|
|
174
|
-
if (!isObjectLike(message)) return;
|
|
175
|
-
if (VOICE_DEBUG) {
|
|
176
|
-
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
if (message.type === "error" && isObjectLike(message.error)) {
|
|
180
|
-
const detail =
|
|
181
|
-
typeof message.error.message === "string"
|
|
182
|
-
? message.error.message
|
|
183
|
-
: JSON.stringify(message.error);
|
|
184
|
-
callbacks.onError(new Error(`${OPENAI_LABEL} error: ${detail}`));
|
|
185
|
-
return;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
if (
|
|
189
|
-
!ready &&
|
|
190
|
-
(message.type === "transcription_session.created" ||
|
|
191
|
-
message.type === "transcription_session.updated")
|
|
192
|
-
) {
|
|
193
|
-
ready = true;
|
|
194
|
-
for (const chunk of pendingAudio.splice(0)) {
|
|
195
|
-
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
196
|
-
}
|
|
197
|
-
return;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
if (
|
|
201
|
-
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
202
|
-
typeof message.delta === "string" &&
|
|
203
|
-
message.delta.length > 0
|
|
204
|
-
) {
|
|
205
|
-
const normalized = normalizer.push(message.delta);
|
|
206
|
-
if (normalized.length > 0) {
|
|
207
|
-
callbacks.onTranscript(normalized);
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
ws.addEventListener("error", (event) => {
|
|
213
|
-
if (stopped) return;
|
|
214
|
-
const message =
|
|
215
|
-
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
216
|
-
callbacks.onError(new Error(`${OPENAI_LABEL} WebSocket error: ${message}`));
|
|
217
|
-
stop();
|
|
218
|
-
});
|
|
219
|
-
|
|
220
|
-
ws.addEventListener("close", (event) => {
|
|
221
|
-
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
222
|
-
const reason = event.reason ? `: ${event.reason}` : "";
|
|
223
|
-
callbacks.onError(
|
|
224
|
-
new Error(
|
|
225
|
-
`${OPENAI_LABEL} WebSocket closed (code ${event.code}${reason})`,
|
|
226
|
-
),
|
|
69
|
+
isReadyMessage(message) {
|
|
70
|
+
return (
|
|
71
|
+
isObjectLike(message) &&
|
|
72
|
+
(message.type === "transcription_session.created" ||
|
|
73
|
+
message.type === "transcription_session.updated")
|
|
227
74
|
);
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
stopped = true;
|
|
246
|
-
rec.stop();
|
|
247
|
-
if (
|
|
248
|
-
ws.readyState === WebSocket.OPEN ||
|
|
249
|
-
ws.readyState === WebSocket.CONNECTING
|
|
250
|
-
) {
|
|
251
|
-
try {
|
|
252
|
-
ws.close(1000, "client stop");
|
|
253
|
-
} catch {
|
|
254
|
-
// ignore
|
|
75
|
+
},
|
|
76
|
+
extractError(message) {
|
|
77
|
+
if (!isObjectLike(message) || message.type !== "error") return undefined;
|
|
78
|
+
const error = message.error;
|
|
79
|
+
if (!isObjectLike(error)) return undefined;
|
|
80
|
+
return typeof error.message === "string"
|
|
81
|
+
? error.message
|
|
82
|
+
: JSON.stringify(error);
|
|
83
|
+
},
|
|
84
|
+
extractTranscript(message) {
|
|
85
|
+
if (
|
|
86
|
+
isObjectLike(message) &&
|
|
87
|
+
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
88
|
+
typeof message.delta === "string" &&
|
|
89
|
+
message.delta.length > 0
|
|
90
|
+
) {
|
|
91
|
+
return message.delta;
|
|
255
92
|
}
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
93
|
+
return undefined;
|
|
94
|
+
},
|
|
95
|
+
buildAudioPayload(chunk, _sampleRate) {
|
|
96
|
+
return {
|
|
97
|
+
type: "input_audio_buffer.append",
|
|
98
|
+
audio: chunk.toString("base64"),
|
|
99
|
+
};
|
|
100
|
+
},
|
|
101
|
+
};
|
|
259
102
|
|
|
260
|
-
return {
|
|
103
|
+
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
261
104
|
}
|
|
@@ -170,6 +170,10 @@ export function startRecorder({ recorder, onAudio, onError, onExit }) {
|
|
|
170
170
|
/**
|
|
171
171
|
* Report an error asynchronously and return an already-terminated session.
|
|
172
172
|
*
|
|
173
|
+
* Calls `onError` followed by `onClose` in a microtask, ensuring the caller
|
|
174
|
+
* receives a valid {@link VoiceSession} synchronously while still notifying
|
|
175
|
+
* the consumer of the failure.
|
|
176
|
+
*
|
|
173
177
|
* @param {VoiceSessionCallbacks} callbacks
|
|
174
178
|
* @param {Error} error
|
|
175
179
|
* @returns {VoiceSession}
|
|
@@ -182,6 +186,281 @@ export function failVoiceSessionAsync(callbacks, error) {
|
|
|
182
186
|
return { stop: async () => {} };
|
|
183
187
|
}
|
|
184
188
|
|
|
189
|
+
/**
|
|
190
|
+
* Provider-specific hook contract for {@link startWebSocketVoiceSession}.
|
|
191
|
+
*
|
|
192
|
+
* Each hook is called at a specific point in the session lifecycle:
|
|
193
|
+
*
|
|
194
|
+
* 1. **Construction** – `buildWsUrl` (and optionally `buildWsOptions`) are
|
|
195
|
+
* invoked immediately to create the WebSocket.
|
|
196
|
+
* 2. **Open** – `buildSetupMessage` is sent as the first JSON message once the
|
|
197
|
+
* WebSocket opens.
|
|
198
|
+
* 3. **Ready** – `isReadyMessage` is tested on every incoming message until it
|
|
199
|
+
* returns `true`. At that point the session transitions to *ready* and any
|
|
200
|
+
* buffered audio chunks are flushed.
|
|
201
|
+
* 4. **Streaming** – `buildAudioPayload` is called for every recorder chunk
|
|
202
|
+
* while the WebSocket is open and ready.
|
|
203
|
+
* 5. **Error extraction** – `extractError` is checked on every message before
|
|
204
|
+
* transcript extraction. If it returns a string, the session reports an
|
|
205
|
+
* error and drops the message.
|
|
206
|
+
* 6. **Transcription** – `extractTranscript` is called on every message after
|
|
207
|
+
* the session is ready. Non-empty results are pushed through the CJK
|
|
208
|
+
* space normalizer and then forwarded to `onTranscript`.
|
|
209
|
+
*
|
|
210
|
+
* @template TConfig
|
|
211
|
+
* @typedef {Object} VoiceProviderHooks
|
|
212
|
+
* @property {string} label - Human-readable provider name (used in logs and
|
|
213
|
+
* error messages).
|
|
214
|
+
* @property {number} sampleRate - PCM sample rate expected by the provider
|
|
215
|
+
* (e.g. 16000 for Gemini, 24000 for OpenAI). Passed to the recorder and
|
|
216
|
+
* `buildAudioPayload`.
|
|
217
|
+
* @property {(config: TConfig) => string} buildWsUrl - Returns the full
|
|
218
|
+
* WebSocket URL, including any query parameters.
|
|
219
|
+
* @property {(config: TConfig) => { headers?: Record<string, string> }} [buildWsOptions]
|
|
220
|
+
* - Returns optional per-provider WebSocket constructor options. Node's
|
|
221
|
+
* global WebSocket (undici) accepts a non-standard `headers` option that
|
|
222
|
+
* is not declared in the standard typings.
|
|
223
|
+
* @property {(config: TConfig) => object} buildSetupMessage - Returns the
|
|
224
|
+
* first JSON message sent immediately after the WebSocket opens.
|
|
225
|
+
* @property {(message: unknown) => boolean} isReadyMessage - Returns `true`
|
|
226
|
+
* when the given server message signals that the provider is ready to
|
|
227
|
+
* receive audio.
|
|
228
|
+
* @property {(message: unknown) => string | undefined} extractTranscript -
|
|
229
|
+
* Extracts a transcript delta from a server message. Return `undefined`
|
|
230
|
+
* when the message carries no transcript.
|
|
231
|
+
* @property {(message: unknown) => string | undefined} [extractError] -
|
|
232
|
+
* Extracts an error description from a server message. Return `undefined`
|
|
233
|
+
* when the message carries no error.
|
|
234
|
+
* @property {(chunk: Buffer, sampleRate: number) => object} buildAudioPayload -
|
|
235
|
+
* Wraps a raw PCM chunk into the provider-specific JSON payload. The
|
|
236
|
+
* `sampleRate` argument is the same value as `hooks.sampleRate`.
|
|
237
|
+
*/
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Shared WebSocket voice session implementation used by both Gemini and
|
|
241
|
+
* OpenAI drivers.
|
|
242
|
+
*
|
|
243
|
+
* Responsibilities of this function:
|
|
244
|
+
* - Detect and start a suitable system audio recorder.
|
|
245
|
+
* - Establish the provider WebSocket connection.
|
|
246
|
+
* - Manage the lifecycle (setup → ready → streaming → close).
|
|
247
|
+
* - Buffer audio chunks while the connection is not yet ready.
|
|
248
|
+
* - Apply CJK space normalization to transcript text.
|
|
249
|
+
*
|
|
250
|
+
* Responsibilities of the caller (the driver):
|
|
251
|
+
* - Provide a {@link VoiceProviderHooks} object that knows the provider's
|
|
252
|
+
* wire protocol (URLs, headers, message schemas).
|
|
253
|
+
* - Supply `config` and `callbacks` from the user's call site.
|
|
254
|
+
*
|
|
255
|
+
* @template TConfig
|
|
256
|
+
* @param {object} options
|
|
257
|
+
* @param {VoiceProviderHooks<TConfig>} options.hooks
|
|
258
|
+
* @param {TConfig & { recorder?: VoiceRecorderConfig }} options.config
|
|
259
|
+
* @param {VoiceSessionCallbacks} options.callbacks
|
|
260
|
+
* @returns {VoiceSession}
|
|
261
|
+
*/
|
|
262
|
+
export function startWebSocketVoiceSession({ hooks, config, callbacks }) {
|
|
263
|
+
const recorder =
|
|
264
|
+
config.recorder ?? detectRecorder(getRecorderCandidates(hooks.sampleRate));
|
|
265
|
+
if (!recorder) {
|
|
266
|
+
return failVoiceSessionAsync(
|
|
267
|
+
callbacks,
|
|
268
|
+
new Error(
|
|
269
|
+
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
270
|
+
),
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if (!isCommandAvailable(recorder.command)) {
|
|
275
|
+
return failVoiceSessionAsync(
|
|
276
|
+
callbacks,
|
|
277
|
+
new Error(
|
|
278
|
+
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
279
|
+
),
|
|
280
|
+
);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
let stopped = false;
|
|
284
|
+
let closeEmitted = false;
|
|
285
|
+
let ready = false;
|
|
286
|
+
/** @type {Buffer[]} */
|
|
287
|
+
const pendingAudio = [];
|
|
288
|
+
const normalizer = createCJKSpaceNormalizer();
|
|
289
|
+
|
|
290
|
+
function emitClose() {
|
|
291
|
+
if (closeEmitted) return;
|
|
292
|
+
closeEmitted = true;
|
|
293
|
+
callbacks.onClose?.();
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const wsUrl = hooks.buildWsUrl(config);
|
|
297
|
+
const wsOptions = hooks.buildWsOptions?.(config);
|
|
298
|
+
|
|
299
|
+
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
300
|
+
// option. The built-in typings only declare the standards-compliant
|
|
301
|
+
// constructor, so cast through `WebSocket`-as-constructor.
|
|
302
|
+
const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
303
|
+
/** @type {unknown} */ (WebSocket)
|
|
304
|
+
);
|
|
305
|
+
const ws = new Ctor(wsUrl, wsOptions);
|
|
306
|
+
ws.binaryType = "arraybuffer";
|
|
307
|
+
|
|
308
|
+
const rec = startRecorder({
|
|
309
|
+
recorder,
|
|
310
|
+
onAudio(chunk) {
|
|
311
|
+
if (stopped) return;
|
|
312
|
+
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
313
|
+
sendAudio(chunk);
|
|
314
|
+
} else {
|
|
315
|
+
pendingAudio.push(chunk);
|
|
316
|
+
}
|
|
317
|
+
},
|
|
318
|
+
onError(err) {
|
|
319
|
+
if (!stopped) callbacks.onError(err);
|
|
320
|
+
stop();
|
|
321
|
+
},
|
|
322
|
+
onExit() {
|
|
323
|
+
stop();
|
|
324
|
+
},
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* @param {Buffer} chunk
|
|
329
|
+
*/
|
|
330
|
+
function sendAudio(chunk) {
|
|
331
|
+
const payload = hooks.buildAudioPayload(chunk, hooks.sampleRate);
|
|
332
|
+
try {
|
|
333
|
+
ws.send(JSON.stringify(payload));
|
|
334
|
+
} catch (err) {
|
|
335
|
+
if (VOICE_DEBUG) {
|
|
336
|
+
process.stderr.write(
|
|
337
|
+
`[voiceInput] sendAudio dropped: ${formatError(err)}\n`,
|
|
338
|
+
);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
ws.addEventListener("open", () => {
|
|
344
|
+
const setup = hooks.buildSetupMessage(config);
|
|
345
|
+
try {
|
|
346
|
+
ws.send(JSON.stringify(setup));
|
|
347
|
+
} catch (err) {
|
|
348
|
+
callbacks.onError(
|
|
349
|
+
new Error(`Failed to send setup message: ${formatError(err)}`),
|
|
350
|
+
);
|
|
351
|
+
stop();
|
|
352
|
+
}
|
|
353
|
+
});
|
|
354
|
+
|
|
355
|
+
ws.addEventListener("message", (event) => {
|
|
356
|
+
if (stopped) return;
|
|
357
|
+
let raw = "";
|
|
358
|
+
let message;
|
|
359
|
+
try {
|
|
360
|
+
raw =
|
|
361
|
+
typeof event.data === "string"
|
|
362
|
+
? event.data
|
|
363
|
+
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
364
|
+
"utf8",
|
|
365
|
+
);
|
|
366
|
+
message = JSON.parse(raw);
|
|
367
|
+
} catch (err) {
|
|
368
|
+
callbacks.onError(
|
|
369
|
+
new Error(`Failed to parse server message: ${formatError(err)}`),
|
|
370
|
+
);
|
|
371
|
+
return;
|
|
372
|
+
}
|
|
373
|
+
if (!isObjectLike(message)) return;
|
|
374
|
+
if (VOICE_DEBUG) {
|
|
375
|
+
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
const errorText = hooks.extractError?.(message);
|
|
379
|
+
if (errorText) {
|
|
380
|
+
callbacks.onError(new Error(`${hooks.label} error: ${errorText}`));
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
if (!ready && hooks.isReadyMessage(message)) {
|
|
385
|
+
ready = true;
|
|
386
|
+
for (const chunk of pendingAudio.splice(0)) {
|
|
387
|
+
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
388
|
+
}
|
|
389
|
+
return;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
const transcript = hooks.extractTranscript(message);
|
|
393
|
+
if (transcript && transcript.length > 0) {
|
|
394
|
+
const normalized = normalizer.push(transcript);
|
|
395
|
+
if (normalized.length > 0) {
|
|
396
|
+
callbacks.onTranscript(normalized);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
ws.addEventListener("error", (event) => {
|
|
402
|
+
if (stopped) return;
|
|
403
|
+
const message =
|
|
404
|
+
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
405
|
+
callbacks.onError(new Error(`${hooks.label} WebSocket error: ${message}`));
|
|
406
|
+
stop();
|
|
407
|
+
});
|
|
408
|
+
|
|
409
|
+
ws.addEventListener("close", (event) => {
|
|
410
|
+
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
411
|
+
const reason = event.reason ? `: ${event.reason}` : "";
|
|
412
|
+
callbacks.onError(
|
|
413
|
+
new Error(
|
|
414
|
+
`${hooks.label} WebSocket closed (code ${event.code}${reason})`,
|
|
415
|
+
),
|
|
416
|
+
);
|
|
417
|
+
}
|
|
418
|
+
stopped = true;
|
|
419
|
+
rec.stop();
|
|
420
|
+
emitClose();
|
|
421
|
+
});
|
|
422
|
+
|
|
423
|
+
if (VOICE_DEBUG) {
|
|
424
|
+
process.stderr.write(
|
|
425
|
+
`[voiceInput] driver=${hooks.label} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Stops the recorder and closes the WebSocket.
|
|
431
|
+
*
|
|
432
|
+
* **Note on asynchronicity:** This function is `async` only to satisfy the
|
|
433
|
+
* {@link VoiceSession} interface. It is called without `await` from event
|
|
434
|
+
* listeners (recorder exit, WebSocket error/close). Callers must not rely
|
|
435
|
+
* on the returned promise because unhandled rejections would crash the
|
|
436
|
+
* process. If the function is ever changed to perform real async work,
|
|
437
|
+
* every call site must wrap it with `.catch(() => {})`.
|
|
438
|
+
*/
|
|
439
|
+
async function stop() {
|
|
440
|
+
if (stopped) return;
|
|
441
|
+
stopped = true;
|
|
442
|
+
rec.stop();
|
|
443
|
+
pendingAudio.length = 0;
|
|
444
|
+
if (
|
|
445
|
+
ws.readyState === WebSocket.OPEN ||
|
|
446
|
+
ws.readyState === WebSocket.CONNECTING
|
|
447
|
+
) {
|
|
448
|
+
try {
|
|
449
|
+
ws.close(1000, "client stop");
|
|
450
|
+
} catch (err) {
|
|
451
|
+
if (VOICE_DEBUG) {
|
|
452
|
+
process.stderr.write(
|
|
453
|
+
`[voiceInput] ws.close failed: ${formatError(err)}\n`,
|
|
454
|
+
);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
emitClose();
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
return { stop };
|
|
462
|
+
}
|
|
463
|
+
|
|
185
464
|
/**
|
|
186
465
|
* Drop whitespace sitting between two CJK characters. Some providers return
|
|
187
466
|
* Japanese transcripts with morpheme-separating spaces ("そう 、 声 で");
|
|
@@ -192,8 +471,14 @@ export function failVoiceSessionAsync(callbacks, error) {
|
|
|
192
471
|
export function createCJKSpaceNormalizer() {
|
|
193
472
|
let prevChar = "";
|
|
194
473
|
let pendingSpaces = "";
|
|
195
|
-
|
|
196
|
-
|
|
474
|
+
|
|
475
|
+
/**
|
|
476
|
+
* @param {string} c
|
|
477
|
+
* @returns {boolean}
|
|
478
|
+
*/
|
|
479
|
+
function isSpace(c) {
|
|
480
|
+
return c === " " || c === "\t" || c === "\u3000";
|
|
481
|
+
}
|
|
197
482
|
|
|
198
483
|
return {
|
|
199
484
|
push(text) {
|
|
@@ -248,3 +533,11 @@ function isCJKChar(ch) {
|
|
|
248
533
|
export function isObjectLike(value) {
|
|
249
534
|
return typeof value === "object" && value !== null;
|
|
250
535
|
}
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* @param {unknown} err
|
|
539
|
+
* @returns {string}
|
|
540
|
+
*/
|
|
541
|
+
function formatError(err) {
|
|
542
|
+
return err instanceof Error ? err.message : String(err);
|
|
543
|
+
}
|