@iinm/plain-agent 1.7.19 → 1.7.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -118
- package/config/agents.predefined/sandbox-configurator.md +16 -18
- package/config/config.predefined.json +15 -15
- package/config/prompts.predefined/shortcuts/configure.md +60 -0
- package/package.json +1 -3
- package/src/agentLoop.mjs +3 -1
- package/src/cliCost.mjs +67 -32
- package/src/cliFormatter.mjs +1 -1
- package/src/cliInteractive.mjs +1 -1
- package/src/config.d.ts +2 -2
- package/src/config.mjs +1 -1
- package/src/costTracker.mjs +58 -19
- package/src/env.mjs +0 -6
- package/src/main.mjs +2 -6
- package/src/model.d.ts +1 -1
- package/src/tools/patchFile.mjs +11 -12
- package/src/utils/notify.mjs +3 -2
- package/src/voiceInputGemini.mjs +58 -210
- package/src/voiceInputOpenAI.mjs +63 -220
- package/src/voiceInputSession.mjs +295 -2
- package/bin/plain-notify-terminal-bell +0 -3
package/src/voiceInputGemini.mjs
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
|
-
createCJKSpaceNormalizer,
|
|
3
|
-
detectRecorder,
|
|
4
|
-
failVoiceSessionAsync,
|
|
5
|
-
getRecorderCandidates,
|
|
6
|
-
isCommandAvailable,
|
|
7
2
|
isObjectLike,
|
|
8
|
-
|
|
9
|
-
VOICE_DEBUG,
|
|
3
|
+
startWebSocketVoiceSession,
|
|
10
4
|
} from "./voiceInputSession.mjs";
|
|
11
5
|
|
|
12
6
|
/**
|
|
13
|
-
* @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
7
|
+
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
14
8
|
*/
|
|
15
9
|
|
|
16
10
|
/**
|
|
@@ -45,213 +39,67 @@ const GEMINI_LABEL = "Gemini Live";
|
|
|
45
39
|
* @returns {VoiceSession}
|
|
46
40
|
*/
|
|
47
41
|
export function startGeminiVoiceSession({ config, callbacks }) {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
const emitClose = () => {
|
|
80
|
-
if (closeEmitted) return;
|
|
81
|
-
closeEmitted = true;
|
|
82
|
-
callbacks.onClose?.();
|
|
83
|
-
};
|
|
84
|
-
|
|
85
|
-
const ws = new WebSocket(`${base}?key=${encodeURIComponent(config.apiKey)}`);
|
|
86
|
-
ws.binaryType = "arraybuffer";
|
|
87
|
-
|
|
88
|
-
const rec = startRecorder({
|
|
89
|
-
recorder,
|
|
90
|
-
onAudio(chunk) {
|
|
91
|
-
if (stopped) return;
|
|
92
|
-
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
93
|
-
sendAudio(chunk);
|
|
94
|
-
} else {
|
|
95
|
-
pendingAudio.push(chunk);
|
|
42
|
+
/** @type {VoiceProviderHooks<VoiceInputGeminiConfig>} */
|
|
43
|
+
const hooks = {
|
|
44
|
+
label: GEMINI_LABEL,
|
|
45
|
+
sampleRate: GEMINI_SAMPLE_RATE,
|
|
46
|
+
buildWsUrl(config) {
|
|
47
|
+
const base = config.baseURL ?? GEMINI_DEFAULT_WS;
|
|
48
|
+
return `${base}?key=${encodeURIComponent(config.apiKey)}`;
|
|
49
|
+
},
|
|
50
|
+
buildSetupMessage(config) {
|
|
51
|
+
const model = config.model ?? GEMINI_DEFAULT_MODEL;
|
|
52
|
+
/** @type {Record<string, unknown>} */
|
|
53
|
+
const generationConfig = {
|
|
54
|
+
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
55
|
+
// > The native audio models only support `AUDIO` response modality.
|
|
56
|
+
responseModalities: ["AUDIO"],
|
|
57
|
+
maxOutputTokens: 1,
|
|
58
|
+
};
|
|
59
|
+
if (model.includes("2.5")) {
|
|
60
|
+
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
61
|
+
}
|
|
62
|
+
/** @type {Record<string, unknown>} */
|
|
63
|
+
const setup = {
|
|
64
|
+
model: `models/${model}`,
|
|
65
|
+
generationConfig,
|
|
66
|
+
inputAudioTranscription: {},
|
|
67
|
+
};
|
|
68
|
+
if (config.language) {
|
|
69
|
+
setup.systemInstruction = {
|
|
70
|
+
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
71
|
+
};
|
|
96
72
|
}
|
|
73
|
+
return { setup };
|
|
97
74
|
},
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
stop();
|
|
75
|
+
isReadyMessage(message) {
|
|
76
|
+
return isObjectLike(message) && "setupComplete" in message;
|
|
101
77
|
},
|
|
102
|
-
|
|
103
|
-
|
|
78
|
+
extractTranscript(message) {
|
|
79
|
+
if (!isObjectLike(message)) return undefined;
|
|
80
|
+
const serverContent = message.serverContent;
|
|
81
|
+
if (!isObjectLike(serverContent)) return undefined;
|
|
82
|
+
const transcription = serverContent.inputTranscription;
|
|
83
|
+
if (
|
|
84
|
+
isObjectLike(transcription) &&
|
|
85
|
+
typeof transcription.text === "string" &&
|
|
86
|
+
transcription.text.length > 0
|
|
87
|
+
) {
|
|
88
|
+
return transcription.text;
|
|
89
|
+
}
|
|
90
|
+
return undefined;
|
|
104
91
|
},
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
realtimeInput: {
|
|
113
|
-
audio: {
|
|
114
|
-
data: chunk.toString("base64"),
|
|
115
|
-
mimeType: `audio/pcm;rate=${GEMINI_SAMPLE_RATE}`,
|
|
92
|
+
buildAudioPayload(chunk, sampleRate) {
|
|
93
|
+
return {
|
|
94
|
+
realtimeInput: {
|
|
95
|
+
audio: {
|
|
96
|
+
data: chunk.toString("base64"),
|
|
97
|
+
mimeType: `audio/pcm;rate=${sampleRate}`,
|
|
98
|
+
},
|
|
116
99
|
},
|
|
117
|
-
},
|
|
118
|
-
};
|
|
119
|
-
try {
|
|
120
|
-
ws.send(JSON.stringify(payload));
|
|
121
|
-
} catch {
|
|
122
|
-
// connection may have just closed
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
ws.addEventListener("open", () => {
|
|
127
|
-
/** @type {Record<string, unknown>} */
|
|
128
|
-
const generationConfig = {
|
|
129
|
-
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
130
|
-
// > The native audio models only support `AUDIO` response modality.
|
|
131
|
-
responseModalities: ["AUDIO"],
|
|
132
|
-
maxOutputTokens: 1,
|
|
133
|
-
};
|
|
134
|
-
if (model.includes("2.5")) {
|
|
135
|
-
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
136
|
-
}
|
|
137
|
-
/** @type {Record<string, unknown>} */
|
|
138
|
-
const setup = {
|
|
139
|
-
model: `models/${model}`,
|
|
140
|
-
generationConfig,
|
|
141
|
-
inputAudioTranscription: {},
|
|
142
|
-
};
|
|
143
|
-
if (config.language) {
|
|
144
|
-
setup.systemInstruction = {
|
|
145
|
-
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
146
100
|
};
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
ws.send(JSON.stringify({ setup }));
|
|
150
|
-
} catch (err) {
|
|
151
|
-
callbacks.onError(
|
|
152
|
-
new Error(
|
|
153
|
-
`Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
|
|
154
|
-
),
|
|
155
|
-
);
|
|
156
|
-
stop();
|
|
157
|
-
}
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
ws.addEventListener("message", (event) => {
|
|
161
|
-
if (stopped) return;
|
|
162
|
-
let raw = "";
|
|
163
|
-
let message;
|
|
164
|
-
try {
|
|
165
|
-
raw =
|
|
166
|
-
typeof event.data === "string"
|
|
167
|
-
? event.data
|
|
168
|
-
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
169
|
-
"utf8",
|
|
170
|
-
);
|
|
171
|
-
message = JSON.parse(raw);
|
|
172
|
-
} catch (err) {
|
|
173
|
-
callbacks.onError(
|
|
174
|
-
new Error(
|
|
175
|
-
`Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
|
|
176
|
-
),
|
|
177
|
-
);
|
|
178
|
-
return;
|
|
179
|
-
}
|
|
180
|
-
if (!isObjectLike(message)) return;
|
|
181
|
-
if (VOICE_DEBUG) {
|
|
182
|
-
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
if (!ready && "setupComplete" in message) {
|
|
186
|
-
ready = true;
|
|
187
|
-
for (const chunk of pendingAudio.splice(0)) {
|
|
188
|
-
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
189
|
-
}
|
|
190
|
-
return;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
const serverContent = message.serverContent;
|
|
194
|
-
if (!isObjectLike(serverContent)) return;
|
|
195
|
-
const transcription = serverContent.inputTranscription;
|
|
196
|
-
if (
|
|
197
|
-
isObjectLike(transcription) &&
|
|
198
|
-
typeof transcription.text === "string" &&
|
|
199
|
-
transcription.text.length > 0
|
|
200
|
-
) {
|
|
201
|
-
const normalized = normalizer.push(transcription.text);
|
|
202
|
-
if (normalized.length > 0) {
|
|
203
|
-
callbacks.onTranscript(normalized);
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
ws.addEventListener("error", (event) => {
|
|
209
|
-
if (stopped) return;
|
|
210
|
-
const message =
|
|
211
|
-
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
212
|
-
callbacks.onError(new Error(`${GEMINI_LABEL} WebSocket error: ${message}`));
|
|
213
|
-
stop();
|
|
214
|
-
});
|
|
215
|
-
|
|
216
|
-
ws.addEventListener("close", (event) => {
|
|
217
|
-
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
218
|
-
const reason = event.reason ? `: ${event.reason}` : "";
|
|
219
|
-
callbacks.onError(
|
|
220
|
-
new Error(
|
|
221
|
-
`${GEMINI_LABEL} WebSocket closed (code ${event.code}${reason})`,
|
|
222
|
-
),
|
|
223
|
-
);
|
|
224
|
-
}
|
|
225
|
-
stopped = true;
|
|
226
|
-
rec.stop();
|
|
227
|
-
emitClose();
|
|
228
|
-
});
|
|
229
|
-
|
|
230
|
-
if (VOICE_DEBUG) {
|
|
231
|
-
process.stderr.write(
|
|
232
|
-
`[voiceInput] driver=${GEMINI_LABEL} recorder=${recorder.command} ${recorder.args.join(" ")}\n`,
|
|
233
|
-
);
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
/**
|
|
237
|
-
* @returns {Promise<void>}
|
|
238
|
-
*/
|
|
239
|
-
async function stop() {
|
|
240
|
-
if (stopped) return;
|
|
241
|
-
stopped = true;
|
|
242
|
-
rec.stop();
|
|
243
|
-
if (
|
|
244
|
-
ws.readyState === WebSocket.OPEN ||
|
|
245
|
-
ws.readyState === WebSocket.CONNECTING
|
|
246
|
-
) {
|
|
247
|
-
try {
|
|
248
|
-
ws.close(1000, "client stop");
|
|
249
|
-
} catch {
|
|
250
|
-
// ignore
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
emitClose();
|
|
254
|
-
}
|
|
101
|
+
},
|
|
102
|
+
};
|
|
255
103
|
|
|
256
|
-
return {
|
|
104
|
+
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
257
105
|
}
|
package/src/voiceInputOpenAI.mjs
CHANGED
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
|
-
createCJKSpaceNormalizer,
|
|
3
|
-
detectRecorder,
|
|
4
|
-
failVoiceSessionAsync,
|
|
5
|
-
getRecorderCandidates,
|
|
6
|
-
isCommandAvailable,
|
|
7
2
|
isObjectLike,
|
|
8
|
-
|
|
9
|
-
VOICE_DEBUG,
|
|
3
|
+
startWebSocketVoiceSession,
|
|
10
4
|
} from "./voiceInputSession.mjs";
|
|
11
5
|
|
|
12
6
|
/**
|
|
13
|
-
* @import { VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
7
|
+
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
14
8
|
*/
|
|
15
9
|
|
|
16
10
|
/**
|
|
@@ -40,222 +34,71 @@ const OPENAI_LABEL = "OpenAI Realtime";
|
|
|
40
34
|
* @returns {VoiceSession}
|
|
41
35
|
*/
|
|
42
36
|
export function startOpenAIVoiceSession({ config, callbacks }) {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"No voice recorder found. Install arecord, sox, or ffmpeg (or set `voiceInput.recorder`).",
|
|
51
|
-
),
|
|
52
|
-
);
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
if (!isCommandAvailable(recorder.command)) {
|
|
56
|
-
return failVoiceSessionAsync(
|
|
57
|
-
callbacks,
|
|
58
|
-
new Error(
|
|
59
|
-
`Voice recorder command "${recorder.command}" not found on PATH.`,
|
|
60
|
-
),
|
|
61
|
-
);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
65
|
-
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
66
|
-
|
|
67
|
-
let stopped = false;
|
|
68
|
-
let closeEmitted = false;
|
|
69
|
-
let ready = false;
|
|
70
|
-
/** @type {Buffer[]} */
|
|
71
|
-
const pendingAudio = [];
|
|
72
|
-
const normalizer = createCJKSpaceNormalizer();
|
|
73
|
-
|
|
74
|
-
const emitClose = () => {
|
|
75
|
-
if (closeEmitted) return;
|
|
76
|
-
closeEmitted = true;
|
|
77
|
-
callbacks.onClose?.();
|
|
78
|
-
};
|
|
79
|
-
|
|
80
|
-
// Node's global WebSocket (undici) accepts a non-standard `headers`
|
|
81
|
-
// option. The built-in typings only declare the standards-compliant
|
|
82
|
-
// constructor, so cast through `WebSocket`-as-constructor.
|
|
83
|
-
const Ctor = /** @type {new (url: string, opts?: unknown) => WebSocket} */ (
|
|
84
|
-
/** @type {unknown} */ (WebSocket)
|
|
85
|
-
);
|
|
86
|
-
const ws = new Ctor(`${base}?intent=transcription`, {
|
|
87
|
-
headers: {
|
|
88
|
-
Authorization: `Bearer ${config.apiKey}`,
|
|
89
|
-
"OpenAI-Beta": "realtime=v1",
|
|
90
|
-
},
|
|
91
|
-
});
|
|
92
|
-
ws.binaryType = "arraybuffer";
|
|
93
|
-
|
|
94
|
-
const rec = startRecorder({
|
|
95
|
-
recorder,
|
|
96
|
-
onAudio(chunk) {
|
|
97
|
-
if (stopped) return;
|
|
98
|
-
if (ready && ws.readyState === WebSocket.OPEN) {
|
|
99
|
-
sendAudio(chunk);
|
|
100
|
-
} else {
|
|
101
|
-
pendingAudio.push(chunk);
|
|
102
|
-
}
|
|
37
|
+
/** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
|
|
38
|
+
const hooks = {
|
|
39
|
+
label: OPENAI_LABEL,
|
|
40
|
+
sampleRate: OPENAI_SAMPLE_RATE,
|
|
41
|
+
buildWsUrl(config) {
|
|
42
|
+
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
43
|
+
return `${base}?intent=transcription`;
|
|
103
44
|
},
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
45
|
+
buildWsOptions(config) {
|
|
46
|
+
return {
|
|
47
|
+
headers: {
|
|
48
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
49
|
+
"OpenAI-Beta": "realtime=v1",
|
|
50
|
+
},
|
|
51
|
+
};
|
|
107
52
|
},
|
|
108
|
-
|
|
109
|
-
|
|
53
|
+
buildSetupMessage(config) {
|
|
54
|
+
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
55
|
+
/** @type {{ model: string, language?: string }} */
|
|
56
|
+
const transcription = { model };
|
|
57
|
+
if (config.language) transcription.language = config.language;
|
|
58
|
+
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
59
|
+
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
60
|
+
return {
|
|
61
|
+
type: "transcription_session.update",
|
|
62
|
+
session: {
|
|
63
|
+
input_audio_format: "pcm16",
|
|
64
|
+
input_audio_transcription: transcription,
|
|
65
|
+
turn_detection: { type: "server_vad" },
|
|
66
|
+
},
|
|
67
|
+
};
|
|
110
68
|
},
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
function sendAudio(chunk) {
|
|
117
|
-
const payload = {
|
|
118
|
-
type: "input_audio_buffer.append",
|
|
119
|
-
audio: chunk.toString("base64"),
|
|
120
|
-
};
|
|
121
|
-
try {
|
|
122
|
-
ws.send(JSON.stringify(payload));
|
|
123
|
-
} catch {
|
|
124
|
-
// connection may have just closed
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
ws.addEventListener("open", () => {
|
|
129
|
-
/** @type {{ model: string, language?: string }} */
|
|
130
|
-
const transcription = { model };
|
|
131
|
-
if (config.language) transcription.language = config.language;
|
|
132
|
-
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
133
|
-
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
134
|
-
const setup = {
|
|
135
|
-
type: "transcription_session.update",
|
|
136
|
-
session: {
|
|
137
|
-
input_audio_format: "pcm16",
|
|
138
|
-
input_audio_transcription: transcription,
|
|
139
|
-
turn_detection: { type: "server_vad" },
|
|
140
|
-
},
|
|
141
|
-
};
|
|
142
|
-
try {
|
|
143
|
-
ws.send(JSON.stringify(setup));
|
|
144
|
-
} catch (err) {
|
|
145
|
-
callbacks.onError(
|
|
146
|
-
new Error(
|
|
147
|
-
`Failed to send setup message: ${err instanceof Error ? err.message : String(err)}`,
|
|
148
|
-
),
|
|
149
|
-
);
|
|
150
|
-
stop();
|
|
151
|
-
}
|
|
152
|
-
});
|
|
153
|
-
|
|
154
|
-
ws.addEventListener("message", (event) => {
|
|
155
|
-
if (stopped) return;
|
|
156
|
-
let raw = "";
|
|
157
|
-
let message;
|
|
158
|
-
try {
|
|
159
|
-
raw =
|
|
160
|
-
typeof event.data === "string"
|
|
161
|
-
? event.data
|
|
162
|
-
: Buffer.from(/** @type {ArrayBuffer} */ (event.data)).toString(
|
|
163
|
-
"utf8",
|
|
164
|
-
);
|
|
165
|
-
message = JSON.parse(raw);
|
|
166
|
-
} catch (err) {
|
|
167
|
-
callbacks.onError(
|
|
168
|
-
new Error(
|
|
169
|
-
`Failed to parse server message: ${err instanceof Error ? err.message : String(err)}`,
|
|
170
|
-
),
|
|
171
|
-
);
|
|
172
|
-
return;
|
|
173
|
-
}
|
|
174
|
-
if (!isObjectLike(message)) return;
|
|
175
|
-
if (VOICE_DEBUG) {
|
|
176
|
-
process.stderr.write(`[voiceInput] <- ${raw.slice(0, 800)}\n`);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
if (message.type === "error" && isObjectLike(message.error)) {
|
|
180
|
-
const detail =
|
|
181
|
-
typeof message.error.message === "string"
|
|
182
|
-
? message.error.message
|
|
183
|
-
: JSON.stringify(message.error);
|
|
184
|
-
callbacks.onError(new Error(`${OPENAI_LABEL} error: ${detail}`));
|
|
185
|
-
return;
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
if (
|
|
189
|
-
!ready &&
|
|
190
|
-
(message.type === "transcription_session.created" ||
|
|
191
|
-
message.type === "transcription_session.updated")
|
|
192
|
-
) {
|
|
193
|
-
ready = true;
|
|
194
|
-
for (const chunk of pendingAudio.splice(0)) {
|
|
195
|
-
if (ws.readyState === WebSocket.OPEN) sendAudio(chunk);
|
|
196
|
-
}
|
|
197
|
-
return;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
if (
|
|
201
|
-
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
202
|
-
typeof message.delta === "string" &&
|
|
203
|
-
message.delta.length > 0
|
|
204
|
-
) {
|
|
205
|
-
const normalized = normalizer.push(message.delta);
|
|
206
|
-
if (normalized.length > 0) {
|
|
207
|
-
callbacks.onTranscript(normalized);
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
ws.addEventListener("error", (event) => {
|
|
213
|
-
if (stopped) return;
|
|
214
|
-
const message =
|
|
215
|
-
/** @type {{ message?: string }} */ (event).message ?? "WebSocket error";
|
|
216
|
-
callbacks.onError(new Error(`${OPENAI_LABEL} WebSocket error: ${message}`));
|
|
217
|
-
stop();
|
|
218
|
-
});
|
|
219
|
-
|
|
220
|
-
ws.addEventListener("close", (event) => {
|
|
221
|
-
if (!stopped && event.code !== 1000 && event.code !== 1005) {
|
|
222
|
-
const reason = event.reason ? `: ${event.reason}` : "";
|
|
223
|
-
callbacks.onError(
|
|
224
|
-
new Error(
|
|
225
|
-
`${OPENAI_LABEL} WebSocket closed (code ${event.code}${reason})`,
|
|
226
|
-
),
|
|
69
|
+
isReadyMessage(message) {
|
|
70
|
+
return (
|
|
71
|
+
isObjectLike(message) &&
|
|
72
|
+
(message.type === "transcription_session.created" ||
|
|
73
|
+
message.type === "transcription_session.updated")
|
|
227
74
|
);
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
stopped = true;
|
|
246
|
-
rec.stop();
|
|
247
|
-
if (
|
|
248
|
-
ws.readyState === WebSocket.OPEN ||
|
|
249
|
-
ws.readyState === WebSocket.CONNECTING
|
|
250
|
-
) {
|
|
251
|
-
try {
|
|
252
|
-
ws.close(1000, "client stop");
|
|
253
|
-
} catch {
|
|
254
|
-
// ignore
|
|
75
|
+
},
|
|
76
|
+
extractError(message) {
|
|
77
|
+
if (!isObjectLike(message) || message.type !== "error") return undefined;
|
|
78
|
+
const error = message.error;
|
|
79
|
+
if (!isObjectLike(error)) return undefined;
|
|
80
|
+
return typeof error.message === "string"
|
|
81
|
+
? error.message
|
|
82
|
+
: JSON.stringify(error);
|
|
83
|
+
},
|
|
84
|
+
extractTranscript(message) {
|
|
85
|
+
if (
|
|
86
|
+
isObjectLike(message) &&
|
|
87
|
+
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
88
|
+
typeof message.delta === "string" &&
|
|
89
|
+
message.delta.length > 0
|
|
90
|
+
) {
|
|
91
|
+
return message.delta;
|
|
255
92
|
}
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
93
|
+
return undefined;
|
|
94
|
+
},
|
|
95
|
+
buildAudioPayload(chunk, _sampleRate) {
|
|
96
|
+
return {
|
|
97
|
+
type: "input_audio_buffer.append",
|
|
98
|
+
audio: chunk.toString("base64"),
|
|
99
|
+
};
|
|
100
|
+
},
|
|
101
|
+
};
|
|
259
102
|
|
|
260
|
-
return {
|
|
103
|
+
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
261
104
|
}
|