agent-voice 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ask-5J4JCHM4.js +307 -0
- package/dist/{ask-OIE6HL2H.js → ask-F6CPRZ22.js} +44 -18
- package/dist/{auth-KET5DNSE.js → auth-4VUEFCFK.js} +1 -1
- package/dist/chunk-3YEHGYHI.js +115 -0
- package/dist/chunk-NHLAAFR3.js +276 -0
- package/dist/chunk-YU5FF2L7.js +12 -0
- package/dist/chunk-ZNUQXGGO.js +145 -0
- package/dist/cli.js +403 -24
- package/dist/daemon-client-6GF277XU.js +94 -0
- package/dist/daemon-lifecycle-BNXENMXI.js +25 -0
- package/dist/daemon.js +473 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +84 -21
- package/dist/say-6EJTKNJJ.js +195 -0
- package/dist/{say-ZVF6EX52.js → say-OEQQFOCC.js} +35 -3
- package/package.json +4 -3
- package/dist/chunk-RGYWLATZ.js +0 -61
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import {
|
|
2
|
+
createRealtimeSession
|
|
3
|
+
} from "./chunk-3YEHGYHI.js";
|
|
4
|
+
import {
|
|
5
|
+
DEFAULT_VOICE,
|
|
6
|
+
SAMPLE_RATE
|
|
7
|
+
} from "./chunk-YU5FF2L7.js";
|
|
8
|
+
|
|
9
|
+
// src/ask.ts
|
|
10
|
+
import { createRequire } from "module";
|
|
11
|
+
var require2 = createRequire(import.meta.url);
|
|
12
|
+
function pcm16Rms(pcm16) {
|
|
13
|
+
const samples = Math.floor(pcm16.length / 2);
|
|
14
|
+
if (samples === 0) return 0;
|
|
15
|
+
let sumSquares = 0;
|
|
16
|
+
for (let i = 0; i < samples; i++) {
|
|
17
|
+
const value = pcm16.readInt16LE(i * 2);
|
|
18
|
+
sumSquares += value * value;
|
|
19
|
+
}
|
|
20
|
+
return Math.sqrt(sumSquares / samples);
|
|
21
|
+
}
|
|
22
|
+
function readEnvInt(name, fallback) {
|
|
23
|
+
const raw = process.env[name];
|
|
24
|
+
if (raw == null) return fallback;
|
|
25
|
+
const parsed = Number.parseInt(raw, 10);
|
|
26
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
27
|
+
}
|
|
28
|
+
async function ask(message, options = {}) {
|
|
29
|
+
const {
|
|
30
|
+
voice = DEFAULT_VOICE,
|
|
31
|
+
timeout = 30,
|
|
32
|
+
ack = false,
|
|
33
|
+
auth,
|
|
34
|
+
createSession,
|
|
35
|
+
createAudioEngine,
|
|
36
|
+
onTrace,
|
|
37
|
+
onAudioFrameSent,
|
|
38
|
+
onAssistantAudio,
|
|
39
|
+
onMicAudio
|
|
40
|
+
} = options;
|
|
41
|
+
const { AudioEngine } = require2("agent-voice-audio");
|
|
42
|
+
const streamDelayMs = readEnvInt("AGENT_VOICE_AEC_STREAM_DELAY_MS", 30);
|
|
43
|
+
const engine = (createAudioEngine ?? ((engineOptions) => new AudioEngine(engineOptions)))({
|
|
44
|
+
sampleRate: SAMPLE_RATE,
|
|
45
|
+
channels: 1,
|
|
46
|
+
enableAec: true,
|
|
47
|
+
streamDelayMs
|
|
48
|
+
});
|
|
49
|
+
engine.start();
|
|
50
|
+
const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
|
|
51
|
+
const startMs = Date.now();
|
|
52
|
+
function logEvent(event, detail) {
|
|
53
|
+
if (!debug) return;
|
|
54
|
+
const elapsed = Date.now() - startMs;
|
|
55
|
+
const suffix = detail ? ` ${detail}` : "";
|
|
56
|
+
process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
|
|
57
|
+
`);
|
|
58
|
+
}
|
|
59
|
+
function trace(event, detail) {
|
|
60
|
+
onTrace?.({ atMs: Date.now() - startMs, event, detail });
|
|
61
|
+
}
|
|
62
|
+
logEvent("start");
|
|
63
|
+
trace("start");
|
|
64
|
+
return new Promise((resolve, reject) => {
|
|
65
|
+
let transcript = "";
|
|
66
|
+
let timeoutTimer = null;
|
|
67
|
+
let responseStartTimer = null;
|
|
68
|
+
let transcriptTimer = null;
|
|
69
|
+
let capturePollTimer = null;
|
|
70
|
+
let speechDetected = false;
|
|
71
|
+
let speechStartedAtMs = 0;
|
|
72
|
+
let initialResponseDone = false;
|
|
73
|
+
let heardAssistantAudio = false;
|
|
74
|
+
let lastAssistantAudioAt = 0;
|
|
75
|
+
let nearEndEvidenceSeen = false;
|
|
76
|
+
let nearEndEvidenceAtMs = 0;
|
|
77
|
+
let nearEndEvidenceConfirmed = false;
|
|
78
|
+
let cleaned = false;
|
|
79
|
+
let settled = false;
|
|
80
|
+
async function cleanup() {
|
|
81
|
+
if (cleaned) return;
|
|
82
|
+
cleaned = true;
|
|
83
|
+
logEvent("cleanup:start");
|
|
84
|
+
trace("cleanup:start");
|
|
85
|
+
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
86
|
+
if (responseStartTimer) clearTimeout(responseStartTimer);
|
|
87
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
88
|
+
if (capturePollTimer) clearInterval(capturePollTimer);
|
|
89
|
+
try {
|
|
90
|
+
engine.stop();
|
|
91
|
+
engine.close();
|
|
92
|
+
} catch {
|
|
93
|
+
}
|
|
94
|
+
session.close();
|
|
95
|
+
logEvent("cleanup:done");
|
|
96
|
+
trace("cleanup:done");
|
|
97
|
+
}
|
|
98
|
+
function resolveOnce(value) {
|
|
99
|
+
if (settled) return;
|
|
100
|
+
settled = true;
|
|
101
|
+
cleanup().then(() => resolve(value));
|
|
102
|
+
}
|
|
103
|
+
function rejectOnce(error) {
|
|
104
|
+
if (settled) return;
|
|
105
|
+
settled = true;
|
|
106
|
+
cleanup().then(() => reject(error));
|
|
107
|
+
}
|
|
108
|
+
capturePollTimer = setInterval(() => {
|
|
109
|
+
if (settled) return;
|
|
110
|
+
let rawFrames = [];
|
|
111
|
+
let processedFrames = [];
|
|
112
|
+
try {
|
|
113
|
+
rawFrames = engine.readRawCapture(64);
|
|
114
|
+
processedFrames = engine.readProcessedCapture(64);
|
|
115
|
+
} catch (err) {
|
|
116
|
+
rejectOnce(
|
|
117
|
+
new Error(
|
|
118
|
+
`audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
|
|
119
|
+
)
|
|
120
|
+
);
|
|
121
|
+
trace("audio:capture_read_error", {
|
|
122
|
+
error: err instanceof Error ? err.message : String(err)
|
|
123
|
+
});
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
for (const frame of rawFrames) onMicAudio?.(frame);
|
|
127
|
+
if (!heardAssistantAudio) return;
|
|
128
|
+
for (const frame of processedFrames) {
|
|
129
|
+
const rms = pcm16Rms(frame);
|
|
130
|
+
const configuredMinSpeechRms = readEnvInt(
|
|
131
|
+
"AGENT_VOICE_MIN_SPEECH_RMS",
|
|
132
|
+
220
|
|
133
|
+
);
|
|
134
|
+
const relaxAfterMs = readEnvInt(
|
|
135
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
|
|
136
|
+
500
|
|
137
|
+
);
|
|
138
|
+
const relaxedMinSpeechRms = readEnvInt(
|
|
139
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
|
|
140
|
+
120
|
|
141
|
+
);
|
|
142
|
+
const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
|
|
143
|
+
if (rms >= minSpeechRms) {
|
|
144
|
+
nearEndEvidenceSeen = true;
|
|
145
|
+
nearEndEvidenceAtMs = Date.now();
|
|
146
|
+
if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
|
|
147
|
+
const evidencePreRollMs = readEnvInt(
|
|
148
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
149
|
+
200
|
|
150
|
+
);
|
|
151
|
+
const evidencePostRollMs = readEnvInt(
|
|
152
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
|
|
153
|
+
1500
|
|
154
|
+
);
|
|
155
|
+
if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
|
|
156
|
+
nearEndEvidenceConfirmed = true;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
160
|
+
}
|
|
161
|
+
onAudioFrameSent?.(frame);
|
|
162
|
+
session.sendAudio(frame);
|
|
163
|
+
}
|
|
164
|
+
if (processedFrames.length > 0) {
|
|
165
|
+
trace("audio:sent_capture", { frames: processedFrames.length });
|
|
166
|
+
}
|
|
167
|
+
}, 10);
|
|
168
|
+
const session = (createSession ?? createRealtimeSession)({
|
|
169
|
+
voice,
|
|
170
|
+
mode: "default",
|
|
171
|
+
ack,
|
|
172
|
+
auth,
|
|
173
|
+
onAudioDelta(pcm16) {
|
|
174
|
+
logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
|
|
175
|
+
trace("realtime:audio_delta", { bytes: pcm16.length });
|
|
176
|
+
heardAssistantAudio = true;
|
|
177
|
+
lastAssistantAudioAt = Date.now();
|
|
178
|
+
onAssistantAudio?.(pcm16);
|
|
179
|
+
engine.play(pcm16);
|
|
180
|
+
},
|
|
181
|
+
onTranscript(text) {
|
|
182
|
+
const echoGuardMs = readEnvInt("AGENT_VOICE_ECHO_GUARD_MS", 1500);
|
|
183
|
+
const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
|
|
184
|
+
if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
|
|
185
|
+
logEvent(
|
|
186
|
+
"realtime:transcript_ignored_echo_guard",
|
|
187
|
+
`since_assistant_ms=${sinceAssistantMs} text="${text}"`
|
|
188
|
+
);
|
|
189
|
+
trace("realtime:transcript_ignored_echo_guard", {
|
|
190
|
+
sinceAssistantMs,
|
|
191
|
+
text
|
|
192
|
+
});
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
logEvent("realtime:transcript", `text="${text}"`);
|
|
196
|
+
trace("realtime:transcript", { text });
|
|
197
|
+
if (speechDetected && !nearEndEvidenceConfirmed) {
|
|
198
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
199
|
+
text,
|
|
200
|
+
speechStartedAtMs,
|
|
201
|
+
nearEndEvidenceSeen,
|
|
202
|
+
nearEndEvidenceAtMs
|
|
203
|
+
});
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
if (transcriptTimer) {
|
|
207
|
+
clearTimeout(transcriptTimer);
|
|
208
|
+
transcriptTimer = null;
|
|
209
|
+
}
|
|
210
|
+
transcript = text;
|
|
211
|
+
if (!ack) resolveOnce(transcript);
|
|
212
|
+
},
|
|
213
|
+
onSpeechStarted() {
|
|
214
|
+
logEvent("realtime:speech_started");
|
|
215
|
+
trace("realtime:speech_started");
|
|
216
|
+
speechDetected = true;
|
|
217
|
+
speechStartedAtMs = Date.now();
|
|
218
|
+
if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
|
|
219
|
+
const evidencePreRollMs = readEnvInt(
|
|
220
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
221
|
+
200
|
|
222
|
+
);
|
|
223
|
+
if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
|
|
224
|
+
nearEndEvidenceConfirmed = true;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
if (timeoutTimer) {
|
|
228
|
+
clearTimeout(timeoutTimer);
|
|
229
|
+
timeoutTimer = null;
|
|
230
|
+
}
|
|
231
|
+
if (transcriptTimer) clearTimeout(transcriptTimer);
|
|
232
|
+
transcriptTimer = setTimeout(() => {
|
|
233
|
+
logEvent("timeout:no_transcript_after_speech");
|
|
234
|
+
trace("timeout:no_transcript_after_speech", {
|
|
235
|
+
timeoutSeconds: timeout
|
|
236
|
+
});
|
|
237
|
+
rejectOnce(
|
|
238
|
+
new Error(
|
|
239
|
+
`No transcript received within ${timeout}s after speech started`
|
|
240
|
+
)
|
|
241
|
+
);
|
|
242
|
+
}, timeout * 1e3);
|
|
243
|
+
if (!initialResponseDone && heardAssistantAudio) {
|
|
244
|
+
try {
|
|
245
|
+
engine.play(Buffer.alloc(0));
|
|
246
|
+
} catch {
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
},
|
|
250
|
+
onInitialResponseDone() {
|
|
251
|
+
logEvent("realtime:initial_response_done");
|
|
252
|
+
trace("realtime:initial_response_done");
|
|
253
|
+
initialResponseDone = true;
|
|
254
|
+
timeoutTimer = setTimeout(() => {
|
|
255
|
+
if (!speechDetected) {
|
|
256
|
+
logEvent("timeout:no_speech");
|
|
257
|
+
trace("timeout:no_speech", { timeoutSeconds: timeout });
|
|
258
|
+
rejectOnce(
|
|
259
|
+
new Error(`No speech detected within ${timeout}s timeout`)
|
|
260
|
+
);
|
|
261
|
+
}
|
|
262
|
+
}, timeout * 1e3);
|
|
263
|
+
},
|
|
264
|
+
onDone() {
|
|
265
|
+
logEvent("realtime:done");
|
|
266
|
+
trace("realtime:done");
|
|
267
|
+
if (ack) resolveOnce(transcript);
|
|
268
|
+
},
|
|
269
|
+
onError(error) {
|
|
270
|
+
logEvent("realtime:error", error);
|
|
271
|
+
trace("realtime:error", { error });
|
|
272
|
+
rejectOnce(new Error(error));
|
|
273
|
+
}
|
|
274
|
+
});
|
|
275
|
+
session.connect().then(
|
|
276
|
+
() => {
|
|
277
|
+
logEvent("realtime:connected");
|
|
278
|
+
trace("realtime:connected");
|
|
279
|
+
logEvent("realtime:send_message");
|
|
280
|
+
trace("realtime:send_message");
|
|
281
|
+
session.sendMessage(message);
|
|
282
|
+
responseStartTimer = setTimeout(() => {
|
|
283
|
+
if (!heardAssistantAudio) {
|
|
284
|
+
logEvent("timeout:no_assistant_audio");
|
|
285
|
+
trace("timeout:no_assistant_audio");
|
|
286
|
+
rejectOnce(
|
|
287
|
+
new Error("No assistant audio received after sending message")
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
}, 1e4);
|
|
291
|
+
},
|
|
292
|
+
(err) => {
|
|
293
|
+
logEvent(
|
|
294
|
+
"realtime:connect_error",
|
|
295
|
+
err instanceof Error ? err.message : String(err)
|
|
296
|
+
);
|
|
297
|
+
trace("realtime:connect_error", {
|
|
298
|
+
error: err instanceof Error ? err.message : String(err)
|
|
299
|
+
});
|
|
300
|
+
rejectOnce(err instanceof Error ? err : new Error(String(err)));
|
|
301
|
+
}
|
|
302
|
+
);
|
|
303
|
+
});
|
|
304
|
+
}
|
|
305
|
+
export {
|
|
306
|
+
ask
|
|
307
|
+
};
|
|
@@ -75,6 +75,7 @@ async function ask(message, options = {}) {
|
|
|
75
75
|
let lastAssistantAudioAt = 0;
|
|
76
76
|
let nearEndEvidenceSeen = false;
|
|
77
77
|
let nearEndEvidenceAtMs = 0;
|
|
78
|
+
let nearEndEvidenceConfirmed = false;
|
|
78
79
|
let cleaned = false;
|
|
79
80
|
let settled = false;
|
|
80
81
|
async function cleanup() {
|
|
@@ -127,10 +128,35 @@ async function ask(message, options = {}) {
|
|
|
127
128
|
if (!heardAssistantAudio) return;
|
|
128
129
|
for (const frame of processedFrames) {
|
|
129
130
|
const rms = pcm16Rms(frame);
|
|
130
|
-
const
|
|
131
|
+
const configuredMinSpeechRms = readEnvInt(
|
|
132
|
+
"AGENT_VOICE_MIN_SPEECH_RMS",
|
|
133
|
+
220
|
|
134
|
+
);
|
|
135
|
+
const relaxAfterMs = readEnvInt(
|
|
136
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
|
|
137
|
+
500
|
|
138
|
+
);
|
|
139
|
+
const relaxedMinSpeechRms = readEnvInt(
|
|
140
|
+
"AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
|
|
141
|
+
120
|
|
142
|
+
);
|
|
143
|
+
const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
|
|
131
144
|
if (rms >= minSpeechRms) {
|
|
132
145
|
nearEndEvidenceSeen = true;
|
|
133
146
|
nearEndEvidenceAtMs = Date.now();
|
|
147
|
+
if (!nearEndEvidenceConfirmed && speechStartedAtMs > 0) {
|
|
148
|
+
const evidencePreRollMs = readEnvInt(
|
|
149
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
150
|
+
200
|
|
151
|
+
);
|
|
152
|
+
const evidencePostRollMs = readEnvInt(
|
|
153
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
|
|
154
|
+
1500
|
|
155
|
+
);
|
|
156
|
+
if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs && nearEndEvidenceAtMs <= speechStartedAtMs + evidencePostRollMs) {
|
|
157
|
+
nearEndEvidenceConfirmed = true;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
134
160
|
trace("audio:near_end_evidence", { rms, minSpeechRms });
|
|
135
161
|
}
|
|
136
162
|
onAudioFrameSent?.(frame);
|
|
@@ -169,23 +195,14 @@ async function ask(message, options = {}) {
|
|
|
169
195
|
}
|
|
170
196
|
logEvent("realtime:transcript", `text="${text}"`);
|
|
171
197
|
trace("realtime:transcript", { text });
|
|
172
|
-
if (speechDetected) {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
text,
|
|
181
|
-
speechStartedAtMs,
|
|
182
|
-
nearEndEvidenceSeen,
|
|
183
|
-
nearEndEvidenceAtMs,
|
|
184
|
-
evidenceAgeMs,
|
|
185
|
-
evidenceWindowMs
|
|
186
|
-
});
|
|
187
|
-
return;
|
|
188
|
-
}
|
|
198
|
+
if (speechDetected && !nearEndEvidenceConfirmed) {
|
|
199
|
+
trace("realtime:transcript_ignored_no_near_end_evidence", {
|
|
200
|
+
text,
|
|
201
|
+
speechStartedAtMs,
|
|
202
|
+
nearEndEvidenceSeen,
|
|
203
|
+
nearEndEvidenceAtMs
|
|
204
|
+
});
|
|
205
|
+
return;
|
|
189
206
|
}
|
|
190
207
|
if (transcriptTimer) {
|
|
191
208
|
clearTimeout(transcriptTimer);
|
|
@@ -199,6 +216,15 @@ async function ask(message, options = {}) {
|
|
|
199
216
|
trace("realtime:speech_started");
|
|
200
217
|
speechDetected = true;
|
|
201
218
|
speechStartedAtMs = Date.now();
|
|
219
|
+
if (nearEndEvidenceSeen && !nearEndEvidenceConfirmed) {
|
|
220
|
+
const evidencePreRollMs = readEnvInt(
|
|
221
|
+
"AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
|
|
222
|
+
200
|
|
223
|
+
);
|
|
224
|
+
if (nearEndEvidenceAtMs >= speechStartedAtMs - evidencePreRollMs) {
|
|
225
|
+
nearEndEvidenceConfirmed = true;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
202
228
|
if (timeoutTimer) {
|
|
203
229
|
clearTimeout(timeoutTimer);
|
|
204
230
|
timeoutTimer = null;
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
// src/realtime.ts
|
|
2
|
+
import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
|
|
3
|
+
var SYSTEM_INSTRUCTIONS = `
|
|
4
|
+
# Role
|
|
5
|
+
Voice relay between an AI agent and a human.
|
|
6
|
+
|
|
7
|
+
# Instructions
|
|
8
|
+
- When given a text message, read it aloud EXACTLY as written. Do not add, remove, or rephrase anything.
|
|
9
|
+
- After the human responds, acknowledge briefly \u2014 a few words only. Vary your phrasing.
|
|
10
|
+
- NEVER repeat back what the user said verbatim.
|
|
11
|
+
- NEVER ask follow-up questions.
|
|
12
|
+
- Keep every response under one sentence.
|
|
13
|
+
|
|
14
|
+
# Tone
|
|
15
|
+
- Calm, neutral, concise.
|
|
16
|
+
`.trim();
|
|
17
|
+
function createRealtimeSession(options) {
|
|
18
|
+
let rt;
|
|
19
|
+
let responseCount = 0;
|
|
20
|
+
function configureSession() {
|
|
21
|
+
const turnDetection = options.mode === "say" ? void 0 : {
|
|
22
|
+
type: "semantic_vad",
|
|
23
|
+
eagerness: "medium",
|
|
24
|
+
create_response: options.ack,
|
|
25
|
+
interrupt_response: true
|
|
26
|
+
};
|
|
27
|
+
rt.send({
|
|
28
|
+
type: "session.update",
|
|
29
|
+
session: {
|
|
30
|
+
instructions: SYSTEM_INSTRUCTIONS,
|
|
31
|
+
voice: options.voice,
|
|
32
|
+
input_audio_format: "pcm16",
|
|
33
|
+
output_audio_format: "pcm16",
|
|
34
|
+
input_audio_transcription: { model: "gpt-4o-transcribe" },
|
|
35
|
+
turn_detection: turnDetection
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
function bindEvents() {
|
|
40
|
+
rt.on("response.audio.delta", (event) => {
|
|
41
|
+
const pcm16 = Buffer.from(event.delta, "base64");
|
|
42
|
+
options.onAudioDelta(pcm16);
|
|
43
|
+
});
|
|
44
|
+
rt.on("response.audio.done", () => {
|
|
45
|
+
options.onAudioDone?.();
|
|
46
|
+
});
|
|
47
|
+
rt.on("conversation.item.input_audio_transcription.completed", (event) => {
|
|
48
|
+
options.onTranscript(event.transcript);
|
|
49
|
+
});
|
|
50
|
+
rt.on("input_audio_buffer.speech_started", () => {
|
|
51
|
+
options.onSpeechStarted();
|
|
52
|
+
});
|
|
53
|
+
rt.on("response.done", () => {
|
|
54
|
+
responseCount++;
|
|
55
|
+
if (responseCount === 1) {
|
|
56
|
+
options.onInitialResponseDone();
|
|
57
|
+
} else if (responseCount === 2) {
|
|
58
|
+
options.onDone();
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
rt.on("error", (event) => {
|
|
62
|
+
options.onError(event.error?.message ?? "Unknown realtime error");
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
return {
|
|
66
|
+
connect() {
|
|
67
|
+
return new Promise((resolve, reject) => {
|
|
68
|
+
const client = options.auth ? {
|
|
69
|
+
apiKey: options.auth.apiKey,
|
|
70
|
+
baseURL: options.auth.baseUrl ?? "https://api.openai.com/v1"
|
|
71
|
+
} : void 0;
|
|
72
|
+
rt = new OpenAIRealtimeWS({ model: "gpt-4o-realtime-preview" }, client);
|
|
73
|
+
rt.socket.on("open", () => {
|
|
74
|
+
configureSession();
|
|
75
|
+
bindEvents();
|
|
76
|
+
resolve();
|
|
77
|
+
});
|
|
78
|
+
rt.socket.on("error", (err) => {
|
|
79
|
+
reject(new Error(`WebSocket connection failed: ${err.message}`));
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
},
|
|
83
|
+
sendMessage(text) {
|
|
84
|
+
rt.send({
|
|
85
|
+
type: "conversation.item.create",
|
|
86
|
+
item: {
|
|
87
|
+
type: "message",
|
|
88
|
+
role: "user",
|
|
89
|
+
content: [
|
|
90
|
+
{
|
|
91
|
+
type: "input_text",
|
|
92
|
+
text: `Read this aloud exactly as written, word for word. Do not add, remove, or change anything:
|
|
93
|
+
|
|
94
|
+
${text}`
|
|
95
|
+
}
|
|
96
|
+
]
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
rt.send({ type: "response.create" });
|
|
100
|
+
},
|
|
101
|
+
sendAudio(pcm16) {
|
|
102
|
+
rt.send({
|
|
103
|
+
type: "input_audio_buffer.append",
|
|
104
|
+
audio: pcm16.toString("base64")
|
|
105
|
+
});
|
|
106
|
+
},
|
|
107
|
+
close() {
|
|
108
|
+
rt?.close();
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export {
|
|
114
|
+
createRealtimeSession
|
|
115
|
+
};
|