@absolutejs/voice 0.0.22-beta.127 → 0.0.22-beta.129
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -0
- package/dist/angular/index.js +26 -0
- package/dist/client/actions.d.ts +54 -0
- package/dist/client/htmxBootstrap.js +26 -0
- package/dist/client/index.js +26 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +605 -29
- package/dist/openaiRealtime.d.ts +27 -0
- package/dist/react/index.js +26 -0
- package/dist/svelte/index.js +26 -0
- package/dist/telephony/twilio.d.ts +3 -2
- package/dist/testing/index.js +113 -21
- package/dist/types.d.ts +26 -3
- package/dist/vue/index.js +26 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -3413,6 +3413,12 @@ var DEFAULT_FORMAT = {
|
|
|
3413
3413
|
encoding: "pcm_s16le",
|
|
3414
3414
|
sampleRateHz: 16000
|
|
3415
3415
|
};
|
|
3416
|
+
var DEFAULT_REALTIME_FORMAT = {
|
|
3417
|
+
channels: 1,
|
|
3418
|
+
container: "raw",
|
|
3419
|
+
encoding: "pcm_s16le",
|
|
3420
|
+
sampleRateHz: 24000
|
|
3421
|
+
};
|
|
3416
3422
|
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
3417
3423
|
var createEmptyCurrentTurn = () => ({
|
|
3418
3424
|
finalText: "",
|
|
@@ -3690,6 +3696,18 @@ var createVoiceSession = (options) => {
|
|
|
3690
3696
|
type: "call_lifecycle"
|
|
3691
3697
|
});
|
|
3692
3698
|
};
|
|
3699
|
+
const sendReplay = async (session) => {
|
|
3700
|
+
await send({
|
|
3701
|
+
assistantTexts: session.turns.flatMap((turn) => turn.assistantText ? [turn.assistantText] : []),
|
|
3702
|
+
call: session.call,
|
|
3703
|
+
partial: session.currentTurn.partialText,
|
|
3704
|
+
scenarioId: session.scenarioId,
|
|
3705
|
+
sessionId: options.id,
|
|
3706
|
+
status: session.status,
|
|
3707
|
+
turns: session.turns,
|
|
3708
|
+
type: "replay"
|
|
3709
|
+
});
|
|
3710
|
+
};
|
|
3693
3711
|
const runHandoff = async (input) => {
|
|
3694
3712
|
const queuedDelivery = options.handoff?.deliveryQueue ? createVoiceHandoffDeliveryRecord({
|
|
3695
3713
|
action: input.action,
|
|
@@ -3793,6 +3811,23 @@ var createVoiceSession = (options) => {
|
|
|
3793
3811
|
});
|
|
3794
3812
|
}
|
|
3795
3813
|
};
|
|
3814
|
+
const sendAssistantAudio = async (chunk, input) => {
|
|
3815
|
+
const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
|
|
3816
|
+
await send({
|
|
3817
|
+
chunkBase64: encodeBase64(normalizedChunk),
|
|
3818
|
+
format: input.format,
|
|
3819
|
+
receivedAt: input.receivedAt,
|
|
3820
|
+
turnId: activeTTSTurnId,
|
|
3821
|
+
type: "audio"
|
|
3822
|
+
});
|
|
3823
|
+
if (activeTTSTurnId) {
|
|
3824
|
+
await appendTurnLatencyStage({
|
|
3825
|
+
at: input.receivedAt,
|
|
3826
|
+
stage: "assistant_audio_received",
|
|
3827
|
+
turnId: activeTTSTurnId
|
|
3828
|
+
});
|
|
3829
|
+
}
|
|
3830
|
+
};
|
|
3796
3831
|
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
3797
3832
|
if (!reset && silenceTimer) {
|
|
3798
3833
|
return;
|
|
@@ -4494,8 +4529,12 @@ var createVoiceSession = (options) => {
|
|
|
4494
4529
|
if (sttSession) {
|
|
4495
4530
|
return sttSession;
|
|
4496
4531
|
}
|
|
4497
|
-
const
|
|
4498
|
-
|
|
4532
|
+
const inputAdapter = options.realtime ?? options.stt;
|
|
4533
|
+
if (!inputAdapter) {
|
|
4534
|
+
throw new Error("Voice session requires either an stt or realtime adapter.");
|
|
4535
|
+
}
|
|
4536
|
+
const openedSession = await inputAdapter.open({
|
|
4537
|
+
format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
|
|
4499
4538
|
languageStrategy: options.languageStrategy,
|
|
4500
4539
|
lexicon,
|
|
4501
4540
|
phraseHints,
|
|
@@ -4530,6 +4569,16 @@ var createVoiceSession = (options) => {
|
|
|
4530
4569
|
openedSession.on("close", (event) => {
|
|
4531
4570
|
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
4532
4571
|
});
|
|
4572
|
+
if (options.realtime) {
|
|
4573
|
+
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
4574
|
+
runAdapterEvent("adapter.audio", async () => {
|
|
4575
|
+
await sendAssistantAudio(chunk, {
|
|
4576
|
+
format,
|
|
4577
|
+
receivedAt
|
|
4578
|
+
});
|
|
4579
|
+
});
|
|
4580
|
+
});
|
|
4581
|
+
}
|
|
4533
4582
|
return openedSession;
|
|
4534
4583
|
};
|
|
4535
4584
|
const ensureTTSSession = async () => {
|
|
@@ -4554,21 +4603,10 @@ var createVoiceSession = (options) => {
|
|
|
4554
4603
|
if (ttsSession !== openedSession) {
|
|
4555
4604
|
return;
|
|
4556
4605
|
}
|
|
4557
|
-
|
|
4558
|
-
await send({
|
|
4559
|
-
chunkBase64: encodeBase64(normalizedChunk),
|
|
4606
|
+
await sendAssistantAudio(chunk, {
|
|
4560
4607
|
format,
|
|
4561
|
-
receivedAt
|
|
4562
|
-
turnId: activeTTSTurnId,
|
|
4563
|
-
type: "audio"
|
|
4608
|
+
receivedAt
|
|
4564
4609
|
});
|
|
4565
|
-
if (activeTTSTurnId) {
|
|
4566
|
-
await appendTurnLatencyStage({
|
|
4567
|
-
at: receivedAt,
|
|
4568
|
-
stage: "assistant_audio_received",
|
|
4569
|
-
turnId: activeTTSTurnId
|
|
4570
|
-
});
|
|
4571
|
-
}
|
|
4572
4610
|
});
|
|
4573
4611
|
});
|
|
4574
4612
|
openedSession.on("error", (event) => {
|
|
@@ -4647,7 +4685,8 @@ var createVoiceSession = (options) => {
|
|
|
4647
4685
|
await appendTrace({
|
|
4648
4686
|
payload: {
|
|
4649
4687
|
text: output.assistantText,
|
|
4650
|
-
ttsConfigured: Boolean(options.tts)
|
|
4688
|
+
ttsConfigured: Boolean(options.tts),
|
|
4689
|
+
realtimeConfigured: Boolean(options.realtime)
|
|
4651
4690
|
},
|
|
4652
4691
|
session,
|
|
4653
4692
|
turnId: turn.id,
|
|
@@ -4679,9 +4718,35 @@ var createVoiceSession = (options) => {
|
|
|
4679
4718
|
turnId: turn.id,
|
|
4680
4719
|
type: "turn.assistant"
|
|
4681
4720
|
});
|
|
4721
|
+
} else if (options.realtime) {
|
|
4722
|
+
const activeRealtimeSession = await ensureAdapter();
|
|
4723
|
+
const realtimeStartedAt = Date.now();
|
|
4724
|
+
activeTTSTurnId = turn.id;
|
|
4725
|
+
await appendTurnLatencyStage({
|
|
4726
|
+
at: realtimeStartedAt,
|
|
4727
|
+
session,
|
|
4728
|
+
stage: "tts_send_started",
|
|
4729
|
+
turnId: turn.id
|
|
4730
|
+
});
|
|
4731
|
+
await activeRealtimeSession.send(output.assistantText);
|
|
4732
|
+
await appendTurnLatencyStage({
|
|
4733
|
+
session,
|
|
4734
|
+
stage: "tts_send_completed",
|
|
4735
|
+
turnId: turn.id
|
|
4736
|
+
});
|
|
4737
|
+
await appendTrace({
|
|
4738
|
+
payload: {
|
|
4739
|
+
elapsedMs: Date.now() - realtimeStartedAt,
|
|
4740
|
+
mode: "realtime",
|
|
4741
|
+
status: "sent"
|
|
4742
|
+
},
|
|
4743
|
+
session,
|
|
4744
|
+
turnId: turn.id,
|
|
4745
|
+
type: "turn.assistant"
|
|
4746
|
+
});
|
|
4682
4747
|
}
|
|
4683
4748
|
} catch (error) {
|
|
4684
|
-
logger.warn("voice
|
|
4749
|
+
logger.warn("voice assistant audio send failed", {
|
|
4685
4750
|
error: toError(error).message,
|
|
4686
4751
|
sessionId: options.id,
|
|
4687
4752
|
turnId: turn.id
|
|
@@ -4689,7 +4754,7 @@ var createVoiceSession = (options) => {
|
|
|
4689
4754
|
await appendTrace({
|
|
4690
4755
|
payload: {
|
|
4691
4756
|
error: toError(error).message,
|
|
4692
|
-
status: "tts-send-failed"
|
|
4757
|
+
status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
|
|
4693
4758
|
},
|
|
4694
4759
|
session,
|
|
4695
4760
|
turnId: turn.id,
|
|
@@ -4894,7 +4959,7 @@ var createVoiceSession = (options) => {
|
|
|
4894
4959
|
turn,
|
|
4895
4960
|
type: "turn"
|
|
4896
4961
|
});
|
|
4897
|
-
if (options.sttLifecycle === "turn-scoped") {
|
|
4962
|
+
if (options.stt && options.sttLifecycle === "turn-scoped") {
|
|
4898
4963
|
await closeAdapter("turn-commit");
|
|
4899
4964
|
}
|
|
4900
4965
|
await completeTurn(updatedSession, turn);
|
|
@@ -4957,6 +5022,7 @@ var createVoiceSession = (options) => {
|
|
|
4957
5022
|
scenarioId: session.scenarioId,
|
|
4958
5023
|
type: "session"
|
|
4959
5024
|
});
|
|
5025
|
+
await sendReplay(session);
|
|
4960
5026
|
if (shouldFireOnSession) {
|
|
4961
5027
|
await options.route.onCallStart?.({
|
|
4962
5028
|
api,
|
|
@@ -5307,6 +5373,9 @@ var resolveLexicon = async (config, input) => {
|
|
|
5307
5373
|
return normalizeLexicon(config.lexicon);
|
|
5308
5374
|
};
|
|
5309
5375
|
var voice = (config) => {
|
|
5376
|
+
if (!config.stt && !config.realtime) {
|
|
5377
|
+
throw new Error("voice requires either an stt or realtime adapter.");
|
|
5378
|
+
}
|
|
5310
5379
|
const runtime = {
|
|
5311
5380
|
activeSessions: new Map,
|
|
5312
5381
|
logger: resolveLogger(config.logger),
|
|
@@ -5381,6 +5450,8 @@ var voice = (config) => {
|
|
|
5381
5450
|
socket: createSocketAdapter(ws),
|
|
5382
5451
|
store: config.session,
|
|
5383
5452
|
trace: config.trace,
|
|
5453
|
+
realtime: config.realtime,
|
|
5454
|
+
realtimeInputFormat: config.realtimeInputFormat,
|
|
5384
5455
|
stt: config.stt,
|
|
5385
5456
|
sttFallback: sessionOptions.sttFallback,
|
|
5386
5457
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
@@ -17088,13 +17159,517 @@ var createGeminiVoiceAssistantModel = (options) => {
|
|
|
17088
17159
|
}
|
|
17089
17160
|
};
|
|
17090
17161
|
};
|
|
17091
|
-
// src/
|
|
17162
|
+
// src/openaiRealtime.ts
|
|
17163
|
+
var DEFAULT_AUTO_COMMIT_SILENCE_MS = 450;
|
|
17164
|
+
var DEFAULT_BASE_URL = "wss://api.openai.com/v1/realtime";
|
|
17165
|
+
var DEFAULT_MODEL = "gpt-realtime";
|
|
17166
|
+
var DEFAULT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
|
|
17167
|
+
var DEFAULT_VOICE = "marin";
|
|
17092
17168
|
var OPENAI_PCM24_FORMAT = {
|
|
17093
17169
|
channels: 1,
|
|
17094
17170
|
container: "raw",
|
|
17095
17171
|
encoding: "pcm_s16le",
|
|
17096
17172
|
sampleRateHz: 24000
|
|
17097
17173
|
};
|
|
17174
|
+
var createListenerMap = () => ({
|
|
17175
|
+
audio: new Set,
|
|
17176
|
+
close: new Set,
|
|
17177
|
+
endOfTurn: new Set,
|
|
17178
|
+
error: new Set,
|
|
17179
|
+
final: new Set,
|
|
17180
|
+
partial: new Set
|
|
17181
|
+
});
|
|
17182
|
+
var emit = async (listeners, event, payload) => {
|
|
17183
|
+
for (const listener of listeners[event]) {
|
|
17184
|
+
await listener(payload);
|
|
17185
|
+
}
|
|
17186
|
+
};
|
|
17187
|
+
var compact = (value) => Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined));
|
|
17188
|
+
var resolveErrorMessage = (error) => {
|
|
17189
|
+
if (typeof error === "string" && error.trim()) {
|
|
17190
|
+
return error;
|
|
17191
|
+
}
|
|
17192
|
+
if (error instanceof Error && error.message.trim()) {
|
|
17193
|
+
return error.message;
|
|
17194
|
+
}
|
|
17195
|
+
if (error && typeof error === "object") {
|
|
17196
|
+
const record = error;
|
|
17197
|
+
for (const key of ["message", "reason", "description", "detail"]) {
|
|
17198
|
+
const candidate = record[key];
|
|
17199
|
+
if (typeof candidate === "string" && candidate.trim()) {
|
|
17200
|
+
return candidate;
|
|
17201
|
+
}
|
|
17202
|
+
}
|
|
17203
|
+
if ("error" in record) {
|
|
17204
|
+
return resolveErrorMessage(record.error);
|
|
17205
|
+
}
|
|
17206
|
+
try {
|
|
17207
|
+
return JSON.stringify(error);
|
|
17208
|
+
} catch {}
|
|
17209
|
+
}
|
|
17210
|
+
return "OpenAI realtime error";
|
|
17211
|
+
};
|
|
17212
|
+
var toUint8Array2 = (value) => value instanceof ArrayBuffer ? new Uint8Array(value) : new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
17213
|
+
var toBase643 = (value) => Buffer.from(toUint8Array2(value)).toString("base64");
|
|
17214
|
+
var textTranscript = (text) => ({
|
|
17215
|
+
id: `openai-realtime-text-${crypto.randomUUID()}`,
|
|
17216
|
+
isFinal: true,
|
|
17217
|
+
text,
|
|
17218
|
+
vendor: "openai"
|
|
17219
|
+
});
|
|
17220
|
+
var audioTranscript = (itemId, text, isFinal) => ({
|
|
17221
|
+
id: itemId,
|
|
17222
|
+
isFinal,
|
|
17223
|
+
text,
|
|
17224
|
+
vendor: "openai"
|
|
17225
|
+
});
|
|
17226
|
+
var assertPCM24Mono = (format) => {
|
|
17227
|
+
if (format.container !== "raw" || format.encoding !== "pcm_s16le" || format.sampleRateHz !== 24000 || format.channels !== 1) {
|
|
17228
|
+
throw new Error("OpenAI Realtime requires raw pcm_s16le audio at 24kHz mono.");
|
|
17229
|
+
}
|
|
17230
|
+
};
|
|
17231
|
+
var resolveTranscriptionLanguage = (options, openOptions) => {
|
|
17232
|
+
if (options.inputTranscriptionLanguage?.trim()) {
|
|
17233
|
+
return options.inputTranscriptionLanguage.trim();
|
|
17234
|
+
}
|
|
17235
|
+
if (openOptions.languageStrategy?.mode !== "fixed") {
|
|
17236
|
+
return;
|
|
17237
|
+
}
|
|
17238
|
+
const language = openOptions.languageStrategy.primaryLanguage.trim();
|
|
17239
|
+
return language.length > 0 ? language : undefined;
|
|
17240
|
+
};
|
|
17241
|
+
var phraseHintPrompt = (options) => {
|
|
17242
|
+
const terms = (options.phraseHints ?? []).flatMap((hint) => [
|
|
17243
|
+
hint.text,
|
|
17244
|
+
...hint.aliases ?? []
|
|
17245
|
+
]);
|
|
17246
|
+
const unique = terms.filter((value, index) => terms.indexOf(value) === index);
|
|
17247
|
+
return unique.length ? `Prioritize accurate recovery of these phrases when heard: ${unique.join(", ")}.` : undefined;
|
|
17248
|
+
};
|
|
17249
|
+
var lexiconPrompt = (options) => {
|
|
17250
|
+
const entries = (options.lexicon ?? []).flatMap((entry) => {
|
|
17251
|
+
const details = [
|
|
17252
|
+
entry.text,
|
|
17253
|
+
entry.pronunciation ? `pronounced ${entry.pronunciation}` : undefined,
|
|
17254
|
+
entry.aliases?.length ? `may also sound like ${entry.aliases.join(", ")}` : undefined,
|
|
17255
|
+
entry.language ? `language ${entry.language}` : undefined
|
|
17256
|
+
].filter((value) => !!value);
|
|
17257
|
+
return details.length ? [details.join(" - ")] : [];
|
|
17258
|
+
});
|
|
17259
|
+
return entries.length ? `Use this pronunciation lexicon when transcribing: ${entries.join("; ")}.` : undefined;
|
|
17260
|
+
};
|
|
17261
|
+
var withOpenPrompts = (options, openOptions) => {
|
|
17262
|
+
const phraseHints = phraseHintPrompt(openOptions);
|
|
17263
|
+
const lexicon = lexiconPrompt(openOptions);
|
|
17264
|
+
if (!phraseHints && !lexicon) {
|
|
17265
|
+
return options;
|
|
17266
|
+
}
|
|
17267
|
+
return {
|
|
17268
|
+
...options,
|
|
17269
|
+
inputTranscriptionPrompt: [
|
|
17270
|
+
options.inputTranscriptionPrompt,
|
|
17271
|
+
phraseHints,
|
|
17272
|
+
lexicon
|
|
17273
|
+
].filter((value) => !!value?.trim()).join(`
|
|
17274
|
+
|
|
17275
|
+
`)
|
|
17276
|
+
};
|
|
17277
|
+
};
|
|
17278
|
+
var sessionUpdateEvent = (options, openOptions) => {
|
|
17279
|
+
const responseMode = options.responseMode ?? "audio";
|
|
17280
|
+
const language = resolveTranscriptionLanguage(options, openOptions);
|
|
17281
|
+
const transcription = options.inputTranscriptionModel === null ? null : compact({
|
|
17282
|
+
language,
|
|
17283
|
+
model: options.inputTranscriptionModel ?? DEFAULT_TRANSCRIPTION_MODEL,
|
|
17284
|
+
prompt: options.inputTranscriptionPrompt
|
|
17285
|
+
});
|
|
17286
|
+
return {
|
|
17287
|
+
event_id: `session-update-${crypto.randomUUID()}`,
|
|
17288
|
+
session: compact({
|
|
17289
|
+
audio: {
|
|
17290
|
+
input: compact({
|
|
17291
|
+
format: {
|
|
17292
|
+
rate: 24000,
|
|
17293
|
+
type: "audio/pcm"
|
|
17294
|
+
},
|
|
17295
|
+
noise_reduction: options.noiseReduction ? { type: options.noiseReduction } : undefined,
|
|
17296
|
+
transcription,
|
|
17297
|
+
turn_detection: null
|
|
17298
|
+
}),
|
|
17299
|
+
output: responseMode === "audio" ? compact({
|
|
17300
|
+
format: {
|
|
17301
|
+
rate: 24000,
|
|
17302
|
+
type: "audio/pcm"
|
|
17303
|
+
},
|
|
17304
|
+
speed: options.speed,
|
|
17305
|
+
voice: options.voice ?? DEFAULT_VOICE
|
|
17306
|
+
}) : undefined
|
|
17307
|
+
},
|
|
17308
|
+
instructions: options.instructions,
|
|
17309
|
+
max_output_tokens: options.maxOutputTokens,
|
|
17310
|
+
output_modalities: [responseMode],
|
|
17311
|
+
temperature: options.temperature,
|
|
17312
|
+
type: "realtime"
|
|
17313
|
+
}),
|
|
17314
|
+
type: "session.update"
|
|
17315
|
+
};
|
|
17316
|
+
};
|
|
17317
|
+
var responseCreateEvent = (options) => {
|
|
17318
|
+
const responseMode = options.responseMode ?? "audio";
|
|
17319
|
+
return {
|
|
17320
|
+
response: compact({
|
|
17321
|
+
audio: responseMode === "audio" ? {
|
|
17322
|
+
output: compact({
|
|
17323
|
+
format: {
|
|
17324
|
+
rate: 24000,
|
|
17325
|
+
type: "audio/pcm"
|
|
17326
|
+
},
|
|
17327
|
+
voice: options.voice ?? DEFAULT_VOICE
|
|
17328
|
+
})
|
|
17329
|
+
} : undefined,
|
|
17330
|
+
conversation: "auto",
|
|
17331
|
+
max_output_tokens: options.maxOutputTokens,
|
|
17332
|
+
output_modalities: [responseMode]
|
|
17333
|
+
}),
|
|
17334
|
+
type: "response.create"
|
|
17335
|
+
};
|
|
17336
|
+
};
|
|
17337
|
+
var createOpenAIRealtimeAdapter = (options) => {
|
|
17338
|
+
const baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
|
|
17339
|
+
const Socket = options.webSocket ?? globalThis.WebSocket;
|
|
17340
|
+
return {
|
|
17341
|
+
kind: "realtime",
|
|
17342
|
+
open: (openOptions) => {
|
|
17343
|
+
assertPCM24Mono(openOptions.format);
|
|
17344
|
+
const runtimeOptions = openOptions;
|
|
17345
|
+
const runtimeConfig = withOpenPrompts(options, runtimeOptions);
|
|
17346
|
+
const model = runtimeConfig.model ?? DEFAULT_MODEL;
|
|
17347
|
+
const listeners = createListenerMap();
|
|
17348
|
+
const socket = new Socket(`${baseUrl.replace(/\/$/, "")}?model=${encodeURIComponent(model)}`, {
|
|
17349
|
+
headers: {
|
|
17350
|
+
Authorization: `Bearer ${runtimeConfig.apiKey}`
|
|
17351
|
+
}
|
|
17352
|
+
});
|
|
17353
|
+
const primaryUpdate = sessionUpdateEvent(runtimeConfig, runtimeOptions);
|
|
17354
|
+
const pendingMessages = [];
|
|
17355
|
+
const partials = new Map;
|
|
17356
|
+
const finals = new Set;
|
|
17357
|
+
const autoCommitSilenceMs = runtimeConfig.autoCommitSilenceMs ?? DEFAULT_AUTO_COMMIT_SILENCE_MS;
|
|
17358
|
+
let audioCommitTimer;
|
|
17359
|
+
let closeEmitted = false;
|
|
17360
|
+
let closed = false;
|
|
17361
|
+
let pendingAudio = false;
|
|
17362
|
+
let ready = false;
|
|
17363
|
+
let readyTimeout;
|
|
17364
|
+
let socketOpen = false;
|
|
17365
|
+
let resolveReady;
|
|
17366
|
+
let rejectReady;
|
|
17367
|
+
const readyPromise = new Promise((resolve2, reject) => {
|
|
17368
|
+
resolveReady = resolve2;
|
|
17369
|
+
rejectReady = reject;
|
|
17370
|
+
});
|
|
17371
|
+
const clearReadyTimeout = () => {
|
|
17372
|
+
if (readyTimeout) {
|
|
17373
|
+
clearTimeout(readyTimeout);
|
|
17374
|
+
readyTimeout = undefined;
|
|
17375
|
+
}
|
|
17376
|
+
};
|
|
17377
|
+
const markReady = () => {
|
|
17378
|
+
if (ready || closed) {
|
|
17379
|
+
return;
|
|
17380
|
+
}
|
|
17381
|
+
ready = true;
|
|
17382
|
+
clearReadyTimeout();
|
|
17383
|
+
resolveReady();
|
|
17384
|
+
};
|
|
17385
|
+
const failReady = (error) => {
|
|
17386
|
+
if (ready || closed) {
|
|
17387
|
+
return;
|
|
17388
|
+
}
|
|
17389
|
+
clearReadyTimeout();
|
|
17390
|
+
rejectReady(error);
|
|
17391
|
+
};
|
|
17392
|
+
const sendRaw = (payload) => {
|
|
17393
|
+
const serialized = JSON.stringify(payload);
|
|
17394
|
+
if (!socketOpen) {
|
|
17395
|
+
pendingMessages.push(serialized);
|
|
17396
|
+
return;
|
|
17397
|
+
}
|
|
17398
|
+
socket.send(serialized);
|
|
17399
|
+
};
|
|
17400
|
+
const flush = () => {
|
|
17401
|
+
for (const message of pendingMessages.splice(0)) {
|
|
17402
|
+
socket.send(message);
|
|
17403
|
+
}
|
|
17404
|
+
};
|
|
17405
|
+
const emitClose = async (code, reason, recoverable = false) => {
|
|
17406
|
+
if (closeEmitted) {
|
|
17407
|
+
return;
|
|
17408
|
+
}
|
|
17409
|
+
closeEmitted = true;
|
|
17410
|
+
await emit(listeners, "close", {
|
|
17411
|
+
code,
|
|
17412
|
+
reason,
|
|
17413
|
+
recoverable,
|
|
17414
|
+
type: "close"
|
|
17415
|
+
});
|
|
17416
|
+
};
|
|
17417
|
+
const commitAudio = async () => {
|
|
17418
|
+
if (closed || !pendingAudio) {
|
|
17419
|
+
return;
|
|
17420
|
+
}
|
|
17421
|
+
pendingAudio = false;
|
|
17422
|
+
sendRaw({ type: "input_audio_buffer.commit" });
|
|
17423
|
+
sendRaw(responseCreateEvent(runtimeConfig));
|
|
17424
|
+
};
|
|
17425
|
+
const resetAudioTimer = () => {
|
|
17426
|
+
if (audioCommitTimer) {
|
|
17427
|
+
clearTimeout(audioCommitTimer);
|
|
17428
|
+
}
|
|
17429
|
+
audioCommitTimer = setTimeout(() => {
|
|
17430
|
+
commitAudio();
|
|
17431
|
+
}, autoCommitSilenceMs);
|
|
17432
|
+
};
|
|
17433
|
+
socket.addEventListener("open", () => {
|
|
17434
|
+
socketOpen = true;
|
|
17435
|
+
sendRaw(primaryUpdate);
|
|
17436
|
+
flush();
|
|
17437
|
+
readyTimeout = setTimeout(() => {
|
|
17438
|
+
failReady(new Error("OpenAI realtime session did not become ready."));
|
|
17439
|
+
}, 8000);
|
|
17440
|
+
}, { once: true });
|
|
17441
|
+
socket.addEventListener("message", (event) => {
|
|
17442
|
+
try {
|
|
17443
|
+
const payload = JSON.parse(String(event.data));
|
|
17444
|
+
const shouldEmitResponseTranscripts = runtimeConfig.emitResponseTranscripts === true;
|
|
17445
|
+
switch (payload.type) {
|
|
17446
|
+
case "session.created":
|
|
17447
|
+
case "session.updated":
|
|
17448
|
+
markReady();
|
|
17449
|
+
return;
|
|
17450
|
+
case "conversation.item.input_audio_transcription.delta": {
|
|
17451
|
+
const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
|
|
17452
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17453
|
+
if (!itemId || !delta) {
|
|
17454
|
+
return;
|
|
17455
|
+
}
|
|
17456
|
+
const text = `${partials.get(itemId) ?? ""}${delta}`;
|
|
17457
|
+
partials.set(itemId, text);
|
|
17458
|
+
emit(listeners, "partial", {
|
|
17459
|
+
receivedAt: Date.now(),
|
|
17460
|
+
transcript: audioTranscript(itemId, text, false),
|
|
17461
|
+
type: "partial"
|
|
17462
|
+
});
|
|
17463
|
+
return;
|
|
17464
|
+
}
|
|
17465
|
+
case "conversation.item.input_audio_transcription.completed": {
|
|
17466
|
+
const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
|
|
17467
|
+
const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
|
|
17468
|
+
if (!itemId || !transcript || finals.has(itemId)) {
|
|
17469
|
+
return;
|
|
17470
|
+
}
|
|
17471
|
+
finals.add(itemId);
|
|
17472
|
+
partials.set(itemId, transcript);
|
|
17473
|
+
emit(listeners, "final", {
|
|
17474
|
+
receivedAt: Date.now(),
|
|
17475
|
+
transcript: audioTranscript(itemId, transcript, true),
|
|
17476
|
+
type: "final"
|
|
17477
|
+
});
|
|
17478
|
+
emit(listeners, "endOfTurn", {
|
|
17479
|
+
receivedAt: Date.now(),
|
|
17480
|
+
reason: "vendor",
|
|
17481
|
+
type: "endOfTurn"
|
|
17482
|
+
});
|
|
17483
|
+
return;
|
|
17484
|
+
}
|
|
17485
|
+
case "conversation.item.input_audio_transcription.failed": {
|
|
17486
|
+
const error = payload.error && typeof payload.error === "object" ? payload.error : undefined;
|
|
17487
|
+
emit(listeners, "error", {
|
|
17488
|
+
code: error?.code,
|
|
17489
|
+
error: new Error(resolveErrorMessage(error ?? payload)),
|
|
17490
|
+
recoverable: true,
|
|
17491
|
+
type: "error"
|
|
17492
|
+
});
|
|
17493
|
+
return;
|
|
17494
|
+
}
|
|
17495
|
+
case "response.audio.delta":
|
|
17496
|
+
case "response.output_audio.delta": {
|
|
17497
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17498
|
+
if (!delta) {
|
|
17499
|
+
return;
|
|
17500
|
+
}
|
|
17501
|
+
emit(listeners, "audio", {
|
|
17502
|
+
chunk: Buffer.from(delta, "base64"),
|
|
17503
|
+
format: OPENAI_PCM24_FORMAT,
|
|
17504
|
+
receivedAt: Date.now(),
|
|
17505
|
+
type: "audio"
|
|
17506
|
+
});
|
|
17507
|
+
return;
|
|
17508
|
+
}
|
|
17509
|
+
case "response.audio_transcript.delta":
|
|
17510
|
+
case "response.output_audio_transcript.delta":
|
|
17511
|
+
case "response.output_text.delta": {
|
|
17512
|
+
if (!shouldEmitResponseTranscripts) {
|
|
17513
|
+
return;
|
|
17514
|
+
}
|
|
17515
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17516
|
+
if (!delta) {
|
|
17517
|
+
return;
|
|
17518
|
+
}
|
|
17519
|
+
emit(listeners, "partial", {
|
|
17520
|
+
receivedAt: Date.now(),
|
|
17521
|
+
transcript: textTranscript(delta),
|
|
17522
|
+
type: "partial"
|
|
17523
|
+
});
|
|
17524
|
+
return;
|
|
17525
|
+
}
|
|
17526
|
+
case "response.audio_transcript.done":
|
|
17527
|
+
case "response.output_audio_transcript.done":
|
|
17528
|
+
case "response.output_text.done": {
|
|
17529
|
+
if (!shouldEmitResponseTranscripts) {
|
|
17530
|
+
return;
|
|
17531
|
+
}
|
|
17532
|
+
const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
|
|
17533
|
+
if (!transcript) {
|
|
17534
|
+
return;
|
|
17535
|
+
}
|
|
17536
|
+
emit(listeners, "final", {
|
|
17537
|
+
receivedAt: Date.now(),
|
|
17538
|
+
transcript: textTranscript(transcript),
|
|
17539
|
+
type: "final"
|
|
17540
|
+
});
|
|
17541
|
+
emit(listeners, "endOfTurn", {
|
|
17542
|
+
receivedAt: Date.now(),
|
|
17543
|
+
reason: "vendor",
|
|
17544
|
+
type: "endOfTurn"
|
|
17545
|
+
});
|
|
17546
|
+
return;
|
|
17547
|
+
}
|
|
17548
|
+
case "error": {
|
|
17549
|
+
const error = payload.error && typeof payload.error === "object" ? payload.error : {};
|
|
17550
|
+
const message = resolveErrorMessage(error);
|
|
17551
|
+
emit(listeners, "error", {
|
|
17552
|
+
code: error.code,
|
|
17553
|
+
error: new Error(message),
|
|
17554
|
+
recoverable: true,
|
|
17555
|
+
type: "error"
|
|
17556
|
+
});
|
|
17557
|
+
if (!ready && error.event_id === primaryUpdate.event_id) {
|
|
17558
|
+
failReady(new Error(message));
|
|
17559
|
+
}
|
|
17560
|
+
return;
|
|
17561
|
+
}
|
|
17562
|
+
default:
|
|
17563
|
+
return;
|
|
17564
|
+
}
|
|
17565
|
+
} catch (error) {
|
|
17566
|
+
emit(listeners, "error", {
|
|
17567
|
+
error: new Error(resolveErrorMessage(error)),
|
|
17568
|
+
recoverable: true,
|
|
17569
|
+
type: "error"
|
|
17570
|
+
});
|
|
17571
|
+
}
|
|
17572
|
+
});
|
|
17573
|
+
socket.addEventListener("error", (event) => {
|
|
17574
|
+
const error = new Error(resolveErrorMessage(event));
|
|
17575
|
+
failReady(error);
|
|
17576
|
+
emit(listeners, "error", {
|
|
17577
|
+
error,
|
|
17578
|
+
recoverable: false,
|
|
17579
|
+
type: "error"
|
|
17580
|
+
});
|
|
17581
|
+
});
|
|
17582
|
+
socket.addEventListener("close", (event) => {
|
|
17583
|
+
socketOpen = false;
|
|
17584
|
+
clearReadyTimeout();
|
|
17585
|
+
if (!ready) {
|
|
17586
|
+
failReady(new Error("OpenAI realtime session closed before ready."));
|
|
17587
|
+
}
|
|
17588
|
+
emitClose(event.code, event.reason || undefined, event.code !== 1000);
|
|
17589
|
+
});
|
|
17590
|
+
if (openOptions.signal) {
|
|
17591
|
+
if (openOptions.signal.aborted) {
|
|
17592
|
+
closed = true;
|
|
17593
|
+
socket.close(1000, "aborted");
|
|
17594
|
+
} else {
|
|
17595
|
+
openOptions.signal.addEventListener("abort", () => {
|
|
17596
|
+
if (!closed) {
|
|
17597
|
+
closed = true;
|
|
17598
|
+
socket.close(1000, "aborted");
|
|
17599
|
+
}
|
|
17600
|
+
}, { once: true });
|
|
17601
|
+
}
|
|
17602
|
+
}
|
|
17603
|
+
return {
|
|
17604
|
+
close: async (reason) => {
|
|
17605
|
+
if (closed) {
|
|
17606
|
+
return;
|
|
17607
|
+
}
|
|
17608
|
+
closed = true;
|
|
17609
|
+
clearReadyTimeout();
|
|
17610
|
+
if (audioCommitTimer) {
|
|
17611
|
+
clearTimeout(audioCommitTimer);
|
|
17612
|
+
audioCommitTimer = undefined;
|
|
17613
|
+
}
|
|
17614
|
+
await commitAudio().catch(() => {});
|
|
17615
|
+
socket.close(1000, reason);
|
|
17616
|
+
await emitClose(1000, reason, false);
|
|
17617
|
+
},
|
|
17618
|
+
on: (event, handler) => {
|
|
17619
|
+
listeners[event].add(handler);
|
|
17620
|
+
return () => {
|
|
17621
|
+
listeners[event].delete(handler);
|
|
17622
|
+
};
|
|
17623
|
+
},
|
|
17624
|
+
send: async (input) => {
|
|
17625
|
+
await readyPromise;
|
|
17626
|
+
if (closed) {
|
|
17627
|
+
return;
|
|
17628
|
+
}
|
|
17629
|
+
if (typeof input === "string") {
|
|
17630
|
+
const text = input.trim();
|
|
17631
|
+
if (!text) {
|
|
17632
|
+
return;
|
|
17633
|
+
}
|
|
17634
|
+
await emit(listeners, "final", {
|
|
17635
|
+
receivedAt: Date.now(),
|
|
17636
|
+
transcript: textTranscript(text),
|
|
17637
|
+
type: "final"
|
|
17638
|
+
});
|
|
17639
|
+
await emit(listeners, "endOfTurn", {
|
|
17640
|
+
receivedAt: Date.now(),
|
|
17641
|
+
reason: "manual",
|
|
17642
|
+
type: "endOfTurn"
|
|
17643
|
+
});
|
|
17644
|
+
sendRaw({
|
|
17645
|
+
item: {
|
|
17646
|
+
content: [{ text, type: "input_text" }],
|
|
17647
|
+
role: "user",
|
|
17648
|
+
type: "message"
|
|
17649
|
+
},
|
|
17650
|
+
type: "conversation.item.create"
|
|
17651
|
+
});
|
|
17652
|
+
sendRaw(responseCreateEvent(runtimeConfig));
|
|
17653
|
+
return;
|
|
17654
|
+
}
|
|
17655
|
+
sendRaw({
|
|
17656
|
+
audio: toBase643(input),
|
|
17657
|
+
type: "input_audio_buffer.append"
|
|
17658
|
+
});
|
|
17659
|
+
pendingAudio = true;
|
|
17660
|
+
resetAudioTimer();
|
|
17661
|
+
}
|
|
17662
|
+
};
|
|
17663
|
+
}
|
|
17664
|
+
};
|
|
17665
|
+
};
|
|
17666
|
+
// src/openaiTTS.ts
|
|
17667
|
+
var OPENAI_PCM24_FORMAT2 = {
|
|
17668
|
+
channels: 1,
|
|
17669
|
+
container: "raw",
|
|
17670
|
+
encoding: "pcm_s16le",
|
|
17671
|
+
sampleRateHz: 24000
|
|
17672
|
+
};
|
|
17098
17673
|
var resolveInstructions = async (instructions, input) => {
|
|
17099
17674
|
if (typeof instructions === "function") {
|
|
17100
17675
|
return instructions(input);
|
|
@@ -17102,7 +17677,7 @@ var resolveInstructions = async (instructions, input) => {
|
|
|
17102
17677
|
return instructions;
|
|
17103
17678
|
};
|
|
17104
17679
|
var createTTSHTTPError = (response) => new Error(`OpenAI voice TTS failed: HTTP ${response.status}`);
|
|
17105
|
-
var
|
|
17680
|
+
var emit2 = async (listeners, event, payload) => {
|
|
17106
17681
|
for (const handler of listeners[event]) {
|
|
17107
17682
|
await Promise.resolve(handler(payload));
|
|
17108
17683
|
}
|
|
@@ -17132,7 +17707,7 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17132
17707
|
closed = true;
|
|
17133
17708
|
abortController.abort();
|
|
17134
17709
|
openOptions.signal?.removeEventListener("abort", signalAbort);
|
|
17135
|
-
await
|
|
17710
|
+
await emit2(listeners, "close", {
|
|
17136
17711
|
reason,
|
|
17137
17712
|
type: "close"
|
|
17138
17713
|
});
|
|
@@ -17175,9 +17750,9 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17175
17750
|
if (!response.body) {
|
|
17176
17751
|
const chunk = new Uint8Array(await response.arrayBuffer());
|
|
17177
17752
|
if (!closed && chunk.byteLength > 0) {
|
|
17178
|
-
await
|
|
17753
|
+
await emit2(listeners, "audio", {
|
|
17179
17754
|
chunk,
|
|
17180
|
-
format:
|
|
17755
|
+
format: OPENAI_PCM24_FORMAT2,
|
|
17181
17756
|
receivedAt: Date.now(),
|
|
17182
17757
|
type: "audio"
|
|
17183
17758
|
});
|
|
@@ -17192,9 +17767,9 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17192
17767
|
break;
|
|
17193
17768
|
}
|
|
17194
17769
|
if (value.byteLength > 0) {
|
|
17195
|
-
await
|
|
17770
|
+
await emit2(listeners, "audio", {
|
|
17196
17771
|
chunk: new Uint8Array(value),
|
|
17197
|
-
format:
|
|
17772
|
+
format: OPENAI_PCM24_FORMAT2,
|
|
17198
17773
|
receivedAt: Date.now(),
|
|
17199
17774
|
type: "audio"
|
|
17200
17775
|
});
|
|
@@ -17208,7 +17783,7 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17208
17783
|
return;
|
|
17209
17784
|
}
|
|
17210
17785
|
const normalizedError = error instanceof Error ? error : new Error(String(error));
|
|
17211
|
-
await
|
|
17786
|
+
await emit2(listeners, "error", {
|
|
17212
17787
|
error: normalizedError,
|
|
17213
17788
|
recoverable: true,
|
|
17214
17789
|
type: "error"
|
|
@@ -19778,11 +20353,11 @@ var createResolver = (options) => {
|
|
|
19778
20353
|
selectedProvider: preferred
|
|
19779
20354
|
};
|
|
19780
20355
|
};
|
|
19781
|
-
const
|
|
20356
|
+
const emit3 = async (event, input) => {
|
|
19782
20357
|
await options.onProviderEvent?.(event, input);
|
|
19783
20358
|
};
|
|
19784
20359
|
return {
|
|
19785
|
-
emit:
|
|
20360
|
+
emit: emit3,
|
|
19786
20361
|
getSuppressionRemainingMs,
|
|
19787
20362
|
providerIds,
|
|
19788
20363
|
recordError,
|
|
@@ -22301,6 +22876,7 @@ export {
|
|
|
22301
22876
|
createPhraseHintCorrectionHandler,
|
|
22302
22877
|
createOpenAIVoiceTTS,
|
|
22303
22878
|
createOpenAIVoiceAssistantModel,
|
|
22879
|
+
createOpenAIRealtimeAdapter,
|
|
22304
22880
|
createMemoryVoiceTelephonyWebhookIdempotencyStore,
|
|
22305
22881
|
createJSONVoiceAssistantModel,
|
|
22306
22882
|
createId,
|