@absolutejs/voice 0.0.22-beta.127 → 0.0.22-beta.129

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3413,6 +3413,12 @@ var DEFAULT_FORMAT = {
3413
3413
  encoding: "pcm_s16le",
3414
3414
  sampleRateHz: 16000
3415
3415
  };
3416
+ var DEFAULT_REALTIME_FORMAT = {
3417
+ channels: 1,
3418
+ container: "raw",
3419
+ encoding: "pcm_s16le",
3420
+ sampleRateHz: 24000
3421
+ };
3416
3422
  var toError = (value) => value instanceof Error ? value : new Error(String(value));
3417
3423
  var createEmptyCurrentTurn = () => ({
3418
3424
  finalText: "",
@@ -3690,6 +3696,18 @@ var createVoiceSession = (options) => {
3690
3696
  type: "call_lifecycle"
3691
3697
  });
3692
3698
  };
3699
+ const sendReplay = async (session) => {
3700
+ await send({
3701
+ assistantTexts: session.turns.flatMap((turn) => turn.assistantText ? [turn.assistantText] : []),
3702
+ call: session.call,
3703
+ partial: session.currentTurn.partialText,
3704
+ scenarioId: session.scenarioId,
3705
+ sessionId: options.id,
3706
+ status: session.status,
3707
+ turns: session.turns,
3708
+ type: "replay"
3709
+ });
3710
+ };
3693
3711
  const runHandoff = async (input) => {
3694
3712
  const queuedDelivery = options.handoff?.deliveryQueue ? createVoiceHandoffDeliveryRecord({
3695
3713
  action: input.action,
@@ -3793,6 +3811,23 @@ var createVoiceSession = (options) => {
3793
3811
  });
3794
3812
  }
3795
3813
  };
3814
+ const sendAssistantAudio = async (chunk, input) => {
3815
+ const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
3816
+ await send({
3817
+ chunkBase64: encodeBase64(normalizedChunk),
3818
+ format: input.format,
3819
+ receivedAt: input.receivedAt,
3820
+ turnId: activeTTSTurnId,
3821
+ type: "audio"
3822
+ });
3823
+ if (activeTTSTurnId) {
3824
+ await appendTurnLatencyStage({
3825
+ at: input.receivedAt,
3826
+ stage: "assistant_audio_received",
3827
+ turnId: activeTTSTurnId
3828
+ });
3829
+ }
3830
+ };
3796
3831
  const scheduleTurnCommit = (delayMs, reason, reset = true) => {
3797
3832
  if (!reset && silenceTimer) {
3798
3833
  return;
@@ -4494,8 +4529,12 @@ var createVoiceSession = (options) => {
4494
4529
  if (sttSession) {
4495
4530
  return sttSession;
4496
4531
  }
4497
- const openedSession = await options.stt.open({
4498
- format: DEFAULT_FORMAT,
4532
+ const inputAdapter = options.realtime ?? options.stt;
4533
+ if (!inputAdapter) {
4534
+ throw new Error("Voice session requires either an stt or realtime adapter.");
4535
+ }
4536
+ const openedSession = await inputAdapter.open({
4537
+ format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
4499
4538
  languageStrategy: options.languageStrategy,
4500
4539
  lexicon,
4501
4540
  phraseHints,
@@ -4530,6 +4569,16 @@ var createVoiceSession = (options) => {
4530
4569
  openedSession.on("close", (event) => {
4531
4570
  runAdapterEvent("adapter.close", () => handleClose(event));
4532
4571
  });
4572
+ if (options.realtime) {
4573
+ openedSession.on("audio", ({ chunk, format, receivedAt }) => {
4574
+ runAdapterEvent("adapter.audio", async () => {
4575
+ await sendAssistantAudio(chunk, {
4576
+ format,
4577
+ receivedAt
4578
+ });
4579
+ });
4580
+ });
4581
+ }
4533
4582
  return openedSession;
4534
4583
  };
4535
4584
  const ensureTTSSession = async () => {
@@ -4554,21 +4603,10 @@ var createVoiceSession = (options) => {
4554
4603
  if (ttsSession !== openedSession) {
4555
4604
  return;
4556
4605
  }
4557
- const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
4558
- await send({
4559
- chunkBase64: encodeBase64(normalizedChunk),
4606
+ await sendAssistantAudio(chunk, {
4560
4607
  format,
4561
- receivedAt,
4562
- turnId: activeTTSTurnId,
4563
- type: "audio"
4608
+ receivedAt
4564
4609
  });
4565
- if (activeTTSTurnId) {
4566
- await appendTurnLatencyStage({
4567
- at: receivedAt,
4568
- stage: "assistant_audio_received",
4569
- turnId: activeTTSTurnId
4570
- });
4571
- }
4572
4610
  });
4573
4611
  });
4574
4612
  openedSession.on("error", (event) => {
@@ -4647,7 +4685,8 @@ var createVoiceSession = (options) => {
4647
4685
  await appendTrace({
4648
4686
  payload: {
4649
4687
  text: output.assistantText,
4650
- ttsConfigured: Boolean(options.tts)
4688
+ ttsConfigured: Boolean(options.tts),
4689
+ realtimeConfigured: Boolean(options.realtime)
4651
4690
  },
4652
4691
  session,
4653
4692
  turnId: turn.id,
@@ -4679,9 +4718,35 @@ var createVoiceSession = (options) => {
4679
4718
  turnId: turn.id,
4680
4719
  type: "turn.assistant"
4681
4720
  });
4721
+ } else if (options.realtime) {
4722
+ const activeRealtimeSession = await ensureAdapter();
4723
+ const realtimeStartedAt = Date.now();
4724
+ activeTTSTurnId = turn.id;
4725
+ await appendTurnLatencyStage({
4726
+ at: realtimeStartedAt,
4727
+ session,
4728
+ stage: "tts_send_started",
4729
+ turnId: turn.id
4730
+ });
4731
+ await activeRealtimeSession.send(output.assistantText);
4732
+ await appendTurnLatencyStage({
4733
+ session,
4734
+ stage: "tts_send_completed",
4735
+ turnId: turn.id
4736
+ });
4737
+ await appendTrace({
4738
+ payload: {
4739
+ elapsedMs: Date.now() - realtimeStartedAt,
4740
+ mode: "realtime",
4741
+ status: "sent"
4742
+ },
4743
+ session,
4744
+ turnId: turn.id,
4745
+ type: "turn.assistant"
4746
+ });
4682
4747
  }
4683
4748
  } catch (error) {
4684
- logger.warn("voice tts send failed", {
4749
+ logger.warn("voice assistant audio send failed", {
4685
4750
  error: toError(error).message,
4686
4751
  sessionId: options.id,
4687
4752
  turnId: turn.id
@@ -4689,7 +4754,7 @@ var createVoiceSession = (options) => {
4689
4754
  await appendTrace({
4690
4755
  payload: {
4691
4756
  error: toError(error).message,
4692
- status: "tts-send-failed"
4757
+ status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
4693
4758
  },
4694
4759
  session,
4695
4760
  turnId: turn.id,
@@ -4894,7 +4959,7 @@ var createVoiceSession = (options) => {
4894
4959
  turn,
4895
4960
  type: "turn"
4896
4961
  });
4897
- if (options.sttLifecycle === "turn-scoped") {
4962
+ if (options.stt && options.sttLifecycle === "turn-scoped") {
4898
4963
  await closeAdapter("turn-commit");
4899
4964
  }
4900
4965
  await completeTurn(updatedSession, turn);
@@ -4957,6 +5022,7 @@ var createVoiceSession = (options) => {
4957
5022
  scenarioId: session.scenarioId,
4958
5023
  type: "session"
4959
5024
  });
5025
+ await sendReplay(session);
4960
5026
  if (shouldFireOnSession) {
4961
5027
  await options.route.onCallStart?.({
4962
5028
  api,
@@ -5307,6 +5373,9 @@ var resolveLexicon = async (config, input) => {
5307
5373
  return normalizeLexicon(config.lexicon);
5308
5374
  };
5309
5375
  var voice = (config) => {
5376
+ if (!config.stt && !config.realtime) {
5377
+ throw new Error("voice requires either an stt or realtime adapter.");
5378
+ }
5310
5379
  const runtime = {
5311
5380
  activeSessions: new Map,
5312
5381
  logger: resolveLogger(config.logger),
@@ -5381,6 +5450,8 @@ var voice = (config) => {
5381
5450
  socket: createSocketAdapter(ws),
5382
5451
  store: config.session,
5383
5452
  trace: config.trace,
5453
+ realtime: config.realtime,
5454
+ realtimeInputFormat: config.realtimeInputFormat,
5384
5455
  stt: config.stt,
5385
5456
  sttFallback: sessionOptions.sttFallback,
5386
5457
  sttLifecycle: sessionOptions.sttLifecycle,
@@ -17088,13 +17159,517 @@ var createGeminiVoiceAssistantModel = (options) => {
17088
17159
  }
17089
17160
  };
17090
17161
  };
17091
- // src/openaiTTS.ts
17162
+ // src/openaiRealtime.ts
17163
+ var DEFAULT_AUTO_COMMIT_SILENCE_MS = 450;
17164
+ var DEFAULT_BASE_URL = "wss://api.openai.com/v1/realtime";
17165
+ var DEFAULT_MODEL = "gpt-realtime";
17166
+ var DEFAULT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
17167
+ var DEFAULT_VOICE = "marin";
17092
17168
  var OPENAI_PCM24_FORMAT = {
17093
17169
  channels: 1,
17094
17170
  container: "raw",
17095
17171
  encoding: "pcm_s16le",
17096
17172
  sampleRateHz: 24000
17097
17173
  };
17174
+ var createListenerMap = () => ({
17175
+ audio: new Set,
17176
+ close: new Set,
17177
+ endOfTurn: new Set,
17178
+ error: new Set,
17179
+ final: new Set,
17180
+ partial: new Set
17181
+ });
17182
+ var emit = async (listeners, event, payload) => {
17183
+ for (const listener of listeners[event]) {
17184
+ await listener(payload);
17185
+ }
17186
+ };
17187
+ var compact = (value) => Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined));
17188
+ var resolveErrorMessage = (error) => {
17189
+ if (typeof error === "string" && error.trim()) {
17190
+ return error;
17191
+ }
17192
+ if (error instanceof Error && error.message.trim()) {
17193
+ return error.message;
17194
+ }
17195
+ if (error && typeof error === "object") {
17196
+ const record = error;
17197
+ for (const key of ["message", "reason", "description", "detail"]) {
17198
+ const candidate = record[key];
17199
+ if (typeof candidate === "string" && candidate.trim()) {
17200
+ return candidate;
17201
+ }
17202
+ }
17203
+ if ("error" in record) {
17204
+ return resolveErrorMessage(record.error);
17205
+ }
17206
+ try {
17207
+ return JSON.stringify(error);
17208
+ } catch {}
17209
+ }
17210
+ return "OpenAI realtime error";
17211
+ };
17212
+ var toUint8Array2 = (value) => value instanceof ArrayBuffer ? new Uint8Array(value) : new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
17213
+ var toBase643 = (value) => Buffer.from(toUint8Array2(value)).toString("base64");
17214
+ var textTranscript = (text) => ({
17215
+ id: `openai-realtime-text-${crypto.randomUUID()}`,
17216
+ isFinal: true,
17217
+ text,
17218
+ vendor: "openai"
17219
+ });
17220
+ var audioTranscript = (itemId, text, isFinal) => ({
17221
+ id: itemId,
17222
+ isFinal,
17223
+ text,
17224
+ vendor: "openai"
17225
+ });
17226
+ var assertPCM24Mono = (format) => {
17227
+ if (format.container !== "raw" || format.encoding !== "pcm_s16le" || format.sampleRateHz !== 24000 || format.channels !== 1) {
17228
+ throw new Error("OpenAI Realtime requires raw pcm_s16le audio at 24kHz mono.");
17229
+ }
17230
+ };
17231
+ var resolveTranscriptionLanguage = (options, openOptions) => {
17232
+ if (options.inputTranscriptionLanguage?.trim()) {
17233
+ return options.inputTranscriptionLanguage.trim();
17234
+ }
17235
+ if (openOptions.languageStrategy?.mode !== "fixed") {
17236
+ return;
17237
+ }
17238
+ const language = openOptions.languageStrategy.primaryLanguage.trim();
17239
+ return language.length > 0 ? language : undefined;
17240
+ };
17241
+ var phraseHintPrompt = (options) => {
17242
+ const terms = (options.phraseHints ?? []).flatMap((hint) => [
17243
+ hint.text,
17244
+ ...hint.aliases ?? []
17245
+ ]);
17246
+ const unique = terms.filter((value, index) => terms.indexOf(value) === index);
17247
+ return unique.length ? `Prioritize accurate recovery of these phrases when heard: ${unique.join(", ")}.` : undefined;
17248
+ };
17249
+ var lexiconPrompt = (options) => {
17250
+ const entries = (options.lexicon ?? []).flatMap((entry) => {
17251
+ const details = [
17252
+ entry.text,
17253
+ entry.pronunciation ? `pronounced ${entry.pronunciation}` : undefined,
17254
+ entry.aliases?.length ? `may also sound like ${entry.aliases.join(", ")}` : undefined,
17255
+ entry.language ? `language ${entry.language}` : undefined
17256
+ ].filter((value) => !!value);
17257
+ return details.length ? [details.join(" - ")] : [];
17258
+ });
17259
+ return entries.length ? `Use this pronunciation lexicon when transcribing: ${entries.join("; ")}.` : undefined;
17260
+ };
17261
+ var withOpenPrompts = (options, openOptions) => {
17262
+ const phraseHints = phraseHintPrompt(openOptions);
17263
+ const lexicon = lexiconPrompt(openOptions);
17264
+ if (!phraseHints && !lexicon) {
17265
+ return options;
17266
+ }
17267
+ return {
17268
+ ...options,
17269
+ inputTranscriptionPrompt: [
17270
+ options.inputTranscriptionPrompt,
17271
+ phraseHints,
17272
+ lexicon
17273
+ ].filter((value) => !!value?.trim()).join(`
17274
+
17275
+ `)
17276
+ };
17277
+ };
17278
+ var sessionUpdateEvent = (options, openOptions) => {
17279
+ const responseMode = options.responseMode ?? "audio";
17280
+ const language = resolveTranscriptionLanguage(options, openOptions);
17281
+ const transcription = options.inputTranscriptionModel === null ? null : compact({
17282
+ language,
17283
+ model: options.inputTranscriptionModel ?? DEFAULT_TRANSCRIPTION_MODEL,
17284
+ prompt: options.inputTranscriptionPrompt
17285
+ });
17286
+ return {
17287
+ event_id: `session-update-${crypto.randomUUID()}`,
17288
+ session: compact({
17289
+ audio: {
17290
+ input: compact({
17291
+ format: {
17292
+ rate: 24000,
17293
+ type: "audio/pcm"
17294
+ },
17295
+ noise_reduction: options.noiseReduction ? { type: options.noiseReduction } : undefined,
17296
+ transcription,
17297
+ turn_detection: null
17298
+ }),
17299
+ output: responseMode === "audio" ? compact({
17300
+ format: {
17301
+ rate: 24000,
17302
+ type: "audio/pcm"
17303
+ },
17304
+ speed: options.speed,
17305
+ voice: options.voice ?? DEFAULT_VOICE
17306
+ }) : undefined
17307
+ },
17308
+ instructions: options.instructions,
17309
+ max_output_tokens: options.maxOutputTokens,
17310
+ output_modalities: [responseMode],
17311
+ temperature: options.temperature,
17312
+ type: "realtime"
17313
+ }),
17314
+ type: "session.update"
17315
+ };
17316
+ };
17317
+ var responseCreateEvent = (options) => {
17318
+ const responseMode = options.responseMode ?? "audio";
17319
+ return {
17320
+ response: compact({
17321
+ audio: responseMode === "audio" ? {
17322
+ output: compact({
17323
+ format: {
17324
+ rate: 24000,
17325
+ type: "audio/pcm"
17326
+ },
17327
+ voice: options.voice ?? DEFAULT_VOICE
17328
+ })
17329
+ } : undefined,
17330
+ conversation: "auto",
17331
+ max_output_tokens: options.maxOutputTokens,
17332
+ output_modalities: [responseMode]
17333
+ }),
17334
+ type: "response.create"
17335
+ };
17336
+ };
17337
+ var createOpenAIRealtimeAdapter = (options) => {
17338
+ const baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
17339
+ const Socket = options.webSocket ?? globalThis.WebSocket;
17340
+ return {
17341
+ kind: "realtime",
17342
+ open: (openOptions) => {
17343
+ assertPCM24Mono(openOptions.format);
17344
+ const runtimeOptions = openOptions;
17345
+ const runtimeConfig = withOpenPrompts(options, runtimeOptions);
17346
+ const model = runtimeConfig.model ?? DEFAULT_MODEL;
17347
+ const listeners = createListenerMap();
17348
+ const socket = new Socket(`${baseUrl.replace(/\/$/, "")}?model=${encodeURIComponent(model)}`, {
17349
+ headers: {
17350
+ Authorization: `Bearer ${runtimeConfig.apiKey}`
17351
+ }
17352
+ });
17353
+ const primaryUpdate = sessionUpdateEvent(runtimeConfig, runtimeOptions);
17354
+ const pendingMessages = [];
17355
+ const partials = new Map;
17356
+ const finals = new Set;
17357
+ const autoCommitSilenceMs = runtimeConfig.autoCommitSilenceMs ?? DEFAULT_AUTO_COMMIT_SILENCE_MS;
17358
+ let audioCommitTimer;
17359
+ let closeEmitted = false;
17360
+ let closed = false;
17361
+ let pendingAudio = false;
17362
+ let ready = false;
17363
+ let readyTimeout;
17364
+ let socketOpen = false;
17365
+ let resolveReady;
17366
+ let rejectReady;
17367
+ const readyPromise = new Promise((resolve2, reject) => {
17368
+ resolveReady = resolve2;
17369
+ rejectReady = reject;
17370
+ });
17371
+ const clearReadyTimeout = () => {
17372
+ if (readyTimeout) {
17373
+ clearTimeout(readyTimeout);
17374
+ readyTimeout = undefined;
17375
+ }
17376
+ };
17377
+ const markReady = () => {
17378
+ if (ready || closed) {
17379
+ return;
17380
+ }
17381
+ ready = true;
17382
+ clearReadyTimeout();
17383
+ resolveReady();
17384
+ };
17385
+ const failReady = (error) => {
17386
+ if (ready || closed) {
17387
+ return;
17388
+ }
17389
+ clearReadyTimeout();
17390
+ rejectReady(error);
17391
+ };
17392
+ const sendRaw = (payload) => {
17393
+ const serialized = JSON.stringify(payload);
17394
+ if (!socketOpen) {
17395
+ pendingMessages.push(serialized);
17396
+ return;
17397
+ }
17398
+ socket.send(serialized);
17399
+ };
17400
+ const flush = () => {
17401
+ for (const message of pendingMessages.splice(0)) {
17402
+ socket.send(message);
17403
+ }
17404
+ };
17405
+ const emitClose = async (code, reason, recoverable = false) => {
17406
+ if (closeEmitted) {
17407
+ return;
17408
+ }
17409
+ closeEmitted = true;
17410
+ await emit(listeners, "close", {
17411
+ code,
17412
+ reason,
17413
+ recoverable,
17414
+ type: "close"
17415
+ });
17416
+ };
17417
+ const commitAudio = async () => {
17418
+ if (closed || !pendingAudio) {
17419
+ return;
17420
+ }
17421
+ pendingAudio = false;
17422
+ sendRaw({ type: "input_audio_buffer.commit" });
17423
+ sendRaw(responseCreateEvent(runtimeConfig));
17424
+ };
17425
+ const resetAudioTimer = () => {
17426
+ if (audioCommitTimer) {
17427
+ clearTimeout(audioCommitTimer);
17428
+ }
17429
+ audioCommitTimer = setTimeout(() => {
17430
+ commitAudio();
17431
+ }, autoCommitSilenceMs);
17432
+ };
17433
+ socket.addEventListener("open", () => {
17434
+ socketOpen = true;
17435
+ sendRaw(primaryUpdate);
17436
+ flush();
17437
+ readyTimeout = setTimeout(() => {
17438
+ failReady(new Error("OpenAI realtime session did not become ready."));
17439
+ }, 8000);
17440
+ }, { once: true });
17441
+ socket.addEventListener("message", (event) => {
17442
+ try {
17443
+ const payload = JSON.parse(String(event.data));
17444
+ const shouldEmitResponseTranscripts = runtimeConfig.emitResponseTranscripts === true;
17445
+ switch (payload.type) {
17446
+ case "session.created":
17447
+ case "session.updated":
17448
+ markReady();
17449
+ return;
17450
+ case "conversation.item.input_audio_transcription.delta": {
17451
+ const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
17452
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17453
+ if (!itemId || !delta) {
17454
+ return;
17455
+ }
17456
+ const text = `${partials.get(itemId) ?? ""}${delta}`;
17457
+ partials.set(itemId, text);
17458
+ emit(listeners, "partial", {
17459
+ receivedAt: Date.now(),
17460
+ transcript: audioTranscript(itemId, text, false),
17461
+ type: "partial"
17462
+ });
17463
+ return;
17464
+ }
17465
+ case "conversation.item.input_audio_transcription.completed": {
17466
+ const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
17467
+ const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
17468
+ if (!itemId || !transcript || finals.has(itemId)) {
17469
+ return;
17470
+ }
17471
+ finals.add(itemId);
17472
+ partials.set(itemId, transcript);
17473
+ emit(listeners, "final", {
17474
+ receivedAt: Date.now(),
17475
+ transcript: audioTranscript(itemId, transcript, true),
17476
+ type: "final"
17477
+ });
17478
+ emit(listeners, "endOfTurn", {
17479
+ receivedAt: Date.now(),
17480
+ reason: "vendor",
17481
+ type: "endOfTurn"
17482
+ });
17483
+ return;
17484
+ }
17485
+ case "conversation.item.input_audio_transcription.failed": {
17486
+ const error = payload.error && typeof payload.error === "object" ? payload.error : undefined;
17487
+ emit(listeners, "error", {
17488
+ code: error?.code,
17489
+ error: new Error(resolveErrorMessage(error ?? payload)),
17490
+ recoverable: true,
17491
+ type: "error"
17492
+ });
17493
+ return;
17494
+ }
17495
+ case "response.audio.delta":
17496
+ case "response.output_audio.delta": {
17497
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17498
+ if (!delta) {
17499
+ return;
17500
+ }
17501
+ emit(listeners, "audio", {
17502
+ chunk: Buffer.from(delta, "base64"),
17503
+ format: OPENAI_PCM24_FORMAT,
17504
+ receivedAt: Date.now(),
17505
+ type: "audio"
17506
+ });
17507
+ return;
17508
+ }
17509
+ case "response.audio_transcript.delta":
17510
+ case "response.output_audio_transcript.delta":
17511
+ case "response.output_text.delta": {
17512
+ if (!shouldEmitResponseTranscripts) {
17513
+ return;
17514
+ }
17515
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17516
+ if (!delta) {
17517
+ return;
17518
+ }
17519
+ emit(listeners, "partial", {
17520
+ receivedAt: Date.now(),
17521
+ transcript: textTranscript(delta),
17522
+ type: "partial"
17523
+ });
17524
+ return;
17525
+ }
17526
+ case "response.audio_transcript.done":
17527
+ case "response.output_audio_transcript.done":
17528
+ case "response.output_text.done": {
17529
+ if (!shouldEmitResponseTranscripts) {
17530
+ return;
17531
+ }
17532
+ const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
17533
+ if (!transcript) {
17534
+ return;
17535
+ }
17536
+ emit(listeners, "final", {
17537
+ receivedAt: Date.now(),
17538
+ transcript: textTranscript(transcript),
17539
+ type: "final"
17540
+ });
17541
+ emit(listeners, "endOfTurn", {
17542
+ receivedAt: Date.now(),
17543
+ reason: "vendor",
17544
+ type: "endOfTurn"
17545
+ });
17546
+ return;
17547
+ }
17548
+ case "error": {
17549
+ const error = payload.error && typeof payload.error === "object" ? payload.error : {};
17550
+ const message = resolveErrorMessage(error);
17551
+ emit(listeners, "error", {
17552
+ code: error.code,
17553
+ error: new Error(message),
17554
+ recoverable: true,
17555
+ type: "error"
17556
+ });
17557
+ if (!ready && error.event_id === primaryUpdate.event_id) {
17558
+ failReady(new Error(message));
17559
+ }
17560
+ return;
17561
+ }
17562
+ default:
17563
+ return;
17564
+ }
17565
+ } catch (error) {
17566
+ emit(listeners, "error", {
17567
+ error: new Error(resolveErrorMessage(error)),
17568
+ recoverable: true,
17569
+ type: "error"
17570
+ });
17571
+ }
17572
+ });
17573
+ socket.addEventListener("error", (event) => {
17574
+ const error = new Error(resolveErrorMessage(event));
17575
+ failReady(error);
17576
+ emit(listeners, "error", {
17577
+ error,
17578
+ recoverable: false,
17579
+ type: "error"
17580
+ });
17581
+ });
17582
+ socket.addEventListener("close", (event) => {
17583
+ socketOpen = false;
17584
+ clearReadyTimeout();
17585
+ if (!ready) {
17586
+ failReady(new Error("OpenAI realtime session closed before ready."));
17587
+ }
17588
+ emitClose(event.code, event.reason || undefined, event.code !== 1000);
17589
+ });
17590
+ if (openOptions.signal) {
17591
+ if (openOptions.signal.aborted) {
17592
+ closed = true;
17593
+ socket.close(1000, "aborted");
17594
+ } else {
17595
+ openOptions.signal.addEventListener("abort", () => {
17596
+ if (!closed) {
17597
+ closed = true;
17598
+ socket.close(1000, "aborted");
17599
+ }
17600
+ }, { once: true });
17601
+ }
17602
+ }
17603
+ return {
17604
+ close: async (reason) => {
17605
+ if (closed) {
17606
+ return;
17607
+ }
17608
+ closed = true;
17609
+ clearReadyTimeout();
17610
+ if (audioCommitTimer) {
17611
+ clearTimeout(audioCommitTimer);
17612
+ audioCommitTimer = undefined;
17613
+ }
17614
+ await commitAudio().catch(() => {});
17615
+ socket.close(1000, reason);
17616
+ await emitClose(1000, reason, false);
17617
+ },
17618
+ on: (event, handler) => {
17619
+ listeners[event].add(handler);
17620
+ return () => {
17621
+ listeners[event].delete(handler);
17622
+ };
17623
+ },
17624
+ send: async (input) => {
17625
+ await readyPromise;
17626
+ if (closed) {
17627
+ return;
17628
+ }
17629
+ if (typeof input === "string") {
17630
+ const text = input.trim();
17631
+ if (!text) {
17632
+ return;
17633
+ }
17634
+ await emit(listeners, "final", {
17635
+ receivedAt: Date.now(),
17636
+ transcript: textTranscript(text),
17637
+ type: "final"
17638
+ });
17639
+ await emit(listeners, "endOfTurn", {
17640
+ receivedAt: Date.now(),
17641
+ reason: "manual",
17642
+ type: "endOfTurn"
17643
+ });
17644
+ sendRaw({
17645
+ item: {
17646
+ content: [{ text, type: "input_text" }],
17647
+ role: "user",
17648
+ type: "message"
17649
+ },
17650
+ type: "conversation.item.create"
17651
+ });
17652
+ sendRaw(responseCreateEvent(runtimeConfig));
17653
+ return;
17654
+ }
17655
+ sendRaw({
17656
+ audio: toBase643(input),
17657
+ type: "input_audio_buffer.append"
17658
+ });
17659
+ pendingAudio = true;
17660
+ resetAudioTimer();
17661
+ }
17662
+ };
17663
+ }
17664
+ };
17665
+ };
17666
+ // src/openaiTTS.ts
17667
+ var OPENAI_PCM24_FORMAT2 = {
17668
+ channels: 1,
17669
+ container: "raw",
17670
+ encoding: "pcm_s16le",
17671
+ sampleRateHz: 24000
17672
+ };
17098
17673
  var resolveInstructions = async (instructions, input) => {
17099
17674
  if (typeof instructions === "function") {
17100
17675
  return instructions(input);
@@ -17102,7 +17677,7 @@ var resolveInstructions = async (instructions, input) => {
17102
17677
  return instructions;
17103
17678
  };
17104
17679
  var createTTSHTTPError = (response) => new Error(`OpenAI voice TTS failed: HTTP ${response.status}`);
17105
- var emit = async (listeners, event, payload) => {
17680
+ var emit2 = async (listeners, event, payload) => {
17106
17681
  for (const handler of listeners[event]) {
17107
17682
  await Promise.resolve(handler(payload));
17108
17683
  }
@@ -17132,7 +17707,7 @@ var createOpenAIVoiceTTS = (options) => {
17132
17707
  closed = true;
17133
17708
  abortController.abort();
17134
17709
  openOptions.signal?.removeEventListener("abort", signalAbort);
17135
- await emit(listeners, "close", {
17710
+ await emit2(listeners, "close", {
17136
17711
  reason,
17137
17712
  type: "close"
17138
17713
  });
@@ -17175,9 +17750,9 @@ var createOpenAIVoiceTTS = (options) => {
17175
17750
  if (!response.body) {
17176
17751
  const chunk = new Uint8Array(await response.arrayBuffer());
17177
17752
  if (!closed && chunk.byteLength > 0) {
17178
- await emit(listeners, "audio", {
17753
+ await emit2(listeners, "audio", {
17179
17754
  chunk,
17180
- format: OPENAI_PCM24_FORMAT,
17755
+ format: OPENAI_PCM24_FORMAT2,
17181
17756
  receivedAt: Date.now(),
17182
17757
  type: "audio"
17183
17758
  });
@@ -17192,9 +17767,9 @@ var createOpenAIVoiceTTS = (options) => {
17192
17767
  break;
17193
17768
  }
17194
17769
  if (value.byteLength > 0) {
17195
- await emit(listeners, "audio", {
17770
+ await emit2(listeners, "audio", {
17196
17771
  chunk: new Uint8Array(value),
17197
- format: OPENAI_PCM24_FORMAT,
17772
+ format: OPENAI_PCM24_FORMAT2,
17198
17773
  receivedAt: Date.now(),
17199
17774
  type: "audio"
17200
17775
  });
@@ -17208,7 +17783,7 @@ var createOpenAIVoiceTTS = (options) => {
17208
17783
  return;
17209
17784
  }
17210
17785
  const normalizedError = error instanceof Error ? error : new Error(String(error));
17211
- await emit(listeners, "error", {
17786
+ await emit2(listeners, "error", {
17212
17787
  error: normalizedError,
17213
17788
  recoverable: true,
17214
17789
  type: "error"
@@ -19778,11 +20353,11 @@ var createResolver = (options) => {
19778
20353
  selectedProvider: preferred
19779
20354
  };
19780
20355
  };
19781
- const emit2 = async (event, input) => {
20356
+ const emit3 = async (event, input) => {
19782
20357
  await options.onProviderEvent?.(event, input);
19783
20358
  };
19784
20359
  return {
19785
- emit: emit2,
20360
+ emit: emit3,
19786
20361
  getSuppressionRemainingMs,
19787
20362
  providerIds,
19788
20363
  recordError,
@@ -22301,6 +22876,7 @@ export {
22301
22876
  createPhraseHintCorrectionHandler,
22302
22877
  createOpenAIVoiceTTS,
22303
22878
  createOpenAIVoiceAssistantModel,
22879
+ createOpenAIRealtimeAdapter,
22304
22880
  createMemoryVoiceTelephonyWebhookIdempotencyStore,
22305
22881
  createJSONVoiceAssistantModel,
22306
22882
  createId,