@absolutejs/voice 0.0.22-beta.127 → 0.0.22-beta.128

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1324,6 +1324,59 @@ app.use(
1324
1324
 
1325
1325
  Client state now exposes `assistantAudio` on the stream/controller helpers, so apps can buffer or play synthesized chunks without inventing a second transport.
1326
1326
 
1327
+ ## OpenAI Realtime
1328
+
1329
+ Use `createOpenAIRealtimeAdapter(...)` when you want a direct OpenAI Realtime speech-to-speech output path for live smoke tests, duplex benchmarks, or custom realtime orchestration. It implements the same `RealtimeAdapter` contract used by the benchmark harness, so the provider can stream `response.output_audio.delta` audio chunks into AbsoluteJS voice events while still emitting normalized transcript, error, and close events.
1330
+
1331
+ ```ts
1332
+ import { createOpenAIRealtimeAdapter } from '@absolutejs/voice';
1333
+ import { runTTSAdapterFixture } from '@absolutejs/voice/testing';
1334
+
1335
+ const realtime = createOpenAIRealtimeAdapter({
1336
+ apiKey: process.env.OPENAI_API_KEY!,
1337
+ instructions: 'Answer in one concise sentence.',
1338
+ model: 'gpt-realtime',
1339
+ voice: 'marin'
1340
+ });
1341
+
1342
+ app.use(
1343
+ voice({
1344
+ path: '/voice',
1345
+ realtime,
1346
+ realtimeInputFormat: {
1347
+ channels: 1,
1348
+ container: 'raw',
1349
+ encoding: 'pcm_s16le',
1350
+ sampleRateHz: 24000
1351
+ },
1352
+ session,
1353
+ onTurn: async ({ turn }) => ({
1354
+ assistantText: `You said: ${turn.text}`
1355
+ }),
1356
+ onComplete: async () => {}
1357
+ })
1358
+ );
1359
+
1360
+ const report = await runTTSAdapterFixture(
1361
+ realtime,
1362
+ {
1363
+ id: 'openai-realtime-smoke',
1364
+ text: 'Say exactly: AbsoluteJS realtime is online.',
1365
+ title: 'OpenAI Realtime smoke'
1366
+ },
1367
+ {
1368
+ realtimeFormat: {
1369
+ channels: 1,
1370
+ container: 'raw',
1371
+ encoding: 'pcm_s16le',
1372
+ sampleRateHz: 24000
1373
+ }
1374
+ }
1375
+ );
1376
+ ```
1377
+
1378
+ For server-to-server use, the adapter opens a WebSocket to OpenAI, sends `session.update`, streams text or base64 PCM input, and emits raw 24kHz mono `pcm_s16le` assistant audio. It requires raw 24kHz mono PCM input because that is the OpenAI Realtime PCM format. The main `voice(...)` route can now run in cascaded mode with `stt` plus optional `tts`, or direct realtime mode with `realtime`. Browser demos should make sure the captured PCM format matches `realtimeInputFormat` or resample before sending audio.
1379
+
1327
1380
  If you want a minimal browser playback path, use the client audio player:
1328
1381
 
1329
1382
  ```ts
package/dist/index.d.ts CHANGED
@@ -31,6 +31,7 @@ export { createVoicePhoneAgent } from './phoneAgent';
31
31
  export { createStoredVoiceCallReviewArtifact, createStoredVoiceExternalObjectMap, createStoredVoiceIntegrationEvent, createStoredVoiceOpsTask, createVoiceFileExternalObjectMapStore, createVoiceFileAssistantMemoryStore, createVoiceFileAuditEventStore, createVoiceFileAuditSinkDeliveryStore, createVoiceFileCampaignStore, createVoiceFileIntegrationEventStore, createVoiceFileReviewStore, createVoiceFileRuntimeStorage, createVoiceFileSessionStore, createVoiceFileTaskStore, createVoiceFileTraceSinkDeliveryStore, createVoiceFileTraceEventStore } from './fileStore';
32
32
  export { createVoiceAssistantMemoryHandle, createVoiceAssistantMemoryRecord, createVoiceMemoryAssistantMemoryStore, resolveVoiceAssistantMemoryNamespace } from './assistantMemory';
33
33
  export { createAnthropicVoiceAssistantModel, createGeminiVoiceAssistantModel, createJSONVoiceAssistantModel, createOpenAIVoiceAssistantModel, resolveVoiceProviderRoutingPolicyPreset, createVoiceProviderRouter } from './modelAdapters';
34
+ export { createOpenAIRealtimeAdapter } from './openaiRealtime';
34
35
  export { createOpenAIVoiceTTS } from './openaiTTS';
35
36
  export { createVoiceProviderHealthHTMLHandler, createVoiceProviderHealthJSONHandler, createVoiceProviderHealthRoutes, renderVoiceProviderHealthHTML, summarizeVoiceProviderHealth } from './providerHealth';
36
37
  export { createVoiceProviderCapabilityHTMLHandler, createVoiceProviderCapabilityJSONHandler, createVoiceProviderCapabilityRoutes, renderVoiceProviderCapabilityHTML, summarizeVoiceProviderCapabilities } from './providerCapabilities';
@@ -81,6 +82,7 @@ export type { VoiceWorkflowContract, VoiceWorkflowContractDefinition, VoiceWorkf
81
82
  export type { VoiceSessionListHTMLHandlerOptions, VoiceSessionListItem, VoiceSessionListOptions, VoiceSessionListRoutesOptions, VoiceSessionListStatus, VoiceSessionReplay, VoiceSessionReplayHTMLHandlerOptions, VoiceSessionReplayOptions, VoiceSessionReplayRoutesOptions, VoiceSessionReplayTurn } from './sessionReplay';
82
83
  export type { AnthropicVoiceAssistantModelOptions, GeminiVoiceAssistantModelOptions, OpenAIVoiceAssistantModelOptions, VoiceProviderRouterEvent, VoiceProviderRouterFallbackMode, VoiceProviderRouterHealthOptions, VoiceProviderRouterOptions, VoiceProviderRouterPolicy, VoiceProviderRouterPolicyPreset, VoiceProviderRouterPolicyWeights, VoiceProviderRouterProviderHealth, VoiceProviderRouterProviderProfile, VoiceProviderRouterStrategy, VoiceJSONAssistantModelHandler, VoiceJSONAssistantModelOptions } from './modelAdapters';
83
84
  export type { OpenAIVoiceTTSOptions, OpenAIVoiceTTSVoice } from './openaiTTS';
85
+ export type { OpenAIRealtimeAdapterOptions, OpenAIRealtimeModel, OpenAIRealtimeNoiseReduction, OpenAIRealtimeResponseMode, OpenAIRealtimeTranscriptionModel, OpenAIRealtimeVoice } from './openaiRealtime';
84
86
  export type { VoiceProviderHealthStatus, VoiceProviderHealthSummary, VoiceProviderHealthSummaryOptions } from './providerHealth';
85
87
  export type { VoiceProviderCapabilityDefinition, VoiceProviderCapabilityHandlerOptions, VoiceProviderCapabilityHTMLHandlerOptions, VoiceProviderCapabilityKind, VoiceProviderCapabilityOptions, VoiceProviderCapabilityReport, VoiceProviderCapabilityRoutesOptions, VoiceProviderCapabilitySummary } from './providerCapabilities';
86
88
  export type { VoiceProviderRoutingContractDefinition, VoiceProviderRoutingContractIssue, VoiceProviderRoutingContractReport, VoiceProviderRoutingContractRunOptions, VoiceProviderRoutingExpectation, VoiceProviderRoutingStatus } from './providerRoutingContract';
package/dist/index.js CHANGED
@@ -3413,6 +3413,12 @@ var DEFAULT_FORMAT = {
3413
3413
  encoding: "pcm_s16le",
3414
3414
  sampleRateHz: 16000
3415
3415
  };
3416
+ var DEFAULT_REALTIME_FORMAT = {
3417
+ channels: 1,
3418
+ container: "raw",
3419
+ encoding: "pcm_s16le",
3420
+ sampleRateHz: 24000
3421
+ };
3416
3422
  var toError = (value) => value instanceof Error ? value : new Error(String(value));
3417
3423
  var createEmptyCurrentTurn = () => ({
3418
3424
  finalText: "",
@@ -3793,6 +3799,23 @@ var createVoiceSession = (options) => {
3793
3799
  });
3794
3800
  }
3795
3801
  };
3802
+ const sendAssistantAudio = async (chunk, input) => {
3803
+ const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
3804
+ await send({
3805
+ chunkBase64: encodeBase64(normalizedChunk),
3806
+ format: input.format,
3807
+ receivedAt: input.receivedAt,
3808
+ turnId: activeTTSTurnId,
3809
+ type: "audio"
3810
+ });
3811
+ if (activeTTSTurnId) {
3812
+ await appendTurnLatencyStage({
3813
+ at: input.receivedAt,
3814
+ stage: "assistant_audio_received",
3815
+ turnId: activeTTSTurnId
3816
+ });
3817
+ }
3818
+ };
3796
3819
  const scheduleTurnCommit = (delayMs, reason, reset = true) => {
3797
3820
  if (!reset && silenceTimer) {
3798
3821
  return;
@@ -4494,8 +4517,12 @@ var createVoiceSession = (options) => {
4494
4517
  if (sttSession) {
4495
4518
  return sttSession;
4496
4519
  }
4497
- const openedSession = await options.stt.open({
4498
- format: DEFAULT_FORMAT,
4520
+ const inputAdapter = options.realtime ?? options.stt;
4521
+ if (!inputAdapter) {
4522
+ throw new Error("Voice session requires either an stt or realtime adapter.");
4523
+ }
4524
+ const openedSession = await inputAdapter.open({
4525
+ format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
4499
4526
  languageStrategy: options.languageStrategy,
4500
4527
  lexicon,
4501
4528
  phraseHints,
@@ -4530,6 +4557,16 @@ var createVoiceSession = (options) => {
4530
4557
  openedSession.on("close", (event) => {
4531
4558
  runAdapterEvent("adapter.close", () => handleClose(event));
4532
4559
  });
4560
+ if (options.realtime) {
4561
+ openedSession.on("audio", ({ chunk, format, receivedAt }) => {
4562
+ runAdapterEvent("adapter.audio", async () => {
4563
+ await sendAssistantAudio(chunk, {
4564
+ format,
4565
+ receivedAt
4566
+ });
4567
+ });
4568
+ });
4569
+ }
4533
4570
  return openedSession;
4534
4571
  };
4535
4572
  const ensureTTSSession = async () => {
@@ -4554,21 +4591,10 @@ var createVoiceSession = (options) => {
4554
4591
  if (ttsSession !== openedSession) {
4555
4592
  return;
4556
4593
  }
4557
- const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
4558
- await send({
4559
- chunkBase64: encodeBase64(normalizedChunk),
4594
+ await sendAssistantAudio(chunk, {
4560
4595
  format,
4561
- receivedAt,
4562
- turnId: activeTTSTurnId,
4563
- type: "audio"
4596
+ receivedAt
4564
4597
  });
4565
- if (activeTTSTurnId) {
4566
- await appendTurnLatencyStage({
4567
- at: receivedAt,
4568
- stage: "assistant_audio_received",
4569
- turnId: activeTTSTurnId
4570
- });
4571
- }
4572
4598
  });
4573
4599
  });
4574
4600
  openedSession.on("error", (event) => {
@@ -4647,7 +4673,8 @@ var createVoiceSession = (options) => {
4647
4673
  await appendTrace({
4648
4674
  payload: {
4649
4675
  text: output.assistantText,
4650
- ttsConfigured: Boolean(options.tts)
4676
+ ttsConfigured: Boolean(options.tts),
4677
+ realtimeConfigured: Boolean(options.realtime)
4651
4678
  },
4652
4679
  session,
4653
4680
  turnId: turn.id,
@@ -4679,9 +4706,35 @@ var createVoiceSession = (options) => {
4679
4706
  turnId: turn.id,
4680
4707
  type: "turn.assistant"
4681
4708
  });
4709
+ } else if (options.realtime) {
4710
+ const activeRealtimeSession = await ensureAdapter();
4711
+ const realtimeStartedAt = Date.now();
4712
+ activeTTSTurnId = turn.id;
4713
+ await appendTurnLatencyStage({
4714
+ at: realtimeStartedAt,
4715
+ session,
4716
+ stage: "tts_send_started",
4717
+ turnId: turn.id
4718
+ });
4719
+ await activeRealtimeSession.send(output.assistantText);
4720
+ await appendTurnLatencyStage({
4721
+ session,
4722
+ stage: "tts_send_completed",
4723
+ turnId: turn.id
4724
+ });
4725
+ await appendTrace({
4726
+ payload: {
4727
+ elapsedMs: Date.now() - realtimeStartedAt,
4728
+ mode: "realtime",
4729
+ status: "sent"
4730
+ },
4731
+ session,
4732
+ turnId: turn.id,
4733
+ type: "turn.assistant"
4734
+ });
4682
4735
  }
4683
4736
  } catch (error) {
4684
- logger.warn("voice tts send failed", {
4737
+ logger.warn("voice assistant audio send failed", {
4685
4738
  error: toError(error).message,
4686
4739
  sessionId: options.id,
4687
4740
  turnId: turn.id
@@ -4689,7 +4742,7 @@ var createVoiceSession = (options) => {
4689
4742
  await appendTrace({
4690
4743
  payload: {
4691
4744
  error: toError(error).message,
4692
- status: "tts-send-failed"
4745
+ status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
4693
4746
  },
4694
4747
  session,
4695
4748
  turnId: turn.id,
@@ -4894,7 +4947,7 @@ var createVoiceSession = (options) => {
4894
4947
  turn,
4895
4948
  type: "turn"
4896
4949
  });
4897
- if (options.sttLifecycle === "turn-scoped") {
4950
+ if (options.stt && options.sttLifecycle === "turn-scoped") {
4898
4951
  await closeAdapter("turn-commit");
4899
4952
  }
4900
4953
  await completeTurn(updatedSession, turn);
@@ -5307,6 +5360,9 @@ var resolveLexicon = async (config, input) => {
5307
5360
  return normalizeLexicon(config.lexicon);
5308
5361
  };
5309
5362
  var voice = (config) => {
5363
+ if (!config.stt && !config.realtime) {
5364
+ throw new Error("voice requires either an stt or realtime adapter.");
5365
+ }
5310
5366
  const runtime = {
5311
5367
  activeSessions: new Map,
5312
5368
  logger: resolveLogger(config.logger),
@@ -5381,6 +5437,8 @@ var voice = (config) => {
5381
5437
  socket: createSocketAdapter(ws),
5382
5438
  store: config.session,
5383
5439
  trace: config.trace,
5440
+ realtime: config.realtime,
5441
+ realtimeInputFormat: config.realtimeInputFormat,
5384
5442
  stt: config.stt,
5385
5443
  sttFallback: sessionOptions.sttFallback,
5386
5444
  sttLifecycle: sessionOptions.sttLifecycle,
@@ -17088,13 +17146,517 @@ var createGeminiVoiceAssistantModel = (options) => {
17088
17146
  }
17089
17147
  };
17090
17148
  };
17091
- // src/openaiTTS.ts
17149
+ // src/openaiRealtime.ts
17150
+ var DEFAULT_AUTO_COMMIT_SILENCE_MS = 450;
17151
+ var DEFAULT_BASE_URL = "wss://api.openai.com/v1/realtime";
17152
+ var DEFAULT_MODEL = "gpt-realtime";
17153
+ var DEFAULT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
17154
+ var DEFAULT_VOICE = "marin";
17092
17155
  var OPENAI_PCM24_FORMAT = {
17093
17156
  channels: 1,
17094
17157
  container: "raw",
17095
17158
  encoding: "pcm_s16le",
17096
17159
  sampleRateHz: 24000
17097
17160
  };
17161
+ var createListenerMap = () => ({
17162
+ audio: new Set,
17163
+ close: new Set,
17164
+ endOfTurn: new Set,
17165
+ error: new Set,
17166
+ final: new Set,
17167
+ partial: new Set
17168
+ });
17169
+ var emit = async (listeners, event, payload) => {
17170
+ for (const listener of listeners[event]) {
17171
+ await listener(payload);
17172
+ }
17173
+ };
17174
+ var compact = (value) => Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined));
17175
+ var resolveErrorMessage = (error) => {
17176
+ if (typeof error === "string" && error.trim()) {
17177
+ return error;
17178
+ }
17179
+ if (error instanceof Error && error.message.trim()) {
17180
+ return error.message;
17181
+ }
17182
+ if (error && typeof error === "object") {
17183
+ const record = error;
17184
+ for (const key of ["message", "reason", "description", "detail"]) {
17185
+ const candidate = record[key];
17186
+ if (typeof candidate === "string" && candidate.trim()) {
17187
+ return candidate;
17188
+ }
17189
+ }
17190
+ if ("error" in record) {
17191
+ return resolveErrorMessage(record.error);
17192
+ }
17193
+ try {
17194
+ return JSON.stringify(error);
17195
+ } catch {}
17196
+ }
17197
+ return "OpenAI realtime error";
17198
+ };
17199
+ var toUint8Array2 = (value) => value instanceof ArrayBuffer ? new Uint8Array(value) : new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
17200
+ var toBase643 = (value) => Buffer.from(toUint8Array2(value)).toString("base64");
17201
+ var textTranscript = (text) => ({
17202
+ id: `openai-realtime-text-${crypto.randomUUID()}`,
17203
+ isFinal: true,
17204
+ text,
17205
+ vendor: "openai"
17206
+ });
17207
+ var audioTranscript = (itemId, text, isFinal) => ({
17208
+ id: itemId,
17209
+ isFinal,
17210
+ text,
17211
+ vendor: "openai"
17212
+ });
17213
+ var assertPCM24Mono = (format) => {
17214
+ if (format.container !== "raw" || format.encoding !== "pcm_s16le" || format.sampleRateHz !== 24000 || format.channels !== 1) {
17215
+ throw new Error("OpenAI Realtime requires raw pcm_s16le audio at 24kHz mono.");
17216
+ }
17217
+ };
17218
+ var resolveTranscriptionLanguage = (options, openOptions) => {
17219
+ if (options.inputTranscriptionLanguage?.trim()) {
17220
+ return options.inputTranscriptionLanguage.trim();
17221
+ }
17222
+ if (openOptions.languageStrategy?.mode !== "fixed") {
17223
+ return;
17224
+ }
17225
+ const language = openOptions.languageStrategy.primaryLanguage.trim();
17226
+ return language.length > 0 ? language : undefined;
17227
+ };
17228
+ var phraseHintPrompt = (options) => {
17229
+ const terms = (options.phraseHints ?? []).flatMap((hint) => [
17230
+ hint.text,
17231
+ ...hint.aliases ?? []
17232
+ ]);
17233
+ const unique = terms.filter((value, index) => terms.indexOf(value) === index);
17234
+ return unique.length ? `Prioritize accurate recovery of these phrases when heard: ${unique.join(", ")}.` : undefined;
17235
+ };
17236
+ var lexiconPrompt = (options) => {
17237
+ const entries = (options.lexicon ?? []).flatMap((entry) => {
17238
+ const details = [
17239
+ entry.text,
17240
+ entry.pronunciation ? `pronounced ${entry.pronunciation}` : undefined,
17241
+ entry.aliases?.length ? `may also sound like ${entry.aliases.join(", ")}` : undefined,
17242
+ entry.language ? `language ${entry.language}` : undefined
17243
+ ].filter((value) => !!value);
17244
+ return details.length ? [details.join(" - ")] : [];
17245
+ });
17246
+ return entries.length ? `Use this pronunciation lexicon when transcribing: ${entries.join("; ")}.` : undefined;
17247
+ };
17248
+ var withOpenPrompts = (options, openOptions) => {
17249
+ const phraseHints = phraseHintPrompt(openOptions);
17250
+ const lexicon = lexiconPrompt(openOptions);
17251
+ if (!phraseHints && !lexicon) {
17252
+ return options;
17253
+ }
17254
+ return {
17255
+ ...options,
17256
+ inputTranscriptionPrompt: [
17257
+ options.inputTranscriptionPrompt,
17258
+ phraseHints,
17259
+ lexicon
17260
+ ].filter((value) => !!value?.trim()).join(`
17261
+
17262
+ `)
17263
+ };
17264
+ };
17265
+ var sessionUpdateEvent = (options, openOptions) => {
17266
+ const responseMode = options.responseMode ?? "audio";
17267
+ const language = resolveTranscriptionLanguage(options, openOptions);
17268
+ const transcription = options.inputTranscriptionModel === null ? null : compact({
17269
+ language,
17270
+ model: options.inputTranscriptionModel ?? DEFAULT_TRANSCRIPTION_MODEL,
17271
+ prompt: options.inputTranscriptionPrompt
17272
+ });
17273
+ return {
17274
+ event_id: `session-update-${crypto.randomUUID()}`,
17275
+ session: compact({
17276
+ audio: {
17277
+ input: compact({
17278
+ format: {
17279
+ rate: 24000,
17280
+ type: "audio/pcm"
17281
+ },
17282
+ noise_reduction: options.noiseReduction ? { type: options.noiseReduction } : undefined,
17283
+ transcription,
17284
+ turn_detection: null
17285
+ }),
17286
+ output: responseMode === "audio" ? compact({
17287
+ format: {
17288
+ rate: 24000,
17289
+ type: "audio/pcm"
17290
+ },
17291
+ speed: options.speed,
17292
+ voice: options.voice ?? DEFAULT_VOICE
17293
+ }) : undefined
17294
+ },
17295
+ instructions: options.instructions,
17296
+ max_output_tokens: options.maxOutputTokens,
17297
+ output_modalities: [responseMode],
17298
+ temperature: options.temperature,
17299
+ type: "realtime"
17300
+ }),
17301
+ type: "session.update"
17302
+ };
17303
+ };
17304
+ var responseCreateEvent = (options) => {
17305
+ const responseMode = options.responseMode ?? "audio";
17306
+ return {
17307
+ response: compact({
17308
+ audio: responseMode === "audio" ? {
17309
+ output: compact({
17310
+ format: {
17311
+ rate: 24000,
17312
+ type: "audio/pcm"
17313
+ },
17314
+ voice: options.voice ?? DEFAULT_VOICE
17315
+ })
17316
+ } : undefined,
17317
+ conversation: "auto",
17318
+ max_output_tokens: options.maxOutputTokens,
17319
+ output_modalities: [responseMode]
17320
+ }),
17321
+ type: "response.create"
17322
+ };
17323
+ };
17324
+ var createOpenAIRealtimeAdapter = (options) => {
17325
+ const baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
17326
+ const Socket = options.webSocket ?? globalThis.WebSocket;
17327
+ return {
17328
+ kind: "realtime",
17329
+ open: (openOptions) => {
17330
+ assertPCM24Mono(openOptions.format);
17331
+ const runtimeOptions = openOptions;
17332
+ const runtimeConfig = withOpenPrompts(options, runtimeOptions);
17333
+ const model = runtimeConfig.model ?? DEFAULT_MODEL;
17334
+ const listeners = createListenerMap();
17335
+ const socket = new Socket(`${baseUrl.replace(/\/$/, "")}?model=${encodeURIComponent(model)}`, {
17336
+ headers: {
17337
+ Authorization: `Bearer ${runtimeConfig.apiKey}`
17338
+ }
17339
+ });
17340
+ const primaryUpdate = sessionUpdateEvent(runtimeConfig, runtimeOptions);
17341
+ const pendingMessages = [];
17342
+ const partials = new Map;
17343
+ const finals = new Set;
17344
+ const autoCommitSilenceMs = runtimeConfig.autoCommitSilenceMs ?? DEFAULT_AUTO_COMMIT_SILENCE_MS;
17345
+ let audioCommitTimer;
17346
+ let closeEmitted = false;
17347
+ let closed = false;
17348
+ let pendingAudio = false;
17349
+ let ready = false;
17350
+ let readyTimeout;
17351
+ let socketOpen = false;
17352
+ let resolveReady;
17353
+ let rejectReady;
17354
+ const readyPromise = new Promise((resolve2, reject) => {
17355
+ resolveReady = resolve2;
17356
+ rejectReady = reject;
17357
+ });
17358
+ const clearReadyTimeout = () => {
17359
+ if (readyTimeout) {
17360
+ clearTimeout(readyTimeout);
17361
+ readyTimeout = undefined;
17362
+ }
17363
+ };
17364
+ const markReady = () => {
17365
+ if (ready || closed) {
17366
+ return;
17367
+ }
17368
+ ready = true;
17369
+ clearReadyTimeout();
17370
+ resolveReady();
17371
+ };
17372
+ const failReady = (error) => {
17373
+ if (ready || closed) {
17374
+ return;
17375
+ }
17376
+ clearReadyTimeout();
17377
+ rejectReady(error);
17378
+ };
17379
+ const sendRaw = (payload) => {
17380
+ const serialized = JSON.stringify(payload);
17381
+ if (!socketOpen) {
17382
+ pendingMessages.push(serialized);
17383
+ return;
17384
+ }
17385
+ socket.send(serialized);
17386
+ };
17387
+ const flush = () => {
17388
+ for (const message of pendingMessages.splice(0)) {
17389
+ socket.send(message);
17390
+ }
17391
+ };
17392
+ const emitClose = async (code, reason, recoverable = false) => {
17393
+ if (closeEmitted) {
17394
+ return;
17395
+ }
17396
+ closeEmitted = true;
17397
+ await emit(listeners, "close", {
17398
+ code,
17399
+ reason,
17400
+ recoverable,
17401
+ type: "close"
17402
+ });
17403
+ };
17404
+ const commitAudio = async () => {
17405
+ if (closed || !pendingAudio) {
17406
+ return;
17407
+ }
17408
+ pendingAudio = false;
17409
+ sendRaw({ type: "input_audio_buffer.commit" });
17410
+ sendRaw(responseCreateEvent(runtimeConfig));
17411
+ };
17412
+ const resetAudioTimer = () => {
17413
+ if (audioCommitTimer) {
17414
+ clearTimeout(audioCommitTimer);
17415
+ }
17416
+ audioCommitTimer = setTimeout(() => {
17417
+ commitAudio();
17418
+ }, autoCommitSilenceMs);
17419
+ };
17420
+ socket.addEventListener("open", () => {
17421
+ socketOpen = true;
17422
+ sendRaw(primaryUpdate);
17423
+ flush();
17424
+ readyTimeout = setTimeout(() => {
17425
+ failReady(new Error("OpenAI realtime session did not become ready."));
17426
+ }, 8000);
17427
+ }, { once: true });
17428
+ socket.addEventListener("message", (event) => {
17429
+ try {
17430
+ const payload = JSON.parse(String(event.data));
17431
+ const shouldEmitResponseTranscripts = runtimeConfig.emitResponseTranscripts === true;
17432
+ switch (payload.type) {
17433
+ case "session.created":
17434
+ case "session.updated":
17435
+ markReady();
17436
+ return;
17437
+ case "conversation.item.input_audio_transcription.delta": {
17438
+ const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
17439
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17440
+ if (!itemId || !delta) {
17441
+ return;
17442
+ }
17443
+ const text = `${partials.get(itemId) ?? ""}${delta}`;
17444
+ partials.set(itemId, text);
17445
+ emit(listeners, "partial", {
17446
+ receivedAt: Date.now(),
17447
+ transcript: audioTranscript(itemId, text, false),
17448
+ type: "partial"
17449
+ });
17450
+ return;
17451
+ }
17452
+ case "conversation.item.input_audio_transcription.completed": {
17453
+ const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
17454
+ const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
17455
+ if (!itemId || !transcript || finals.has(itemId)) {
17456
+ return;
17457
+ }
17458
+ finals.add(itemId);
17459
+ partials.set(itemId, transcript);
17460
+ emit(listeners, "final", {
17461
+ receivedAt: Date.now(),
17462
+ transcript: audioTranscript(itemId, transcript, true),
17463
+ type: "final"
17464
+ });
17465
+ emit(listeners, "endOfTurn", {
17466
+ receivedAt: Date.now(),
17467
+ reason: "vendor",
17468
+ type: "endOfTurn"
17469
+ });
17470
+ return;
17471
+ }
17472
+ case "conversation.item.input_audio_transcription.failed": {
17473
+ const error = payload.error && typeof payload.error === "object" ? payload.error : undefined;
17474
+ emit(listeners, "error", {
17475
+ code: error?.code,
17476
+ error: new Error(resolveErrorMessage(error ?? payload)),
17477
+ recoverable: true,
17478
+ type: "error"
17479
+ });
17480
+ return;
17481
+ }
17482
+ case "response.audio.delta":
17483
+ case "response.output_audio.delta": {
17484
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17485
+ if (!delta) {
17486
+ return;
17487
+ }
17488
+ emit(listeners, "audio", {
17489
+ chunk: Buffer.from(delta, "base64"),
17490
+ format: OPENAI_PCM24_FORMAT,
17491
+ receivedAt: Date.now(),
17492
+ type: "audio"
17493
+ });
17494
+ return;
17495
+ }
17496
+ case "response.audio_transcript.delta":
17497
+ case "response.output_audio_transcript.delta":
17498
+ case "response.output_text.delta": {
17499
+ if (!shouldEmitResponseTranscripts) {
17500
+ return;
17501
+ }
17502
+ const delta = typeof payload.delta === "string" ? payload.delta : undefined;
17503
+ if (!delta) {
17504
+ return;
17505
+ }
17506
+ emit(listeners, "partial", {
17507
+ receivedAt: Date.now(),
17508
+ transcript: textTranscript(delta),
17509
+ type: "partial"
17510
+ });
17511
+ return;
17512
+ }
17513
+ case "response.audio_transcript.done":
17514
+ case "response.output_audio_transcript.done":
17515
+ case "response.output_text.done": {
17516
+ if (!shouldEmitResponseTranscripts) {
17517
+ return;
17518
+ }
17519
+ const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
17520
+ if (!transcript) {
17521
+ return;
17522
+ }
17523
+ emit(listeners, "final", {
17524
+ receivedAt: Date.now(),
17525
+ transcript: textTranscript(transcript),
17526
+ type: "final"
17527
+ });
17528
+ emit(listeners, "endOfTurn", {
17529
+ receivedAt: Date.now(),
17530
+ reason: "vendor",
17531
+ type: "endOfTurn"
17532
+ });
17533
+ return;
17534
+ }
17535
+ case "error": {
17536
+ const error = payload.error && typeof payload.error === "object" ? payload.error : {};
17537
+ const message = resolveErrorMessage(error);
17538
+ emit(listeners, "error", {
17539
+ code: error.code,
17540
+ error: new Error(message),
17541
+ recoverable: true,
17542
+ type: "error"
17543
+ });
17544
+ if (!ready && error.event_id === primaryUpdate.event_id) {
17545
+ failReady(new Error(message));
17546
+ }
17547
+ return;
17548
+ }
17549
+ default:
17550
+ return;
17551
+ }
17552
+ } catch (error) {
17553
+ emit(listeners, "error", {
17554
+ error: new Error(resolveErrorMessage(error)),
17555
+ recoverable: true,
17556
+ type: "error"
17557
+ });
17558
+ }
17559
+ });
17560
+ socket.addEventListener("error", (event) => {
17561
+ const error = new Error(resolveErrorMessage(event));
17562
+ failReady(error);
17563
+ emit(listeners, "error", {
17564
+ error,
17565
+ recoverable: false,
17566
+ type: "error"
17567
+ });
17568
+ });
17569
+ socket.addEventListener("close", (event) => {
17570
+ socketOpen = false;
17571
+ clearReadyTimeout();
17572
+ if (!ready) {
17573
+ failReady(new Error("OpenAI realtime session closed before ready."));
17574
+ }
17575
+ emitClose(event.code, event.reason || undefined, event.code !== 1000);
17576
+ });
17577
+ if (openOptions.signal) {
17578
+ if (openOptions.signal.aborted) {
17579
+ closed = true;
17580
+ socket.close(1000, "aborted");
17581
+ } else {
17582
+ openOptions.signal.addEventListener("abort", () => {
17583
+ if (!closed) {
17584
+ closed = true;
17585
+ socket.close(1000, "aborted");
17586
+ }
17587
+ }, { once: true });
17588
+ }
17589
+ }
17590
+ return {
17591
+ close: async (reason) => {
17592
+ if (closed) {
17593
+ return;
17594
+ }
17595
+ closed = true;
17596
+ clearReadyTimeout();
17597
+ if (audioCommitTimer) {
17598
+ clearTimeout(audioCommitTimer);
17599
+ audioCommitTimer = undefined;
17600
+ }
17601
+ await commitAudio().catch(() => {});
17602
+ socket.close(1000, reason);
17603
+ await emitClose(1000, reason, false);
17604
+ },
17605
+ on: (event, handler) => {
17606
+ listeners[event].add(handler);
17607
+ return () => {
17608
+ listeners[event].delete(handler);
17609
+ };
17610
+ },
17611
+ send: async (input) => {
17612
+ await readyPromise;
17613
+ if (closed) {
17614
+ return;
17615
+ }
17616
+ if (typeof input === "string") {
17617
+ const text = input.trim();
17618
+ if (!text) {
17619
+ return;
17620
+ }
17621
+ await emit(listeners, "final", {
17622
+ receivedAt: Date.now(),
17623
+ transcript: textTranscript(text),
17624
+ type: "final"
17625
+ });
17626
+ await emit(listeners, "endOfTurn", {
17627
+ receivedAt: Date.now(),
17628
+ reason: "manual",
17629
+ type: "endOfTurn"
17630
+ });
17631
+ sendRaw({
17632
+ item: {
17633
+ content: [{ text, type: "input_text" }],
17634
+ role: "user",
17635
+ type: "message"
17636
+ },
17637
+ type: "conversation.item.create"
17638
+ });
17639
+ sendRaw(responseCreateEvent(runtimeConfig));
17640
+ return;
17641
+ }
17642
+ sendRaw({
17643
+ audio: toBase643(input),
17644
+ type: "input_audio_buffer.append"
17645
+ });
17646
+ pendingAudio = true;
17647
+ resetAudioTimer();
17648
+ }
17649
+ };
17650
+ }
17651
+ };
17652
+ };
17653
+ // src/openaiTTS.ts
17654
+ var OPENAI_PCM24_FORMAT2 = {
17655
+ channels: 1,
17656
+ container: "raw",
17657
+ encoding: "pcm_s16le",
17658
+ sampleRateHz: 24000
17659
+ };
17098
17660
  var resolveInstructions = async (instructions, input) => {
17099
17661
  if (typeof instructions === "function") {
17100
17662
  return instructions(input);
@@ -17102,7 +17664,7 @@ var resolveInstructions = async (instructions, input) => {
17102
17664
  return instructions;
17103
17665
  };
17104
17666
  var createTTSHTTPError = (response) => new Error(`OpenAI voice TTS failed: HTTP ${response.status}`);
17105
- var emit = async (listeners, event, payload) => {
17667
+ var emit2 = async (listeners, event, payload) => {
17106
17668
  for (const handler of listeners[event]) {
17107
17669
  await Promise.resolve(handler(payload));
17108
17670
  }
@@ -17132,7 +17694,7 @@ var createOpenAIVoiceTTS = (options) => {
17132
17694
  closed = true;
17133
17695
  abortController.abort();
17134
17696
  openOptions.signal?.removeEventListener("abort", signalAbort);
17135
- await emit(listeners, "close", {
17697
+ await emit2(listeners, "close", {
17136
17698
  reason,
17137
17699
  type: "close"
17138
17700
  });
@@ -17175,9 +17737,9 @@ var createOpenAIVoiceTTS = (options) => {
17175
17737
  if (!response.body) {
17176
17738
  const chunk = new Uint8Array(await response.arrayBuffer());
17177
17739
  if (!closed && chunk.byteLength > 0) {
17178
- await emit(listeners, "audio", {
17740
+ await emit2(listeners, "audio", {
17179
17741
  chunk,
17180
- format: OPENAI_PCM24_FORMAT,
17742
+ format: OPENAI_PCM24_FORMAT2,
17181
17743
  receivedAt: Date.now(),
17182
17744
  type: "audio"
17183
17745
  });
@@ -17192,9 +17754,9 @@ var createOpenAIVoiceTTS = (options) => {
17192
17754
  break;
17193
17755
  }
17194
17756
  if (value.byteLength > 0) {
17195
- await emit(listeners, "audio", {
17757
+ await emit2(listeners, "audio", {
17196
17758
  chunk: new Uint8Array(value),
17197
- format: OPENAI_PCM24_FORMAT,
17759
+ format: OPENAI_PCM24_FORMAT2,
17198
17760
  receivedAt: Date.now(),
17199
17761
  type: "audio"
17200
17762
  });
@@ -17208,7 +17770,7 @@ var createOpenAIVoiceTTS = (options) => {
17208
17770
  return;
17209
17771
  }
17210
17772
  const normalizedError = error instanceof Error ? error : new Error(String(error));
17211
- await emit(listeners, "error", {
17773
+ await emit2(listeners, "error", {
17212
17774
  error: normalizedError,
17213
17775
  recoverable: true,
17214
17776
  type: "error"
@@ -19778,11 +20340,11 @@ var createResolver = (options) => {
19778
20340
  selectedProvider: preferred
19779
20341
  };
19780
20342
  };
19781
- const emit2 = async (event, input) => {
20343
+ const emit3 = async (event, input) => {
19782
20344
  await options.onProviderEvent?.(event, input);
19783
20345
  };
19784
20346
  return {
19785
- emit: emit2,
20347
+ emit: emit3,
19786
20348
  getSuppressionRemainingMs,
19787
20349
  providerIds,
19788
20350
  recordError,
@@ -22301,6 +22863,7 @@ export {
22301
22863
  createPhraseHintCorrectionHandler,
22302
22864
  createOpenAIVoiceTTS,
22303
22865
  createOpenAIVoiceAssistantModel,
22866
+ createOpenAIRealtimeAdapter,
22304
22867
  createMemoryVoiceTelephonyWebhookIdempotencyStore,
22305
22868
  createJSONVoiceAssistantModel,
22306
22869
  createId,
@@ -0,0 +1,27 @@
1
+ import type { RealtimeAdapter } from './types';
2
+ export type OpenAIRealtimeModel = 'gpt-realtime' | 'gpt-realtime-mini' | 'gpt-4o-realtime-preview' | 'gpt-4o-mini-realtime-preview' | (string & {});
3
+ export type OpenAIRealtimeVoice = 'alloy' | 'ash' | 'ballad' | 'cedar' | 'coral' | 'echo' | 'marin' | 'sage' | 'shimmer' | 'verse' | {
4
+ id: string;
5
+ } | (string & {});
6
+ export type OpenAIRealtimeTranscriptionModel = 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe' | 'whisper-1' | (string & {});
7
+ export type OpenAIRealtimeNoiseReduction = 'near_field' | 'far_field';
8
+ export type OpenAIRealtimeResponseMode = 'audio' | 'text';
9
+ export type OpenAIRealtimeAdapterOptions = {
10
+ apiKey: string;
11
+ autoCommitSilenceMs?: number;
12
+ baseUrl?: string;
13
+ emitResponseTranscripts?: boolean;
14
+ inputTranscriptionLanguage?: string;
15
+ inputTranscriptionModel?: OpenAIRealtimeTranscriptionModel | null;
16
+ inputTranscriptionPrompt?: string;
17
+ instructions?: string;
18
+ maxOutputTokens?: number | 'inf';
19
+ model?: OpenAIRealtimeModel;
20
+ noiseReduction?: OpenAIRealtimeNoiseReduction;
21
+ responseMode?: OpenAIRealtimeResponseMode;
22
+ speed?: number;
23
+ temperature?: number;
24
+ voice?: OpenAIRealtimeVoice;
25
+ webSocket?: typeof WebSocket;
26
+ };
27
+ export declare const createOpenAIRealtimeAdapter: (options: OpenAIRealtimeAdapterOptions) => RealtimeAdapter;
@@ -2,7 +2,7 @@ import { Elysia } from 'elysia';
2
2
  import type { VoiceTelephonySetupStatus, VoiceTelephonySmokeCheck, VoiceTelephonySmokeReport } from './contract';
3
3
  import { type VoiceTelephonyOutcomePolicy, type VoiceTelephonyWebhookRoutesOptions } from '../telephonyOutcome';
4
4
  import { type VoiceCallReviewArtifact, type VoiceCallReviewConfig } from '../testing/review';
5
- import type { AudioFormat, VoiceLogger, VoicePluginConfig, VoiceSessionRecord, VoiceServerMessage } from '../types';
5
+ import type { AudioFormat, STTAdapter, VoiceLogger, VoicePluginConfig, VoiceSessionRecord, VoiceServerMessage } from '../types';
6
6
  type TwilioMediaPayload = {
7
7
  chunk?: string;
8
8
  payload: string;
@@ -78,7 +78,7 @@ export type TwilioMediaStreamSocket = {
78
78
  close: (code?: number, reason?: string) => void | Promise<void>;
79
79
  send: (data: string) => void | Promise<void>;
80
80
  };
81
- export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = Omit<VoicePluginConfig<TContext, TSession, TResult>, 'htmx' | 'path'> & {
81
+ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = Omit<VoicePluginConfig<TContext, TSession, TResult>, 'htmx' | 'path' | 'stt'> & {
82
82
  clearOnInboundMedia?: boolean;
83
83
  context: TContext;
84
84
  logger?: VoiceLogger;
@@ -97,6 +97,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
97
97
  };
98
98
  scenarioId?: string;
99
99
  sessionId?: string;
100
+ stt: STTAdapter;
100
101
  };
101
102
  export type TwilioMediaStreamBridge = {
102
103
  close: (reason?: string) => Promise<void>;
@@ -5033,6 +5033,12 @@ var DEFAULT_FORMAT = {
5033
5033
  encoding: "pcm_s16le",
5034
5034
  sampleRateHz: 16000
5035
5035
  };
5036
+ var DEFAULT_REALTIME_FORMAT = {
5037
+ channels: 1,
5038
+ container: "raw",
5039
+ encoding: "pcm_s16le",
5040
+ sampleRateHz: 24000
5041
+ };
5036
5042
  var toError = (value) => value instanceof Error ? value : new Error(String(value));
5037
5043
  var createEmptyCurrentTurn = () => ({
5038
5044
  finalText: "",
@@ -5413,6 +5419,23 @@ var createVoiceSession = (options) => {
5413
5419
  });
5414
5420
  }
5415
5421
  };
5422
+ const sendAssistantAudio = async (chunk, input) => {
5423
+ const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
5424
+ await send({
5425
+ chunkBase64: encodeBase64(normalizedChunk),
5426
+ format: input.format,
5427
+ receivedAt: input.receivedAt,
5428
+ turnId: activeTTSTurnId,
5429
+ type: "audio"
5430
+ });
5431
+ if (activeTTSTurnId) {
5432
+ await appendTurnLatencyStage({
5433
+ at: input.receivedAt,
5434
+ stage: "assistant_audio_received",
5435
+ turnId: activeTTSTurnId
5436
+ });
5437
+ }
5438
+ };
5416
5439
  const scheduleTurnCommit = (delayMs, reason, reset = true) => {
5417
5440
  if (!reset && silenceTimer) {
5418
5441
  return;
@@ -6114,8 +6137,12 @@ var createVoiceSession = (options) => {
6114
6137
  if (sttSession) {
6115
6138
  return sttSession;
6116
6139
  }
6117
- const openedSession = await options.stt.open({
6118
- format: DEFAULT_FORMAT,
6140
+ const inputAdapter = options.realtime ?? options.stt;
6141
+ if (!inputAdapter) {
6142
+ throw new Error("Voice session requires either an stt or realtime adapter.");
6143
+ }
6144
+ const openedSession = await inputAdapter.open({
6145
+ format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
6119
6146
  languageStrategy: options.languageStrategy,
6120
6147
  lexicon,
6121
6148
  phraseHints,
@@ -6150,6 +6177,16 @@ var createVoiceSession = (options) => {
6150
6177
  openedSession.on("close", (event) => {
6151
6178
  runAdapterEvent("adapter.close", () => handleClose(event));
6152
6179
  });
6180
+ if (options.realtime) {
6181
+ openedSession.on("audio", ({ chunk, format, receivedAt }) => {
6182
+ runAdapterEvent("adapter.audio", async () => {
6183
+ await sendAssistantAudio(chunk, {
6184
+ format,
6185
+ receivedAt
6186
+ });
6187
+ });
6188
+ });
6189
+ }
6153
6190
  return openedSession;
6154
6191
  };
6155
6192
  const ensureTTSSession = async () => {
@@ -6174,21 +6211,10 @@ var createVoiceSession = (options) => {
6174
6211
  if (ttsSession !== openedSession) {
6175
6212
  return;
6176
6213
  }
6177
- const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
6178
- await send({
6179
- chunkBase64: encodeBase64(normalizedChunk),
6214
+ await sendAssistantAudio(chunk, {
6180
6215
  format,
6181
- receivedAt,
6182
- turnId: activeTTSTurnId,
6183
- type: "audio"
6216
+ receivedAt
6184
6217
  });
6185
- if (activeTTSTurnId) {
6186
- await appendTurnLatencyStage({
6187
- at: receivedAt,
6188
- stage: "assistant_audio_received",
6189
- turnId: activeTTSTurnId
6190
- });
6191
- }
6192
6218
  });
6193
6219
  });
6194
6220
  openedSession.on("error", (event) => {
@@ -6267,7 +6293,8 @@ var createVoiceSession = (options) => {
6267
6293
  await appendTrace({
6268
6294
  payload: {
6269
6295
  text: output.assistantText,
6270
- ttsConfigured: Boolean(options.tts)
6296
+ ttsConfigured: Boolean(options.tts),
6297
+ realtimeConfigured: Boolean(options.realtime)
6271
6298
  },
6272
6299
  session,
6273
6300
  turnId: turn.id,
@@ -6299,9 +6326,35 @@ var createVoiceSession = (options) => {
6299
6326
  turnId: turn.id,
6300
6327
  type: "turn.assistant"
6301
6328
  });
6329
+ } else if (options.realtime) {
6330
+ const activeRealtimeSession = await ensureAdapter();
6331
+ const realtimeStartedAt = Date.now();
6332
+ activeTTSTurnId = turn.id;
6333
+ await appendTurnLatencyStage({
6334
+ at: realtimeStartedAt,
6335
+ session,
6336
+ stage: "tts_send_started",
6337
+ turnId: turn.id
6338
+ });
6339
+ await activeRealtimeSession.send(output.assistantText);
6340
+ await appendTurnLatencyStage({
6341
+ session,
6342
+ stage: "tts_send_completed",
6343
+ turnId: turn.id
6344
+ });
6345
+ await appendTrace({
6346
+ payload: {
6347
+ elapsedMs: Date.now() - realtimeStartedAt,
6348
+ mode: "realtime",
6349
+ status: "sent"
6350
+ },
6351
+ session,
6352
+ turnId: turn.id,
6353
+ type: "turn.assistant"
6354
+ });
6302
6355
  }
6303
6356
  } catch (error) {
6304
- logger.warn("voice tts send failed", {
6357
+ logger.warn("voice assistant audio send failed", {
6305
6358
  error: toError(error).message,
6306
6359
  sessionId: options.id,
6307
6360
  turnId: turn.id
@@ -6309,7 +6362,7 @@ var createVoiceSession = (options) => {
6309
6362
  await appendTrace({
6310
6363
  payload: {
6311
6364
  error: toError(error).message,
6312
- status: "tts-send-failed"
6365
+ status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
6313
6366
  },
6314
6367
  session,
6315
6368
  turnId: turn.id,
@@ -6514,7 +6567,7 @@ var createVoiceSession = (options) => {
6514
6567
  turn,
6515
6568
  type: "turn"
6516
6569
  });
6517
- if (options.sttLifecycle === "turn-scoped") {
6570
+ if (options.stt && options.sttLifecycle === "turn-scoped") {
6518
6571
  await closeAdapter("turn-commit");
6519
6572
  }
6520
6573
  await completeTurn(updatedSession, turn);
@@ -9600,7 +9653,7 @@ var runVoiceTelephonyBenchmark = async (scenarios = getDefaultVoiceTelephonyBenc
9600
9653
  };
9601
9654
  };
9602
9655
  // src/testing/tts.ts
9603
- var DEFAULT_REALTIME_FORMAT = {
9656
+ var DEFAULT_REALTIME_FORMAT2 = {
9604
9657
  channels: 1,
9605
9658
  container: "raw",
9606
9659
  encoding: "pcm_s16le",
@@ -9659,7 +9712,7 @@ var runTTSAdapterFixture = async (adapter, fixture, options = {}) => {
9659
9712
  let audioDurationMs = 0;
9660
9713
  let audioChunkCount = 0;
9661
9714
  const session = adapter.kind === "realtime" ? await adapter.open({
9662
- format: options.realtimeFormat ?? DEFAULT_REALTIME_FORMAT,
9715
+ format: options.realtimeFormat ?? DEFAULT_REALTIME_FORMAT2,
9663
9716
  sessionId: `tts-benchmark:${fixture.id}`,
9664
9717
  ...openOptions ?? {}
9665
9718
  }) : await adapter.open({
package/dist/types.d.ts CHANGED
@@ -616,9 +616,11 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
616
616
  lexicon?: VoiceLexiconEntry[] | VoiceLexiconResolver<TContext>;
617
617
  phraseHints?: VoicePhraseHint[] | VoicePhraseHintResolver<TContext>;
618
618
  preset?: VoiceRuntimePreset;
619
- stt: STTAdapter;
619
+ stt?: STTAdapter;
620
620
  sttFallback?: VoiceSTTFallbackConfig;
621
621
  sttLifecycle?: VoiceSTTLifecycle;
622
+ realtime?: RealtimeAdapter;
623
+ realtimeInputFormat?: AudioFormat;
622
624
  tts?: TTSAdapter;
623
625
  session: VoiceSessionStore<NoInfer<TSession>>;
624
626
  reconnect?: VoiceReconnectConfig;
@@ -635,7 +637,9 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
635
637
  id: string;
636
638
  context: TContext;
637
639
  socket: VoiceSocket;
638
- stt: STTAdapter;
640
+ stt?: STTAdapter;
641
+ realtime?: RealtimeAdapter;
642
+ realtimeInputFormat?: AudioFormat;
639
643
  tts?: TTSAdapter;
640
644
  languageStrategy?: VoiceLanguageStrategy;
641
645
  lexicon?: VoiceLexiconEntry[];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.127",
3
+ "version": "0.0.22-beta.128",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",