@absolutejs/voice 0.0.22-beta.127 → 0.0.22-beta.128
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +592 -29
- package/dist/openaiRealtime.d.ts +27 -0
- package/dist/telephony/twilio.d.ts +3 -2
- package/dist/testing/index.js +74 -21
- package/dist/types.d.ts +6 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1324,6 +1324,59 @@ app.use(
|
|
|
1324
1324
|
|
|
1325
1325
|
Client state now exposes `assistantAudio` on the stream/controller helpers, so apps can buffer or play synthesized chunks without inventing a second transport.
|
|
1326
1326
|
|
|
1327
|
+
## OpenAI Realtime
|
|
1328
|
+
|
|
1329
|
+
Use `createOpenAIRealtimeAdapter(...)` when you want a direct OpenAI Realtime speech-to-speech output path for live smoke tests, duplex benchmarks, or custom realtime orchestration. It implements the same `RealtimeAdapter` contract used by the benchmark harness, so the provider can stream `response.output_audio.delta` audio chunks into AbsoluteJS voice events while still emitting normalized transcript, error, and close events.
|
|
1330
|
+
|
|
1331
|
+
```ts
|
|
1332
|
+
import { createOpenAIRealtimeAdapter } from '@absolutejs/voice';
|
|
1333
|
+
import { runTTSAdapterFixture } from '@absolutejs/voice/testing';
|
|
1334
|
+
|
|
1335
|
+
const realtime = createOpenAIRealtimeAdapter({
|
|
1336
|
+
apiKey: process.env.OPENAI_API_KEY!,
|
|
1337
|
+
instructions: 'Answer in one concise sentence.',
|
|
1338
|
+
model: 'gpt-realtime',
|
|
1339
|
+
voice: 'marin'
|
|
1340
|
+
});
|
|
1341
|
+
|
|
1342
|
+
app.use(
|
|
1343
|
+
voice({
|
|
1344
|
+
path: '/voice',
|
|
1345
|
+
realtime,
|
|
1346
|
+
realtimeInputFormat: {
|
|
1347
|
+
channels: 1,
|
|
1348
|
+
container: 'raw',
|
|
1349
|
+
encoding: 'pcm_s16le',
|
|
1350
|
+
sampleRateHz: 24000
|
|
1351
|
+
},
|
|
1352
|
+
session,
|
|
1353
|
+
onTurn: async ({ turn }) => ({
|
|
1354
|
+
assistantText: `You said: ${turn.text}`
|
|
1355
|
+
}),
|
|
1356
|
+
onComplete: async () => {}
|
|
1357
|
+
})
|
|
1358
|
+
);
|
|
1359
|
+
|
|
1360
|
+
const report = await runTTSAdapterFixture(
|
|
1361
|
+
realtime,
|
|
1362
|
+
{
|
|
1363
|
+
id: 'openai-realtime-smoke',
|
|
1364
|
+
text: 'Say exactly: AbsoluteJS realtime is online.',
|
|
1365
|
+
title: 'OpenAI Realtime smoke'
|
|
1366
|
+
},
|
|
1367
|
+
{
|
|
1368
|
+
realtimeFormat: {
|
|
1369
|
+
channels: 1,
|
|
1370
|
+
container: 'raw',
|
|
1371
|
+
encoding: 'pcm_s16le',
|
|
1372
|
+
sampleRateHz: 24000
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
);
|
|
1376
|
+
```
|
|
1377
|
+
|
|
1378
|
+
For server-to-server use, the adapter opens a WebSocket to OpenAI, sends `session.update`, streams text or base64 PCM input, and emits raw 24kHz mono `pcm_s16le` assistant audio. It requires raw 24kHz mono PCM input because that is the OpenAI Realtime PCM format. The main `voice(...)` route can now run in cascaded mode with `stt` plus optional `tts`, or direct realtime mode with `realtime`. Browser demos should make sure the captured PCM format matches `realtimeInputFormat` or resample before sending audio.
|
|
1379
|
+
|
|
1327
1380
|
If you want a minimal browser playback path, use the client audio player:
|
|
1328
1381
|
|
|
1329
1382
|
```ts
|
package/dist/index.d.ts
CHANGED
|
@@ -31,6 +31,7 @@ export { createVoicePhoneAgent } from './phoneAgent';
|
|
|
31
31
|
export { createStoredVoiceCallReviewArtifact, createStoredVoiceExternalObjectMap, createStoredVoiceIntegrationEvent, createStoredVoiceOpsTask, createVoiceFileExternalObjectMapStore, createVoiceFileAssistantMemoryStore, createVoiceFileAuditEventStore, createVoiceFileAuditSinkDeliveryStore, createVoiceFileCampaignStore, createVoiceFileIntegrationEventStore, createVoiceFileReviewStore, createVoiceFileRuntimeStorage, createVoiceFileSessionStore, createVoiceFileTaskStore, createVoiceFileTraceSinkDeliveryStore, createVoiceFileTraceEventStore } from './fileStore';
|
|
32
32
|
export { createVoiceAssistantMemoryHandle, createVoiceAssistantMemoryRecord, createVoiceMemoryAssistantMemoryStore, resolveVoiceAssistantMemoryNamespace } from './assistantMemory';
|
|
33
33
|
export { createAnthropicVoiceAssistantModel, createGeminiVoiceAssistantModel, createJSONVoiceAssistantModel, createOpenAIVoiceAssistantModel, resolveVoiceProviderRoutingPolicyPreset, createVoiceProviderRouter } from './modelAdapters';
|
|
34
|
+
export { createOpenAIRealtimeAdapter } from './openaiRealtime';
|
|
34
35
|
export { createOpenAIVoiceTTS } from './openaiTTS';
|
|
35
36
|
export { createVoiceProviderHealthHTMLHandler, createVoiceProviderHealthJSONHandler, createVoiceProviderHealthRoutes, renderVoiceProviderHealthHTML, summarizeVoiceProviderHealth } from './providerHealth';
|
|
36
37
|
export { createVoiceProviderCapabilityHTMLHandler, createVoiceProviderCapabilityJSONHandler, createVoiceProviderCapabilityRoutes, renderVoiceProviderCapabilityHTML, summarizeVoiceProviderCapabilities } from './providerCapabilities';
|
|
@@ -81,6 +82,7 @@ export type { VoiceWorkflowContract, VoiceWorkflowContractDefinition, VoiceWorkf
|
|
|
81
82
|
export type { VoiceSessionListHTMLHandlerOptions, VoiceSessionListItem, VoiceSessionListOptions, VoiceSessionListRoutesOptions, VoiceSessionListStatus, VoiceSessionReplay, VoiceSessionReplayHTMLHandlerOptions, VoiceSessionReplayOptions, VoiceSessionReplayRoutesOptions, VoiceSessionReplayTurn } from './sessionReplay';
|
|
82
83
|
export type { AnthropicVoiceAssistantModelOptions, GeminiVoiceAssistantModelOptions, OpenAIVoiceAssistantModelOptions, VoiceProviderRouterEvent, VoiceProviderRouterFallbackMode, VoiceProviderRouterHealthOptions, VoiceProviderRouterOptions, VoiceProviderRouterPolicy, VoiceProviderRouterPolicyPreset, VoiceProviderRouterPolicyWeights, VoiceProviderRouterProviderHealth, VoiceProviderRouterProviderProfile, VoiceProviderRouterStrategy, VoiceJSONAssistantModelHandler, VoiceJSONAssistantModelOptions } from './modelAdapters';
|
|
83
84
|
export type { OpenAIVoiceTTSOptions, OpenAIVoiceTTSVoice } from './openaiTTS';
|
|
85
|
+
export type { OpenAIRealtimeAdapterOptions, OpenAIRealtimeModel, OpenAIRealtimeNoiseReduction, OpenAIRealtimeResponseMode, OpenAIRealtimeTranscriptionModel, OpenAIRealtimeVoice } from './openaiRealtime';
|
|
84
86
|
export type { VoiceProviderHealthStatus, VoiceProviderHealthSummary, VoiceProviderHealthSummaryOptions } from './providerHealth';
|
|
85
87
|
export type { VoiceProviderCapabilityDefinition, VoiceProviderCapabilityHandlerOptions, VoiceProviderCapabilityHTMLHandlerOptions, VoiceProviderCapabilityKind, VoiceProviderCapabilityOptions, VoiceProviderCapabilityReport, VoiceProviderCapabilityRoutesOptions, VoiceProviderCapabilitySummary } from './providerCapabilities';
|
|
86
88
|
export type { VoiceProviderRoutingContractDefinition, VoiceProviderRoutingContractIssue, VoiceProviderRoutingContractReport, VoiceProviderRoutingContractRunOptions, VoiceProviderRoutingExpectation, VoiceProviderRoutingStatus } from './providerRoutingContract';
|
package/dist/index.js
CHANGED
|
@@ -3413,6 +3413,12 @@ var DEFAULT_FORMAT = {
|
|
|
3413
3413
|
encoding: "pcm_s16le",
|
|
3414
3414
|
sampleRateHz: 16000
|
|
3415
3415
|
};
|
|
3416
|
+
var DEFAULT_REALTIME_FORMAT = {
|
|
3417
|
+
channels: 1,
|
|
3418
|
+
container: "raw",
|
|
3419
|
+
encoding: "pcm_s16le",
|
|
3420
|
+
sampleRateHz: 24000
|
|
3421
|
+
};
|
|
3416
3422
|
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
3417
3423
|
var createEmptyCurrentTurn = () => ({
|
|
3418
3424
|
finalText: "",
|
|
@@ -3793,6 +3799,23 @@ var createVoiceSession = (options) => {
|
|
|
3793
3799
|
});
|
|
3794
3800
|
}
|
|
3795
3801
|
};
|
|
3802
|
+
const sendAssistantAudio = async (chunk, input) => {
|
|
3803
|
+
const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
|
|
3804
|
+
await send({
|
|
3805
|
+
chunkBase64: encodeBase64(normalizedChunk),
|
|
3806
|
+
format: input.format,
|
|
3807
|
+
receivedAt: input.receivedAt,
|
|
3808
|
+
turnId: activeTTSTurnId,
|
|
3809
|
+
type: "audio"
|
|
3810
|
+
});
|
|
3811
|
+
if (activeTTSTurnId) {
|
|
3812
|
+
await appendTurnLatencyStage({
|
|
3813
|
+
at: input.receivedAt,
|
|
3814
|
+
stage: "assistant_audio_received",
|
|
3815
|
+
turnId: activeTTSTurnId
|
|
3816
|
+
});
|
|
3817
|
+
}
|
|
3818
|
+
};
|
|
3796
3819
|
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
3797
3820
|
if (!reset && silenceTimer) {
|
|
3798
3821
|
return;
|
|
@@ -4494,8 +4517,12 @@ var createVoiceSession = (options) => {
|
|
|
4494
4517
|
if (sttSession) {
|
|
4495
4518
|
return sttSession;
|
|
4496
4519
|
}
|
|
4497
|
-
const
|
|
4498
|
-
|
|
4520
|
+
const inputAdapter = options.realtime ?? options.stt;
|
|
4521
|
+
if (!inputAdapter) {
|
|
4522
|
+
throw new Error("Voice session requires either an stt or realtime adapter.");
|
|
4523
|
+
}
|
|
4524
|
+
const openedSession = await inputAdapter.open({
|
|
4525
|
+
format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
|
|
4499
4526
|
languageStrategy: options.languageStrategy,
|
|
4500
4527
|
lexicon,
|
|
4501
4528
|
phraseHints,
|
|
@@ -4530,6 +4557,16 @@ var createVoiceSession = (options) => {
|
|
|
4530
4557
|
openedSession.on("close", (event) => {
|
|
4531
4558
|
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
4532
4559
|
});
|
|
4560
|
+
if (options.realtime) {
|
|
4561
|
+
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
4562
|
+
runAdapterEvent("adapter.audio", async () => {
|
|
4563
|
+
await sendAssistantAudio(chunk, {
|
|
4564
|
+
format,
|
|
4565
|
+
receivedAt
|
|
4566
|
+
});
|
|
4567
|
+
});
|
|
4568
|
+
});
|
|
4569
|
+
}
|
|
4533
4570
|
return openedSession;
|
|
4534
4571
|
};
|
|
4535
4572
|
const ensureTTSSession = async () => {
|
|
@@ -4554,21 +4591,10 @@ var createVoiceSession = (options) => {
|
|
|
4554
4591
|
if (ttsSession !== openedSession) {
|
|
4555
4592
|
return;
|
|
4556
4593
|
}
|
|
4557
|
-
|
|
4558
|
-
await send({
|
|
4559
|
-
chunkBase64: encodeBase64(normalizedChunk),
|
|
4594
|
+
await sendAssistantAudio(chunk, {
|
|
4560
4595
|
format,
|
|
4561
|
-
receivedAt
|
|
4562
|
-
turnId: activeTTSTurnId,
|
|
4563
|
-
type: "audio"
|
|
4596
|
+
receivedAt
|
|
4564
4597
|
});
|
|
4565
|
-
if (activeTTSTurnId) {
|
|
4566
|
-
await appendTurnLatencyStage({
|
|
4567
|
-
at: receivedAt,
|
|
4568
|
-
stage: "assistant_audio_received",
|
|
4569
|
-
turnId: activeTTSTurnId
|
|
4570
|
-
});
|
|
4571
|
-
}
|
|
4572
4598
|
});
|
|
4573
4599
|
});
|
|
4574
4600
|
openedSession.on("error", (event) => {
|
|
@@ -4647,7 +4673,8 @@ var createVoiceSession = (options) => {
|
|
|
4647
4673
|
await appendTrace({
|
|
4648
4674
|
payload: {
|
|
4649
4675
|
text: output.assistantText,
|
|
4650
|
-
ttsConfigured: Boolean(options.tts)
|
|
4676
|
+
ttsConfigured: Boolean(options.tts),
|
|
4677
|
+
realtimeConfigured: Boolean(options.realtime)
|
|
4651
4678
|
},
|
|
4652
4679
|
session,
|
|
4653
4680
|
turnId: turn.id,
|
|
@@ -4679,9 +4706,35 @@ var createVoiceSession = (options) => {
|
|
|
4679
4706
|
turnId: turn.id,
|
|
4680
4707
|
type: "turn.assistant"
|
|
4681
4708
|
});
|
|
4709
|
+
} else if (options.realtime) {
|
|
4710
|
+
const activeRealtimeSession = await ensureAdapter();
|
|
4711
|
+
const realtimeStartedAt = Date.now();
|
|
4712
|
+
activeTTSTurnId = turn.id;
|
|
4713
|
+
await appendTurnLatencyStage({
|
|
4714
|
+
at: realtimeStartedAt,
|
|
4715
|
+
session,
|
|
4716
|
+
stage: "tts_send_started",
|
|
4717
|
+
turnId: turn.id
|
|
4718
|
+
});
|
|
4719
|
+
await activeRealtimeSession.send(output.assistantText);
|
|
4720
|
+
await appendTurnLatencyStage({
|
|
4721
|
+
session,
|
|
4722
|
+
stage: "tts_send_completed",
|
|
4723
|
+
turnId: turn.id
|
|
4724
|
+
});
|
|
4725
|
+
await appendTrace({
|
|
4726
|
+
payload: {
|
|
4727
|
+
elapsedMs: Date.now() - realtimeStartedAt,
|
|
4728
|
+
mode: "realtime",
|
|
4729
|
+
status: "sent"
|
|
4730
|
+
},
|
|
4731
|
+
session,
|
|
4732
|
+
turnId: turn.id,
|
|
4733
|
+
type: "turn.assistant"
|
|
4734
|
+
});
|
|
4682
4735
|
}
|
|
4683
4736
|
} catch (error) {
|
|
4684
|
-
logger.warn("voice
|
|
4737
|
+
logger.warn("voice assistant audio send failed", {
|
|
4685
4738
|
error: toError(error).message,
|
|
4686
4739
|
sessionId: options.id,
|
|
4687
4740
|
turnId: turn.id
|
|
@@ -4689,7 +4742,7 @@ var createVoiceSession = (options) => {
|
|
|
4689
4742
|
await appendTrace({
|
|
4690
4743
|
payload: {
|
|
4691
4744
|
error: toError(error).message,
|
|
4692
|
-
status: "tts-send-failed"
|
|
4745
|
+
status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
|
|
4693
4746
|
},
|
|
4694
4747
|
session,
|
|
4695
4748
|
turnId: turn.id,
|
|
@@ -4894,7 +4947,7 @@ var createVoiceSession = (options) => {
|
|
|
4894
4947
|
turn,
|
|
4895
4948
|
type: "turn"
|
|
4896
4949
|
});
|
|
4897
|
-
if (options.sttLifecycle === "turn-scoped") {
|
|
4950
|
+
if (options.stt && options.sttLifecycle === "turn-scoped") {
|
|
4898
4951
|
await closeAdapter("turn-commit");
|
|
4899
4952
|
}
|
|
4900
4953
|
await completeTurn(updatedSession, turn);
|
|
@@ -5307,6 +5360,9 @@ var resolveLexicon = async (config, input) => {
|
|
|
5307
5360
|
return normalizeLexicon(config.lexicon);
|
|
5308
5361
|
};
|
|
5309
5362
|
var voice = (config) => {
|
|
5363
|
+
if (!config.stt && !config.realtime) {
|
|
5364
|
+
throw new Error("voice requires either an stt or realtime adapter.");
|
|
5365
|
+
}
|
|
5310
5366
|
const runtime = {
|
|
5311
5367
|
activeSessions: new Map,
|
|
5312
5368
|
logger: resolveLogger(config.logger),
|
|
@@ -5381,6 +5437,8 @@ var voice = (config) => {
|
|
|
5381
5437
|
socket: createSocketAdapter(ws),
|
|
5382
5438
|
store: config.session,
|
|
5383
5439
|
trace: config.trace,
|
|
5440
|
+
realtime: config.realtime,
|
|
5441
|
+
realtimeInputFormat: config.realtimeInputFormat,
|
|
5384
5442
|
stt: config.stt,
|
|
5385
5443
|
sttFallback: sessionOptions.sttFallback,
|
|
5386
5444
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
@@ -17088,13 +17146,517 @@ var createGeminiVoiceAssistantModel = (options) => {
|
|
|
17088
17146
|
}
|
|
17089
17147
|
};
|
|
17090
17148
|
};
|
|
17091
|
-
// src/
|
|
17149
|
+
// src/openaiRealtime.ts
|
|
17150
|
+
var DEFAULT_AUTO_COMMIT_SILENCE_MS = 450;
|
|
17151
|
+
var DEFAULT_BASE_URL = "wss://api.openai.com/v1/realtime";
|
|
17152
|
+
var DEFAULT_MODEL = "gpt-realtime";
|
|
17153
|
+
var DEFAULT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe";
|
|
17154
|
+
var DEFAULT_VOICE = "marin";
|
|
17092
17155
|
var OPENAI_PCM24_FORMAT = {
|
|
17093
17156
|
channels: 1,
|
|
17094
17157
|
container: "raw",
|
|
17095
17158
|
encoding: "pcm_s16le",
|
|
17096
17159
|
sampleRateHz: 24000
|
|
17097
17160
|
};
|
|
17161
|
+
var createListenerMap = () => ({
|
|
17162
|
+
audio: new Set,
|
|
17163
|
+
close: new Set,
|
|
17164
|
+
endOfTurn: new Set,
|
|
17165
|
+
error: new Set,
|
|
17166
|
+
final: new Set,
|
|
17167
|
+
partial: new Set
|
|
17168
|
+
});
|
|
17169
|
+
var emit = async (listeners, event, payload) => {
|
|
17170
|
+
for (const listener of listeners[event]) {
|
|
17171
|
+
await listener(payload);
|
|
17172
|
+
}
|
|
17173
|
+
};
|
|
17174
|
+
var compact = (value) => Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined));
|
|
17175
|
+
var resolveErrorMessage = (error) => {
|
|
17176
|
+
if (typeof error === "string" && error.trim()) {
|
|
17177
|
+
return error;
|
|
17178
|
+
}
|
|
17179
|
+
if (error instanceof Error && error.message.trim()) {
|
|
17180
|
+
return error.message;
|
|
17181
|
+
}
|
|
17182
|
+
if (error && typeof error === "object") {
|
|
17183
|
+
const record = error;
|
|
17184
|
+
for (const key of ["message", "reason", "description", "detail"]) {
|
|
17185
|
+
const candidate = record[key];
|
|
17186
|
+
if (typeof candidate === "string" && candidate.trim()) {
|
|
17187
|
+
return candidate;
|
|
17188
|
+
}
|
|
17189
|
+
}
|
|
17190
|
+
if ("error" in record) {
|
|
17191
|
+
return resolveErrorMessage(record.error);
|
|
17192
|
+
}
|
|
17193
|
+
try {
|
|
17194
|
+
return JSON.stringify(error);
|
|
17195
|
+
} catch {}
|
|
17196
|
+
}
|
|
17197
|
+
return "OpenAI realtime error";
|
|
17198
|
+
};
|
|
17199
|
+
var toUint8Array2 = (value) => value instanceof ArrayBuffer ? new Uint8Array(value) : new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
|
|
17200
|
+
var toBase643 = (value) => Buffer.from(toUint8Array2(value)).toString("base64");
|
|
17201
|
+
var textTranscript = (text) => ({
|
|
17202
|
+
id: `openai-realtime-text-${crypto.randomUUID()}`,
|
|
17203
|
+
isFinal: true,
|
|
17204
|
+
text,
|
|
17205
|
+
vendor: "openai"
|
|
17206
|
+
});
|
|
17207
|
+
var audioTranscript = (itemId, text, isFinal) => ({
|
|
17208
|
+
id: itemId,
|
|
17209
|
+
isFinal,
|
|
17210
|
+
text,
|
|
17211
|
+
vendor: "openai"
|
|
17212
|
+
});
|
|
17213
|
+
var assertPCM24Mono = (format) => {
|
|
17214
|
+
if (format.container !== "raw" || format.encoding !== "pcm_s16le" || format.sampleRateHz !== 24000 || format.channels !== 1) {
|
|
17215
|
+
throw new Error("OpenAI Realtime requires raw pcm_s16le audio at 24kHz mono.");
|
|
17216
|
+
}
|
|
17217
|
+
};
|
|
17218
|
+
var resolveTranscriptionLanguage = (options, openOptions) => {
|
|
17219
|
+
if (options.inputTranscriptionLanguage?.trim()) {
|
|
17220
|
+
return options.inputTranscriptionLanguage.trim();
|
|
17221
|
+
}
|
|
17222
|
+
if (openOptions.languageStrategy?.mode !== "fixed") {
|
|
17223
|
+
return;
|
|
17224
|
+
}
|
|
17225
|
+
const language = openOptions.languageStrategy.primaryLanguage.trim();
|
|
17226
|
+
return language.length > 0 ? language : undefined;
|
|
17227
|
+
};
|
|
17228
|
+
var phraseHintPrompt = (options) => {
|
|
17229
|
+
const terms = (options.phraseHints ?? []).flatMap((hint) => [
|
|
17230
|
+
hint.text,
|
|
17231
|
+
...hint.aliases ?? []
|
|
17232
|
+
]);
|
|
17233
|
+
const unique = terms.filter((value, index) => terms.indexOf(value) === index);
|
|
17234
|
+
return unique.length ? `Prioritize accurate recovery of these phrases when heard: ${unique.join(", ")}.` : undefined;
|
|
17235
|
+
};
|
|
17236
|
+
var lexiconPrompt = (options) => {
|
|
17237
|
+
const entries = (options.lexicon ?? []).flatMap((entry) => {
|
|
17238
|
+
const details = [
|
|
17239
|
+
entry.text,
|
|
17240
|
+
entry.pronunciation ? `pronounced ${entry.pronunciation}` : undefined,
|
|
17241
|
+
entry.aliases?.length ? `may also sound like ${entry.aliases.join(", ")}` : undefined,
|
|
17242
|
+
entry.language ? `language ${entry.language}` : undefined
|
|
17243
|
+
].filter((value) => !!value);
|
|
17244
|
+
return details.length ? [details.join(" - ")] : [];
|
|
17245
|
+
});
|
|
17246
|
+
return entries.length ? `Use this pronunciation lexicon when transcribing: ${entries.join("; ")}.` : undefined;
|
|
17247
|
+
};
|
|
17248
|
+
var withOpenPrompts = (options, openOptions) => {
|
|
17249
|
+
const phraseHints = phraseHintPrompt(openOptions);
|
|
17250
|
+
const lexicon = lexiconPrompt(openOptions);
|
|
17251
|
+
if (!phraseHints && !lexicon) {
|
|
17252
|
+
return options;
|
|
17253
|
+
}
|
|
17254
|
+
return {
|
|
17255
|
+
...options,
|
|
17256
|
+
inputTranscriptionPrompt: [
|
|
17257
|
+
options.inputTranscriptionPrompt,
|
|
17258
|
+
phraseHints,
|
|
17259
|
+
lexicon
|
|
17260
|
+
].filter((value) => !!value?.trim()).join(`
|
|
17261
|
+
|
|
17262
|
+
`)
|
|
17263
|
+
};
|
|
17264
|
+
};
|
|
17265
|
+
var sessionUpdateEvent = (options, openOptions) => {
|
|
17266
|
+
const responseMode = options.responseMode ?? "audio";
|
|
17267
|
+
const language = resolveTranscriptionLanguage(options, openOptions);
|
|
17268
|
+
const transcription = options.inputTranscriptionModel === null ? null : compact({
|
|
17269
|
+
language,
|
|
17270
|
+
model: options.inputTranscriptionModel ?? DEFAULT_TRANSCRIPTION_MODEL,
|
|
17271
|
+
prompt: options.inputTranscriptionPrompt
|
|
17272
|
+
});
|
|
17273
|
+
return {
|
|
17274
|
+
event_id: `session-update-${crypto.randomUUID()}`,
|
|
17275
|
+
session: compact({
|
|
17276
|
+
audio: {
|
|
17277
|
+
input: compact({
|
|
17278
|
+
format: {
|
|
17279
|
+
rate: 24000,
|
|
17280
|
+
type: "audio/pcm"
|
|
17281
|
+
},
|
|
17282
|
+
noise_reduction: options.noiseReduction ? { type: options.noiseReduction } : undefined,
|
|
17283
|
+
transcription,
|
|
17284
|
+
turn_detection: null
|
|
17285
|
+
}),
|
|
17286
|
+
output: responseMode === "audio" ? compact({
|
|
17287
|
+
format: {
|
|
17288
|
+
rate: 24000,
|
|
17289
|
+
type: "audio/pcm"
|
|
17290
|
+
},
|
|
17291
|
+
speed: options.speed,
|
|
17292
|
+
voice: options.voice ?? DEFAULT_VOICE
|
|
17293
|
+
}) : undefined
|
|
17294
|
+
},
|
|
17295
|
+
instructions: options.instructions,
|
|
17296
|
+
max_output_tokens: options.maxOutputTokens,
|
|
17297
|
+
output_modalities: [responseMode],
|
|
17298
|
+
temperature: options.temperature,
|
|
17299
|
+
type: "realtime"
|
|
17300
|
+
}),
|
|
17301
|
+
type: "session.update"
|
|
17302
|
+
};
|
|
17303
|
+
};
|
|
17304
|
+
var responseCreateEvent = (options) => {
|
|
17305
|
+
const responseMode = options.responseMode ?? "audio";
|
|
17306
|
+
return {
|
|
17307
|
+
response: compact({
|
|
17308
|
+
audio: responseMode === "audio" ? {
|
|
17309
|
+
output: compact({
|
|
17310
|
+
format: {
|
|
17311
|
+
rate: 24000,
|
|
17312
|
+
type: "audio/pcm"
|
|
17313
|
+
},
|
|
17314
|
+
voice: options.voice ?? DEFAULT_VOICE
|
|
17315
|
+
})
|
|
17316
|
+
} : undefined,
|
|
17317
|
+
conversation: "auto",
|
|
17318
|
+
max_output_tokens: options.maxOutputTokens,
|
|
17319
|
+
output_modalities: [responseMode]
|
|
17320
|
+
}),
|
|
17321
|
+
type: "response.create"
|
|
17322
|
+
};
|
|
17323
|
+
};
|
|
17324
|
+
var createOpenAIRealtimeAdapter = (options) => {
|
|
17325
|
+
const baseUrl = options.baseUrl ?? DEFAULT_BASE_URL;
|
|
17326
|
+
const Socket = options.webSocket ?? globalThis.WebSocket;
|
|
17327
|
+
return {
|
|
17328
|
+
kind: "realtime",
|
|
17329
|
+
open: (openOptions) => {
|
|
17330
|
+
assertPCM24Mono(openOptions.format);
|
|
17331
|
+
const runtimeOptions = openOptions;
|
|
17332
|
+
const runtimeConfig = withOpenPrompts(options, runtimeOptions);
|
|
17333
|
+
const model = runtimeConfig.model ?? DEFAULT_MODEL;
|
|
17334
|
+
const listeners = createListenerMap();
|
|
17335
|
+
const socket = new Socket(`${baseUrl.replace(/\/$/, "")}?model=${encodeURIComponent(model)}`, {
|
|
17336
|
+
headers: {
|
|
17337
|
+
Authorization: `Bearer ${runtimeConfig.apiKey}`
|
|
17338
|
+
}
|
|
17339
|
+
});
|
|
17340
|
+
const primaryUpdate = sessionUpdateEvent(runtimeConfig, runtimeOptions);
|
|
17341
|
+
const pendingMessages = [];
|
|
17342
|
+
const partials = new Map;
|
|
17343
|
+
const finals = new Set;
|
|
17344
|
+
const autoCommitSilenceMs = runtimeConfig.autoCommitSilenceMs ?? DEFAULT_AUTO_COMMIT_SILENCE_MS;
|
|
17345
|
+
let audioCommitTimer;
|
|
17346
|
+
let closeEmitted = false;
|
|
17347
|
+
let closed = false;
|
|
17348
|
+
let pendingAudio = false;
|
|
17349
|
+
let ready = false;
|
|
17350
|
+
let readyTimeout;
|
|
17351
|
+
let socketOpen = false;
|
|
17352
|
+
let resolveReady;
|
|
17353
|
+
let rejectReady;
|
|
17354
|
+
const readyPromise = new Promise((resolve2, reject) => {
|
|
17355
|
+
resolveReady = resolve2;
|
|
17356
|
+
rejectReady = reject;
|
|
17357
|
+
});
|
|
17358
|
+
const clearReadyTimeout = () => {
|
|
17359
|
+
if (readyTimeout) {
|
|
17360
|
+
clearTimeout(readyTimeout);
|
|
17361
|
+
readyTimeout = undefined;
|
|
17362
|
+
}
|
|
17363
|
+
};
|
|
17364
|
+
const markReady = () => {
|
|
17365
|
+
if (ready || closed) {
|
|
17366
|
+
return;
|
|
17367
|
+
}
|
|
17368
|
+
ready = true;
|
|
17369
|
+
clearReadyTimeout();
|
|
17370
|
+
resolveReady();
|
|
17371
|
+
};
|
|
17372
|
+
const failReady = (error) => {
|
|
17373
|
+
if (ready || closed) {
|
|
17374
|
+
return;
|
|
17375
|
+
}
|
|
17376
|
+
clearReadyTimeout();
|
|
17377
|
+
rejectReady(error);
|
|
17378
|
+
};
|
|
17379
|
+
const sendRaw = (payload) => {
|
|
17380
|
+
const serialized = JSON.stringify(payload);
|
|
17381
|
+
if (!socketOpen) {
|
|
17382
|
+
pendingMessages.push(serialized);
|
|
17383
|
+
return;
|
|
17384
|
+
}
|
|
17385
|
+
socket.send(serialized);
|
|
17386
|
+
};
|
|
17387
|
+
const flush = () => {
|
|
17388
|
+
for (const message of pendingMessages.splice(0)) {
|
|
17389
|
+
socket.send(message);
|
|
17390
|
+
}
|
|
17391
|
+
};
|
|
17392
|
+
const emitClose = async (code, reason, recoverable = false) => {
|
|
17393
|
+
if (closeEmitted) {
|
|
17394
|
+
return;
|
|
17395
|
+
}
|
|
17396
|
+
closeEmitted = true;
|
|
17397
|
+
await emit(listeners, "close", {
|
|
17398
|
+
code,
|
|
17399
|
+
reason,
|
|
17400
|
+
recoverable,
|
|
17401
|
+
type: "close"
|
|
17402
|
+
});
|
|
17403
|
+
};
|
|
17404
|
+
const commitAudio = async () => {
|
|
17405
|
+
if (closed || !pendingAudio) {
|
|
17406
|
+
return;
|
|
17407
|
+
}
|
|
17408
|
+
pendingAudio = false;
|
|
17409
|
+
sendRaw({ type: "input_audio_buffer.commit" });
|
|
17410
|
+
sendRaw(responseCreateEvent(runtimeConfig));
|
|
17411
|
+
};
|
|
17412
|
+
const resetAudioTimer = () => {
|
|
17413
|
+
if (audioCommitTimer) {
|
|
17414
|
+
clearTimeout(audioCommitTimer);
|
|
17415
|
+
}
|
|
17416
|
+
audioCommitTimer = setTimeout(() => {
|
|
17417
|
+
commitAudio();
|
|
17418
|
+
}, autoCommitSilenceMs);
|
|
17419
|
+
};
|
|
17420
|
+
socket.addEventListener("open", () => {
|
|
17421
|
+
socketOpen = true;
|
|
17422
|
+
sendRaw(primaryUpdate);
|
|
17423
|
+
flush();
|
|
17424
|
+
readyTimeout = setTimeout(() => {
|
|
17425
|
+
failReady(new Error("OpenAI realtime session did not become ready."));
|
|
17426
|
+
}, 8000);
|
|
17427
|
+
}, { once: true });
|
|
17428
|
+
socket.addEventListener("message", (event) => {
|
|
17429
|
+
try {
|
|
17430
|
+
const payload = JSON.parse(String(event.data));
|
|
17431
|
+
const shouldEmitResponseTranscripts = runtimeConfig.emitResponseTranscripts === true;
|
|
17432
|
+
switch (payload.type) {
|
|
17433
|
+
case "session.created":
|
|
17434
|
+
case "session.updated":
|
|
17435
|
+
markReady();
|
|
17436
|
+
return;
|
|
17437
|
+
case "conversation.item.input_audio_transcription.delta": {
|
|
17438
|
+
const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
|
|
17439
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17440
|
+
if (!itemId || !delta) {
|
|
17441
|
+
return;
|
|
17442
|
+
}
|
|
17443
|
+
const text = `${partials.get(itemId) ?? ""}${delta}`;
|
|
17444
|
+
partials.set(itemId, text);
|
|
17445
|
+
emit(listeners, "partial", {
|
|
17446
|
+
receivedAt: Date.now(),
|
|
17447
|
+
transcript: audioTranscript(itemId, text, false),
|
|
17448
|
+
type: "partial"
|
|
17449
|
+
});
|
|
17450
|
+
return;
|
|
17451
|
+
}
|
|
17452
|
+
case "conversation.item.input_audio_transcription.completed": {
|
|
17453
|
+
const itemId = typeof payload.item_id === "string" ? payload.item_id : undefined;
|
|
17454
|
+
const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
|
|
17455
|
+
if (!itemId || !transcript || finals.has(itemId)) {
|
|
17456
|
+
return;
|
|
17457
|
+
}
|
|
17458
|
+
finals.add(itemId);
|
|
17459
|
+
partials.set(itemId, transcript);
|
|
17460
|
+
emit(listeners, "final", {
|
|
17461
|
+
receivedAt: Date.now(),
|
|
17462
|
+
transcript: audioTranscript(itemId, transcript, true),
|
|
17463
|
+
type: "final"
|
|
17464
|
+
});
|
|
17465
|
+
emit(listeners, "endOfTurn", {
|
|
17466
|
+
receivedAt: Date.now(),
|
|
17467
|
+
reason: "vendor",
|
|
17468
|
+
type: "endOfTurn"
|
|
17469
|
+
});
|
|
17470
|
+
return;
|
|
17471
|
+
}
|
|
17472
|
+
case "conversation.item.input_audio_transcription.failed": {
|
|
17473
|
+
const error = payload.error && typeof payload.error === "object" ? payload.error : undefined;
|
|
17474
|
+
emit(listeners, "error", {
|
|
17475
|
+
code: error?.code,
|
|
17476
|
+
error: new Error(resolveErrorMessage(error ?? payload)),
|
|
17477
|
+
recoverable: true,
|
|
17478
|
+
type: "error"
|
|
17479
|
+
});
|
|
17480
|
+
return;
|
|
17481
|
+
}
|
|
17482
|
+
case "response.audio.delta":
|
|
17483
|
+
case "response.output_audio.delta": {
|
|
17484
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17485
|
+
if (!delta) {
|
|
17486
|
+
return;
|
|
17487
|
+
}
|
|
17488
|
+
emit(listeners, "audio", {
|
|
17489
|
+
chunk: Buffer.from(delta, "base64"),
|
|
17490
|
+
format: OPENAI_PCM24_FORMAT,
|
|
17491
|
+
receivedAt: Date.now(),
|
|
17492
|
+
type: "audio"
|
|
17493
|
+
});
|
|
17494
|
+
return;
|
|
17495
|
+
}
|
|
17496
|
+
case "response.audio_transcript.delta":
|
|
17497
|
+
case "response.output_audio_transcript.delta":
|
|
17498
|
+
case "response.output_text.delta": {
|
|
17499
|
+
if (!shouldEmitResponseTranscripts) {
|
|
17500
|
+
return;
|
|
17501
|
+
}
|
|
17502
|
+
const delta = typeof payload.delta === "string" ? payload.delta : undefined;
|
|
17503
|
+
if (!delta) {
|
|
17504
|
+
return;
|
|
17505
|
+
}
|
|
17506
|
+
emit(listeners, "partial", {
|
|
17507
|
+
receivedAt: Date.now(),
|
|
17508
|
+
transcript: textTranscript(delta),
|
|
17509
|
+
type: "partial"
|
|
17510
|
+
});
|
|
17511
|
+
return;
|
|
17512
|
+
}
|
|
17513
|
+
case "response.audio_transcript.done":
|
|
17514
|
+
case "response.output_audio_transcript.done":
|
|
17515
|
+
case "response.output_text.done": {
|
|
17516
|
+
if (!shouldEmitResponseTranscripts) {
|
|
17517
|
+
return;
|
|
17518
|
+
}
|
|
17519
|
+
const transcript = typeof payload.transcript === "string" ? payload.transcript : undefined;
|
|
17520
|
+
if (!transcript) {
|
|
17521
|
+
return;
|
|
17522
|
+
}
|
|
17523
|
+
emit(listeners, "final", {
|
|
17524
|
+
receivedAt: Date.now(),
|
|
17525
|
+
transcript: textTranscript(transcript),
|
|
17526
|
+
type: "final"
|
|
17527
|
+
});
|
|
17528
|
+
emit(listeners, "endOfTurn", {
|
|
17529
|
+
receivedAt: Date.now(),
|
|
17530
|
+
reason: "vendor",
|
|
17531
|
+
type: "endOfTurn"
|
|
17532
|
+
});
|
|
17533
|
+
return;
|
|
17534
|
+
}
|
|
17535
|
+
case "error": {
|
|
17536
|
+
const error = payload.error && typeof payload.error === "object" ? payload.error : {};
|
|
17537
|
+
const message = resolveErrorMessage(error);
|
|
17538
|
+
emit(listeners, "error", {
|
|
17539
|
+
code: error.code,
|
|
17540
|
+
error: new Error(message),
|
|
17541
|
+
recoverable: true,
|
|
17542
|
+
type: "error"
|
|
17543
|
+
});
|
|
17544
|
+
if (!ready && error.event_id === primaryUpdate.event_id) {
|
|
17545
|
+
failReady(new Error(message));
|
|
17546
|
+
}
|
|
17547
|
+
return;
|
|
17548
|
+
}
|
|
17549
|
+
default:
|
|
17550
|
+
return;
|
|
17551
|
+
}
|
|
17552
|
+
} catch (error) {
|
|
17553
|
+
emit(listeners, "error", {
|
|
17554
|
+
error: new Error(resolveErrorMessage(error)),
|
|
17555
|
+
recoverable: true,
|
|
17556
|
+
type: "error"
|
|
17557
|
+
});
|
|
17558
|
+
}
|
|
17559
|
+
});
|
|
17560
|
+
socket.addEventListener("error", (event) => {
|
|
17561
|
+
const error = new Error(resolveErrorMessage(event));
|
|
17562
|
+
failReady(error);
|
|
17563
|
+
emit(listeners, "error", {
|
|
17564
|
+
error,
|
|
17565
|
+
recoverable: false,
|
|
17566
|
+
type: "error"
|
|
17567
|
+
});
|
|
17568
|
+
});
|
|
17569
|
+
socket.addEventListener("close", (event) => {
|
|
17570
|
+
socketOpen = false;
|
|
17571
|
+
clearReadyTimeout();
|
|
17572
|
+
if (!ready) {
|
|
17573
|
+
failReady(new Error("OpenAI realtime session closed before ready."));
|
|
17574
|
+
}
|
|
17575
|
+
emitClose(event.code, event.reason || undefined, event.code !== 1000);
|
|
17576
|
+
});
|
|
17577
|
+
if (openOptions.signal) {
|
|
17578
|
+
if (openOptions.signal.aborted) {
|
|
17579
|
+
closed = true;
|
|
17580
|
+
socket.close(1000, "aborted");
|
|
17581
|
+
} else {
|
|
17582
|
+
openOptions.signal.addEventListener("abort", () => {
|
|
17583
|
+
if (!closed) {
|
|
17584
|
+
closed = true;
|
|
17585
|
+
socket.close(1000, "aborted");
|
|
17586
|
+
}
|
|
17587
|
+
}, { once: true });
|
|
17588
|
+
}
|
|
17589
|
+
}
|
|
17590
|
+
return {
|
|
17591
|
+
close: async (reason) => {
|
|
17592
|
+
if (closed) {
|
|
17593
|
+
return;
|
|
17594
|
+
}
|
|
17595
|
+
closed = true;
|
|
17596
|
+
clearReadyTimeout();
|
|
17597
|
+
if (audioCommitTimer) {
|
|
17598
|
+
clearTimeout(audioCommitTimer);
|
|
17599
|
+
audioCommitTimer = undefined;
|
|
17600
|
+
}
|
|
17601
|
+
await commitAudio().catch(() => {});
|
|
17602
|
+
socket.close(1000, reason);
|
|
17603
|
+
await emitClose(1000, reason, false);
|
|
17604
|
+
},
|
|
17605
|
+
on: (event, handler) => {
|
|
17606
|
+
listeners[event].add(handler);
|
|
17607
|
+
return () => {
|
|
17608
|
+
listeners[event].delete(handler);
|
|
17609
|
+
};
|
|
17610
|
+
},
|
|
17611
|
+
send: async (input) => {
|
|
17612
|
+
await readyPromise;
|
|
17613
|
+
if (closed) {
|
|
17614
|
+
return;
|
|
17615
|
+
}
|
|
17616
|
+
if (typeof input === "string") {
|
|
17617
|
+
const text = input.trim();
|
|
17618
|
+
if (!text) {
|
|
17619
|
+
return;
|
|
17620
|
+
}
|
|
17621
|
+
await emit(listeners, "final", {
|
|
17622
|
+
receivedAt: Date.now(),
|
|
17623
|
+
transcript: textTranscript(text),
|
|
17624
|
+
type: "final"
|
|
17625
|
+
});
|
|
17626
|
+
await emit(listeners, "endOfTurn", {
|
|
17627
|
+
receivedAt: Date.now(),
|
|
17628
|
+
reason: "manual",
|
|
17629
|
+
type: "endOfTurn"
|
|
17630
|
+
});
|
|
17631
|
+
sendRaw({
|
|
17632
|
+
item: {
|
|
17633
|
+
content: [{ text, type: "input_text" }],
|
|
17634
|
+
role: "user",
|
|
17635
|
+
type: "message"
|
|
17636
|
+
},
|
|
17637
|
+
type: "conversation.item.create"
|
|
17638
|
+
});
|
|
17639
|
+
sendRaw(responseCreateEvent(runtimeConfig));
|
|
17640
|
+
return;
|
|
17641
|
+
}
|
|
17642
|
+
sendRaw({
|
|
17643
|
+
audio: toBase643(input),
|
|
17644
|
+
type: "input_audio_buffer.append"
|
|
17645
|
+
});
|
|
17646
|
+
pendingAudio = true;
|
|
17647
|
+
resetAudioTimer();
|
|
17648
|
+
}
|
|
17649
|
+
};
|
|
17650
|
+
}
|
|
17651
|
+
};
|
|
17652
|
+
};
|
|
17653
|
+
// src/openaiTTS.ts
|
|
17654
|
+
var OPENAI_PCM24_FORMAT2 = {
|
|
17655
|
+
channels: 1,
|
|
17656
|
+
container: "raw",
|
|
17657
|
+
encoding: "pcm_s16le",
|
|
17658
|
+
sampleRateHz: 24000
|
|
17659
|
+
};
|
|
17098
17660
|
var resolveInstructions = async (instructions, input) => {
|
|
17099
17661
|
if (typeof instructions === "function") {
|
|
17100
17662
|
return instructions(input);
|
|
@@ -17102,7 +17664,7 @@ var resolveInstructions = async (instructions, input) => {
|
|
|
17102
17664
|
return instructions;
|
|
17103
17665
|
};
|
|
17104
17666
|
var createTTSHTTPError = (response) => new Error(`OpenAI voice TTS failed: HTTP ${response.status}`);
|
|
17105
|
-
var
|
|
17667
|
+
var emit2 = async (listeners, event, payload) => {
|
|
17106
17668
|
for (const handler of listeners[event]) {
|
|
17107
17669
|
await Promise.resolve(handler(payload));
|
|
17108
17670
|
}
|
|
@@ -17132,7 +17694,7 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17132
17694
|
closed = true;
|
|
17133
17695
|
abortController.abort();
|
|
17134
17696
|
openOptions.signal?.removeEventListener("abort", signalAbort);
|
|
17135
|
-
await
|
|
17697
|
+
await emit2(listeners, "close", {
|
|
17136
17698
|
reason,
|
|
17137
17699
|
type: "close"
|
|
17138
17700
|
});
|
|
@@ -17175,9 +17737,9 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17175
17737
|
if (!response.body) {
|
|
17176
17738
|
const chunk = new Uint8Array(await response.arrayBuffer());
|
|
17177
17739
|
if (!closed && chunk.byteLength > 0) {
|
|
17178
|
-
await
|
|
17740
|
+
await emit2(listeners, "audio", {
|
|
17179
17741
|
chunk,
|
|
17180
|
-
format:
|
|
17742
|
+
format: OPENAI_PCM24_FORMAT2,
|
|
17181
17743
|
receivedAt: Date.now(),
|
|
17182
17744
|
type: "audio"
|
|
17183
17745
|
});
|
|
@@ -17192,9 +17754,9 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17192
17754
|
break;
|
|
17193
17755
|
}
|
|
17194
17756
|
if (value.byteLength > 0) {
|
|
17195
|
-
await
|
|
17757
|
+
await emit2(listeners, "audio", {
|
|
17196
17758
|
chunk: new Uint8Array(value),
|
|
17197
|
-
format:
|
|
17759
|
+
format: OPENAI_PCM24_FORMAT2,
|
|
17198
17760
|
receivedAt: Date.now(),
|
|
17199
17761
|
type: "audio"
|
|
17200
17762
|
});
|
|
@@ -17208,7 +17770,7 @@ var createOpenAIVoiceTTS = (options) => {
|
|
|
17208
17770
|
return;
|
|
17209
17771
|
}
|
|
17210
17772
|
const normalizedError = error instanceof Error ? error : new Error(String(error));
|
|
17211
|
-
await
|
|
17773
|
+
await emit2(listeners, "error", {
|
|
17212
17774
|
error: normalizedError,
|
|
17213
17775
|
recoverable: true,
|
|
17214
17776
|
type: "error"
|
|
@@ -19778,11 +20340,11 @@ var createResolver = (options) => {
|
|
|
19778
20340
|
selectedProvider: preferred
|
|
19779
20341
|
};
|
|
19780
20342
|
};
|
|
19781
|
-
const
|
|
20343
|
+
const emit3 = async (event, input) => {
|
|
19782
20344
|
await options.onProviderEvent?.(event, input);
|
|
19783
20345
|
};
|
|
19784
20346
|
return {
|
|
19785
|
-
emit:
|
|
20347
|
+
emit: emit3,
|
|
19786
20348
|
getSuppressionRemainingMs,
|
|
19787
20349
|
providerIds,
|
|
19788
20350
|
recordError,
|
|
@@ -22301,6 +22863,7 @@ export {
|
|
|
22301
22863
|
createPhraseHintCorrectionHandler,
|
|
22302
22864
|
createOpenAIVoiceTTS,
|
|
22303
22865
|
createOpenAIVoiceAssistantModel,
|
|
22866
|
+
createOpenAIRealtimeAdapter,
|
|
22304
22867
|
createMemoryVoiceTelephonyWebhookIdempotencyStore,
|
|
22305
22868
|
createJSONVoiceAssistantModel,
|
|
22306
22869
|
createId,
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { RealtimeAdapter } from './types';
|
|
2
|
+
export type OpenAIRealtimeModel = 'gpt-realtime' | 'gpt-realtime-mini' | 'gpt-4o-realtime-preview' | 'gpt-4o-mini-realtime-preview' | (string & {});
|
|
3
|
+
export type OpenAIRealtimeVoice = 'alloy' | 'ash' | 'ballad' | 'cedar' | 'coral' | 'echo' | 'marin' | 'sage' | 'shimmer' | 'verse' | {
|
|
4
|
+
id: string;
|
|
5
|
+
} | (string & {});
|
|
6
|
+
export type OpenAIRealtimeTranscriptionModel = 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe' | 'whisper-1' | (string & {});
|
|
7
|
+
export type OpenAIRealtimeNoiseReduction = 'near_field' | 'far_field';
|
|
8
|
+
export type OpenAIRealtimeResponseMode = 'audio' | 'text';
|
|
9
|
+
export type OpenAIRealtimeAdapterOptions = {
|
|
10
|
+
apiKey: string;
|
|
11
|
+
autoCommitSilenceMs?: number;
|
|
12
|
+
baseUrl?: string;
|
|
13
|
+
emitResponseTranscripts?: boolean;
|
|
14
|
+
inputTranscriptionLanguage?: string;
|
|
15
|
+
inputTranscriptionModel?: OpenAIRealtimeTranscriptionModel | null;
|
|
16
|
+
inputTranscriptionPrompt?: string;
|
|
17
|
+
instructions?: string;
|
|
18
|
+
maxOutputTokens?: number | 'inf';
|
|
19
|
+
model?: OpenAIRealtimeModel;
|
|
20
|
+
noiseReduction?: OpenAIRealtimeNoiseReduction;
|
|
21
|
+
responseMode?: OpenAIRealtimeResponseMode;
|
|
22
|
+
speed?: number;
|
|
23
|
+
temperature?: number;
|
|
24
|
+
voice?: OpenAIRealtimeVoice;
|
|
25
|
+
webSocket?: typeof WebSocket;
|
|
26
|
+
};
|
|
27
|
+
export declare const createOpenAIRealtimeAdapter: (options: OpenAIRealtimeAdapterOptions) => RealtimeAdapter;
|
|
@@ -2,7 +2,7 @@ import { Elysia } from 'elysia';
|
|
|
2
2
|
import type { VoiceTelephonySetupStatus, VoiceTelephonySmokeCheck, VoiceTelephonySmokeReport } from './contract';
|
|
3
3
|
import { type VoiceTelephonyOutcomePolicy, type VoiceTelephonyWebhookRoutesOptions } from '../telephonyOutcome';
|
|
4
4
|
import { type VoiceCallReviewArtifact, type VoiceCallReviewConfig } from '../testing/review';
|
|
5
|
-
import type { AudioFormat, VoiceLogger, VoicePluginConfig, VoiceSessionRecord, VoiceServerMessage } from '../types';
|
|
5
|
+
import type { AudioFormat, STTAdapter, VoiceLogger, VoicePluginConfig, VoiceSessionRecord, VoiceServerMessage } from '../types';
|
|
6
6
|
type TwilioMediaPayload = {
|
|
7
7
|
chunk?: string;
|
|
8
8
|
payload: string;
|
|
@@ -78,7 +78,7 @@ export type TwilioMediaStreamSocket = {
|
|
|
78
78
|
close: (code?: number, reason?: string) => void | Promise<void>;
|
|
79
79
|
send: (data: string) => void | Promise<void>;
|
|
80
80
|
};
|
|
81
|
-
export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = Omit<VoicePluginConfig<TContext, TSession, TResult>, 'htmx' | 'path'> & {
|
|
81
|
+
export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends VoiceSessionRecord = VoiceSessionRecord, TResult = unknown> = Omit<VoicePluginConfig<TContext, TSession, TResult>, 'htmx' | 'path' | 'stt'> & {
|
|
82
82
|
clearOnInboundMedia?: boolean;
|
|
83
83
|
context: TContext;
|
|
84
84
|
logger?: VoiceLogger;
|
|
@@ -97,6 +97,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
97
97
|
};
|
|
98
98
|
scenarioId?: string;
|
|
99
99
|
sessionId?: string;
|
|
100
|
+
stt: STTAdapter;
|
|
100
101
|
};
|
|
101
102
|
export type TwilioMediaStreamBridge = {
|
|
102
103
|
close: (reason?: string) => Promise<void>;
|
package/dist/testing/index.js
CHANGED
|
@@ -5033,6 +5033,12 @@ var DEFAULT_FORMAT = {
|
|
|
5033
5033
|
encoding: "pcm_s16le",
|
|
5034
5034
|
sampleRateHz: 16000
|
|
5035
5035
|
};
|
|
5036
|
+
var DEFAULT_REALTIME_FORMAT = {
|
|
5037
|
+
channels: 1,
|
|
5038
|
+
container: "raw",
|
|
5039
|
+
encoding: "pcm_s16le",
|
|
5040
|
+
sampleRateHz: 24000
|
|
5041
|
+
};
|
|
5036
5042
|
var toError = (value) => value instanceof Error ? value : new Error(String(value));
|
|
5037
5043
|
var createEmptyCurrentTurn = () => ({
|
|
5038
5044
|
finalText: "",
|
|
@@ -5413,6 +5419,23 @@ var createVoiceSession = (options) => {
|
|
|
5413
5419
|
});
|
|
5414
5420
|
}
|
|
5415
5421
|
};
|
|
5422
|
+
const sendAssistantAudio = async (chunk, input) => {
|
|
5423
|
+
const normalizedChunk = chunk instanceof Uint8Array ? new Uint8Array(chunk) : chunk instanceof ArrayBuffer ? new Uint8Array(chunk.slice(0)) : new Uint8Array(chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength));
|
|
5424
|
+
await send({
|
|
5425
|
+
chunkBase64: encodeBase64(normalizedChunk),
|
|
5426
|
+
format: input.format,
|
|
5427
|
+
receivedAt: input.receivedAt,
|
|
5428
|
+
turnId: activeTTSTurnId,
|
|
5429
|
+
type: "audio"
|
|
5430
|
+
});
|
|
5431
|
+
if (activeTTSTurnId) {
|
|
5432
|
+
await appendTurnLatencyStage({
|
|
5433
|
+
at: input.receivedAt,
|
|
5434
|
+
stage: "assistant_audio_received",
|
|
5435
|
+
turnId: activeTTSTurnId
|
|
5436
|
+
});
|
|
5437
|
+
}
|
|
5438
|
+
};
|
|
5416
5439
|
const scheduleTurnCommit = (delayMs, reason, reset = true) => {
|
|
5417
5440
|
if (!reset && silenceTimer) {
|
|
5418
5441
|
return;
|
|
@@ -6114,8 +6137,12 @@ var createVoiceSession = (options) => {
|
|
|
6114
6137
|
if (sttSession) {
|
|
6115
6138
|
return sttSession;
|
|
6116
6139
|
}
|
|
6117
|
-
const
|
|
6118
|
-
|
|
6140
|
+
const inputAdapter = options.realtime ?? options.stt;
|
|
6141
|
+
if (!inputAdapter) {
|
|
6142
|
+
throw new Error("Voice session requires either an stt or realtime adapter.");
|
|
6143
|
+
}
|
|
6144
|
+
const openedSession = await inputAdapter.open({
|
|
6145
|
+
format: options.realtime ? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT : DEFAULT_FORMAT,
|
|
6119
6146
|
languageStrategy: options.languageStrategy,
|
|
6120
6147
|
lexicon,
|
|
6121
6148
|
phraseHints,
|
|
@@ -6150,6 +6177,16 @@ var createVoiceSession = (options) => {
|
|
|
6150
6177
|
openedSession.on("close", (event) => {
|
|
6151
6178
|
runAdapterEvent("adapter.close", () => handleClose(event));
|
|
6152
6179
|
});
|
|
6180
|
+
if (options.realtime) {
|
|
6181
|
+
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
6182
|
+
runAdapterEvent("adapter.audio", async () => {
|
|
6183
|
+
await sendAssistantAudio(chunk, {
|
|
6184
|
+
format,
|
|
6185
|
+
receivedAt
|
|
6186
|
+
});
|
|
6187
|
+
});
|
|
6188
|
+
});
|
|
6189
|
+
}
|
|
6153
6190
|
return openedSession;
|
|
6154
6191
|
};
|
|
6155
6192
|
const ensureTTSSession = async () => {
|
|
@@ -6174,21 +6211,10 @@ var createVoiceSession = (options) => {
|
|
|
6174
6211
|
if (ttsSession !== openedSession) {
|
|
6175
6212
|
return;
|
|
6176
6213
|
}
|
|
6177
|
-
|
|
6178
|
-
await send({
|
|
6179
|
-
chunkBase64: encodeBase64(normalizedChunk),
|
|
6214
|
+
await sendAssistantAudio(chunk, {
|
|
6180
6215
|
format,
|
|
6181
|
-
receivedAt
|
|
6182
|
-
turnId: activeTTSTurnId,
|
|
6183
|
-
type: "audio"
|
|
6216
|
+
receivedAt
|
|
6184
6217
|
});
|
|
6185
|
-
if (activeTTSTurnId) {
|
|
6186
|
-
await appendTurnLatencyStage({
|
|
6187
|
-
at: receivedAt,
|
|
6188
|
-
stage: "assistant_audio_received",
|
|
6189
|
-
turnId: activeTTSTurnId
|
|
6190
|
-
});
|
|
6191
|
-
}
|
|
6192
6218
|
});
|
|
6193
6219
|
});
|
|
6194
6220
|
openedSession.on("error", (event) => {
|
|
@@ -6267,7 +6293,8 @@ var createVoiceSession = (options) => {
|
|
|
6267
6293
|
await appendTrace({
|
|
6268
6294
|
payload: {
|
|
6269
6295
|
text: output.assistantText,
|
|
6270
|
-
ttsConfigured: Boolean(options.tts)
|
|
6296
|
+
ttsConfigured: Boolean(options.tts),
|
|
6297
|
+
realtimeConfigured: Boolean(options.realtime)
|
|
6271
6298
|
},
|
|
6272
6299
|
session,
|
|
6273
6300
|
turnId: turn.id,
|
|
@@ -6299,9 +6326,35 @@ var createVoiceSession = (options) => {
|
|
|
6299
6326
|
turnId: turn.id,
|
|
6300
6327
|
type: "turn.assistant"
|
|
6301
6328
|
});
|
|
6329
|
+
} else if (options.realtime) {
|
|
6330
|
+
const activeRealtimeSession = await ensureAdapter();
|
|
6331
|
+
const realtimeStartedAt = Date.now();
|
|
6332
|
+
activeTTSTurnId = turn.id;
|
|
6333
|
+
await appendTurnLatencyStage({
|
|
6334
|
+
at: realtimeStartedAt,
|
|
6335
|
+
session,
|
|
6336
|
+
stage: "tts_send_started",
|
|
6337
|
+
turnId: turn.id
|
|
6338
|
+
});
|
|
6339
|
+
await activeRealtimeSession.send(output.assistantText);
|
|
6340
|
+
await appendTurnLatencyStage({
|
|
6341
|
+
session,
|
|
6342
|
+
stage: "tts_send_completed",
|
|
6343
|
+
turnId: turn.id
|
|
6344
|
+
});
|
|
6345
|
+
await appendTrace({
|
|
6346
|
+
payload: {
|
|
6347
|
+
elapsedMs: Date.now() - realtimeStartedAt,
|
|
6348
|
+
mode: "realtime",
|
|
6349
|
+
status: "sent"
|
|
6350
|
+
},
|
|
6351
|
+
session,
|
|
6352
|
+
turnId: turn.id,
|
|
6353
|
+
type: "turn.assistant"
|
|
6354
|
+
});
|
|
6302
6355
|
}
|
|
6303
6356
|
} catch (error) {
|
|
6304
|
-
logger.warn("voice
|
|
6357
|
+
logger.warn("voice assistant audio send failed", {
|
|
6305
6358
|
error: toError(error).message,
|
|
6306
6359
|
sessionId: options.id,
|
|
6307
6360
|
turnId: turn.id
|
|
@@ -6309,7 +6362,7 @@ var createVoiceSession = (options) => {
|
|
|
6309
6362
|
await appendTrace({
|
|
6310
6363
|
payload: {
|
|
6311
6364
|
error: toError(error).message,
|
|
6312
|
-
status: "tts-send-failed"
|
|
6365
|
+
status: options.realtime ? "realtime-send-failed" : "tts-send-failed"
|
|
6313
6366
|
},
|
|
6314
6367
|
session,
|
|
6315
6368
|
turnId: turn.id,
|
|
@@ -6514,7 +6567,7 @@ var createVoiceSession = (options) => {
|
|
|
6514
6567
|
turn,
|
|
6515
6568
|
type: "turn"
|
|
6516
6569
|
});
|
|
6517
|
-
if (options.sttLifecycle === "turn-scoped") {
|
|
6570
|
+
if (options.stt && options.sttLifecycle === "turn-scoped") {
|
|
6518
6571
|
await closeAdapter("turn-commit");
|
|
6519
6572
|
}
|
|
6520
6573
|
await completeTurn(updatedSession, turn);
|
|
@@ -9600,7 +9653,7 @@ var runVoiceTelephonyBenchmark = async (scenarios = getDefaultVoiceTelephonyBenc
|
|
|
9600
9653
|
};
|
|
9601
9654
|
};
|
|
9602
9655
|
// src/testing/tts.ts
|
|
9603
|
-
var
|
|
9656
|
+
var DEFAULT_REALTIME_FORMAT2 = {
|
|
9604
9657
|
channels: 1,
|
|
9605
9658
|
container: "raw",
|
|
9606
9659
|
encoding: "pcm_s16le",
|
|
@@ -9659,7 +9712,7 @@ var runTTSAdapterFixture = async (adapter, fixture, options = {}) => {
|
|
|
9659
9712
|
let audioDurationMs = 0;
|
|
9660
9713
|
let audioChunkCount = 0;
|
|
9661
9714
|
const session = adapter.kind === "realtime" ? await adapter.open({
|
|
9662
|
-
format: options.realtimeFormat ??
|
|
9715
|
+
format: options.realtimeFormat ?? DEFAULT_REALTIME_FORMAT2,
|
|
9663
9716
|
sessionId: `tts-benchmark:${fixture.id}`,
|
|
9664
9717
|
...openOptions ?? {}
|
|
9665
9718
|
}) : await adapter.open({
|
package/dist/types.d.ts
CHANGED
|
@@ -616,9 +616,11 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
616
616
|
lexicon?: VoiceLexiconEntry[] | VoiceLexiconResolver<TContext>;
|
|
617
617
|
phraseHints?: VoicePhraseHint[] | VoicePhraseHintResolver<TContext>;
|
|
618
618
|
preset?: VoiceRuntimePreset;
|
|
619
|
-
stt
|
|
619
|
+
stt?: STTAdapter;
|
|
620
620
|
sttFallback?: VoiceSTTFallbackConfig;
|
|
621
621
|
sttLifecycle?: VoiceSTTLifecycle;
|
|
622
|
+
realtime?: RealtimeAdapter;
|
|
623
|
+
realtimeInputFormat?: AudioFormat;
|
|
622
624
|
tts?: TTSAdapter;
|
|
623
625
|
session: VoiceSessionStore<NoInfer<TSession>>;
|
|
624
626
|
reconnect?: VoiceReconnectConfig;
|
|
@@ -635,7 +637,9 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
635
637
|
id: string;
|
|
636
638
|
context: TContext;
|
|
637
639
|
socket: VoiceSocket;
|
|
638
|
-
stt
|
|
640
|
+
stt?: STTAdapter;
|
|
641
|
+
realtime?: RealtimeAdapter;
|
|
642
|
+
realtimeInputFormat?: AudioFormat;
|
|
639
643
|
tts?: TTSAdapter;
|
|
640
644
|
languageStrategy?: VoiceLanguageStrategy;
|
|
641
645
|
lexicon?: VoiceLexiconEntry[];
|