@kognitivedev/backend-cloud 0.2.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +2 -0
- package/.turbo/turbo-test.log +14 -0
- package/CHANGELOG.md +11 -0
- package/README.md +88 -0
- package/dist/cloud-voice-parameters.d.ts +11 -0
- package/dist/cloud-voice-parameters.js +219 -0
- package/dist/cloud-voice-prompt-service.d.ts +24 -0
- package/dist/cloud-voice-prompt-service.js +382 -0
- package/dist/cloud-voice-runtime-service.d.ts +73 -0
- package/dist/cloud-voice-runtime-service.js +443 -0
- package/dist/cloud-voice.d.ts +36 -0
- package/dist/cloud-voice.js +683 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +26 -0
- package/dist/phone-control.d.ts +50 -0
- package/dist/phone-control.js +97 -0
- package/dist/phone-runtime/audio-playout-tracker.d.ts +51 -0
- package/dist/phone-runtime/audio-playout-tracker.js +93 -0
- package/dist/phone-runtime/openai-twilio-realtime.d.ts +95 -0
- package/dist/phone-runtime/openai-twilio-realtime.js +1074 -0
- package/dist/tools.d.ts +2 -0
- package/dist/tools.js +216 -0
- package/dist/types.d.ts +468 -0
- package/dist/types.js +2 -0
- package/dist/utils.d.ts +3 -0
- package/dist/utils.js +14 -0
- package/package.json +47 -0
- package/src/__tests__/audio-playout-tracker.test.ts +46 -0
- package/src/__tests__/cloud-voice.test.ts +1006 -0
- package/src/__tests__/openai-twilio-realtime.test.ts +1193 -0
- package/src/__tests__/phone-control.test.ts +105 -0
- package/src/cloud-voice-parameters.ts +236 -0
- package/src/cloud-voice-prompt-service.ts +493 -0
- package/src/cloud-voice-runtime-service.ts +465 -0
- package/src/cloud-voice.ts +831 -0
- package/src/index.ts +10 -0
- package/src/phone-control.ts +156 -0
- package/src/phone-runtime/audio-playout-tracker.ts +132 -0
- package/src/phone-runtime/openai-twilio-realtime.ts +1250 -0
- package/src/tools.ts +227 -0
- package/src/types.ts +529 -0
- package/src/utils.ts +11 -0
- package/tsconfig.json +13 -0
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
CloudVoiceAgentConfig,
|
|
3
|
+
CloudVoiceChannel,
|
|
4
|
+
CloudVoiceConversationProfile,
|
|
5
|
+
CloudVoiceHumanizationConfig,
|
|
6
|
+
CloudVoicePipelineConfig,
|
|
7
|
+
CloudVoiceProvider,
|
|
8
|
+
CloudVoiceRuntimeCapabilities,
|
|
9
|
+
CloudVoiceSpeechConfig,
|
|
10
|
+
} from "./types";
|
|
11
|
+
import { getRecord, getString } from "./utils";
|
|
12
|
+
|
|
13
|
+
export const DEFAULT_CLOUD_VOICE_HUMANIZATION: Required<CloudVoiceHumanizationConfig> = {
|
|
14
|
+
enabled: true,
|
|
15
|
+
openingMode: "auto",
|
|
16
|
+
openingStyle: "brief",
|
|
17
|
+
fillerStyle: "light",
|
|
18
|
+
backchannelFrequency: "low",
|
|
19
|
+
disfluency: "rare",
|
|
20
|
+
toolLatencyFillerMs: 700,
|
|
21
|
+
conversationProfile: {
|
|
22
|
+
personality: "warm",
|
|
23
|
+
tone: "professional",
|
|
24
|
+
pacing: "concise",
|
|
25
|
+
unclearAudio: "ask_repeat",
|
|
26
|
+
confirmation: "critical_fields",
|
|
27
|
+
escalation: "when_blocked",
|
|
28
|
+
numberReadback: true,
|
|
29
|
+
},
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
export const CLOUD_VOICE_CONFIG_VERSION = 2 as const;
|
|
33
|
+
export const CLOUD_VOICE_PROMPT_COMPILER_VERSION = "cloud-voice-provider-native-v2";
|
|
34
|
+
|
|
35
|
+
export const CLOUD_VOICE_PROVIDER_CAPABILITIES: Record<CloudVoiceProvider, CloudVoiceRuntimeCapabilities> = {
|
|
36
|
+
"openai-realtime": {
|
|
37
|
+
canUpdateInstructionsLive: true,
|
|
38
|
+
supportsSessionResume: false,
|
|
39
|
+
supportsToolCalling: true,
|
|
40
|
+
supportsOutputAudioTranscripts: true,
|
|
41
|
+
supportsServerVadConfig: true,
|
|
42
|
+
supportsSemanticVad: true,
|
|
43
|
+
},
|
|
44
|
+
"gemini-live": {
|
|
45
|
+
canUpdateInstructionsLive: false,
|
|
46
|
+
supportsSessionResume: true,
|
|
47
|
+
supportsToolCalling: true,
|
|
48
|
+
supportsOutputAudioTranscripts: true,
|
|
49
|
+
supportsServerVadConfig: true,
|
|
50
|
+
supportsNativeAudioOptions: true,
|
|
51
|
+
supportsAffectiveDialog: true,
|
|
52
|
+
supportsProactiveAudio: true,
|
|
53
|
+
},
|
|
54
|
+
"kognitive-voice": {
|
|
55
|
+
canUpdateInstructionsLive: true,
|
|
56
|
+
supportsSessionResume: true,
|
|
57
|
+
supportsToolCalling: true,
|
|
58
|
+
supportsOutputAudioTranscripts: true,
|
|
59
|
+
supportsServerVadConfig: false,
|
|
60
|
+
supportsPipelineEotConfig: true,
|
|
61
|
+
supportsCartesiaTtsControls: true,
|
|
62
|
+
},
|
|
63
|
+
"xai-realtime": {
|
|
64
|
+
canUpdateInstructionsLive: true,
|
|
65
|
+
supportsSessionResume: false,
|
|
66
|
+
supportsToolCalling: true,
|
|
67
|
+
supportsOutputAudioTranscripts: true,
|
|
68
|
+
supportsServerVadConfig: true,
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
const DEFAULT_KOGNITIVE_PIPELINE: CloudVoicePipelineConfig = {
|
|
73
|
+
transport: { type: "websocket", provider: "kognitive-websocket" },
|
|
74
|
+
stt: { provider: "deepgram", model: "nova-3", language: "en" },
|
|
75
|
+
llm: { provider: "openai", model: "gpt-4o-mini" },
|
|
76
|
+
tts: { provider: "cartesia", model: "sonic-3", voice: "a0e99841-438c-4a64-b679-ae501e7d6091" },
|
|
77
|
+
turn: {
|
|
78
|
+
interruptResponse: true,
|
|
79
|
+
createResponse: true,
|
|
80
|
+
prefixPaddingMs: 300,
|
|
81
|
+
silenceDurationMs: 650,
|
|
82
|
+
},
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
const SPEECH_LANGUAGE_INSTRUCTIONS: Record<string, string> = {
|
|
86
|
+
en: "English",
|
|
87
|
+
"en-US": "English",
|
|
88
|
+
"en-GB": "English",
|
|
89
|
+
tr: "Turkish",
|
|
90
|
+
"tr-TR": "Turkish",
|
|
91
|
+
de: "German",
|
|
92
|
+
"de-DE": "German",
|
|
93
|
+
fr: "French",
|
|
94
|
+
"fr-FR": "French",
|
|
95
|
+
es: "Spanish",
|
|
96
|
+
"es-ES": "Spanish",
|
|
97
|
+
it: "Italian",
|
|
98
|
+
"it-IT": "Italian",
|
|
99
|
+
pt: "Portuguese",
|
|
100
|
+
"pt-PT": "Portuguese",
|
|
101
|
+
"pt-BR": "Portuguese",
|
|
102
|
+
nl: "Dutch",
|
|
103
|
+
"nl-NL": "Dutch",
|
|
104
|
+
sv: "Swedish",
|
|
105
|
+
"sv-SE": "Swedish",
|
|
106
|
+
no: "Norwegian",
|
|
107
|
+
"nb-NO": "Norwegian",
|
|
108
|
+
da: "Danish",
|
|
109
|
+
"da-DK": "Danish",
|
|
110
|
+
fi: "Finnish",
|
|
111
|
+
"fi-FI": "Finnish",
|
|
112
|
+
pl: "Polish",
|
|
113
|
+
"pl-PL": "Polish",
|
|
114
|
+
cs: "Czech",
|
|
115
|
+
"cs-CZ": "Czech",
|
|
116
|
+
ro: "Romanian",
|
|
117
|
+
"ro-RO": "Romanian",
|
|
118
|
+
el: "Greek",
|
|
119
|
+
"el-GR": "Greek",
|
|
120
|
+
ru: "Russian",
|
|
121
|
+
"ru-RU": "Russian",
|
|
122
|
+
uk: "Ukrainian",
|
|
123
|
+
"uk-UA": "Ukrainian",
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
const SPEECH_LANGUAGE_CODES: Record<string, string> = {
|
|
127
|
+
en: "en-US",
|
|
128
|
+
tr: "tr-TR",
|
|
129
|
+
de: "de-DE",
|
|
130
|
+
fr: "fr-FR",
|
|
131
|
+
es: "es-ES",
|
|
132
|
+
it: "it-IT",
|
|
133
|
+
pt: "pt-PT",
|
|
134
|
+
nl: "nl-NL",
|
|
135
|
+
sv: "sv-SE",
|
|
136
|
+
no: "nb-NO",
|
|
137
|
+
da: "da-DK",
|
|
138
|
+
fi: "fi-FI",
|
|
139
|
+
pl: "pl-PL",
|
|
140
|
+
cs: "cs-CZ",
|
|
141
|
+
ro: "ro-RO",
|
|
142
|
+
el: "el-GR",
|
|
143
|
+
ru: "ru-RU",
|
|
144
|
+
uk: "uk-UA",
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
const SPEECH_ACCENT_LANGUAGE_CODES: Record<string, string> = {
|
|
148
|
+
"neutral english": "en-US",
|
|
149
|
+
"british english": "en-GB",
|
|
150
|
+
"irish english": "en-IE",
|
|
151
|
+
"scottish english": "en-GB",
|
|
152
|
+
"standard istanbul turkish": "tr-TR",
|
|
153
|
+
"aegean turkish": "tr-TR",
|
|
154
|
+
"anatolian turkish": "tr-TR",
|
|
155
|
+
"standard german": "de-DE",
|
|
156
|
+
"austrian german": "de-AT",
|
|
157
|
+
"swiss german influenced standard german": "de-CH",
|
|
158
|
+
"standard french from france": "fr-FR",
|
|
159
|
+
"belgian french": "fr-BE",
|
|
160
|
+
"swiss french": "fr-CH",
|
|
161
|
+
"castilian spanish": "es-ES",
|
|
162
|
+
"andalusian spanish": "es-ES",
|
|
163
|
+
"canarian spanish": "es-ES",
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
function isPhoneChannel(channel?: CloudVoiceChannel) {
|
|
167
|
+
return channel === "phone" || channel === "sip" || channel === "outbound";
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
export function normalizeCloudVoiceHumanizationConfig(value: unknown): Required<CloudVoiceHumanizationConfig> {
|
|
171
|
+
const record = getRecord(value);
|
|
172
|
+
const openingMode = record.openingMode === "wait" || record.openingMode === "auto"
|
|
173
|
+
? record.openingMode
|
|
174
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.openingMode;
|
|
175
|
+
const openingStyle = record.openingStyle === "warm" || record.openingStyle === "professional" || record.openingStyle === "brief"
|
|
176
|
+
? record.openingStyle
|
|
177
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.openingStyle;
|
|
178
|
+
const fillerStyle = record.fillerStyle === "off" || record.fillerStyle === "natural" || record.fillerStyle === "light"
|
|
179
|
+
? record.fillerStyle
|
|
180
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.fillerStyle;
|
|
181
|
+
const backchannelFrequency = record.backchannelFrequency === "off" || record.backchannelFrequency === "medium" || record.backchannelFrequency === "low"
|
|
182
|
+
? record.backchannelFrequency
|
|
183
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.backchannelFrequency;
|
|
184
|
+
const disfluency = record.disfluency === "off" || record.disfluency === "rare"
|
|
185
|
+
? record.disfluency
|
|
186
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.disfluency;
|
|
187
|
+
const toolLatencyFillerMs = typeof record.toolLatencyFillerMs === "number" && Number.isFinite(record.toolLatencyFillerMs)
|
|
188
|
+
? Math.max(0, Math.round(record.toolLatencyFillerMs))
|
|
189
|
+
: DEFAULT_CLOUD_VOICE_HUMANIZATION.toolLatencyFillerMs;
|
|
190
|
+
return {
|
|
191
|
+
enabled: typeof record.enabled === "boolean" ? record.enabled : DEFAULT_CLOUD_VOICE_HUMANIZATION.enabled,
|
|
192
|
+
openingMode,
|
|
193
|
+
openingStyle,
|
|
194
|
+
fillerStyle,
|
|
195
|
+
backchannelFrequency,
|
|
196
|
+
disfluency,
|
|
197
|
+
toolLatencyFillerMs,
|
|
198
|
+
conversationProfile: normalizeCloudVoiceConversationProfile(record.conversationProfile ?? record.profile, record),
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
export function normalizeCloudVoiceConversationProfile(value: unknown, legacy: Record<string, unknown> = {}): CloudVoiceConversationProfile {
|
|
203
|
+
const record = getRecord(value);
|
|
204
|
+
const openingStyle = getString(legacy.openingStyle, "");
|
|
205
|
+
const fillerStyle = getString(legacy.fillerStyle, "");
|
|
206
|
+
const backchannelFrequency = getString(legacy.backchannelFrequency, "");
|
|
207
|
+
const defaults = DEFAULT_CLOUD_VOICE_HUMANIZATION.conversationProfile;
|
|
208
|
+
const personality: CloudVoiceConversationProfile["personality"] = record.personality === "neutral" || record.personality === "warm" || record.personality === "expert" || record.personality === "concierge"
|
|
209
|
+
? record.personality
|
|
210
|
+
: openingStyle === "professional"
|
|
211
|
+
? "expert"
|
|
212
|
+
: "warm";
|
|
213
|
+
return {
|
|
214
|
+
personality,
|
|
215
|
+
tone: record.tone === "casual" || record.tone === "professional" || record.tone === "empathetic" || record.tone === "polished"
|
|
216
|
+
? record.tone
|
|
217
|
+
: openingStyle === "warm"
|
|
218
|
+
? "empathetic"
|
|
219
|
+
: defaults.tone,
|
|
220
|
+
pacing: record.pacing === "concise" || record.pacing === "measured" || record.pacing === "deliberate" || record.pacing === "energetic"
|
|
221
|
+
? record.pacing
|
|
222
|
+
: fillerStyle === "natural" || backchannelFrequency === "medium"
|
|
223
|
+
? "measured"
|
|
224
|
+
: defaults.pacing,
|
|
225
|
+
unclearAudio: record.unclearAudio === "ask_repeat" || record.unclearAudio === "confirm_best_guess"
|
|
226
|
+
? record.unclearAudio
|
|
227
|
+
: defaults.unclearAudio,
|
|
228
|
+
confirmation: record.confirmation === "critical_fields" || record.confirmation === "all_actions" || record.confirmation === "minimal"
|
|
229
|
+
? record.confirmation
|
|
230
|
+
: defaults.confirmation,
|
|
231
|
+
escalation: record.escalation === "when_blocked" || record.escalation === "on_request" || record.escalation === "never"
|
|
232
|
+
? record.escalation
|
|
233
|
+
: defaults.escalation,
|
|
234
|
+
numberReadback: typeof record.numberReadback === "boolean" ? record.numberReadback : defaults.numberReadback,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
export function getCloudVoiceProviderCapabilities(provider: CloudVoiceProvider): CloudVoiceRuntimeCapabilities {
|
|
239
|
+
return CLOUD_VOICE_PROVIDER_CAPABILITIES[provider] ?? CLOUD_VOICE_PROVIDER_CAPABILITIES["openai-realtime"];
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
export function normalizeSpeechConfig(config: Pick<CloudVoiceAgentConfig, "metadata">): CloudVoiceSpeechConfig | undefined {
|
|
243
|
+
const speech = getRecord(getRecord(config.metadata).speech);
|
|
244
|
+
const normalized: CloudVoiceSpeechConfig = {};
|
|
245
|
+
const knownKeys = ["language", "accent", "style", "pace", "emotion"] as const;
|
|
246
|
+
for (const key of knownKeys) {
|
|
247
|
+
const value = speech[key];
|
|
248
|
+
if (typeof value === "string" && value.trim()) normalized[key] = value.trim();
|
|
249
|
+
}
|
|
250
|
+
for (const [key, value] of Object.entries(speech)) {
|
|
251
|
+
if ((knownKeys as readonly string[]).includes(key) || value === undefined) continue;
|
|
252
|
+
normalized[key] = value;
|
|
253
|
+
}
|
|
254
|
+
return Object.keys(normalized).length > 0 ? normalized : undefined;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
export function toLanguageInstruction(value: string | undefined) {
|
|
258
|
+
return value ? SPEECH_LANGUAGE_INSTRUCTIONS[value] ?? SPEECH_LANGUAGE_INSTRUCTIONS[value.toLowerCase()] ?? value : undefined;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
export function resolveCloudVoiceSpeechLanguageCode(config: Pick<CloudVoiceAgentConfig, "metadata">) {
|
|
262
|
+
const speech = normalizeSpeechConfig(config);
|
|
263
|
+
const accentCode = speech?.accent ? SPEECH_ACCENT_LANGUAGE_CODES[speech.accent.toLowerCase()] : undefined;
|
|
264
|
+
if (accentCode) return accentCode;
|
|
265
|
+
const language = speech?.language;
|
|
266
|
+
if (!language) return undefined;
|
|
267
|
+
if (/^[a-z]{2,3}-[A-Z]{2}$/i.test(language)) return language;
|
|
268
|
+
return SPEECH_LANGUAGE_CODES[language] ?? SPEECH_LANGUAGE_CODES[language.toLowerCase()] ?? language;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
function resolveCloudVoiceTranscriptionLanguage(config: Pick<CloudVoiceAgentConfig, "metadata">) {
|
|
272
|
+
const speech = normalizeSpeechConfig(config);
|
|
273
|
+
const language = speech?.language;
|
|
274
|
+
if (!language) return undefined;
|
|
275
|
+
return language.includes("-") ? language.split("-")[0] : language;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function normalizePipelineTransport(config: CloudVoiceAgentConfig): CloudVoicePipelineConfig["transport"] {
|
|
279
|
+
const transport = getRecord(config.pipeline?.transport);
|
|
280
|
+
const type = transport.type === "webrtc" || transport.type === "websocket"
|
|
281
|
+
? transport.type
|
|
282
|
+
: config.transport;
|
|
283
|
+
const provider = transport.provider === "daily" || transport.provider === "kognitive-websocket"
|
|
284
|
+
? transport.provider
|
|
285
|
+
: type === "webrtc"
|
|
286
|
+
? "daily"
|
|
287
|
+
: "kognitive-websocket";
|
|
288
|
+
return { type, provider };
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
export function normalizeCloudVoicePipelineConfig(config: CloudVoiceAgentConfig): CloudVoicePipelineConfig {
|
|
292
|
+
const pipeline = getRecord(config.pipeline);
|
|
293
|
+
const stt = getRecord(pipeline.stt);
|
|
294
|
+
const llm = getRecord(pipeline.llm);
|
|
295
|
+
const tts = getRecord(pipeline.tts);
|
|
296
|
+
const turn = getRecord(pipeline.turn);
|
|
297
|
+
const backgroundAudio = getRecord(pipeline.backgroundAudio);
|
|
298
|
+
const sttLanguage = typeof stt.language === "string" && stt.language.trim()
|
|
299
|
+
? stt.language.trim()
|
|
300
|
+
: resolveCloudVoiceTranscriptionLanguage(config);
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
transport: normalizePipelineTransport(config),
|
|
304
|
+
stt: {
|
|
305
|
+
provider: getString(stt.provider, DEFAULT_KOGNITIVE_PIPELINE.stt.provider),
|
|
306
|
+
model: getString(stt.model, DEFAULT_KOGNITIVE_PIPELINE.stt.model),
|
|
307
|
+
...(sttLanguage ? { language: sttLanguage } : {}),
|
|
308
|
+
},
|
|
309
|
+
llm: {
|
|
310
|
+
provider: getString(llm.provider, DEFAULT_KOGNITIVE_PIPELINE.llm.provider),
|
|
311
|
+
model: getString(llm.model, config.model || DEFAULT_KOGNITIVE_PIPELINE.llm.model),
|
|
312
|
+
},
|
|
313
|
+
tts: {
|
|
314
|
+
provider: getString(tts.provider, DEFAULT_KOGNITIVE_PIPELINE.tts.provider),
|
|
315
|
+
model: getString(tts.model, DEFAULT_KOGNITIVE_PIPELINE.tts.model),
|
|
316
|
+
voice: getString(tts.voice, config.voice && config.voice !== "alloy" ? config.voice : DEFAULT_KOGNITIVE_PIPELINE.tts.voice),
|
|
317
|
+
...(Object.keys(getRecord(tts.options)).length > 0 ? { options: getRecord(tts.options) } : {}),
|
|
318
|
+
},
|
|
319
|
+
...(Object.keys(turn).length > 0
|
|
320
|
+
? { turn: { ...DEFAULT_KOGNITIVE_PIPELINE.turn, ...turn } }
|
|
321
|
+
: { turn: DEFAULT_KOGNITIVE_PIPELINE.turn }),
|
|
322
|
+
...(Object.keys(backgroundAudio).length > 0 ? { backgroundAudio } : {}),
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
export function toPreparedTranscription(config: Pick<CloudVoiceAgentConfig, "metadata" | "transcription">) {
|
|
327
|
+
if (config.transcription === null) return null;
|
|
328
|
+
const transcription = getRecord(config.transcription);
|
|
329
|
+
const language = resolveCloudVoiceTranscriptionLanguage(config);
|
|
330
|
+
return language
|
|
331
|
+
? { ...transcription, language }
|
|
332
|
+
: transcription;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
export function toOpenAITurnDetection(value: unknown) {
|
|
336
|
+
if (value === null) return null;
|
|
337
|
+
const turnDetection = getRecord(value);
|
|
338
|
+
const type = typeof turnDetection.type === "string" ? turnDetection.type : "";
|
|
339
|
+
if (!type) return undefined;
|
|
340
|
+
|
|
341
|
+
const config: Record<string, unknown> = { type };
|
|
342
|
+
if (typeof turnDetection.create_response === "boolean") {
|
|
343
|
+
config.create_response = turnDetection.create_response;
|
|
344
|
+
} else if (typeof turnDetection.createResponse === "boolean") {
|
|
345
|
+
config.create_response = turnDetection.createResponse;
|
|
346
|
+
}
|
|
347
|
+
if (typeof turnDetection.interrupt_response === "boolean") {
|
|
348
|
+
config.interrupt_response = turnDetection.interrupt_response;
|
|
349
|
+
} else if (typeof turnDetection.interruptResponse === "boolean") {
|
|
350
|
+
config.interrupt_response = turnDetection.interruptResponse;
|
|
351
|
+
}
|
|
352
|
+
if (typeof turnDetection.prefix_padding_ms === "number") {
|
|
353
|
+
config.prefix_padding_ms = turnDetection.prefix_padding_ms;
|
|
354
|
+
} else if (typeof turnDetection.prefixPaddingMs === "number") {
|
|
355
|
+
config.prefix_padding_ms = turnDetection.prefixPaddingMs;
|
|
356
|
+
}
|
|
357
|
+
if (typeof turnDetection.silence_duration_ms === "number") {
|
|
358
|
+
config.silence_duration_ms = turnDetection.silence_duration_ms;
|
|
359
|
+
} else if (typeof turnDetection.silenceDurationMs === "number") {
|
|
360
|
+
config.silence_duration_ms = turnDetection.silenceDurationMs;
|
|
361
|
+
}
|
|
362
|
+
if (typeof turnDetection.threshold === "number") {
|
|
363
|
+
config.threshold = turnDetection.threshold;
|
|
364
|
+
}
|
|
365
|
+
if (typeof turnDetection.eagerness === "string") {
|
|
366
|
+
config.eagerness = turnDetection.eagerness;
|
|
367
|
+
}
|
|
368
|
+
return config;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
export function normalizeCloudVoiceTurnDetection(
|
|
372
|
+
provider: CloudVoiceProvider,
|
|
373
|
+
channel: CloudVoiceChannel,
|
|
374
|
+
value: unknown,
|
|
375
|
+
) {
|
|
376
|
+
if (value === null) return null;
|
|
377
|
+
const record = getRecord(value);
|
|
378
|
+
const isPhone = isPhoneChannel(channel);
|
|
379
|
+
if (provider === "openai-realtime" && !isPhone) {
|
|
380
|
+
const type = getString(record.type, "semantic_vad") === "off" ? "off" : getString(record.type, "semantic_vad");
|
|
381
|
+
if (type === "off") return null;
|
|
382
|
+
if (type === "semantic_vad") {
|
|
383
|
+
return {
|
|
384
|
+
type: "semantic_vad",
|
|
385
|
+
createResponse: typeof record.createResponse === "boolean" ? record.createResponse : true,
|
|
386
|
+
interruptResponse: typeof record.interruptResponse === "boolean" ? record.interruptResponse : true,
|
|
387
|
+
eagerness: getString(record.eagerness, "low") || "low",
|
|
388
|
+
};
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
if (provider === "gemini-live") {
|
|
392
|
+
return {
|
|
393
|
+
type: "server_vad",
|
|
394
|
+
createResponse: typeof record.createResponse === "boolean" ? record.createResponse : true,
|
|
395
|
+
interruptResponse: typeof record.interruptResponse === "boolean" ? record.interruptResponse : true,
|
|
396
|
+
prefixPaddingMs: typeof record.prefixPaddingMs === "number" ? record.prefixPaddingMs : 120,
|
|
397
|
+
silenceDurationMs: typeof record.silenceDurationMs === "number" ? record.silenceDurationMs : 700,
|
|
398
|
+
startOfSpeechSensitivity: getString(record.startOfSpeechSensitivity, "START_SENSITIVITY_LOW"),
|
|
399
|
+
endOfSpeechSensitivity: getString(record.endOfSpeechSensitivity, "END_SENSITIVITY_LOW"),
|
|
400
|
+
disabled: typeof record.disabled === "boolean" ? record.disabled : false,
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
if (provider === "kognitive-voice") {
|
|
404
|
+
return {
|
|
405
|
+
type: "flux_eot",
|
|
406
|
+
eager_eot_threshold: typeof record.eager_eot_threshold === "number" ? record.eager_eot_threshold : 0.85,
|
|
407
|
+
eot_threshold: typeof record.eot_threshold === "number" ? record.eot_threshold : 0.65,
|
|
408
|
+
eot_timeout_ms: typeof record.eot_timeout_ms === "number" ? record.eot_timeout_ms : 900,
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
return {
|
|
412
|
+
type: getString(record.type, "server_vad") || "server_vad",
|
|
413
|
+
createResponse: typeof record.createResponse === "boolean" ? record.createResponse : true,
|
|
414
|
+
interruptResponse: typeof record.interruptResponse === "boolean" ? record.interruptResponse : true,
|
|
415
|
+
threshold: typeof record.threshold === "number" ? record.threshold : 0.6,
|
|
416
|
+
prefixPaddingMs: typeof record.prefixPaddingMs === "number" ? record.prefixPaddingMs : 300,
|
|
417
|
+
silenceDurationMs: typeof record.silenceDurationMs === "number" ? record.silenceDurationMs : 700,
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
export function normalizeCloudVoiceInputNoiseReduction(
|
|
422
|
+
provider: CloudVoiceProvider,
|
|
423
|
+
channel: CloudVoiceChannel,
|
|
424
|
+
value: unknown,
|
|
425
|
+
) {
|
|
426
|
+
if (value === null) return null;
|
|
427
|
+
const record = getRecord(value);
|
|
428
|
+
if (Object.keys(record).length > 0) return record;
|
|
429
|
+
|
|
430
|
+
if (provider === "openai-realtime" && isPhoneChannel(channel)) {
|
|
431
|
+
return { type: "near_field" };
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return null;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
export function compileCartesiaTtsOptions(config: Pick<CloudVoiceAgentConfig, "voice" | "metadata" | "providerOptions" | "pipeline">) {
|
|
438
|
+
const providerOptions = getRecord(config.providerOptions);
|
|
439
|
+
const pipeline = getRecord(config.pipeline ?? providerOptions.pipeline);
|
|
440
|
+
const tts = getRecord(pipeline.tts);
|
|
441
|
+
const cartesia = getRecord(tts.options ?? tts.providerOptions ?? providerOptions.cartesia);
|
|
442
|
+
const speech = normalizeSpeechConfig(config);
|
|
443
|
+
const options: Record<string, unknown> = {
|
|
444
|
+
modelId: getString(tts.model, getString(cartesia.modelId, "sonic-3")),
|
|
445
|
+
voice: getString(tts.voice, config.voice && config.voice !== "alloy" ? config.voice : DEFAULT_KOGNITIVE_PIPELINE.tts.voice),
|
|
446
|
+
};
|
|
447
|
+
const pace = getString(speech?.pace, "").toLowerCase();
|
|
448
|
+
const speed = typeof cartesia.speed === "number" || cartesia.speed === "slow" || cartesia.speed === "normal" || cartesia.speed === "fast"
|
|
449
|
+
? cartesia.speed
|
|
450
|
+
: pace.includes("slow")
|
|
451
|
+
? "slow"
|
|
452
|
+
: pace.includes("fast") || pace.includes("energetic")
|
|
453
|
+
? "fast"
|
|
454
|
+
: "normal";
|
|
455
|
+
options.speed = speed;
|
|
456
|
+
const emotion = getString(speech?.emotion, "");
|
|
457
|
+
if (emotion) options.emotion = [emotion];
|
|
458
|
+
if (typeof cartesia.pronunciationDictId === "string" && cartesia.pronunciationDictId.trim()) {
|
|
459
|
+
options.pronunciationDictId = cartesia.pronunciationDictId.trim();
|
|
460
|
+
}
|
|
461
|
+
if (cartesia.contextMode === "reset" || cartesia.contextMode === "continue") {
|
|
462
|
+
options.contextMode = cartesia.contextMode;
|
|
463
|
+
}
|
|
464
|
+
return options;
|
|
465
|
+
}
|