@absolutejs/voice 0.0.22-beta.545 → 0.0.22-beta.547
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/agent.d.ts +2 -0
- package/dist/core/types.d.ts +7 -2
- package/dist/index.js +437 -230
- package/dist/testing/index.js +373 -250
- package/package.json +154 -154
package/dist/testing/index.js
CHANGED
|
@@ -4036,89 +4036,6 @@ var createVoiceProviderOrchestrationProfile = (options) => {
|
|
|
4036
4036
|
}
|
|
4037
4037
|
};
|
|
4038
4038
|
};
|
|
4039
|
-
var OUTPUT_SCHEMA = {
|
|
4040
|
-
additionalProperties: false,
|
|
4041
|
-
properties: {
|
|
4042
|
-
assistantText: {
|
|
4043
|
-
type: "string"
|
|
4044
|
-
},
|
|
4045
|
-
complete: {
|
|
4046
|
-
type: "boolean"
|
|
4047
|
-
},
|
|
4048
|
-
escalate: {
|
|
4049
|
-
additionalProperties: false,
|
|
4050
|
-
properties: {
|
|
4051
|
-
metadata: {
|
|
4052
|
-
additionalProperties: true,
|
|
4053
|
-
type: "object"
|
|
4054
|
-
},
|
|
4055
|
-
reason: {
|
|
4056
|
-
type: "string"
|
|
4057
|
-
}
|
|
4058
|
-
},
|
|
4059
|
-
required: ["reason"],
|
|
4060
|
-
type: "object"
|
|
4061
|
-
},
|
|
4062
|
-
noAnswer: {
|
|
4063
|
-
additionalProperties: false,
|
|
4064
|
-
properties: {
|
|
4065
|
-
metadata: {
|
|
4066
|
-
additionalProperties: true,
|
|
4067
|
-
type: "object"
|
|
4068
|
-
}
|
|
4069
|
-
},
|
|
4070
|
-
type: "object"
|
|
4071
|
-
},
|
|
4072
|
-
result: {
|
|
4073
|
-
additionalProperties: true,
|
|
4074
|
-
type: "object"
|
|
4075
|
-
},
|
|
4076
|
-
transfer: {
|
|
4077
|
-
additionalProperties: false,
|
|
4078
|
-
properties: {
|
|
4079
|
-
metadata: {
|
|
4080
|
-
additionalProperties: true,
|
|
4081
|
-
type: "object"
|
|
4082
|
-
},
|
|
4083
|
-
reason: {
|
|
4084
|
-
type: "string"
|
|
4085
|
-
},
|
|
4086
|
-
target: {
|
|
4087
|
-
type: "string"
|
|
4088
|
-
}
|
|
4089
|
-
},
|
|
4090
|
-
required: ["target"],
|
|
4091
|
-
type: "object"
|
|
4092
|
-
},
|
|
4093
|
-
voicemail: {
|
|
4094
|
-
additionalProperties: false,
|
|
4095
|
-
properties: {
|
|
4096
|
-
metadata: {
|
|
4097
|
-
additionalProperties: true,
|
|
4098
|
-
type: "object"
|
|
4099
|
-
}
|
|
4100
|
-
},
|
|
4101
|
-
type: "object"
|
|
4102
|
-
}
|
|
4103
|
-
},
|
|
4104
|
-
type: "object"
|
|
4105
|
-
};
|
|
4106
|
-
var ROUTE_RESULT_INSTRUCTION = "Return only a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools. Only set transfer, escalate, voicemail, or noAnswer when the user explicitly asks for that lifecycle outcome or a tool result says that exact outcome. Do not infer voicemail from generic words like voice, voice app, or voice integration.";
|
|
4107
|
-
var stripJSONCodeFence = (value) => {
|
|
4108
|
-
const trimmed = value.trim();
|
|
4109
|
-
const match = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
|
|
4110
|
-
return match?.[1]?.trim() ?? value;
|
|
4111
|
-
};
|
|
4112
|
-
var parseJSON = (value) => {
|
|
4113
|
-
try {
|
|
4114
|
-
const parsed = JSON.parse(stripJSONCodeFence(value));
|
|
4115
|
-
return parsed && typeof parsed === "object" ? parsed : {};
|
|
4116
|
-
} catch {
|
|
4117
|
-
return {
|
|
4118
|
-
assistantText: value
|
|
4119
|
-
};
|
|
4120
|
-
}
|
|
4121
|
-
};
|
|
4122
4039
|
var parseJSONValue = (value) => {
|
|
4123
4040
|
try {
|
|
4124
4041
|
return JSON.parse(value);
|
|
@@ -4589,48 +4506,95 @@ var messageToGeminiContent = (message) => {
|
|
|
4589
4506
|
role: message.role === "assistant" ? "model" : "user"
|
|
4590
4507
|
};
|
|
4591
4508
|
};
|
|
4592
|
-
var
|
|
4593
|
-
|
|
4594
|
-
|
|
4509
|
+
var VOICE_SYSTEM_INSTRUCTIONS = "You are on a live phone call. Reply with natural, concise spoken sentences \u2014 no markdown, lists, headings, or emoji. To take an action (transfer the call, escalate, record voicemail/no-answer, or end the call), CALL the matching tool rather than describing it in words. Call the complete tool once the conversation's goal is met.";
|
|
4510
|
+
var parseToolArgs = (raw) => {
|
|
4511
|
+
if (!raw.trim()) {
|
|
4512
|
+
return {};
|
|
4595
4513
|
}
|
|
4596
|
-
|
|
4597
|
-
|
|
4598
|
-
|
|
4599
|
-
|
|
4600
|
-
}
|
|
4601
|
-
const record = item;
|
|
4602
|
-
const content = Array.isArray(record.content) ? record.content : [];
|
|
4603
|
-
for (const contentItem of content) {
|
|
4604
|
-
if (!contentItem || typeof contentItem !== "object") {
|
|
4605
|
-
continue;
|
|
4606
|
-
}
|
|
4607
|
-
const contentRecord = contentItem;
|
|
4608
|
-
if (typeof contentRecord.text === "string") {
|
|
4609
|
-
return contentRecord.text;
|
|
4610
|
-
}
|
|
4611
|
-
}
|
|
4514
|
+
try {
|
|
4515
|
+
const parsed = JSON.parse(raw);
|
|
4516
|
+
return parsed && typeof parsed === "object" ? parsed : {};
|
|
4517
|
+
} catch {
|
|
4518
|
+
return {};
|
|
4612
4519
|
}
|
|
4613
|
-
return "";
|
|
4614
4520
|
};
|
|
4615
|
-
var
|
|
4616
|
-
const
|
|
4617
|
-
|
|
4618
|
-
|
|
4619
|
-
|
|
4620
|
-
|
|
4521
|
+
var readServerSentEvents = async (response, onEvent) => {
|
|
4522
|
+
const reader = response.body?.getReader();
|
|
4523
|
+
if (!reader) {
|
|
4524
|
+
throw new Error("streaming response has no body");
|
|
4525
|
+
}
|
|
4526
|
+
const decoder = new TextDecoder;
|
|
4527
|
+
let buffer = "";
|
|
4528
|
+
const drain = (block) => {
|
|
4529
|
+
for (const line of block.split(`
|
|
4530
|
+
`)) {
|
|
4531
|
+
const trimmed = line.trimStart();
|
|
4532
|
+
if (!trimmed.startsWith("data:"))
|
|
4533
|
+
continue;
|
|
4534
|
+
const data = trimmed.slice("data:".length).trim();
|
|
4535
|
+
if (!data || data === "[DONE]")
|
|
4536
|
+
continue;
|
|
4537
|
+
try {
|
|
4538
|
+
onEvent(JSON.parse(data));
|
|
4539
|
+
} catch {}
|
|
4621
4540
|
}
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4541
|
+
};
|
|
4542
|
+
for (;; ) {
|
|
4543
|
+
const { done, value } = await reader.read();
|
|
4544
|
+
if (done)
|
|
4545
|
+
break;
|
|
4546
|
+
buffer += decoder.decode(value, { stream: true });
|
|
4547
|
+
let separator = buffer.indexOf(`
|
|
4548
|
+
|
|
4549
|
+
`);
|
|
4550
|
+
while (separator !== -1) {
|
|
4551
|
+
drain(buffer.slice(0, separator));
|
|
4552
|
+
buffer = buffer.slice(separator + 2);
|
|
4553
|
+
separator = buffer.indexOf(`
|
|
4554
|
+
|
|
4555
|
+
`);
|
|
4625
4556
|
}
|
|
4626
|
-
const args = typeof record.arguments === "string" ? parseJSON(record.arguments) : {};
|
|
4627
|
-
toolCalls.push({
|
|
4628
|
-
args,
|
|
4629
|
-
id: typeof record.call_id === "string" ? record.call_id : typeof record.id === "string" ? record.id : undefined,
|
|
4630
|
-
name: record.name
|
|
4631
|
-
});
|
|
4632
4557
|
}
|
|
4633
|
-
|
|
4558
|
+
if (buffer.trim())
|
|
4559
|
+
drain(buffer);
|
|
4560
|
+
};
|
|
4561
|
+
var finalizeToolCalls = (calls) => [...calls.values()].filter((call) => call.name).map((call) => ({
|
|
4562
|
+
args: parseToolArgs(call.args),
|
|
4563
|
+
id: call.id,
|
|
4564
|
+
name: call.name
|
|
4565
|
+
}));
|
|
4566
|
+
var consumeOpenAIResponsesStream = async (response, onTextDelta) => {
|
|
4567
|
+
let assistantText = "";
|
|
4568
|
+
let usage;
|
|
4569
|
+
const calls = new Map;
|
|
4570
|
+
await readServerSentEvents(response, (event) => {
|
|
4571
|
+
const type = typeof event.type === "string" ? event.type : "";
|
|
4572
|
+
const item = event.item;
|
|
4573
|
+
if (type === "response.output_text.delta" && typeof event.delta === "string") {
|
|
4574
|
+
assistantText += event.delta;
|
|
4575
|
+
onTextDelta?.(event.delta);
|
|
4576
|
+
} else if (type === "response.output_item.added" && item?.type === "function_call") {
|
|
4577
|
+
calls.set(String(item.id ?? item.call_id ?? ""), {
|
|
4578
|
+
args: typeof item.arguments === "string" ? item.arguments : "",
|
|
4579
|
+
id: typeof item.call_id === "string" ? item.call_id : item.id,
|
|
4580
|
+
name: typeof item.name === "string" ? item.name : ""
|
|
4581
|
+
});
|
|
4582
|
+
} else if (type === "response.function_call_arguments.delta" && typeof event.delta === "string") {
|
|
4583
|
+
const entry = calls.get(String(event.item_id ?? ""));
|
|
4584
|
+
if (entry)
|
|
4585
|
+
entry.args += event.delta;
|
|
4586
|
+
} else if (type === "response.output_item.done" && item?.type === "function_call" && typeof item.arguments === "string" && item.arguments) {
|
|
4587
|
+
const entry = calls.get(String(item.id ?? item.call_id ?? ""));
|
|
4588
|
+
if (entry)
|
|
4589
|
+
entry.args = item.arguments;
|
|
4590
|
+
} else if (type === "response.completed") {
|
|
4591
|
+
const completed = event.response;
|
|
4592
|
+
if (completed?.usage && typeof completed.usage === "object") {
|
|
4593
|
+
usage = completed.usage;
|
|
4594
|
+
}
|
|
4595
|
+
}
|
|
4596
|
+
});
|
|
4597
|
+
return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
|
|
4634
4598
|
};
|
|
4635
4599
|
var createOpenAIVoiceAssistantModel = (options) => {
|
|
4636
4600
|
const fetchImpl = options.fetch ?? globalThis.fetch;
|
|
@@ -4641,23 +4605,13 @@ var createOpenAIVoiceAssistantModel = (options) => {
|
|
|
4641
4605
|
const response = await fetchImpl(`${baseUrl.replace(/\/$/, "")}/responses`, {
|
|
4642
4606
|
body: JSON.stringify({
|
|
4643
4607
|
input: messagesToOpenAIInput(input.messages),
|
|
4644
|
-
instructions: [
|
|
4645
|
-
input.system,
|
|
4646
|
-
"Return a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools."
|
|
4647
|
-
].filter(Boolean).join(`
|
|
4608
|
+
instructions: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
|
|
4648
4609
|
|
|
4649
4610
|
`),
|
|
4650
4611
|
max_output_tokens: options.maxOutputTokens,
|
|
4651
4612
|
model,
|
|
4613
|
+
stream: true,
|
|
4652
4614
|
temperature: options.temperature,
|
|
4653
|
-
text: {
|
|
4654
|
-
format: {
|
|
4655
|
-
name: "voice_route_result",
|
|
4656
|
-
schema: OUTPUT_SCHEMA,
|
|
4657
|
-
strict: false,
|
|
4658
|
-
type: "json_schema"
|
|
4659
|
-
}
|
|
4660
|
-
},
|
|
4661
4615
|
tool_choice: input.tools.length ? "auto" : "none",
|
|
4662
4616
|
tools: input.tools.map((tool) => ({
|
|
4663
4617
|
description: tool.description,
|
|
@@ -4671,6 +4625,7 @@ var createOpenAIVoiceAssistantModel = (options) => {
|
|
|
4671
4625
|
}))
|
|
4672
4626
|
}),
|
|
4673
4627
|
headers: {
|
|
4628
|
+
accept: "text/event-stream",
|
|
4674
4629
|
authorization: `Bearer ${options.apiKey}`,
|
|
4675
4630
|
"content-type": "application/json"
|
|
4676
4631
|
},
|
|
@@ -4679,43 +4634,52 @@ var createOpenAIVoiceAssistantModel = (options) => {
|
|
|
4679
4634
|
if (!response.ok) {
|
|
4680
4635
|
throw createHTTPError("OpenAI", response);
|
|
4681
4636
|
}
|
|
4682
|
-
const
|
|
4683
|
-
if (
|
|
4684
|
-
await options.onUsage?.(
|
|
4685
|
-
}
|
|
4686
|
-
const toolCalls = extractToolCalls(body);
|
|
4687
|
-
if (toolCalls.length) {
|
|
4688
|
-
return {
|
|
4689
|
-
toolCalls
|
|
4690
|
-
};
|
|
4637
|
+
const { assistantText, toolCalls, usage } = await consumeOpenAIResponsesStream(response, input.onTextDelta);
|
|
4638
|
+
if (usage) {
|
|
4639
|
+
await options.onUsage?.(usage);
|
|
4691
4640
|
}
|
|
4692
|
-
return
|
|
4641
|
+
return {
|
|
4642
|
+
...assistantText ? { assistantText } : {},
|
|
4643
|
+
...toolCalls.length ? { toolCalls } : {}
|
|
4644
|
+
};
|
|
4693
4645
|
}
|
|
4694
4646
|
};
|
|
4695
4647
|
};
|
|
4696
|
-
var
|
|
4697
|
-
|
|
4698
|
-
|
|
4699
|
-
|
|
4700
|
-
|
|
4701
|
-
|
|
4702
|
-
|
|
4703
|
-
|
|
4704
|
-
|
|
4705
|
-
|
|
4706
|
-
|
|
4707
|
-
|
|
4708
|
-
|
|
4709
|
-
|
|
4710
|
-
|
|
4648
|
+
var consumeAnthropicStream = async (response, onTextDelta) => {
|
|
4649
|
+
let assistantText = "";
|
|
4650
|
+
let usage;
|
|
4651
|
+
const calls = new Map;
|
|
4652
|
+
await readServerSentEvents(response, (event) => {
|
|
4653
|
+
const type = typeof event.type === "string" ? event.type : "";
|
|
4654
|
+
const delta = event.delta;
|
|
4655
|
+
if (type === "content_block_delta" && delta?.type === "text_delta") {
|
|
4656
|
+
if (typeof delta.text === "string") {
|
|
4657
|
+
assistantText += delta.text;
|
|
4658
|
+
onTextDelta?.(delta.text);
|
|
4659
|
+
}
|
|
4660
|
+
} else if (type === "content_block_delta" && delta?.type === "input_json_delta" && typeof delta.partial_json === "string") {
|
|
4661
|
+
const entry = calls.get(String(event.index ?? ""));
|
|
4662
|
+
if (entry)
|
|
4663
|
+
entry.args += delta.partial_json;
|
|
4664
|
+
} else if (type === "content_block_start") {
|
|
4665
|
+
const block = event.content_block;
|
|
4666
|
+
if (block?.type === "tool_use") {
|
|
4667
|
+
calls.set(String(event.index ?? ""), {
|
|
4668
|
+
args: "",
|
|
4669
|
+
id: typeof block.id === "string" ? block.id : undefined,
|
|
4670
|
+
name: typeof block.name === "string" ? block.name : ""
|
|
4671
|
+
});
|
|
4672
|
+
}
|
|
4673
|
+
} else if (type === "message_start") {
|
|
4674
|
+
const message = event.message;
|
|
4675
|
+
if (message?.usage && typeof message.usage === "object") {
|
|
4676
|
+
usage = message.usage;
|
|
4677
|
+
}
|
|
4678
|
+
} else if (type === "message_delta" && event.usage && typeof event.usage === "object") {
|
|
4679
|
+
usage = { ...usage, ...event.usage };
|
|
4711
4680
|
}
|
|
4712
|
-
|
|
4713
|
-
|
|
4714
|
-
id: typeof record.id === "string" ? record.id : undefined,
|
|
4715
|
-
name: record.name
|
|
4716
|
-
});
|
|
4717
|
-
}
|
|
4718
|
-
return toolCalls;
|
|
4681
|
+
});
|
|
4682
|
+
return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
|
|
4719
4683
|
};
|
|
4720
4684
|
var createAnthropicVoiceAssistantModel = (options) => {
|
|
4721
4685
|
const fetchImpl = options.fetch ?? globalThis.fetch;
|
|
@@ -4728,7 +4692,8 @@ var createAnthropicVoiceAssistantModel = (options) => {
|
|
|
4728
4692
|
max_tokens: options.maxOutputTokens ?? 1024,
|
|
4729
4693
|
messages: input.messages.map(messageToAnthropicMessage).filter(Boolean),
|
|
4730
4694
|
model,
|
|
4731
|
-
|
|
4695
|
+
stream: true,
|
|
4696
|
+
system: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
|
|
4732
4697
|
|
|
4733
4698
|
`),
|
|
4734
4699
|
temperature: options.temperature,
|
|
@@ -4752,57 +4717,55 @@ var createAnthropicVoiceAssistantModel = (options) => {
|
|
|
4752
4717
|
if (!response.ok) {
|
|
4753
4718
|
throw createHTTPError("Anthropic", response);
|
|
4754
4719
|
}
|
|
4755
|
-
const
|
|
4756
|
-
if (
|
|
4757
|
-
await options.onUsage?.(
|
|
4758
|
-
}
|
|
4759
|
-
const toolCalls = extractAnthropicToolCalls(body);
|
|
4760
|
-
if (toolCalls.length) {
|
|
4761
|
-
return {
|
|
4762
|
-
assistantText: extractAnthropicText(body) || undefined,
|
|
4763
|
-
toolCalls
|
|
4764
|
-
};
|
|
4720
|
+
const { assistantText, toolCalls, usage } = await consumeAnthropicStream(response, input.onTextDelta);
|
|
4721
|
+
if (usage) {
|
|
4722
|
+
await options.onUsage?.(usage);
|
|
4765
4723
|
}
|
|
4766
|
-
return
|
|
4724
|
+
return {
|
|
4725
|
+
...assistantText ? { assistantText } : {},
|
|
4726
|
+
...toolCalls.length ? { toolCalls } : {}
|
|
4727
|
+
};
|
|
4767
4728
|
}
|
|
4768
4729
|
};
|
|
4769
4730
|
};
|
|
4770
|
-
var
|
|
4771
|
-
|
|
4772
|
-
|
|
4773
|
-
|
|
4774
|
-
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
4778
|
-
|
|
4731
|
+
var handleGeminiPart = (part, collect) => {
|
|
4732
|
+
if (!part || typeof part !== "object")
|
|
4733
|
+
return "";
|
|
4734
|
+
const record = part;
|
|
4735
|
+
if (typeof record.text === "string" && record.text) {
|
|
4736
|
+
collect.onTextDelta?.(record.text);
|
|
4737
|
+
return record.text;
|
|
4738
|
+
}
|
|
4739
|
+
const { functionCall } = record;
|
|
4740
|
+
if (functionCall && typeof functionCall === "object") {
|
|
4741
|
+
const fn = functionCall;
|
|
4742
|
+
if (typeof fn.name === "string") {
|
|
4743
|
+
collect.toolCalls.push({
|
|
4744
|
+
args: fn.args && typeof fn.args === "object" ? fn.args : {},
|
|
4745
|
+
id: typeof fn.id === "string" ? fn.id : undefined,
|
|
4746
|
+
name: fn.name
|
|
4747
|
+
});
|
|
4748
|
+
}
|
|
4779
4749
|
}
|
|
4780
|
-
|
|
4781
|
-
return Array.isArray(parts) ? parts : [];
|
|
4750
|
+
return "";
|
|
4782
4751
|
};
|
|
4783
|
-
var
|
|
4784
|
-
|
|
4785
|
-
|
|
4752
|
+
var consumeGeminiStream = async (response, onTextDelta) => {
|
|
4753
|
+
let assistantText = "";
|
|
4754
|
+
let usage;
|
|
4786
4755
|
const toolCalls = [];
|
|
4787
|
-
|
|
4788
|
-
if (
|
|
4789
|
-
|
|
4756
|
+
await readServerSentEvents(response, (event) => {
|
|
4757
|
+
if (event.usageMetadata && typeof event.usageMetadata === "object") {
|
|
4758
|
+
usage = event.usageMetadata;
|
|
4790
4759
|
}
|
|
4791
|
-
const
|
|
4792
|
-
|
|
4793
|
-
|
|
4794
|
-
|
|
4795
|
-
const
|
|
4796
|
-
|
|
4797
|
-
continue;
|
|
4760
|
+
const candidates = Array.isArray(event.candidates) ? event.candidates : [];
|
|
4761
|
+
const first = candidates[0];
|
|
4762
|
+
const content = first?.content;
|
|
4763
|
+
const parts = Array.isArray(content?.parts) ? content.parts : [];
|
|
4764
|
+
for (const part of parts) {
|
|
4765
|
+
assistantText += handleGeminiPart(part, { onTextDelta, toolCalls });
|
|
4798
4766
|
}
|
|
4799
|
-
|
|
4800
|
-
|
|
4801
|
-
id: typeof record.id === "string" ? record.id : undefined,
|
|
4802
|
-
name: record.name
|
|
4803
|
-
});
|
|
4804
|
-
}
|
|
4805
|
-
return toolCalls;
|
|
4767
|
+
});
|
|
4768
|
+
return { assistantText, toolCalls, usage };
|
|
4806
4769
|
};
|
|
4807
4770
|
var createGeminiVoiceAssistantModel = (options) => {
|
|
4808
4771
|
const fetchImpl = options.fetch ?? globalThis.fetch;
|
|
@@ -4811,7 +4774,7 @@ var createGeminiVoiceAssistantModel = (options) => {
|
|
|
4811
4774
|
const maxRetries = Math.max(0, options.maxRetries ?? 2);
|
|
4812
4775
|
return {
|
|
4813
4776
|
generate: async (input) => {
|
|
4814
|
-
const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:
|
|
4777
|
+
const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:streamGenerateContent?alt=sse&key=${encodeURIComponent(options.apiKey)}`;
|
|
4815
4778
|
let response;
|
|
4816
4779
|
for (let attempt = 0;attempt <= maxRetries; attempt += 1) {
|
|
4817
4780
|
response = await fetchImpl(endpoint, {
|
|
@@ -4819,16 +4782,12 @@ var createGeminiVoiceAssistantModel = (options) => {
|
|
|
4819
4782
|
contents: input.messages.map(messageToGeminiContent).filter(Boolean),
|
|
4820
4783
|
generationConfig: {
|
|
4821
4784
|
maxOutputTokens: options.maxOutputTokens,
|
|
4822
|
-
...input.tools.length ? {} : {
|
|
4823
|
-
responseMimeType: "application/json",
|
|
4824
|
-
responseSchema: toGeminiSchema(OUTPUT_SCHEMA)
|
|
4825
|
-
},
|
|
4826
4785
|
temperature: options.temperature
|
|
4827
4786
|
},
|
|
4828
4787
|
systemInstruction: {
|
|
4829
4788
|
parts: [
|
|
4830
4789
|
{
|
|
4831
|
-
text: [input.system,
|
|
4790
|
+
text: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
|
|
4832
4791
|
|
|
4833
4792
|
`)
|
|
4834
4793
|
}
|
|
@@ -4864,18 +4823,14 @@ var createGeminiVoiceAssistantModel = (options) => {
|
|
|
4864
4823
|
if (!response.ok) {
|
|
4865
4824
|
throw createHTTPError("Gemini", response);
|
|
4866
4825
|
}
|
|
4867
|
-
const
|
|
4868
|
-
if (
|
|
4869
|
-
await options.onUsage?.(
|
|
4870
|
-
}
|
|
4871
|
-
const toolCalls = extractGeminiToolCalls(body);
|
|
4872
|
-
if (toolCalls.length) {
|
|
4873
|
-
return {
|
|
4874
|
-
assistantText: extractGeminiText(body) || undefined,
|
|
4875
|
-
toolCalls
|
|
4876
|
-
};
|
|
4826
|
+
const { assistantText, toolCalls, usage } = await consumeGeminiStream(response, input.onTextDelta);
|
|
4827
|
+
if (usage) {
|
|
4828
|
+
await options.onUsage?.(usage);
|
|
4877
4829
|
}
|
|
4878
|
-
return
|
|
4830
|
+
return {
|
|
4831
|
+
...assistantText ? { assistantText } : {},
|
|
4832
|
+
...toolCalls.length ? { toolCalls } : {}
|
|
4833
|
+
};
|
|
4879
4834
|
}
|
|
4880
4835
|
};
|
|
4881
4836
|
};
|
|
@@ -5523,6 +5478,19 @@ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
|
|
|
5523
5478
|
var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
|
|
5524
5479
|
var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
|
|
5525
5480
|
var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
|
|
5481
|
+
var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
|
|
5482
|
+
var MAX_TTS_CHUNK_CHARS = 220;
|
|
5483
|
+
var nextSpeakableBoundary = (buffer) => {
|
|
5484
|
+
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
5485
|
+
return match ? match.index + match[0].length : -1;
|
|
5486
|
+
};
|
|
5487
|
+
var softCutBoundary = (buffer) => {
|
|
5488
|
+
if (buffer.length < MAX_TTS_CHUNK_CHARS)
|
|
5489
|
+
return -1;
|
|
5490
|
+
const window2 = buffer.slice(0, MAX_TTS_CHUNK_CHARS);
|
|
5491
|
+
const lastSpace = window2.lastIndexOf(" ");
|
|
5492
|
+
return lastSpace > 0 ? lastSpace + 1 : MAX_TTS_CHUNK_CHARS;
|
|
5493
|
+
};
|
|
5526
5494
|
var calculateMeanConfidence = (transcripts) => {
|
|
5527
5495
|
let sum = 0;
|
|
5528
5496
|
let total = 0;
|
|
@@ -6992,6 +6960,110 @@ var createVoiceSession = (options) => {
|
|
|
6992
6960
|
});
|
|
6993
6961
|
});
|
|
6994
6962
|
};
|
|
6963
|
+
const createTurnTTSStreamer = (turn, session) => {
|
|
6964
|
+
let buffer = "";
|
|
6965
|
+
let full = "";
|
|
6966
|
+
let charsSent = 0;
|
|
6967
|
+
let started = false;
|
|
6968
|
+
let streamed = false;
|
|
6969
|
+
let sendChain = Promise.resolve();
|
|
6970
|
+
let ttsSessionRequest = null;
|
|
6971
|
+
const ttsStartedAt = Date.now();
|
|
6972
|
+
const ensure = () => {
|
|
6973
|
+
if (!ttsSessionRequest) {
|
|
6974
|
+
ttsSessionRequest = ensureTTSSession().catch((error) => {
|
|
6975
|
+
logger.warn("voice assistant audio send failed", {
|
|
6976
|
+
error: toError(error).message,
|
|
6977
|
+
sessionId: options.id,
|
|
6978
|
+
turnId: turn.id
|
|
6979
|
+
});
|
|
6980
|
+
return null;
|
|
6981
|
+
});
|
|
6982
|
+
}
|
|
6983
|
+
return ttsSessionRequest;
|
|
6984
|
+
};
|
|
6985
|
+
const flush = (text) => {
|
|
6986
|
+
if (!text.trim())
|
|
6987
|
+
return;
|
|
6988
|
+
const previous = sendChain;
|
|
6989
|
+
sendChain = (async () => {
|
|
6990
|
+
await previous;
|
|
6991
|
+
if (started && activeTTSTurnId !== turn.id)
|
|
6992
|
+
return;
|
|
6993
|
+
const ttsSession2 = await ensure();
|
|
6994
|
+
if (!ttsSession2 || started && activeTTSTurnId !== turn.id)
|
|
6995
|
+
return;
|
|
6996
|
+
if (!started) {
|
|
6997
|
+
activeTTSTurnId = turn.id;
|
|
6998
|
+
await appendTurnLatencyStage({
|
|
6999
|
+
at: ttsStartedAt,
|
|
7000
|
+
session,
|
|
7001
|
+
stage: "tts_send_started",
|
|
7002
|
+
turnId: turn.id
|
|
7003
|
+
});
|
|
7004
|
+
started = true;
|
|
7005
|
+
}
|
|
7006
|
+
try {
|
|
7007
|
+
await ttsSession2.send(text);
|
|
7008
|
+
charsSent += text.length;
|
|
7009
|
+
} catch (error) {
|
|
7010
|
+
logger.warn("voice assistant audio send failed", {
|
|
7011
|
+
error: toError(error).message,
|
|
7012
|
+
sessionId: options.id,
|
|
7013
|
+
turnId: turn.id
|
|
7014
|
+
});
|
|
7015
|
+
}
|
|
7016
|
+
})();
|
|
7017
|
+
};
|
|
7018
|
+
return {
|
|
7019
|
+
finish: async () => {
|
|
7020
|
+
if (buffer.trim()) {
|
|
7021
|
+
flush(buffer);
|
|
7022
|
+
}
|
|
7023
|
+
buffer = "";
|
|
7024
|
+
await sendChain;
|
|
7025
|
+
if (started) {
|
|
7026
|
+
if (options.costAccountant) {
|
|
7027
|
+
options.costAccountant.recordTTS({ characters: charsSent });
|
|
7028
|
+
}
|
|
7029
|
+
await appendTurnLatencyStage({
|
|
7030
|
+
session,
|
|
7031
|
+
stage: "tts_send_completed",
|
|
7032
|
+
turnId: turn.id
|
|
7033
|
+
});
|
|
7034
|
+
await appendTrace({
|
|
7035
|
+
payload: {
|
|
7036
|
+
elapsedMs: Date.now() - ttsStartedAt,
|
|
7037
|
+
status: "sent",
|
|
7038
|
+
streamed: true
|
|
7039
|
+
},
|
|
7040
|
+
session,
|
|
7041
|
+
turnId: turn.id,
|
|
7042
|
+
type: "turn.assistant"
|
|
7043
|
+
});
|
|
7044
|
+
}
|
|
7045
|
+
return { fullText: full, streamed };
|
|
7046
|
+
},
|
|
7047
|
+
push: (delta) => {
|
|
7048
|
+
if (!delta)
|
|
7049
|
+
return;
|
|
7050
|
+
streamed = true;
|
|
7051
|
+
full += delta;
|
|
7052
|
+
buffer += delta;
|
|
7053
|
+
let boundary = nextSpeakableBoundary(buffer);
|
|
7054
|
+
while (boundary !== -1) {
|
|
7055
|
+
flush(buffer.slice(0, boundary));
|
|
7056
|
+
buffer = buffer.slice(boundary);
|
|
7057
|
+
boundary = nextSpeakableBoundary(buffer);
|
|
7058
|
+
}
|
|
7059
|
+
const cut = softCutBoundary(buffer);
|
|
7060
|
+
if (cut !== -1) {
|
|
7061
|
+
flush(buffer.slice(0, cut));
|
|
7062
|
+
buffer = buffer.slice(cut);
|
|
7063
|
+
}
|
|
7064
|
+
}
|
|
7065
|
+
};
|
|
7066
|
+
};
|
|
6995
7067
|
const completeTurn = async (session, turn) => {
|
|
6996
7068
|
const liveOpsControl = await options.liveOps?.getControl(options.id);
|
|
6997
7069
|
if (liveOpsControl?.assistantPaused || liveOpsControl?.operatorTakeover) {
|
|
@@ -7012,6 +7084,7 @@ var createVoiceSession = (options) => {
|
|
|
7012
7084
|
return;
|
|
7013
7085
|
}
|
|
7014
7086
|
const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
|
|
7087
|
+
const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
|
|
7015
7088
|
const committedOutput = await options.route.onTurn({
|
|
7016
7089
|
api,
|
|
7017
7090
|
context: options.context,
|
|
@@ -7019,6 +7092,7 @@ var createVoiceSession = (options) => {
|
|
|
7019
7092
|
control: liveOpsControl,
|
|
7020
7093
|
injectedInstruction
|
|
7021
7094
|
} : undefined,
|
|
7095
|
+
onTextDelta: ttsStreamer?.push,
|
|
7022
7096
|
session,
|
|
7023
7097
|
turn
|
|
7024
7098
|
});
|
|
@@ -7038,7 +7112,28 @@ var createVoiceSession = (options) => {
|
|
|
7038
7112
|
setTurnResult(currentSession, turn.id, { citations: turnCitations });
|
|
7039
7113
|
});
|
|
7040
7114
|
}
|
|
7041
|
-
|
|
7115
|
+
const streamResult = ttsStreamer ? await ttsStreamer.finish() : undefined;
|
|
7116
|
+
if (streamResult?.streamed) {
|
|
7117
|
+
output.assistantText = streamResult.fullText || output.assistantText;
|
|
7118
|
+
if (output.assistantText) {
|
|
7119
|
+
const finalText = output.assistantText;
|
|
7120
|
+
await writeSession((currentSession) => {
|
|
7121
|
+
setTurnResult(currentSession, turn.id, { assistantText: finalText });
|
|
7122
|
+
});
|
|
7123
|
+
await send({ text: finalText, turnId: turn.id, type: "assistant" });
|
|
7124
|
+
await appendTrace({
|
|
7125
|
+
payload: {
|
|
7126
|
+
assistantMode: resolveVoiceAssistantMode(options),
|
|
7127
|
+
realtimeConfigured: Boolean(options.realtime),
|
|
7128
|
+
text: finalText,
|
|
7129
|
+
ttsConfigured: Boolean(options.tts)
|
|
7130
|
+
},
|
|
7131
|
+
session,
|
|
7132
|
+
turnId: turn.id,
|
|
7133
|
+
type: "turn.assistant"
|
|
7134
|
+
});
|
|
7135
|
+
}
|
|
7136
|
+
} else if (output?.assistantText) {
|
|
7042
7137
|
const assistantTextStartedAt = Date.now();
|
|
7043
7138
|
await writeSession((currentSession) => {
|
|
7044
7139
|
setTurnResult(currentSession, turn.id, {
|
|
@@ -7443,7 +7538,7 @@ var createVoiceSession = (options) => {
|
|
|
7443
7538
|
kickCallSilenceWatchdog();
|
|
7444
7539
|
startAmdEvaluationTimer();
|
|
7445
7540
|
if (shouldFireOnSession && options.greeting && session.turns.length === 0) {
|
|
7446
|
-
const greetingText = typeof options.greeting === "function" ? await options.greeting() : options.greeting;
|
|
7541
|
+
const greetingText = typeof options.greeting === "function" ? await options.greeting({ session }) : options.greeting;
|
|
7447
7542
|
const greetingTurnId = createId();
|
|
7448
7543
|
await send({
|
|
7449
7544
|
text: greetingText,
|
|
@@ -12743,8 +12838,23 @@ var createTwilioSocketAdapter = (socket, getState) => ({
|
|
|
12743
12838
|
if (!state.streamSid) {
|
|
12744
12839
|
return;
|
|
12745
12840
|
}
|
|
12841
|
+
const clearMessage = { event: "clear", streamSid: state.streamSid };
|
|
12746
12842
|
state.reviewRecorder?.recordTwilioOutbound({ event: "clear" });
|
|
12747
|
-
await
|
|
12843
|
+
await state.trace?.append({
|
|
12844
|
+
at: Date.now(),
|
|
12845
|
+
payload: {
|
|
12846
|
+
callSid: state.callSid ?? undefined,
|
|
12847
|
+
carrier: state.carrier,
|
|
12848
|
+
direction: "outbound",
|
|
12849
|
+
envelope: clearMessage,
|
|
12850
|
+
event: "clear",
|
|
12851
|
+
streamId: state.streamSid
|
|
12852
|
+
},
|
|
12853
|
+
scenarioId: state.scenarioId ?? undefined,
|
|
12854
|
+
sessionId: state.sessionId ?? state.streamSid,
|
|
12855
|
+
type: "client.telephony_media"
|
|
12856
|
+
});
|
|
12857
|
+
await Promise.resolve(socket.send(JSON.stringify(clearMessage)));
|
|
12748
12858
|
},
|
|
12749
12859
|
close: async (code, reason) => {
|
|
12750
12860
|
await Promise.resolve(socket.close(code, reason));
|
|
@@ -13266,7 +13376,7 @@ var createFakeSTTAdapter = (inputSpy, sttDelayMs) => ({
|
|
|
13266
13376
|
final: new Set,
|
|
13267
13377
|
partial: new Set
|
|
13268
13378
|
};
|
|
13269
|
-
let
|
|
13379
|
+
let sendCount = 0;
|
|
13270
13380
|
return {
|
|
13271
13381
|
close: async () => {
|
|
13272
13382
|
for (const handler of listeners.close) {
|
|
@@ -13281,31 +13391,44 @@ var createFakeSTTAdapter = (inputSpy, sttDelayMs) => ({
|
|
|
13281
13391
|
},
|
|
13282
13392
|
send: async (audio) => {
|
|
13283
13393
|
inputSpy.push(toUint8Array2(audio));
|
|
13284
|
-
|
|
13394
|
+
sendCount += 1;
|
|
13395
|
+
if (sendCount === 1) {
|
|
13396
|
+
if (sttDelayMs > 0) {
|
|
13397
|
+
await Bun.sleep(sttDelayMs);
|
|
13398
|
+
}
|
|
13399
|
+
const receivedAt = Date.now();
|
|
13400
|
+
for (const handler of listeners.final) {
|
|
13401
|
+
handler({
|
|
13402
|
+
receivedAt,
|
|
13403
|
+
transcript: {
|
|
13404
|
+
id: "telephony-benchmark-final",
|
|
13405
|
+
isFinal: true,
|
|
13406
|
+
text: "hello from twilio"
|
|
13407
|
+
},
|
|
13408
|
+
type: "final"
|
|
13409
|
+
});
|
|
13410
|
+
}
|
|
13411
|
+
for (const handler of listeners.endOfTurn) {
|
|
13412
|
+
handler({
|
|
13413
|
+
reason: "vendor",
|
|
13414
|
+
receivedAt,
|
|
13415
|
+
type: "endOfTurn"
|
|
13416
|
+
});
|
|
13417
|
+
}
|
|
13285
13418
|
return;
|
|
13286
13419
|
}
|
|
13287
|
-
|
|
13288
|
-
|
|
13289
|
-
|
|
13290
|
-
|
|
13291
|
-
|
|
13292
|
-
|
|
13293
|
-
|
|
13294
|
-
|
|
13295
|
-
|
|
13296
|
-
|
|
13297
|
-
|
|
13298
|
-
|
|
13299
|
-
},
|
|
13300
|
-
type: "final"
|
|
13301
|
-
});
|
|
13302
|
-
}
|
|
13303
|
-
for (const handler of listeners.endOfTurn) {
|
|
13304
|
-
handler({
|
|
13305
|
-
reason: "vendor",
|
|
13306
|
-
receivedAt,
|
|
13307
|
-
type: "endOfTurn"
|
|
13308
|
-
});
|
|
13420
|
+
if (sendCount === 2) {
|
|
13421
|
+
for (const handler of listeners.partial) {
|
|
13422
|
+
handler({
|
|
13423
|
+
receivedAt: Date.now(),
|
|
13424
|
+
transcript: {
|
|
13425
|
+
id: "telephony-benchmark-partial",
|
|
13426
|
+
isFinal: false,
|
|
13427
|
+
text: "actually wait"
|
|
13428
|
+
},
|
|
13429
|
+
type: "partial"
|
|
13430
|
+
});
|
|
13431
|
+
}
|
|
13309
13432
|
}
|
|
13310
13433
|
}
|
|
13311
13434
|
};
|