@absolutejs/voice 0.0.22-beta.545 → 0.0.22-beta.547

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4036,89 +4036,6 @@ var createVoiceProviderOrchestrationProfile = (options) => {
4036
4036
  }
4037
4037
  };
4038
4038
  };
4039
- var OUTPUT_SCHEMA = {
4040
- additionalProperties: false,
4041
- properties: {
4042
- assistantText: {
4043
- type: "string"
4044
- },
4045
- complete: {
4046
- type: "boolean"
4047
- },
4048
- escalate: {
4049
- additionalProperties: false,
4050
- properties: {
4051
- metadata: {
4052
- additionalProperties: true,
4053
- type: "object"
4054
- },
4055
- reason: {
4056
- type: "string"
4057
- }
4058
- },
4059
- required: ["reason"],
4060
- type: "object"
4061
- },
4062
- noAnswer: {
4063
- additionalProperties: false,
4064
- properties: {
4065
- metadata: {
4066
- additionalProperties: true,
4067
- type: "object"
4068
- }
4069
- },
4070
- type: "object"
4071
- },
4072
- result: {
4073
- additionalProperties: true,
4074
- type: "object"
4075
- },
4076
- transfer: {
4077
- additionalProperties: false,
4078
- properties: {
4079
- metadata: {
4080
- additionalProperties: true,
4081
- type: "object"
4082
- },
4083
- reason: {
4084
- type: "string"
4085
- },
4086
- target: {
4087
- type: "string"
4088
- }
4089
- },
4090
- required: ["target"],
4091
- type: "object"
4092
- },
4093
- voicemail: {
4094
- additionalProperties: false,
4095
- properties: {
4096
- metadata: {
4097
- additionalProperties: true,
4098
- type: "object"
4099
- }
4100
- },
4101
- type: "object"
4102
- }
4103
- },
4104
- type: "object"
4105
- };
4106
- var ROUTE_RESULT_INSTRUCTION = "Return only a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools. Only set transfer, escalate, voicemail, or noAnswer when the user explicitly asks for that lifecycle outcome or a tool result says that exact outcome. Do not infer voicemail from generic words like voice, voice app, or voice integration.";
4107
- var stripJSONCodeFence = (value) => {
4108
- const trimmed = value.trim();
4109
- const match = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
4110
- return match?.[1]?.trim() ?? value;
4111
- };
4112
- var parseJSON = (value) => {
4113
- try {
4114
- const parsed = JSON.parse(stripJSONCodeFence(value));
4115
- return parsed && typeof parsed === "object" ? parsed : {};
4116
- } catch {
4117
- return {
4118
- assistantText: value
4119
- };
4120
- }
4121
- };
4122
4039
  var parseJSONValue = (value) => {
4123
4040
  try {
4124
4041
  return JSON.parse(value);
@@ -4589,48 +4506,95 @@ var messageToGeminiContent = (message) => {
4589
4506
  role: message.role === "assistant" ? "model" : "user"
4590
4507
  };
4591
4508
  };
4592
- var extractText = (response) => {
4593
- if (typeof response.output_text === "string") {
4594
- return response.output_text;
4509
+ var VOICE_SYSTEM_INSTRUCTIONS = "You are on a live phone call. Reply with natural, concise spoken sentences \u2014 no markdown, lists, headings, or emoji. To take an action (transfer the call, escalate, record voicemail/no-answer, or end the call), CALL the matching tool rather than describing it in words. Call the complete tool once the conversation's goal is met.";
4510
+ var parseToolArgs = (raw) => {
4511
+ if (!raw.trim()) {
4512
+ return {};
4595
4513
  }
4596
- const output = Array.isArray(response.output) ? response.output : [];
4597
- for (const item of output) {
4598
- if (!item || typeof item !== "object") {
4599
- continue;
4600
- }
4601
- const record = item;
4602
- const content = Array.isArray(record.content) ? record.content : [];
4603
- for (const contentItem of content) {
4604
- if (!contentItem || typeof contentItem !== "object") {
4605
- continue;
4606
- }
4607
- const contentRecord = contentItem;
4608
- if (typeof contentRecord.text === "string") {
4609
- return contentRecord.text;
4610
- }
4611
- }
4514
+ try {
4515
+ const parsed = JSON.parse(raw);
4516
+ return parsed && typeof parsed === "object" ? parsed : {};
4517
+ } catch {
4518
+ return {};
4612
4519
  }
4613
- return "";
4614
4520
  };
4615
- var extractToolCalls = (response) => {
4616
- const output = Array.isArray(response.output) ? response.output : [];
4617
- const toolCalls = [];
4618
- for (const item of output) {
4619
- if (!item || typeof item !== "object") {
4620
- continue;
4521
+ var readServerSentEvents = async (response, onEvent) => {
4522
+ const reader = response.body?.getReader();
4523
+ if (!reader) {
4524
+ throw new Error("streaming response has no body");
4525
+ }
4526
+ const decoder = new TextDecoder;
4527
+ let buffer = "";
4528
+ const drain = (block) => {
4529
+ for (const line of block.split(`
4530
+ `)) {
4531
+ const trimmed = line.trimStart();
4532
+ if (!trimmed.startsWith("data:"))
4533
+ continue;
4534
+ const data = trimmed.slice("data:".length).trim();
4535
+ if (!data || data === "[DONE]")
4536
+ continue;
4537
+ try {
4538
+ onEvent(JSON.parse(data));
4539
+ } catch {}
4621
4540
  }
4622
- const record = item;
4623
- if (record.type !== "function_call" || typeof record.name !== "string") {
4624
- continue;
4541
+ };
4542
+ for (;; ) {
4543
+ const { done, value } = await reader.read();
4544
+ if (done)
4545
+ break;
4546
+ buffer += decoder.decode(value, { stream: true });
4547
+ let separator = buffer.indexOf(`
4548
+
4549
+ `);
4550
+ while (separator !== -1) {
4551
+ drain(buffer.slice(0, separator));
4552
+ buffer = buffer.slice(separator + 2);
4553
+ separator = buffer.indexOf(`
4554
+
4555
+ `);
4625
4556
  }
4626
- const args = typeof record.arguments === "string" ? parseJSON(record.arguments) : {};
4627
- toolCalls.push({
4628
- args,
4629
- id: typeof record.call_id === "string" ? record.call_id : typeof record.id === "string" ? record.id : undefined,
4630
- name: record.name
4631
- });
4632
4557
  }
4633
- return toolCalls;
4558
+ if (buffer.trim())
4559
+ drain(buffer);
4560
+ };
4561
+ var finalizeToolCalls = (calls) => [...calls.values()].filter((call) => call.name).map((call) => ({
4562
+ args: parseToolArgs(call.args),
4563
+ id: call.id,
4564
+ name: call.name
4565
+ }));
4566
+ var consumeOpenAIResponsesStream = async (response, onTextDelta) => {
4567
+ let assistantText = "";
4568
+ let usage;
4569
+ const calls = new Map;
4570
+ await readServerSentEvents(response, (event) => {
4571
+ const type = typeof event.type === "string" ? event.type : "";
4572
+ const item = event.item;
4573
+ if (type === "response.output_text.delta" && typeof event.delta === "string") {
4574
+ assistantText += event.delta;
4575
+ onTextDelta?.(event.delta);
4576
+ } else if (type === "response.output_item.added" && item?.type === "function_call") {
4577
+ calls.set(String(item.id ?? item.call_id ?? ""), {
4578
+ args: typeof item.arguments === "string" ? item.arguments : "",
4579
+ id: typeof item.call_id === "string" ? item.call_id : item.id,
4580
+ name: typeof item.name === "string" ? item.name : ""
4581
+ });
4582
+ } else if (type === "response.function_call_arguments.delta" && typeof event.delta === "string") {
4583
+ const entry = calls.get(String(event.item_id ?? ""));
4584
+ if (entry)
4585
+ entry.args += event.delta;
4586
+ } else if (type === "response.output_item.done" && item?.type === "function_call" && typeof item.arguments === "string" && item.arguments) {
4587
+ const entry = calls.get(String(item.id ?? item.call_id ?? ""));
4588
+ if (entry)
4589
+ entry.args = item.arguments;
4590
+ } else if (type === "response.completed") {
4591
+ const completed = event.response;
4592
+ if (completed?.usage && typeof completed.usage === "object") {
4593
+ usage = completed.usage;
4594
+ }
4595
+ }
4596
+ });
4597
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
4634
4598
  };
4635
4599
  var createOpenAIVoiceAssistantModel = (options) => {
4636
4600
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -4641,23 +4605,13 @@ var createOpenAIVoiceAssistantModel = (options) => {
4641
4605
  const response = await fetchImpl(`${baseUrl.replace(/\/$/, "")}/responses`, {
4642
4606
  body: JSON.stringify({
4643
4607
  input: messagesToOpenAIInput(input.messages),
4644
- instructions: [
4645
- input.system,
4646
- "Return a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools."
4647
- ].filter(Boolean).join(`
4608
+ instructions: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
4648
4609
 
4649
4610
  `),
4650
4611
  max_output_tokens: options.maxOutputTokens,
4651
4612
  model,
4613
+ stream: true,
4652
4614
  temperature: options.temperature,
4653
- text: {
4654
- format: {
4655
- name: "voice_route_result",
4656
- schema: OUTPUT_SCHEMA,
4657
- strict: false,
4658
- type: "json_schema"
4659
- }
4660
- },
4661
4615
  tool_choice: input.tools.length ? "auto" : "none",
4662
4616
  tools: input.tools.map((tool) => ({
4663
4617
  description: tool.description,
@@ -4671,6 +4625,7 @@ var createOpenAIVoiceAssistantModel = (options) => {
4671
4625
  }))
4672
4626
  }),
4673
4627
  headers: {
4628
+ accept: "text/event-stream",
4674
4629
  authorization: `Bearer ${options.apiKey}`,
4675
4630
  "content-type": "application/json"
4676
4631
  },
@@ -4679,43 +4634,52 @@ var createOpenAIVoiceAssistantModel = (options) => {
4679
4634
  if (!response.ok) {
4680
4635
  throw createHTTPError("OpenAI", response);
4681
4636
  }
4682
- const body = await response.json();
4683
- if (body.usage && typeof body.usage === "object") {
4684
- await options.onUsage?.(body.usage);
4685
- }
4686
- const toolCalls = extractToolCalls(body);
4687
- if (toolCalls.length) {
4688
- return {
4689
- toolCalls
4690
- };
4637
+ const { assistantText, toolCalls, usage } = await consumeOpenAIResponsesStream(response, input.onTextDelta);
4638
+ if (usage) {
4639
+ await options.onUsage?.(usage);
4691
4640
  }
4692
- return normalizeRouteOutput(parseJSON(extractText(body)));
4641
+ return {
4642
+ ...assistantText ? { assistantText } : {},
4643
+ ...toolCalls.length ? { toolCalls } : {}
4644
+ };
4693
4645
  }
4694
4646
  };
4695
4647
  };
4696
- var extractAnthropicText = (response) => {
4697
- const content = Array.isArray(response.content) ? response.content : [];
4698
- return content.map((item) => item && typeof item === "object" && item.type === "text" && typeof item.text === "string" ? item.text : "").filter(Boolean).join(`
4699
- `);
4700
- };
4701
- var extractAnthropicToolCalls = (response) => {
4702
- const content = Array.isArray(response.content) ? response.content : [];
4703
- const toolCalls = [];
4704
- for (const item of content) {
4705
- if (!item || typeof item !== "object") {
4706
- continue;
4707
- }
4708
- const record = item;
4709
- if (record.type !== "tool_use" || typeof record.name !== "string") {
4710
- continue;
4648
+ var consumeAnthropicStream = async (response, onTextDelta) => {
4649
+ let assistantText = "";
4650
+ let usage;
4651
+ const calls = new Map;
4652
+ await readServerSentEvents(response, (event) => {
4653
+ const type = typeof event.type === "string" ? event.type : "";
4654
+ const delta = event.delta;
4655
+ if (type === "content_block_delta" && delta?.type === "text_delta") {
4656
+ if (typeof delta.text === "string") {
4657
+ assistantText += delta.text;
4658
+ onTextDelta?.(delta.text);
4659
+ }
4660
+ } else if (type === "content_block_delta" && delta?.type === "input_json_delta" && typeof delta.partial_json === "string") {
4661
+ const entry = calls.get(String(event.index ?? ""));
4662
+ if (entry)
4663
+ entry.args += delta.partial_json;
4664
+ } else if (type === "content_block_start") {
4665
+ const block = event.content_block;
4666
+ if (block?.type === "tool_use") {
4667
+ calls.set(String(event.index ?? ""), {
4668
+ args: "",
4669
+ id: typeof block.id === "string" ? block.id : undefined,
4670
+ name: typeof block.name === "string" ? block.name : ""
4671
+ });
4672
+ }
4673
+ } else if (type === "message_start") {
4674
+ const message = event.message;
4675
+ if (message?.usage && typeof message.usage === "object") {
4676
+ usage = message.usage;
4677
+ }
4678
+ } else if (type === "message_delta" && event.usage && typeof event.usage === "object") {
4679
+ usage = { ...usage, ...event.usage };
4711
4680
  }
4712
- toolCalls.push({
4713
- args: record.input && typeof record.input === "object" ? record.input : {},
4714
- id: typeof record.id === "string" ? record.id : undefined,
4715
- name: record.name
4716
- });
4717
- }
4718
- return toolCalls;
4681
+ });
4682
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
4719
4683
  };
4720
4684
  var createAnthropicVoiceAssistantModel = (options) => {
4721
4685
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -4728,7 +4692,8 @@ var createAnthropicVoiceAssistantModel = (options) => {
4728
4692
  max_tokens: options.maxOutputTokens ?? 1024,
4729
4693
  messages: input.messages.map(messageToAnthropicMessage).filter(Boolean),
4730
4694
  model,
4731
- system: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
4695
+ stream: true,
4696
+ system: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
4732
4697
 
4733
4698
  `),
4734
4699
  temperature: options.temperature,
@@ -4752,57 +4717,55 @@ var createAnthropicVoiceAssistantModel = (options) => {
4752
4717
  if (!response.ok) {
4753
4718
  throw createHTTPError("Anthropic", response);
4754
4719
  }
4755
- const body = await response.json();
4756
- if (body.usage && typeof body.usage === "object") {
4757
- await options.onUsage?.(body.usage);
4758
- }
4759
- const toolCalls = extractAnthropicToolCalls(body);
4760
- if (toolCalls.length) {
4761
- return {
4762
- assistantText: extractAnthropicText(body) || undefined,
4763
- toolCalls
4764
- };
4720
+ const { assistantText, toolCalls, usage } = await consumeAnthropicStream(response, input.onTextDelta);
4721
+ if (usage) {
4722
+ await options.onUsage?.(usage);
4765
4723
  }
4766
- return normalizeRouteOutput(parseJSON(extractAnthropicText(body)));
4724
+ return {
4725
+ ...assistantText ? { assistantText } : {},
4726
+ ...toolCalls.length ? { toolCalls } : {}
4727
+ };
4767
4728
  }
4768
4729
  };
4769
4730
  };
4770
- var extractGeminiCandidateParts = (response) => {
4771
- const candidates = Array.isArray(response.candidates) ? response.candidates : [];
4772
- const first = candidates[0];
4773
- if (!first || typeof first !== "object") {
4774
- return [];
4775
- }
4776
- const { content } = first;
4777
- if (!content || typeof content !== "object") {
4778
- return [];
4731
+ var handleGeminiPart = (part, collect) => {
4732
+ if (!part || typeof part !== "object")
4733
+ return "";
4734
+ const record = part;
4735
+ if (typeof record.text === "string" && record.text) {
4736
+ collect.onTextDelta?.(record.text);
4737
+ return record.text;
4738
+ }
4739
+ const { functionCall } = record;
4740
+ if (functionCall && typeof functionCall === "object") {
4741
+ const fn = functionCall;
4742
+ if (typeof fn.name === "string") {
4743
+ collect.toolCalls.push({
4744
+ args: fn.args && typeof fn.args === "object" ? fn.args : {},
4745
+ id: typeof fn.id === "string" ? fn.id : undefined,
4746
+ name: fn.name
4747
+ });
4748
+ }
4779
4749
  }
4780
- const { parts } = content;
4781
- return Array.isArray(parts) ? parts : [];
4750
+ return "";
4782
4751
  };
4783
- var extractGeminiText = (response) => extractGeminiCandidateParts(response).map((part) => part && typeof part === "object" && typeof part.text === "string" ? part.text : "").filter(Boolean).join(`
4784
- `);
4785
- var extractGeminiToolCalls = (response) => {
4752
+ var consumeGeminiStream = async (response, onTextDelta) => {
4753
+ let assistantText = "";
4754
+ let usage;
4786
4755
  const toolCalls = [];
4787
- for (const part of extractGeminiCandidateParts(response)) {
4788
- if (!part || typeof part !== "object") {
4789
- continue;
4756
+ await readServerSentEvents(response, (event) => {
4757
+ if (event.usageMetadata && typeof event.usageMetadata === "object") {
4758
+ usage = event.usageMetadata;
4790
4759
  }
4791
- const { functionCall } = part;
4792
- if (!functionCall || typeof functionCall !== "object") {
4793
- continue;
4794
- }
4795
- const record = functionCall;
4796
- if (typeof record.name !== "string") {
4797
- continue;
4760
+ const candidates = Array.isArray(event.candidates) ? event.candidates : [];
4761
+ const first = candidates[0];
4762
+ const content = first?.content;
4763
+ const parts = Array.isArray(content?.parts) ? content.parts : [];
4764
+ for (const part of parts) {
4765
+ assistantText += handleGeminiPart(part, { onTextDelta, toolCalls });
4798
4766
  }
4799
- toolCalls.push({
4800
- args: record.args && typeof record.args === "object" ? record.args : {},
4801
- id: typeof record.id === "string" ? record.id : undefined,
4802
- name: record.name
4803
- });
4804
- }
4805
- return toolCalls;
4767
+ });
4768
+ return { assistantText, toolCalls, usage };
4806
4769
  };
4807
4770
  var createGeminiVoiceAssistantModel = (options) => {
4808
4771
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -4811,7 +4774,7 @@ var createGeminiVoiceAssistantModel = (options) => {
4811
4774
  const maxRetries = Math.max(0, options.maxRetries ?? 2);
4812
4775
  return {
4813
4776
  generate: async (input) => {
4814
- const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(options.apiKey)}`;
4777
+ const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:streamGenerateContent?alt=sse&key=${encodeURIComponent(options.apiKey)}`;
4815
4778
  let response;
4816
4779
  for (let attempt = 0;attempt <= maxRetries; attempt += 1) {
4817
4780
  response = await fetchImpl(endpoint, {
@@ -4819,16 +4782,12 @@ var createGeminiVoiceAssistantModel = (options) => {
4819
4782
  contents: input.messages.map(messageToGeminiContent).filter(Boolean),
4820
4783
  generationConfig: {
4821
4784
  maxOutputTokens: options.maxOutputTokens,
4822
- ...input.tools.length ? {} : {
4823
- responseMimeType: "application/json",
4824
- responseSchema: toGeminiSchema(OUTPUT_SCHEMA)
4825
- },
4826
4785
  temperature: options.temperature
4827
4786
  },
4828
4787
  systemInstruction: {
4829
4788
  parts: [
4830
4789
  {
4831
- text: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
4790
+ text: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
4832
4791
 
4833
4792
  `)
4834
4793
  }
@@ -4864,18 +4823,14 @@ var createGeminiVoiceAssistantModel = (options) => {
4864
4823
  if (!response.ok) {
4865
4824
  throw createHTTPError("Gemini", response);
4866
4825
  }
4867
- const body = await response.json();
4868
- if (body.usageMetadata && typeof body.usageMetadata === "object") {
4869
- await options.onUsage?.(body.usageMetadata);
4870
- }
4871
- const toolCalls = extractGeminiToolCalls(body);
4872
- if (toolCalls.length) {
4873
- return {
4874
- assistantText: extractGeminiText(body) || undefined,
4875
- toolCalls
4876
- };
4826
+ const { assistantText, toolCalls, usage } = await consumeGeminiStream(response, input.onTextDelta);
4827
+ if (usage) {
4828
+ await options.onUsage?.(usage);
4877
4829
  }
4878
- return normalizeRouteOutput(parseJSON(extractGeminiText(body)));
4830
+ return {
4831
+ ...assistantText ? { assistantText } : {},
4832
+ ...toolCalls.length ? { toolCalls } : {}
4833
+ };
4879
4834
  }
4880
4835
  };
4881
4836
  };
@@ -5523,6 +5478,19 @@ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
5523
5478
  var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
5524
5479
  var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
5525
5480
  var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
5481
+ var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
5482
+ var MAX_TTS_CHUNK_CHARS = 220;
5483
+ var nextSpeakableBoundary = (buffer) => {
5484
+ const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
5485
+ return match ? match.index + match[0].length : -1;
5486
+ };
5487
+ var softCutBoundary = (buffer) => {
5488
+ if (buffer.length < MAX_TTS_CHUNK_CHARS)
5489
+ return -1;
5490
+ const window2 = buffer.slice(0, MAX_TTS_CHUNK_CHARS);
5491
+ const lastSpace = window2.lastIndexOf(" ");
5492
+ return lastSpace > 0 ? lastSpace + 1 : MAX_TTS_CHUNK_CHARS;
5493
+ };
5526
5494
  var calculateMeanConfidence = (transcripts) => {
5527
5495
  let sum = 0;
5528
5496
  let total = 0;
@@ -6992,6 +6960,110 @@ var createVoiceSession = (options) => {
6992
6960
  });
6993
6961
  });
6994
6962
  };
6963
+ const createTurnTTSStreamer = (turn, session) => {
6964
+ let buffer = "";
6965
+ let full = "";
6966
+ let charsSent = 0;
6967
+ let started = false;
6968
+ let streamed = false;
6969
+ let sendChain = Promise.resolve();
6970
+ let ttsSessionRequest = null;
6971
+ const ttsStartedAt = Date.now();
6972
+ const ensure = () => {
6973
+ if (!ttsSessionRequest) {
6974
+ ttsSessionRequest = ensureTTSSession().catch((error) => {
6975
+ logger.warn("voice assistant audio send failed", {
6976
+ error: toError(error).message,
6977
+ sessionId: options.id,
6978
+ turnId: turn.id
6979
+ });
6980
+ return null;
6981
+ });
6982
+ }
6983
+ return ttsSessionRequest;
6984
+ };
6985
+ const flush = (text) => {
6986
+ if (!text.trim())
6987
+ return;
6988
+ const previous = sendChain;
6989
+ sendChain = (async () => {
6990
+ await previous;
6991
+ if (started && activeTTSTurnId !== turn.id)
6992
+ return;
6993
+ const ttsSession2 = await ensure();
6994
+ if (!ttsSession2 || started && activeTTSTurnId !== turn.id)
6995
+ return;
6996
+ if (!started) {
6997
+ activeTTSTurnId = turn.id;
6998
+ await appendTurnLatencyStage({
6999
+ at: ttsStartedAt,
7000
+ session,
7001
+ stage: "tts_send_started",
7002
+ turnId: turn.id
7003
+ });
7004
+ started = true;
7005
+ }
7006
+ try {
7007
+ await ttsSession2.send(text);
7008
+ charsSent += text.length;
7009
+ } catch (error) {
7010
+ logger.warn("voice assistant audio send failed", {
7011
+ error: toError(error).message,
7012
+ sessionId: options.id,
7013
+ turnId: turn.id
7014
+ });
7015
+ }
7016
+ })();
7017
+ };
7018
+ return {
7019
+ finish: async () => {
7020
+ if (buffer.trim()) {
7021
+ flush(buffer);
7022
+ }
7023
+ buffer = "";
7024
+ await sendChain;
7025
+ if (started) {
7026
+ if (options.costAccountant) {
7027
+ options.costAccountant.recordTTS({ characters: charsSent });
7028
+ }
7029
+ await appendTurnLatencyStage({
7030
+ session,
7031
+ stage: "tts_send_completed",
7032
+ turnId: turn.id
7033
+ });
7034
+ await appendTrace({
7035
+ payload: {
7036
+ elapsedMs: Date.now() - ttsStartedAt,
7037
+ status: "sent",
7038
+ streamed: true
7039
+ },
7040
+ session,
7041
+ turnId: turn.id,
7042
+ type: "turn.assistant"
7043
+ });
7044
+ }
7045
+ return { fullText: full, streamed };
7046
+ },
7047
+ push: (delta) => {
7048
+ if (!delta)
7049
+ return;
7050
+ streamed = true;
7051
+ full += delta;
7052
+ buffer += delta;
7053
+ let boundary = nextSpeakableBoundary(buffer);
7054
+ while (boundary !== -1) {
7055
+ flush(buffer.slice(0, boundary));
7056
+ buffer = buffer.slice(boundary);
7057
+ boundary = nextSpeakableBoundary(buffer);
7058
+ }
7059
+ const cut = softCutBoundary(buffer);
7060
+ if (cut !== -1) {
7061
+ flush(buffer.slice(0, cut));
7062
+ buffer = buffer.slice(cut);
7063
+ }
7064
+ }
7065
+ };
7066
+ };
6995
7067
  const completeTurn = async (session, turn) => {
6996
7068
  const liveOpsControl = await options.liveOps?.getControl(options.id);
6997
7069
  if (liveOpsControl?.assistantPaused || liveOpsControl?.operatorTakeover) {
@@ -7012,6 +7084,7 @@ var createVoiceSession = (options) => {
7012
7084
  return;
7013
7085
  }
7014
7086
  const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
7087
+ const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
7015
7088
  const committedOutput = await options.route.onTurn({
7016
7089
  api,
7017
7090
  context: options.context,
@@ -7019,6 +7092,7 @@ var createVoiceSession = (options) => {
7019
7092
  control: liveOpsControl,
7020
7093
  injectedInstruction
7021
7094
  } : undefined,
7095
+ onTextDelta: ttsStreamer?.push,
7022
7096
  session,
7023
7097
  turn
7024
7098
  });
@@ -7038,7 +7112,28 @@ var createVoiceSession = (options) => {
7038
7112
  setTurnResult(currentSession, turn.id, { citations: turnCitations });
7039
7113
  });
7040
7114
  }
7041
- if (output?.assistantText) {
7115
+ const streamResult = ttsStreamer ? await ttsStreamer.finish() : undefined;
7116
+ if (streamResult?.streamed) {
7117
+ output.assistantText = streamResult.fullText || output.assistantText;
7118
+ if (output.assistantText) {
7119
+ const finalText = output.assistantText;
7120
+ await writeSession((currentSession) => {
7121
+ setTurnResult(currentSession, turn.id, { assistantText: finalText });
7122
+ });
7123
+ await send({ text: finalText, turnId: turn.id, type: "assistant" });
7124
+ await appendTrace({
7125
+ payload: {
7126
+ assistantMode: resolveVoiceAssistantMode(options),
7127
+ realtimeConfigured: Boolean(options.realtime),
7128
+ text: finalText,
7129
+ ttsConfigured: Boolean(options.tts)
7130
+ },
7131
+ session,
7132
+ turnId: turn.id,
7133
+ type: "turn.assistant"
7134
+ });
7135
+ }
7136
+ } else if (output?.assistantText) {
7042
7137
  const assistantTextStartedAt = Date.now();
7043
7138
  await writeSession((currentSession) => {
7044
7139
  setTurnResult(currentSession, turn.id, {
@@ -7443,7 +7538,7 @@ var createVoiceSession = (options) => {
7443
7538
  kickCallSilenceWatchdog();
7444
7539
  startAmdEvaluationTimer();
7445
7540
  if (shouldFireOnSession && options.greeting && session.turns.length === 0) {
7446
- const greetingText = typeof options.greeting === "function" ? await options.greeting() : options.greeting;
7541
+ const greetingText = typeof options.greeting === "function" ? await options.greeting({ session }) : options.greeting;
7447
7542
  const greetingTurnId = createId();
7448
7543
  await send({
7449
7544
  text: greetingText,
@@ -12743,8 +12838,23 @@ var createTwilioSocketAdapter = (socket, getState) => ({
12743
12838
  if (!state.streamSid) {
12744
12839
  return;
12745
12840
  }
12841
+ const clearMessage = { event: "clear", streamSid: state.streamSid };
12746
12842
  state.reviewRecorder?.recordTwilioOutbound({ event: "clear" });
12747
- await Promise.resolve(socket.send(JSON.stringify({ event: "clear", streamSid: state.streamSid })));
12843
+ await state.trace?.append({
12844
+ at: Date.now(),
12845
+ payload: {
12846
+ callSid: state.callSid ?? undefined,
12847
+ carrier: state.carrier,
12848
+ direction: "outbound",
12849
+ envelope: clearMessage,
12850
+ event: "clear",
12851
+ streamId: state.streamSid
12852
+ },
12853
+ scenarioId: state.scenarioId ?? undefined,
12854
+ sessionId: state.sessionId ?? state.streamSid,
12855
+ type: "client.telephony_media"
12856
+ });
12857
+ await Promise.resolve(socket.send(JSON.stringify(clearMessage)));
12748
12858
  },
12749
12859
  close: async (code, reason) => {
12750
12860
  await Promise.resolve(socket.close(code, reason));
@@ -13266,7 +13376,7 @@ var createFakeSTTAdapter = (inputSpy, sttDelayMs) => ({
13266
13376
  final: new Set,
13267
13377
  partial: new Set
13268
13378
  };
13269
- let delivered = false;
13379
+ let sendCount = 0;
13270
13380
  return {
13271
13381
  close: async () => {
13272
13382
  for (const handler of listeners.close) {
@@ -13281,31 +13391,44 @@ var createFakeSTTAdapter = (inputSpy, sttDelayMs) => ({
13281
13391
  },
13282
13392
  send: async (audio) => {
13283
13393
  inputSpy.push(toUint8Array2(audio));
13284
- if (delivered) {
13394
+ sendCount += 1;
13395
+ if (sendCount === 1) {
13396
+ if (sttDelayMs > 0) {
13397
+ await Bun.sleep(sttDelayMs);
13398
+ }
13399
+ const receivedAt = Date.now();
13400
+ for (const handler of listeners.final) {
13401
+ handler({
13402
+ receivedAt,
13403
+ transcript: {
13404
+ id: "telephony-benchmark-final",
13405
+ isFinal: true,
13406
+ text: "hello from twilio"
13407
+ },
13408
+ type: "final"
13409
+ });
13410
+ }
13411
+ for (const handler of listeners.endOfTurn) {
13412
+ handler({
13413
+ reason: "vendor",
13414
+ receivedAt,
13415
+ type: "endOfTurn"
13416
+ });
13417
+ }
13285
13418
  return;
13286
13419
  }
13287
- delivered = true;
13288
- if (sttDelayMs > 0) {
13289
- await Bun.sleep(sttDelayMs);
13290
- }
13291
- const receivedAt = Date.now();
13292
- for (const handler of listeners.final) {
13293
- handler({
13294
- receivedAt,
13295
- transcript: {
13296
- id: "telephony-benchmark-final",
13297
- isFinal: true,
13298
- text: "hello from twilio"
13299
- },
13300
- type: "final"
13301
- });
13302
- }
13303
- for (const handler of listeners.endOfTurn) {
13304
- handler({
13305
- reason: "vendor",
13306
- receivedAt,
13307
- type: "endOfTurn"
13308
- });
13420
+ if (sendCount === 2) {
13421
+ for (const handler of listeners.partial) {
13422
+ handler({
13423
+ receivedAt: Date.now(),
13424
+ transcript: {
13425
+ id: "telephony-benchmark-partial",
13426
+ isFinal: false,
13427
+ text: "actually wait"
13428
+ },
13429
+ type: "partial"
13430
+ });
13431
+ }
13309
13432
  }
13310
13433
  }
13311
13434
  };