@absolutejs/voice 0.0.22-beta.544 → 0.0.22-beta.546

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3661,6 +3661,19 @@ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
3661
3661
  var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
3662
3662
  var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
3663
3663
  var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
3664
+ var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
3665
+ var MAX_TTS_CHUNK_CHARS = 220;
3666
+ var nextSpeakableBoundary = (buffer) => {
3667
+ const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
3668
+ return match ? match.index + match[0].length : -1;
3669
+ };
3670
+ var softCutBoundary = (buffer) => {
3671
+ if (buffer.length < MAX_TTS_CHUNK_CHARS)
3672
+ return -1;
3673
+ const window2 = buffer.slice(0, MAX_TTS_CHUNK_CHARS);
3674
+ const lastSpace = window2.lastIndexOf(" ");
3675
+ return lastSpace > 0 ? lastSpace + 1 : MAX_TTS_CHUNK_CHARS;
3676
+ };
3664
3677
  var calculateMeanConfidence = (transcripts) => {
3665
3678
  let sum = 0;
3666
3679
  let total = 0;
@@ -4888,6 +4901,9 @@ var createVoiceSession = (options) => {
4888
4901
  };
4889
4902
  };
4890
4903
  const handlePartial = async (transcript) => {
4904
+ if (activeTTSTurnId !== undefined && transcript.text.trim()) {
4905
+ cancelActiveTTS("barge-in");
4906
+ }
4891
4907
  const session = await writeSession((session2) => {
4892
4908
  const nextPartialStartedAt = transcript.startedAtMs ?? session2.currentTurn.partialStartedAt;
4893
4909
  const nextPartialEndedAt = transcript.endedAtMs ?? session2.currentTurn.partialEndedAt;
@@ -5127,6 +5143,110 @@ var createVoiceSession = (options) => {
5127
5143
  });
5128
5144
  });
5129
5145
  };
5146
+ const createTurnTTSStreamer = (turn, session) => {
5147
+ let buffer = "";
5148
+ let full = "";
5149
+ let charsSent = 0;
5150
+ let started = false;
5151
+ let streamed = false;
5152
+ let sendChain = Promise.resolve();
5153
+ let ttsSessionRequest = null;
5154
+ const ttsStartedAt = Date.now();
5155
+ const ensure = () => {
5156
+ if (!ttsSessionRequest) {
5157
+ ttsSessionRequest = ensureTTSSession().catch((error) => {
5158
+ logger.warn("voice assistant audio send failed", {
5159
+ error: toError(error).message,
5160
+ sessionId: options.id,
5161
+ turnId: turn.id
5162
+ });
5163
+ return null;
5164
+ });
5165
+ }
5166
+ return ttsSessionRequest;
5167
+ };
5168
+ const flush = (text) => {
5169
+ if (!text.trim())
5170
+ return;
5171
+ const previous = sendChain;
5172
+ sendChain = (async () => {
5173
+ await previous;
5174
+ if (started && activeTTSTurnId !== turn.id)
5175
+ return;
5176
+ const ttsSession2 = await ensure();
5177
+ if (!ttsSession2 || started && activeTTSTurnId !== turn.id)
5178
+ return;
5179
+ if (!started) {
5180
+ activeTTSTurnId = turn.id;
5181
+ await appendTurnLatencyStage({
5182
+ at: ttsStartedAt,
5183
+ session,
5184
+ stage: "tts_send_started",
5185
+ turnId: turn.id
5186
+ });
5187
+ started = true;
5188
+ }
5189
+ try {
5190
+ await ttsSession2.send(text);
5191
+ charsSent += text.length;
5192
+ } catch (error) {
5193
+ logger.warn("voice assistant audio send failed", {
5194
+ error: toError(error).message,
5195
+ sessionId: options.id,
5196
+ turnId: turn.id
5197
+ });
5198
+ }
5199
+ })();
5200
+ };
5201
+ return {
5202
+ finish: async () => {
5203
+ if (buffer.trim()) {
5204
+ flush(buffer);
5205
+ }
5206
+ buffer = "";
5207
+ await sendChain;
5208
+ if (started) {
5209
+ if (options.costAccountant) {
5210
+ options.costAccountant.recordTTS({ characters: charsSent });
5211
+ }
5212
+ await appendTurnLatencyStage({
5213
+ session,
5214
+ stage: "tts_send_completed",
5215
+ turnId: turn.id
5216
+ });
5217
+ await appendTrace({
5218
+ payload: {
5219
+ elapsedMs: Date.now() - ttsStartedAt,
5220
+ status: "sent",
5221
+ streamed: true
5222
+ },
5223
+ session,
5224
+ turnId: turn.id,
5225
+ type: "turn.assistant"
5226
+ });
5227
+ }
5228
+ return { fullText: full, streamed };
5229
+ },
5230
+ push: (delta) => {
5231
+ if (!delta)
5232
+ return;
5233
+ streamed = true;
5234
+ full += delta;
5235
+ buffer += delta;
5236
+ let boundary = nextSpeakableBoundary(buffer);
5237
+ while (boundary !== -1) {
5238
+ flush(buffer.slice(0, boundary));
5239
+ buffer = buffer.slice(boundary);
5240
+ boundary = nextSpeakableBoundary(buffer);
5241
+ }
5242
+ const cut = softCutBoundary(buffer);
5243
+ if (cut !== -1) {
5244
+ flush(buffer.slice(0, cut));
5245
+ buffer = buffer.slice(cut);
5246
+ }
5247
+ }
5248
+ };
5249
+ };
5130
5250
  const completeTurn = async (session, turn) => {
5131
5251
  const liveOpsControl = await options.liveOps?.getControl(options.id);
5132
5252
  if (liveOpsControl?.assistantPaused || liveOpsControl?.operatorTakeover) {
@@ -5147,6 +5267,7 @@ var createVoiceSession = (options) => {
5147
5267
  return;
5148
5268
  }
5149
5269
  const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
5270
+ const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
5150
5271
  const committedOutput = await options.route.onTurn({
5151
5272
  api,
5152
5273
  context: options.context,
@@ -5154,6 +5275,7 @@ var createVoiceSession = (options) => {
5154
5275
  control: liveOpsControl,
5155
5276
  injectedInstruction
5156
5277
  } : undefined,
5278
+ onTextDelta: ttsStreamer?.push,
5157
5279
  session,
5158
5280
  turn
5159
5281
  });
@@ -5173,7 +5295,28 @@ var createVoiceSession = (options) => {
5173
5295
  setTurnResult(currentSession, turn.id, { citations: turnCitations });
5174
5296
  });
5175
5297
  }
5176
- if (output?.assistantText) {
5298
+ const streamResult = ttsStreamer ? await ttsStreamer.finish() : undefined;
5299
+ if (streamResult?.streamed) {
5300
+ output.assistantText = streamResult.fullText || output.assistantText;
5301
+ if (output.assistantText) {
5302
+ const finalText = output.assistantText;
5303
+ await writeSession((currentSession) => {
5304
+ setTurnResult(currentSession, turn.id, { assistantText: finalText });
5305
+ });
5306
+ await send({ text: finalText, turnId: turn.id, type: "assistant" });
5307
+ await appendTrace({
5308
+ payload: {
5309
+ assistantMode: resolveVoiceAssistantMode(options),
5310
+ realtimeConfigured: Boolean(options.realtime),
5311
+ text: finalText,
5312
+ ttsConfigured: Boolean(options.tts)
5313
+ },
5314
+ session,
5315
+ turnId: turn.id,
5316
+ type: "turn.assistant"
5317
+ });
5318
+ }
5319
+ } else if (output?.assistantText) {
5177
5320
  const assistantTextStartedAt = Date.now();
5178
5321
  await writeSession((currentSession) => {
5179
5322
  setTurnResult(currentSession, turn.id, {
@@ -5661,9 +5804,6 @@ var createVoiceSession = (options) => {
5661
5804
  if (amdFirstAudioAt === undefined) {
5662
5805
  amdFirstAudioAt = Date.now();
5663
5806
  }
5664
- if (!speechDetected && activeTTSTurnId !== undefined) {
5665
- cancelActiveTTS("barge-in");
5666
- }
5667
5807
  speechDetected = true;
5668
5808
  clearSilenceTimer();
5669
5809
  kickCallSilenceWatchdog();
@@ -6825,6 +6965,100 @@ var appendVoiceAgentSquadHandoff = async (input) => {
6825
6965
  });
6826
6966
  return handoff;
6827
6967
  };
6968
+ var LIFECYCLE_TOOLS = [
6969
+ {
6970
+ description: "Transfer the call to a human agent or phone number. Say a short handoff line to the caller first, then call this.",
6971
+ name: "transfer_call",
6972
+ parameters: {
6973
+ additionalProperties: false,
6974
+ properties: {
6975
+ reason: { description: "Why you are transferring", type: "string" },
6976
+ target: {
6977
+ description: "Agent id or phone number to transfer to",
6978
+ type: "string"
6979
+ }
6980
+ },
6981
+ required: ["target"],
6982
+ type: "object"
6983
+ }
6984
+ },
6985
+ {
6986
+ description: "Escalate to a supervisor or human when you cannot resolve the caller's request.",
6987
+ name: "escalate",
6988
+ parameters: {
6989
+ additionalProperties: false,
6990
+ properties: {
6991
+ reason: { description: "Why you are escalating", type: "string" }
6992
+ },
6993
+ required: ["reason"],
6994
+ type: "object"
6995
+ }
6996
+ },
6997
+ {
6998
+ description: "Record that the call reached voicemail or an answering machine.",
6999
+ name: "leave_voicemail",
7000
+ parameters: { additionalProperties: false, properties: {}, type: "object" }
7001
+ },
7002
+ {
7003
+ description: "Record that no one answered or the call could not proceed to a conversation.",
7004
+ name: "mark_no_answer",
7005
+ parameters: { additionalProperties: false, properties: {}, type: "object" }
7006
+ },
7007
+ {
7008
+ description: "End the conversation once its goal is met. Optionally include a structured result.",
7009
+ name: "complete",
7010
+ parameters: {
7011
+ additionalProperties: true,
7012
+ properties: {
7013
+ result: { description: "Structured outcome of the call, if any" }
7014
+ },
7015
+ type: "object"
7016
+ }
7017
+ }
7018
+ ];
7019
+ var LIFECYCLE_TOOL_NAMES = new Set(LIFECYCLE_TOOLS.map((tool) => tool.name));
7020
+ var applyLifecycleToolCall = (output, toolCall) => {
7021
+ const args = toolCall.args ?? {};
7022
+ switch (toolCall.name) {
7023
+ case "transfer_call":
7024
+ output.transfer = {
7025
+ reason: typeof args.reason === "string" ? args.reason : undefined,
7026
+ target: typeof args.target === "string" ? args.target : ""
7027
+ };
7028
+ break;
7029
+ case "escalate":
7030
+ output.escalate = {
7031
+ reason: typeof args.reason === "string" ? args.reason : "escalation requested"
7032
+ };
7033
+ break;
7034
+ case "leave_voicemail":
7035
+ output.voicemail = {};
7036
+ break;
7037
+ case "mark_no_answer":
7038
+ output.noAnswer = {};
7039
+ break;
7040
+ case "complete":
7041
+ output.complete = true;
7042
+ if ("result" in args) {
7043
+ output.result = args.result;
7044
+ }
7045
+ break;
7046
+ default:
7047
+ break;
7048
+ }
7049
+ };
7050
+ var isLifecycleRequested = (output) => Boolean(output.complete) || Boolean(output.transfer) || Boolean(output.escalate) || Boolean(output.voicemail) || Boolean(output.noAnswer);
7051
+ var partitionAppToolCalls = (output, toolCalls) => {
7052
+ const appToolCalls = [];
7053
+ for (const toolCall of toolCalls ?? []) {
7054
+ if (LIFECYCLE_TOOL_NAMES.has(toolCall.name)) {
7055
+ applyLifecycleToolCall(output, toolCall);
7056
+ } else {
7057
+ appToolCalls.push(toolCall);
7058
+ }
7059
+ }
7060
+ return appToolCalls;
7061
+ };
6828
7062
  var createVoiceAgent = (options) => {
6829
7063
  const toolMap = new Map(options.tools?.map((tool) => [tool.name, tool]) ?? []);
6830
7064
  const maxToolRounds = Math.max(0, options.maxToolRounds ?? 2);
@@ -6848,9 +7082,10 @@ var createVoiceAgent = (options) => {
6848
7082
  agentId: options.id,
6849
7083
  context: input.context,
6850
7084
  messages,
7085
+ onTextDelta: input.onTextDelta,
6851
7086
  session: input.session,
6852
7087
  system,
6853
- tools: [...toolMap.values()].map((tool) => ({
7088
+ tools: [...LIFECYCLE_TOOLS, ...toolMap.values()].map((tool) => ({
6854
7089
  description: tool.description,
6855
7090
  name: tool.name,
6856
7091
  parameters: tool.parameters
@@ -6915,10 +7150,11 @@ var createVoiceAgent = (options) => {
6915
7150
  role: "assistant"
6916
7151
  });
6917
7152
  }
6918
- if (!output.toolCalls?.length || round === maxToolRounds) {
7153
+ const appToolCalls = partitionAppToolCalls(output, output.toolCalls);
7154
+ if (appToolCalls.length === 0 || isLifecycleRequested(output) || round === maxToolRounds) {
6919
7155
  break;
6920
7156
  }
6921
- for (const toolCall of output.toolCalls) {
7157
+ for (const toolCall of appToolCalls) {
6922
7158
  const tool = toolMap.get(toolCall.name);
6923
7159
  if (!tool) {
6924
7160
  const missingResult = {
@@ -7924,6 +8160,7 @@ var createVoiceAssistant = (options) => {
7924
8160
  }
7925
8161
  const runResult = await runner.run({
7926
8162
  ...input,
8163
+ onTextDelta: input.onTextDelta,
7927
8164
  system: liveOpsInstruction ? `Operator instruction for this turn: ${liveOpsInstruction}` : undefined
7928
8165
  }) ?? {};
7929
8166
  const result = runResult;
@@ -44164,89 +44401,6 @@ var createVoiceProviderOrchestrationProfile = (options) => {
44164
44401
  }
44165
44402
  };
44166
44403
  };
44167
- var OUTPUT_SCHEMA = {
44168
- additionalProperties: false,
44169
- properties: {
44170
- assistantText: {
44171
- type: "string"
44172
- },
44173
- complete: {
44174
- type: "boolean"
44175
- },
44176
- escalate: {
44177
- additionalProperties: false,
44178
- properties: {
44179
- metadata: {
44180
- additionalProperties: true,
44181
- type: "object"
44182
- },
44183
- reason: {
44184
- type: "string"
44185
- }
44186
- },
44187
- required: ["reason"],
44188
- type: "object"
44189
- },
44190
- noAnswer: {
44191
- additionalProperties: false,
44192
- properties: {
44193
- metadata: {
44194
- additionalProperties: true,
44195
- type: "object"
44196
- }
44197
- },
44198
- type: "object"
44199
- },
44200
- result: {
44201
- additionalProperties: true,
44202
- type: "object"
44203
- },
44204
- transfer: {
44205
- additionalProperties: false,
44206
- properties: {
44207
- metadata: {
44208
- additionalProperties: true,
44209
- type: "object"
44210
- },
44211
- reason: {
44212
- type: "string"
44213
- },
44214
- target: {
44215
- type: "string"
44216
- }
44217
- },
44218
- required: ["target"],
44219
- type: "object"
44220
- },
44221
- voicemail: {
44222
- additionalProperties: false,
44223
- properties: {
44224
- metadata: {
44225
- additionalProperties: true,
44226
- type: "object"
44227
- }
44228
- },
44229
- type: "object"
44230
- }
44231
- },
44232
- type: "object"
44233
- };
44234
- var ROUTE_RESULT_INSTRUCTION = "Return only a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools. Only set transfer, escalate, voicemail, or noAnswer when the user explicitly asks for that lifecycle outcome or a tool result says that exact outcome. Do not infer voicemail from generic words like voice, voice app, or voice integration.";
44235
- var stripJSONCodeFence = (value) => {
44236
- const trimmed = value.trim();
44237
- const match = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
44238
- return match?.[1]?.trim() ?? value;
44239
- };
44240
- var parseJSON = (value) => {
44241
- try {
44242
- const parsed = JSON.parse(stripJSONCodeFence(value));
44243
- return parsed && typeof parsed === "object" ? parsed : {};
44244
- } catch {
44245
- return {
44246
- assistantText: value
44247
- };
44248
- }
44249
- };
44250
44404
  var parseJSONValue = (value) => {
44251
44405
  try {
44252
44406
  return JSON.parse(value);
@@ -44717,48 +44871,95 @@ var messageToGeminiContent = (message) => {
44717
44871
  role: message.role === "assistant" ? "model" : "user"
44718
44872
  };
44719
44873
  };
44720
- var extractText = (response) => {
44721
- if (typeof response.output_text === "string") {
44722
- return response.output_text;
44874
+ var VOICE_SYSTEM_INSTRUCTIONS = "You are on a live phone call. Reply with natural, concise spoken sentences \u2014 no markdown, lists, headings, or emoji. To take an action (transfer the call, escalate, record voicemail/no-answer, or end the call), CALL the matching tool rather than describing it in words. Call the complete tool once the conversation's goal is met.";
44875
+ var parseToolArgs = (raw) => {
44876
+ if (!raw.trim()) {
44877
+ return {};
44723
44878
  }
44724
- const output = Array.isArray(response.output) ? response.output : [];
44725
- for (const item of output) {
44726
- if (!item || typeof item !== "object") {
44727
- continue;
44728
- }
44729
- const record = item;
44730
- const content = Array.isArray(record.content) ? record.content : [];
44731
- for (const contentItem of content) {
44732
- if (!contentItem || typeof contentItem !== "object") {
44733
- continue;
44734
- }
44735
- const contentRecord = contentItem;
44736
- if (typeof contentRecord.text === "string") {
44737
- return contentRecord.text;
44738
- }
44739
- }
44879
+ try {
44880
+ const parsed = JSON.parse(raw);
44881
+ return parsed && typeof parsed === "object" ? parsed : {};
44882
+ } catch {
44883
+ return {};
44740
44884
  }
44741
- return "";
44742
44885
  };
44743
- var extractToolCalls = (response) => {
44744
- const output = Array.isArray(response.output) ? response.output : [];
44745
- const toolCalls = [];
44746
- for (const item of output) {
44747
- if (!item || typeof item !== "object") {
44748
- continue;
44886
+ var readServerSentEvents = async (response, onEvent) => {
44887
+ const reader = response.body?.getReader();
44888
+ if (!reader) {
44889
+ throw new Error("streaming response has no body");
44890
+ }
44891
+ const decoder = new TextDecoder;
44892
+ let buffer = "";
44893
+ const drain = (block) => {
44894
+ for (const line of block.split(`
44895
+ `)) {
44896
+ const trimmed = line.trimStart();
44897
+ if (!trimmed.startsWith("data:"))
44898
+ continue;
44899
+ const data = trimmed.slice("data:".length).trim();
44900
+ if (!data || data === "[DONE]")
44901
+ continue;
44902
+ try {
44903
+ onEvent(JSON.parse(data));
44904
+ } catch {}
44749
44905
  }
44750
- const record = item;
44751
- if (record.type !== "function_call" || typeof record.name !== "string") {
44752
- continue;
44906
+ };
44907
+ for (;; ) {
44908
+ const { done, value } = await reader.read();
44909
+ if (done)
44910
+ break;
44911
+ buffer += decoder.decode(value, { stream: true });
44912
+ let separator = buffer.indexOf(`
44913
+
44914
+ `);
44915
+ while (separator !== -1) {
44916
+ drain(buffer.slice(0, separator));
44917
+ buffer = buffer.slice(separator + 2);
44918
+ separator = buffer.indexOf(`
44919
+
44920
+ `);
44753
44921
  }
44754
- const args = typeof record.arguments === "string" ? parseJSON(record.arguments) : {};
44755
- toolCalls.push({
44756
- args,
44757
- id: typeof record.call_id === "string" ? record.call_id : typeof record.id === "string" ? record.id : undefined,
44758
- name: record.name
44759
- });
44760
44922
  }
44761
- return toolCalls;
44923
+ if (buffer.trim())
44924
+ drain(buffer);
44925
+ };
44926
+ var finalizeToolCalls = (calls) => [...calls.values()].filter((call) => call.name).map((call) => ({
44927
+ args: parseToolArgs(call.args),
44928
+ id: call.id,
44929
+ name: call.name
44930
+ }));
44931
+ var consumeOpenAIResponsesStream = async (response, onTextDelta) => {
44932
+ let assistantText = "";
44933
+ let usage;
44934
+ const calls = new Map;
44935
+ await readServerSentEvents(response, (event) => {
44936
+ const type = typeof event.type === "string" ? event.type : "";
44937
+ const item = event.item;
44938
+ if (type === "response.output_text.delta" && typeof event.delta === "string") {
44939
+ assistantText += event.delta;
44940
+ onTextDelta?.(event.delta);
44941
+ } else if (type === "response.output_item.added" && item?.type === "function_call") {
44942
+ calls.set(String(item.id ?? item.call_id ?? ""), {
44943
+ args: typeof item.arguments === "string" ? item.arguments : "",
44944
+ id: typeof item.call_id === "string" ? item.call_id : item.id,
44945
+ name: typeof item.name === "string" ? item.name : ""
44946
+ });
44947
+ } else if (type === "response.function_call_arguments.delta" && typeof event.delta === "string") {
44948
+ const entry = calls.get(String(event.item_id ?? ""));
44949
+ if (entry)
44950
+ entry.args += event.delta;
44951
+ } else if (type === "response.output_item.done" && item?.type === "function_call" && typeof item.arguments === "string" && item.arguments) {
44952
+ const entry = calls.get(String(item.id ?? item.call_id ?? ""));
44953
+ if (entry)
44954
+ entry.args = item.arguments;
44955
+ } else if (type === "response.completed") {
44956
+ const completed = event.response;
44957
+ if (completed?.usage && typeof completed.usage === "object") {
44958
+ usage = completed.usage;
44959
+ }
44960
+ }
44961
+ });
44962
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
44762
44963
  };
44763
44964
  var createOpenAIVoiceAssistantModel = (options) => {
44764
44965
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44769,23 +44970,13 @@ var createOpenAIVoiceAssistantModel = (options) => {
44769
44970
  const response = await fetchImpl(`${baseUrl.replace(/\/$/, "")}/responses`, {
44770
44971
  body: JSON.stringify({
44771
44972
  input: messagesToOpenAIInput(input.messages),
44772
- instructions: [
44773
- input.system,
44774
- "Return a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools."
44775
- ].filter(Boolean).join(`
44973
+ instructions: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44776
44974
 
44777
44975
  `),
44778
44976
  max_output_tokens: options.maxOutputTokens,
44779
44977
  model,
44978
+ stream: true,
44780
44979
  temperature: options.temperature,
44781
- text: {
44782
- format: {
44783
- name: "voice_route_result",
44784
- schema: OUTPUT_SCHEMA,
44785
- strict: false,
44786
- type: "json_schema"
44787
- }
44788
- },
44789
44980
  tool_choice: input.tools.length ? "auto" : "none",
44790
44981
  tools: input.tools.map((tool) => ({
44791
44982
  description: tool.description,
@@ -44799,6 +44990,7 @@ var createOpenAIVoiceAssistantModel = (options) => {
44799
44990
  }))
44800
44991
  }),
44801
44992
  headers: {
44993
+ accept: "text/event-stream",
44802
44994
  authorization: `Bearer ${options.apiKey}`,
44803
44995
  "content-type": "application/json"
44804
44996
  },
@@ -44807,43 +44999,52 @@ var createOpenAIVoiceAssistantModel = (options) => {
44807
44999
  if (!response.ok) {
44808
45000
  throw createHTTPError("OpenAI", response);
44809
45001
  }
44810
- const body = await response.json();
44811
- if (body.usage && typeof body.usage === "object") {
44812
- await options.onUsage?.(body.usage);
44813
- }
44814
- const toolCalls = extractToolCalls(body);
44815
- if (toolCalls.length) {
44816
- return {
44817
- toolCalls
44818
- };
45002
+ const { assistantText, toolCalls, usage } = await consumeOpenAIResponsesStream(response, input.onTextDelta);
45003
+ if (usage) {
45004
+ await options.onUsage?.(usage);
44819
45005
  }
44820
- return normalizeRouteOutput(parseJSON(extractText(body)));
45006
+ return {
45007
+ ...assistantText ? { assistantText } : {},
45008
+ ...toolCalls.length ? { toolCalls } : {}
45009
+ };
44821
45010
  }
44822
45011
  };
44823
45012
  };
44824
- var extractAnthropicText = (response) => {
44825
- const content = Array.isArray(response.content) ? response.content : [];
44826
- return content.map((item) => item && typeof item === "object" && item.type === "text" && typeof item.text === "string" ? item.text : "").filter(Boolean).join(`
44827
- `);
44828
- };
44829
- var extractAnthropicToolCalls = (response) => {
44830
- const content = Array.isArray(response.content) ? response.content : [];
44831
- const toolCalls = [];
44832
- for (const item of content) {
44833
- if (!item || typeof item !== "object") {
44834
- continue;
44835
- }
44836
- const record = item;
44837
- if (record.type !== "tool_use" || typeof record.name !== "string") {
44838
- continue;
45013
+ var consumeAnthropicStream = async (response, onTextDelta) => {
45014
+ let assistantText = "";
45015
+ let usage;
45016
+ const calls = new Map;
45017
+ await readServerSentEvents(response, (event) => {
45018
+ const type = typeof event.type === "string" ? event.type : "";
45019
+ const delta = event.delta;
45020
+ if (type === "content_block_delta" && delta?.type === "text_delta") {
45021
+ if (typeof delta.text === "string") {
45022
+ assistantText += delta.text;
45023
+ onTextDelta?.(delta.text);
45024
+ }
45025
+ } else if (type === "content_block_delta" && delta?.type === "input_json_delta" && typeof delta.partial_json === "string") {
45026
+ const entry = calls.get(String(event.index ?? ""));
45027
+ if (entry)
45028
+ entry.args += delta.partial_json;
45029
+ } else if (type === "content_block_start") {
45030
+ const block = event.content_block;
45031
+ if (block?.type === "tool_use") {
45032
+ calls.set(String(event.index ?? ""), {
45033
+ args: "",
45034
+ id: typeof block.id === "string" ? block.id : undefined,
45035
+ name: typeof block.name === "string" ? block.name : ""
45036
+ });
45037
+ }
45038
+ } else if (type === "message_start") {
45039
+ const message = event.message;
45040
+ if (message?.usage && typeof message.usage === "object") {
45041
+ usage = message.usage;
45042
+ }
45043
+ } else if (type === "message_delta" && event.usage && typeof event.usage === "object") {
45044
+ usage = { ...usage, ...event.usage };
44839
45045
  }
44840
- toolCalls.push({
44841
- args: record.input && typeof record.input === "object" ? record.input : {},
44842
- id: typeof record.id === "string" ? record.id : undefined,
44843
- name: record.name
44844
- });
44845
- }
44846
- return toolCalls;
45046
+ });
45047
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
44847
45048
  };
44848
45049
  var createAnthropicVoiceAssistantModel = (options) => {
44849
45050
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44856,7 +45057,8 @@ var createAnthropicVoiceAssistantModel = (options) => {
44856
45057
  max_tokens: options.maxOutputTokens ?? 1024,
44857
45058
  messages: input.messages.map(messageToAnthropicMessage).filter(Boolean),
44858
45059
  model,
44859
- system: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
45060
+ stream: true,
45061
+ system: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44860
45062
 
44861
45063
  `),
44862
45064
  temperature: options.temperature,
@@ -44880,57 +45082,55 @@ var createAnthropicVoiceAssistantModel = (options) => {
44880
45082
  if (!response.ok) {
44881
45083
  throw createHTTPError("Anthropic", response);
44882
45084
  }
44883
- const body = await response.json();
44884
- if (body.usage && typeof body.usage === "object") {
44885
- await options.onUsage?.(body.usage);
44886
- }
44887
- const toolCalls = extractAnthropicToolCalls(body);
44888
- if (toolCalls.length) {
44889
- return {
44890
- assistantText: extractAnthropicText(body) || undefined,
44891
- toolCalls
44892
- };
45085
+ const { assistantText, toolCalls, usage } = await consumeAnthropicStream(response, input.onTextDelta);
45086
+ if (usage) {
45087
+ await options.onUsage?.(usage);
44893
45088
  }
44894
- return normalizeRouteOutput(parseJSON(extractAnthropicText(body)));
45089
+ return {
45090
+ ...assistantText ? { assistantText } : {},
45091
+ ...toolCalls.length ? { toolCalls } : {}
45092
+ };
44895
45093
  }
44896
45094
  };
44897
45095
  };
44898
- var extractGeminiCandidateParts = (response) => {
44899
- const candidates = Array.isArray(response.candidates) ? response.candidates : [];
44900
- const first = candidates[0];
44901
- if (!first || typeof first !== "object") {
44902
- return [];
45096
+ var handleGeminiPart = (part, collect) => {
45097
+ if (!part || typeof part !== "object")
45098
+ return "";
45099
+ const record = part;
45100
+ if (typeof record.text === "string" && record.text) {
45101
+ collect.onTextDelta?.(record.text);
45102
+ return record.text;
44903
45103
  }
44904
- const { content } = first;
44905
- if (!content || typeof content !== "object") {
44906
- return [];
45104
+ const { functionCall } = record;
45105
+ if (functionCall && typeof functionCall === "object") {
45106
+ const fn = functionCall;
45107
+ if (typeof fn.name === "string") {
45108
+ collect.toolCalls.push({
45109
+ args: fn.args && typeof fn.args === "object" ? fn.args : {},
45110
+ id: typeof fn.id === "string" ? fn.id : undefined,
45111
+ name: fn.name
45112
+ });
45113
+ }
44907
45114
  }
44908
- const { parts } = content;
44909
- return Array.isArray(parts) ? parts : [];
45115
+ return "";
44910
45116
  };
44911
- var extractGeminiText = (response) => extractGeminiCandidateParts(response).map((part) => part && typeof part === "object" && typeof part.text === "string" ? part.text : "").filter(Boolean).join(`
44912
- `);
44913
- var extractGeminiToolCalls = (response) => {
45117
+ var consumeGeminiStream = async (response, onTextDelta) => {
45118
+ let assistantText = "";
45119
+ let usage;
44914
45120
  const toolCalls = [];
44915
- for (const part of extractGeminiCandidateParts(response)) {
44916
- if (!part || typeof part !== "object") {
44917
- continue;
44918
- }
44919
- const { functionCall } = part;
44920
- if (!functionCall || typeof functionCall !== "object") {
44921
- continue;
45121
+ await readServerSentEvents(response, (event) => {
45122
+ if (event.usageMetadata && typeof event.usageMetadata === "object") {
45123
+ usage = event.usageMetadata;
44922
45124
  }
44923
- const record = functionCall;
44924
- if (typeof record.name !== "string") {
44925
- continue;
45125
+ const candidates = Array.isArray(event.candidates) ? event.candidates : [];
45126
+ const first = candidates[0];
45127
+ const content = first?.content;
45128
+ const parts = Array.isArray(content?.parts) ? content.parts : [];
45129
+ for (const part of parts) {
45130
+ assistantText += handleGeminiPart(part, { onTextDelta, toolCalls });
44926
45131
  }
44927
- toolCalls.push({
44928
- args: record.args && typeof record.args === "object" ? record.args : {},
44929
- id: typeof record.id === "string" ? record.id : undefined,
44930
- name: record.name
44931
- });
44932
- }
44933
- return toolCalls;
45132
+ });
45133
+ return { assistantText, toolCalls, usage };
44934
45134
  };
44935
45135
  var createGeminiVoiceAssistantModel = (options) => {
44936
45136
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44939,7 +45139,7 @@ var createGeminiVoiceAssistantModel = (options) => {
44939
45139
  const maxRetries = Math.max(0, options.maxRetries ?? 2);
44940
45140
  return {
44941
45141
  generate: async (input) => {
44942
- const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(options.apiKey)}`;
45142
+ const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:streamGenerateContent?alt=sse&key=${encodeURIComponent(options.apiKey)}`;
44943
45143
  let response;
44944
45144
  for (let attempt = 0;attempt <= maxRetries; attempt += 1) {
44945
45145
  response = await fetchImpl(endpoint, {
@@ -44947,16 +45147,12 @@ var createGeminiVoiceAssistantModel = (options) => {
44947
45147
  contents: input.messages.map(messageToGeminiContent).filter(Boolean),
44948
45148
  generationConfig: {
44949
45149
  maxOutputTokens: options.maxOutputTokens,
44950
- ...input.tools.length ? {} : {
44951
- responseMimeType: "application/json",
44952
- responseSchema: toGeminiSchema(OUTPUT_SCHEMA)
44953
- },
44954
45150
  temperature: options.temperature
44955
45151
  },
44956
45152
  systemInstruction: {
44957
45153
  parts: [
44958
45154
  {
44959
- text: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
45155
+ text: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44960
45156
 
44961
45157
  `)
44962
45158
  }
@@ -44992,18 +45188,14 @@ var createGeminiVoiceAssistantModel = (options) => {
44992
45188
  if (!response.ok) {
44993
45189
  throw createHTTPError("Gemini", response);
44994
45190
  }
44995
- const body = await response.json();
44996
- if (body.usageMetadata && typeof body.usageMetadata === "object") {
44997
- await options.onUsage?.(body.usageMetadata);
45191
+ const { assistantText, toolCalls, usage } = await consumeGeminiStream(response, input.onTextDelta);
45192
+ if (usage) {
45193
+ await options.onUsage?.(usage);
44998
45194
  }
44999
- const toolCalls = extractGeminiToolCalls(body);
45000
- if (toolCalls.length) {
45001
- return {
45002
- assistantText: extractGeminiText(body) || undefined,
45003
- toolCalls
45004
- };
45005
- }
45006
- return normalizeRouteOutput(parseJSON(extractGeminiText(body)));
45195
+ return {
45196
+ ...assistantText ? { assistantText } : {},
45197
+ ...toolCalls.length ? { toolCalls } : {}
45198
+ };
45007
45199
  }
45008
45200
  };
45009
45201
  };
@@ -48413,14 +48605,14 @@ var DEFAULT_VOICE_PROMPT_INJECTION_RULES = [
48413
48605
  severity: "low"
48414
48606
  }
48415
48607
  ];
48416
- var extractText2 = (input) => typeof input === "string" ? input : input.text;
48608
+ var extractText = (input) => typeof input === "string" ? input : input.text;
48417
48609
  var createVoicePromptInjectionGuard = (options = {}) => {
48418
48610
  const rules = options.rules ?? DEFAULT_VOICE_PROMPT_INJECTION_RULES;
48419
48611
  const replacement = options.sanitizedReplacement ?? "[REDACTED:INJECTION]";
48420
48612
  return {
48421
48613
  rules,
48422
48614
  evaluate: (input) => {
48423
- const text = extractText2(input);
48615
+ const text = extractText(input);
48424
48616
  const matches = [];
48425
48617
  for (const rule of rules) {
48426
48618
  rule.pattern.lastIndex = 0;