@absolutejs/voice 0.0.22-beta.545 → 0.0.22-beta.547

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3661,6 +3661,19 @@ var countWords2 = (text) => text.trim().split(/\s+/).filter(Boolean).length;
3661
3661
  var normalizeText2 = (text) => text.trim().replace(/\s+/g, " ");
3662
3662
  var getAudioChunkDurationMs = (chunk) => chunk.byteLength / (DEFAULT_FORMAT.sampleRateHz * DEFAULT_FORMAT.channels * 2) * 1000;
3663
3663
  var getBufferedAudioDurationMs = (chunks) => chunks.reduce((total, chunk) => total + getAudioChunkDurationMs(chunk), 0);
3664
+ var STREAM_SENTENCE_BOUNDARY = /[.!?\u2026]['")\]]*\s/;
3665
+ var MAX_TTS_CHUNK_CHARS = 220;
3666
+ var nextSpeakableBoundary = (buffer) => {
3667
+ const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
3668
+ return match ? match.index + match[0].length : -1;
3669
+ };
3670
+ var softCutBoundary = (buffer) => {
3671
+ if (buffer.length < MAX_TTS_CHUNK_CHARS)
3672
+ return -1;
3673
+ const window2 = buffer.slice(0, MAX_TTS_CHUNK_CHARS);
3674
+ const lastSpace = window2.lastIndexOf(" ");
3675
+ return lastSpace > 0 ? lastSpace + 1 : MAX_TTS_CHUNK_CHARS;
3676
+ };
3664
3677
  var calculateMeanConfidence = (transcripts) => {
3665
3678
  let sum = 0;
3666
3679
  let total = 0;
@@ -5130,6 +5143,110 @@ var createVoiceSession = (options) => {
5130
5143
  });
5131
5144
  });
5132
5145
  };
5146
+ const createTurnTTSStreamer = (turn, session) => {
5147
+ let buffer = "";
5148
+ let full = "";
5149
+ let charsSent = 0;
5150
+ let started = false;
5151
+ let streamed = false;
5152
+ let sendChain = Promise.resolve();
5153
+ let ttsSessionRequest = null;
5154
+ const ttsStartedAt = Date.now();
5155
+ const ensure = () => {
5156
+ if (!ttsSessionRequest) {
5157
+ ttsSessionRequest = ensureTTSSession().catch((error) => {
5158
+ logger.warn("voice assistant audio send failed", {
5159
+ error: toError(error).message,
5160
+ sessionId: options.id,
5161
+ turnId: turn.id
5162
+ });
5163
+ return null;
5164
+ });
5165
+ }
5166
+ return ttsSessionRequest;
5167
+ };
5168
+ const flush = (text) => {
5169
+ if (!text.trim())
5170
+ return;
5171
+ const previous = sendChain;
5172
+ sendChain = (async () => {
5173
+ await previous;
5174
+ if (started && activeTTSTurnId !== turn.id)
5175
+ return;
5176
+ const ttsSession2 = await ensure();
5177
+ if (!ttsSession2 || started && activeTTSTurnId !== turn.id)
5178
+ return;
5179
+ if (!started) {
5180
+ activeTTSTurnId = turn.id;
5181
+ await appendTurnLatencyStage({
5182
+ at: ttsStartedAt,
5183
+ session,
5184
+ stage: "tts_send_started",
5185
+ turnId: turn.id
5186
+ });
5187
+ started = true;
5188
+ }
5189
+ try {
5190
+ await ttsSession2.send(text);
5191
+ charsSent += text.length;
5192
+ } catch (error) {
5193
+ logger.warn("voice assistant audio send failed", {
5194
+ error: toError(error).message,
5195
+ sessionId: options.id,
5196
+ turnId: turn.id
5197
+ });
5198
+ }
5199
+ })();
5200
+ };
5201
+ return {
5202
+ finish: async () => {
5203
+ if (buffer.trim()) {
5204
+ flush(buffer);
5205
+ }
5206
+ buffer = "";
5207
+ await sendChain;
5208
+ if (started) {
5209
+ if (options.costAccountant) {
5210
+ options.costAccountant.recordTTS({ characters: charsSent });
5211
+ }
5212
+ await appendTurnLatencyStage({
5213
+ session,
5214
+ stage: "tts_send_completed",
5215
+ turnId: turn.id
5216
+ });
5217
+ await appendTrace({
5218
+ payload: {
5219
+ elapsedMs: Date.now() - ttsStartedAt,
5220
+ status: "sent",
5221
+ streamed: true
5222
+ },
5223
+ session,
5224
+ turnId: turn.id,
5225
+ type: "turn.assistant"
5226
+ });
5227
+ }
5228
+ return { fullText: full, streamed };
5229
+ },
5230
+ push: (delta) => {
5231
+ if (!delta)
5232
+ return;
5233
+ streamed = true;
5234
+ full += delta;
5235
+ buffer += delta;
5236
+ let boundary = nextSpeakableBoundary(buffer);
5237
+ while (boundary !== -1) {
5238
+ flush(buffer.slice(0, boundary));
5239
+ buffer = buffer.slice(boundary);
5240
+ boundary = nextSpeakableBoundary(buffer);
5241
+ }
5242
+ const cut = softCutBoundary(buffer);
5243
+ if (cut !== -1) {
5244
+ flush(buffer.slice(0, cut));
5245
+ buffer = buffer.slice(cut);
5246
+ }
5247
+ }
5248
+ };
5249
+ };
5133
5250
  const completeTurn = async (session, turn) => {
5134
5251
  const liveOpsControl = await options.liveOps?.getControl(options.id);
5135
5252
  if (liveOpsControl?.assistantPaused || liveOpsControl?.operatorTakeover) {
@@ -5150,6 +5267,7 @@ var createVoiceSession = (options) => {
5150
5267
  return;
5151
5268
  }
5152
5269
  const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
5270
+ const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
5153
5271
  const committedOutput = await options.route.onTurn({
5154
5272
  api,
5155
5273
  context: options.context,
@@ -5157,6 +5275,7 @@ var createVoiceSession = (options) => {
5157
5275
  control: liveOpsControl,
5158
5276
  injectedInstruction
5159
5277
  } : undefined,
5278
+ onTextDelta: ttsStreamer?.push,
5160
5279
  session,
5161
5280
  turn
5162
5281
  });
@@ -5176,7 +5295,28 @@ var createVoiceSession = (options) => {
5176
5295
  setTurnResult(currentSession, turn.id, { citations: turnCitations });
5177
5296
  });
5178
5297
  }
5179
- if (output?.assistantText) {
5298
+ const streamResult = ttsStreamer ? await ttsStreamer.finish() : undefined;
5299
+ if (streamResult?.streamed) {
5300
+ output.assistantText = streamResult.fullText || output.assistantText;
5301
+ if (output.assistantText) {
5302
+ const finalText = output.assistantText;
5303
+ await writeSession((currentSession) => {
5304
+ setTurnResult(currentSession, turn.id, { assistantText: finalText });
5305
+ });
5306
+ await send({ text: finalText, turnId: turn.id, type: "assistant" });
5307
+ await appendTrace({
5308
+ payload: {
5309
+ assistantMode: resolveVoiceAssistantMode(options),
5310
+ realtimeConfigured: Boolean(options.realtime),
5311
+ text: finalText,
5312
+ ttsConfigured: Boolean(options.tts)
5313
+ },
5314
+ session,
5315
+ turnId: turn.id,
5316
+ type: "turn.assistant"
5317
+ });
5318
+ }
5319
+ } else if (output?.assistantText) {
5180
5320
  const assistantTextStartedAt = Date.now();
5181
5321
  await writeSession((currentSession) => {
5182
5322
  setTurnResult(currentSession, turn.id, {
@@ -5581,7 +5721,7 @@ var createVoiceSession = (options) => {
5581
5721
  kickCallSilenceWatchdog();
5582
5722
  startAmdEvaluationTimer();
5583
5723
  if (shouldFireOnSession && options.greeting && session.turns.length === 0) {
5584
- const greetingText = typeof options.greeting === "function" ? await options.greeting() : options.greeting;
5724
+ const greetingText = typeof options.greeting === "function" ? await options.greeting({ session }) : options.greeting;
5585
5725
  const greetingTurnId = createId();
5586
5726
  await send({
5587
5727
  text: greetingText,
@@ -6825,6 +6965,100 @@ var appendVoiceAgentSquadHandoff = async (input) => {
6825
6965
  });
6826
6966
  return handoff;
6827
6967
  };
6968
+ var LIFECYCLE_TOOLS = [
6969
+ {
6970
+ description: "Transfer the call to a human agent or phone number. Say a short handoff line to the caller first, then call this.",
6971
+ name: "transfer_call",
6972
+ parameters: {
6973
+ additionalProperties: false,
6974
+ properties: {
6975
+ reason: { description: "Why you are transferring", type: "string" },
6976
+ target: {
6977
+ description: "Agent id or phone number to transfer to",
6978
+ type: "string"
6979
+ }
6980
+ },
6981
+ required: ["target"],
6982
+ type: "object"
6983
+ }
6984
+ },
6985
+ {
6986
+ description: "Escalate to a supervisor or human when you cannot resolve the caller's request.",
6987
+ name: "escalate",
6988
+ parameters: {
6989
+ additionalProperties: false,
6990
+ properties: {
6991
+ reason: { description: "Why you are escalating", type: "string" }
6992
+ },
6993
+ required: ["reason"],
6994
+ type: "object"
6995
+ }
6996
+ },
6997
+ {
6998
+ description: "Record that the call reached voicemail or an answering machine.",
6999
+ name: "leave_voicemail",
7000
+ parameters: { additionalProperties: false, properties: {}, type: "object" }
7001
+ },
7002
+ {
7003
+ description: "Record that no one answered or the call could not proceed to a conversation.",
7004
+ name: "mark_no_answer",
7005
+ parameters: { additionalProperties: false, properties: {}, type: "object" }
7006
+ },
7007
+ {
7008
+ description: "End the conversation once its goal is met. Optionally include a structured result.",
7009
+ name: "complete",
7010
+ parameters: {
7011
+ additionalProperties: true,
7012
+ properties: {
7013
+ result: { description: "Structured outcome of the call, if any" }
7014
+ },
7015
+ type: "object"
7016
+ }
7017
+ }
7018
+ ];
7019
+ var LIFECYCLE_TOOL_NAMES = new Set(LIFECYCLE_TOOLS.map((tool) => tool.name));
7020
+ var applyLifecycleToolCall = (output, toolCall) => {
7021
+ const args = toolCall.args ?? {};
7022
+ switch (toolCall.name) {
7023
+ case "transfer_call":
7024
+ output.transfer = {
7025
+ reason: typeof args.reason === "string" ? args.reason : undefined,
7026
+ target: typeof args.target === "string" ? args.target : ""
7027
+ };
7028
+ break;
7029
+ case "escalate":
7030
+ output.escalate = {
7031
+ reason: typeof args.reason === "string" ? args.reason : "escalation requested"
7032
+ };
7033
+ break;
7034
+ case "leave_voicemail":
7035
+ output.voicemail = {};
7036
+ break;
7037
+ case "mark_no_answer":
7038
+ output.noAnswer = {};
7039
+ break;
7040
+ case "complete":
7041
+ output.complete = true;
7042
+ if ("result" in args) {
7043
+ output.result = args.result;
7044
+ }
7045
+ break;
7046
+ default:
7047
+ break;
7048
+ }
7049
+ };
7050
+ var isLifecycleRequested = (output) => Boolean(output.complete) || Boolean(output.transfer) || Boolean(output.escalate) || Boolean(output.voicemail) || Boolean(output.noAnswer);
7051
+ var partitionAppToolCalls = (output, toolCalls) => {
7052
+ const appToolCalls = [];
7053
+ for (const toolCall of toolCalls ?? []) {
7054
+ if (LIFECYCLE_TOOL_NAMES.has(toolCall.name)) {
7055
+ applyLifecycleToolCall(output, toolCall);
7056
+ } else {
7057
+ appToolCalls.push(toolCall);
7058
+ }
7059
+ }
7060
+ return appToolCalls;
7061
+ };
6828
7062
  var createVoiceAgent = (options) => {
6829
7063
  const toolMap = new Map(options.tools?.map((tool) => [tool.name, tool]) ?? []);
6830
7064
  const maxToolRounds = Math.max(0, options.maxToolRounds ?? 2);
@@ -6848,9 +7082,10 @@ var createVoiceAgent = (options) => {
6848
7082
  agentId: options.id,
6849
7083
  context: input.context,
6850
7084
  messages,
7085
+ onTextDelta: input.onTextDelta,
6851
7086
  session: input.session,
6852
7087
  system,
6853
- tools: [...toolMap.values()].map((tool) => ({
7088
+ tools: [...LIFECYCLE_TOOLS, ...toolMap.values()].map((tool) => ({
6854
7089
  description: tool.description,
6855
7090
  name: tool.name,
6856
7091
  parameters: tool.parameters
@@ -6915,10 +7150,11 @@ var createVoiceAgent = (options) => {
6915
7150
  role: "assistant"
6916
7151
  });
6917
7152
  }
6918
- if (!output.toolCalls?.length || round === maxToolRounds) {
7153
+ const appToolCalls = partitionAppToolCalls(output, output.toolCalls);
7154
+ if (appToolCalls.length === 0 || isLifecycleRequested(output) || round === maxToolRounds) {
6919
7155
  break;
6920
7156
  }
6921
- for (const toolCall of output.toolCalls) {
7157
+ for (const toolCall of appToolCalls) {
6922
7158
  const tool = toolMap.get(toolCall.name);
6923
7159
  if (!tool) {
6924
7160
  const missingResult = {
@@ -7924,6 +8160,7 @@ var createVoiceAssistant = (options) => {
7924
8160
  }
7925
8161
  const runResult = await runner.run({
7926
8162
  ...input,
8163
+ onTextDelta: input.onTextDelta,
7927
8164
  system: liveOpsInstruction ? `Operator instruction for this turn: ${liveOpsInstruction}` : undefined
7928
8165
  }) ?? {};
7929
8166
  const result = runResult;
@@ -24065,8 +24302,23 @@ var createTwilioSocketAdapter = (socket, getState) => ({
24065
24302
  if (!state.streamSid) {
24066
24303
  return;
24067
24304
  }
24305
+ const clearMessage = { event: "clear", streamSid: state.streamSid };
24068
24306
  state.reviewRecorder?.recordTwilioOutbound({ event: "clear" });
24069
- await Promise.resolve(socket.send(JSON.stringify({ event: "clear", streamSid: state.streamSid })));
24307
+ await state.trace?.append({
24308
+ at: Date.now(),
24309
+ payload: {
24310
+ callSid: state.callSid ?? undefined,
24311
+ carrier: state.carrier,
24312
+ direction: "outbound",
24313
+ envelope: clearMessage,
24314
+ event: "clear",
24315
+ streamId: state.streamSid
24316
+ },
24317
+ scenarioId: state.scenarioId ?? undefined,
24318
+ sessionId: state.sessionId ?? state.streamSid,
24319
+ type: "client.telephony_media"
24320
+ });
24321
+ await Promise.resolve(socket.send(JSON.stringify(clearMessage)));
24070
24322
  },
24071
24323
  close: async (code, reason) => {
24072
24324
  await Promise.resolve(socket.close(code, reason));
@@ -44164,89 +44416,6 @@ var createVoiceProviderOrchestrationProfile = (options) => {
44164
44416
  }
44165
44417
  };
44166
44418
  };
44167
- var OUTPUT_SCHEMA = {
44168
- additionalProperties: false,
44169
- properties: {
44170
- assistantText: {
44171
- type: "string"
44172
- },
44173
- complete: {
44174
- type: "boolean"
44175
- },
44176
- escalate: {
44177
- additionalProperties: false,
44178
- properties: {
44179
- metadata: {
44180
- additionalProperties: true,
44181
- type: "object"
44182
- },
44183
- reason: {
44184
- type: "string"
44185
- }
44186
- },
44187
- required: ["reason"],
44188
- type: "object"
44189
- },
44190
- noAnswer: {
44191
- additionalProperties: false,
44192
- properties: {
44193
- metadata: {
44194
- additionalProperties: true,
44195
- type: "object"
44196
- }
44197
- },
44198
- type: "object"
44199
- },
44200
- result: {
44201
- additionalProperties: true,
44202
- type: "object"
44203
- },
44204
- transfer: {
44205
- additionalProperties: false,
44206
- properties: {
44207
- metadata: {
44208
- additionalProperties: true,
44209
- type: "object"
44210
- },
44211
- reason: {
44212
- type: "string"
44213
- },
44214
- target: {
44215
- type: "string"
44216
- }
44217
- },
44218
- required: ["target"],
44219
- type: "object"
44220
- },
44221
- voicemail: {
44222
- additionalProperties: false,
44223
- properties: {
44224
- metadata: {
44225
- additionalProperties: true,
44226
- type: "object"
44227
- }
44228
- },
44229
- type: "object"
44230
- }
44231
- },
44232
- type: "object"
44233
- };
44234
- var ROUTE_RESULT_INSTRUCTION = "Return only a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools. Only set transfer, escalate, voicemail, or noAnswer when the user explicitly asks for that lifecycle outcome or a tool result says that exact outcome. Do not infer voicemail from generic words like voice, voice app, or voice integration.";
44235
- var stripJSONCodeFence = (value) => {
44236
- const trimmed = value.trim();
44237
- const match = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
44238
- return match?.[1]?.trim() ?? value;
44239
- };
44240
- var parseJSON = (value) => {
44241
- try {
44242
- const parsed = JSON.parse(stripJSONCodeFence(value));
44243
- return parsed && typeof parsed === "object" ? parsed : {};
44244
- } catch {
44245
- return {
44246
- assistantText: value
44247
- };
44248
- }
44249
- };
44250
44419
  var parseJSONValue = (value) => {
44251
44420
  try {
44252
44421
  return JSON.parse(value);
@@ -44717,48 +44886,95 @@ var messageToGeminiContent = (message) => {
44717
44886
  role: message.role === "assistant" ? "model" : "user"
44718
44887
  };
44719
44888
  };
44720
- var extractText = (response) => {
44721
- if (typeof response.output_text === "string") {
44722
- return response.output_text;
44889
+ var VOICE_SYSTEM_INSTRUCTIONS = "You are on a live phone call. Reply with natural, concise spoken sentences \u2014 no markdown, lists, headings, or emoji. To take an action (transfer the call, escalate, record voicemail/no-answer, or end the call), CALL the matching tool rather than describing it in words. Call the complete tool once the conversation's goal is met.";
44890
+ var parseToolArgs = (raw) => {
44891
+ if (!raw.trim()) {
44892
+ return {};
44723
44893
  }
44724
- const output = Array.isArray(response.output) ? response.output : [];
44725
- for (const item of output) {
44726
- if (!item || typeof item !== "object") {
44727
- continue;
44728
- }
44729
- const record = item;
44730
- const content = Array.isArray(record.content) ? record.content : [];
44731
- for (const contentItem of content) {
44732
- if (!contentItem || typeof contentItem !== "object") {
44733
- continue;
44734
- }
44735
- const contentRecord = contentItem;
44736
- if (typeof contentRecord.text === "string") {
44737
- return contentRecord.text;
44738
- }
44739
- }
44894
+ try {
44895
+ const parsed = JSON.parse(raw);
44896
+ return parsed && typeof parsed === "object" ? parsed : {};
44897
+ } catch {
44898
+ return {};
44740
44899
  }
44741
- return "";
44742
44900
  };
44743
- var extractToolCalls = (response) => {
44744
- const output = Array.isArray(response.output) ? response.output : [];
44745
- const toolCalls = [];
44746
- for (const item of output) {
44747
- if (!item || typeof item !== "object") {
44748
- continue;
44901
+ var readServerSentEvents = async (response, onEvent) => {
44902
+ const reader = response.body?.getReader();
44903
+ if (!reader) {
44904
+ throw new Error("streaming response has no body");
44905
+ }
44906
+ const decoder = new TextDecoder;
44907
+ let buffer = "";
44908
+ const drain = (block) => {
44909
+ for (const line of block.split(`
44910
+ `)) {
44911
+ const trimmed = line.trimStart();
44912
+ if (!trimmed.startsWith("data:"))
44913
+ continue;
44914
+ const data = trimmed.slice("data:".length).trim();
44915
+ if (!data || data === "[DONE]")
44916
+ continue;
44917
+ try {
44918
+ onEvent(JSON.parse(data));
44919
+ } catch {}
44749
44920
  }
44750
- const record = item;
44751
- if (record.type !== "function_call" || typeof record.name !== "string") {
44752
- continue;
44921
+ };
44922
+ for (;; ) {
44923
+ const { done, value } = await reader.read();
44924
+ if (done)
44925
+ break;
44926
+ buffer += decoder.decode(value, { stream: true });
44927
+ let separator = buffer.indexOf(`
44928
+
44929
+ `);
44930
+ while (separator !== -1) {
44931
+ drain(buffer.slice(0, separator));
44932
+ buffer = buffer.slice(separator + 2);
44933
+ separator = buffer.indexOf(`
44934
+
44935
+ `);
44753
44936
  }
44754
- const args = typeof record.arguments === "string" ? parseJSON(record.arguments) : {};
44755
- toolCalls.push({
44756
- args,
44757
- id: typeof record.call_id === "string" ? record.call_id : typeof record.id === "string" ? record.id : undefined,
44758
- name: record.name
44759
- });
44760
44937
  }
44761
- return toolCalls;
44938
+ if (buffer.trim())
44939
+ drain(buffer);
44940
+ };
44941
+ var finalizeToolCalls = (calls) => [...calls.values()].filter((call) => call.name).map((call) => ({
44942
+ args: parseToolArgs(call.args),
44943
+ id: call.id,
44944
+ name: call.name
44945
+ }));
44946
+ var consumeOpenAIResponsesStream = async (response, onTextDelta) => {
44947
+ let assistantText = "";
44948
+ let usage;
44949
+ const calls = new Map;
44950
+ await readServerSentEvents(response, (event) => {
44951
+ const type = typeof event.type === "string" ? event.type : "";
44952
+ const item = event.item;
44953
+ if (type === "response.output_text.delta" && typeof event.delta === "string") {
44954
+ assistantText += event.delta;
44955
+ onTextDelta?.(event.delta);
44956
+ } else if (type === "response.output_item.added" && item?.type === "function_call") {
44957
+ calls.set(String(item.id ?? item.call_id ?? ""), {
44958
+ args: typeof item.arguments === "string" ? item.arguments : "",
44959
+ id: typeof item.call_id === "string" ? item.call_id : item.id,
44960
+ name: typeof item.name === "string" ? item.name : ""
44961
+ });
44962
+ } else if (type === "response.function_call_arguments.delta" && typeof event.delta === "string") {
44963
+ const entry = calls.get(String(event.item_id ?? ""));
44964
+ if (entry)
44965
+ entry.args += event.delta;
44966
+ } else if (type === "response.output_item.done" && item?.type === "function_call" && typeof item.arguments === "string" && item.arguments) {
44967
+ const entry = calls.get(String(item.id ?? item.call_id ?? ""));
44968
+ if (entry)
44969
+ entry.args = item.arguments;
44970
+ } else if (type === "response.completed") {
44971
+ const completed = event.response;
44972
+ if (completed?.usage && typeof completed.usage === "object") {
44973
+ usage = completed.usage;
44974
+ }
44975
+ }
44976
+ });
44977
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
44762
44978
  };
44763
44979
  var createOpenAIVoiceAssistantModel = (options) => {
44764
44980
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44769,23 +44985,13 @@ var createOpenAIVoiceAssistantModel = (options) => {
44769
44985
  const response = await fetchImpl(`${baseUrl.replace(/\/$/, "")}/responses`, {
44770
44986
  body: JSON.stringify({
44771
44987
  input: messagesToOpenAIInput(input.messages),
44772
- instructions: [
44773
- input.system,
44774
- "Return a JSON object with assistantText, complete, transfer, escalate, voicemail, noAnswer, and result when you are not calling tools."
44775
- ].filter(Boolean).join(`
44988
+ instructions: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44776
44989
 
44777
44990
  `),
44778
44991
  max_output_tokens: options.maxOutputTokens,
44779
44992
  model,
44993
+ stream: true,
44780
44994
  temperature: options.temperature,
44781
- text: {
44782
- format: {
44783
- name: "voice_route_result",
44784
- schema: OUTPUT_SCHEMA,
44785
- strict: false,
44786
- type: "json_schema"
44787
- }
44788
- },
44789
44995
  tool_choice: input.tools.length ? "auto" : "none",
44790
44996
  tools: input.tools.map((tool) => ({
44791
44997
  description: tool.description,
@@ -44799,6 +45005,7 @@ var createOpenAIVoiceAssistantModel = (options) => {
44799
45005
  }))
44800
45006
  }),
44801
45007
  headers: {
45008
+ accept: "text/event-stream",
44802
45009
  authorization: `Bearer ${options.apiKey}`,
44803
45010
  "content-type": "application/json"
44804
45011
  },
@@ -44807,43 +45014,52 @@ var createOpenAIVoiceAssistantModel = (options) => {
44807
45014
  if (!response.ok) {
44808
45015
  throw createHTTPError("OpenAI", response);
44809
45016
  }
44810
- const body = await response.json();
44811
- if (body.usage && typeof body.usage === "object") {
44812
- await options.onUsage?.(body.usage);
44813
- }
44814
- const toolCalls = extractToolCalls(body);
44815
- if (toolCalls.length) {
44816
- return {
44817
- toolCalls
44818
- };
45017
+ const { assistantText, toolCalls, usage } = await consumeOpenAIResponsesStream(response, input.onTextDelta);
45018
+ if (usage) {
45019
+ await options.onUsage?.(usage);
44819
45020
  }
44820
- return normalizeRouteOutput(parseJSON(extractText(body)));
45021
+ return {
45022
+ ...assistantText ? { assistantText } : {},
45023
+ ...toolCalls.length ? { toolCalls } : {}
45024
+ };
44821
45025
  }
44822
45026
  };
44823
45027
  };
44824
- var extractAnthropicText = (response) => {
44825
- const content = Array.isArray(response.content) ? response.content : [];
44826
- return content.map((item) => item && typeof item === "object" && item.type === "text" && typeof item.text === "string" ? item.text : "").filter(Boolean).join(`
44827
- `);
44828
- };
44829
- var extractAnthropicToolCalls = (response) => {
44830
- const content = Array.isArray(response.content) ? response.content : [];
44831
- const toolCalls = [];
44832
- for (const item of content) {
44833
- if (!item || typeof item !== "object") {
44834
- continue;
44835
- }
44836
- const record = item;
44837
- if (record.type !== "tool_use" || typeof record.name !== "string") {
44838
- continue;
45028
+ var consumeAnthropicStream = async (response, onTextDelta) => {
45029
+ let assistantText = "";
45030
+ let usage;
45031
+ const calls = new Map;
45032
+ await readServerSentEvents(response, (event) => {
45033
+ const type = typeof event.type === "string" ? event.type : "";
45034
+ const delta = event.delta;
45035
+ if (type === "content_block_delta" && delta?.type === "text_delta") {
45036
+ if (typeof delta.text === "string") {
45037
+ assistantText += delta.text;
45038
+ onTextDelta?.(delta.text);
45039
+ }
45040
+ } else if (type === "content_block_delta" && delta?.type === "input_json_delta" && typeof delta.partial_json === "string") {
45041
+ const entry = calls.get(String(event.index ?? ""));
45042
+ if (entry)
45043
+ entry.args += delta.partial_json;
45044
+ } else if (type === "content_block_start") {
45045
+ const block = event.content_block;
45046
+ if (block?.type === "tool_use") {
45047
+ calls.set(String(event.index ?? ""), {
45048
+ args: "",
45049
+ id: typeof block.id === "string" ? block.id : undefined,
45050
+ name: typeof block.name === "string" ? block.name : ""
45051
+ });
45052
+ }
45053
+ } else if (type === "message_start") {
45054
+ const message = event.message;
45055
+ if (message?.usage && typeof message.usage === "object") {
45056
+ usage = message.usage;
45057
+ }
45058
+ } else if (type === "message_delta" && event.usage && typeof event.usage === "object") {
45059
+ usage = { ...usage, ...event.usage };
44839
45060
  }
44840
- toolCalls.push({
44841
- args: record.input && typeof record.input === "object" ? record.input : {},
44842
- id: typeof record.id === "string" ? record.id : undefined,
44843
- name: record.name
44844
- });
44845
- }
44846
- return toolCalls;
45061
+ });
45062
+ return { assistantText, toolCalls: finalizeToolCalls(calls), usage };
44847
45063
  };
44848
45064
  var createAnthropicVoiceAssistantModel = (options) => {
44849
45065
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44856,7 +45072,8 @@ var createAnthropicVoiceAssistantModel = (options) => {
44856
45072
  max_tokens: options.maxOutputTokens ?? 1024,
44857
45073
  messages: input.messages.map(messageToAnthropicMessage).filter(Boolean),
44858
45074
  model,
44859
- system: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
45075
+ stream: true,
45076
+ system: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44860
45077
 
44861
45078
  `),
44862
45079
  temperature: options.temperature,
@@ -44880,57 +45097,55 @@ var createAnthropicVoiceAssistantModel = (options) => {
44880
45097
  if (!response.ok) {
44881
45098
  throw createHTTPError("Anthropic", response);
44882
45099
  }
44883
- const body = await response.json();
44884
- if (body.usage && typeof body.usage === "object") {
44885
- await options.onUsage?.(body.usage);
44886
- }
44887
- const toolCalls = extractAnthropicToolCalls(body);
44888
- if (toolCalls.length) {
44889
- return {
44890
- assistantText: extractAnthropicText(body) || undefined,
44891
- toolCalls
44892
- };
45100
+ const { assistantText, toolCalls, usage } = await consumeAnthropicStream(response, input.onTextDelta);
45101
+ if (usage) {
45102
+ await options.onUsage?.(usage);
44893
45103
  }
44894
- return normalizeRouteOutput(parseJSON(extractAnthropicText(body)));
45104
+ return {
45105
+ ...assistantText ? { assistantText } : {},
45106
+ ...toolCalls.length ? { toolCalls } : {}
45107
+ };
44895
45108
  }
44896
45109
  };
44897
45110
  };
44898
- var extractGeminiCandidateParts = (response) => {
44899
- const candidates = Array.isArray(response.candidates) ? response.candidates : [];
44900
- const first = candidates[0];
44901
- if (!first || typeof first !== "object") {
44902
- return [];
45111
+ var handleGeminiPart = (part, collect) => {
45112
+ if (!part || typeof part !== "object")
45113
+ return "";
45114
+ const record = part;
45115
+ if (typeof record.text === "string" && record.text) {
45116
+ collect.onTextDelta?.(record.text);
45117
+ return record.text;
44903
45118
  }
44904
- const { content } = first;
44905
- if (!content || typeof content !== "object") {
44906
- return [];
45119
+ const { functionCall } = record;
45120
+ if (functionCall && typeof functionCall === "object") {
45121
+ const fn = functionCall;
45122
+ if (typeof fn.name === "string") {
45123
+ collect.toolCalls.push({
45124
+ args: fn.args && typeof fn.args === "object" ? fn.args : {},
45125
+ id: typeof fn.id === "string" ? fn.id : undefined,
45126
+ name: fn.name
45127
+ });
45128
+ }
44907
45129
  }
44908
- const { parts } = content;
44909
- return Array.isArray(parts) ? parts : [];
45130
+ return "";
44910
45131
  };
44911
- var extractGeminiText = (response) => extractGeminiCandidateParts(response).map((part) => part && typeof part === "object" && typeof part.text === "string" ? part.text : "").filter(Boolean).join(`
44912
- `);
44913
- var extractGeminiToolCalls = (response) => {
45132
+ var consumeGeminiStream = async (response, onTextDelta) => {
45133
+ let assistantText = "";
45134
+ let usage;
44914
45135
  const toolCalls = [];
44915
- for (const part of extractGeminiCandidateParts(response)) {
44916
- if (!part || typeof part !== "object") {
44917
- continue;
44918
- }
44919
- const { functionCall } = part;
44920
- if (!functionCall || typeof functionCall !== "object") {
44921
- continue;
45136
+ await readServerSentEvents(response, (event) => {
45137
+ if (event.usageMetadata && typeof event.usageMetadata === "object") {
45138
+ usage = event.usageMetadata;
44922
45139
  }
44923
- const record = functionCall;
44924
- if (typeof record.name !== "string") {
44925
- continue;
45140
+ const candidates = Array.isArray(event.candidates) ? event.candidates : [];
45141
+ const first = candidates[0];
45142
+ const content = first?.content;
45143
+ const parts = Array.isArray(content?.parts) ? content.parts : [];
45144
+ for (const part of parts) {
45145
+ assistantText += handleGeminiPart(part, { onTextDelta, toolCalls });
44926
45146
  }
44927
- toolCalls.push({
44928
- args: record.args && typeof record.args === "object" ? record.args : {},
44929
- id: typeof record.id === "string" ? record.id : undefined,
44930
- name: record.name
44931
- });
44932
- }
44933
- return toolCalls;
45147
+ });
45148
+ return { assistantText, toolCalls, usage };
44934
45149
  };
44935
45150
  var createGeminiVoiceAssistantModel = (options) => {
44936
45151
  const fetchImpl = options.fetch ?? globalThis.fetch;
@@ -44939,7 +45154,7 @@ var createGeminiVoiceAssistantModel = (options) => {
44939
45154
  const maxRetries = Math.max(0, options.maxRetries ?? 2);
44940
45155
  return {
44941
45156
  generate: async (input) => {
44942
- const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(options.apiKey)}`;
45157
+ const endpoint = `${baseUrl.replace(/\/$/, "")}/models/${encodeURIComponent(model)}:streamGenerateContent?alt=sse&key=${encodeURIComponent(options.apiKey)}`;
44943
45158
  let response;
44944
45159
  for (let attempt = 0;attempt <= maxRetries; attempt += 1) {
44945
45160
  response = await fetchImpl(endpoint, {
@@ -44947,16 +45162,12 @@ var createGeminiVoiceAssistantModel = (options) => {
44947
45162
  contents: input.messages.map(messageToGeminiContent).filter(Boolean),
44948
45163
  generationConfig: {
44949
45164
  maxOutputTokens: options.maxOutputTokens,
44950
- ...input.tools.length ? {} : {
44951
- responseMimeType: "application/json",
44952
- responseSchema: toGeminiSchema(OUTPUT_SCHEMA)
44953
- },
44954
45165
  temperature: options.temperature
44955
45166
  },
44956
45167
  systemInstruction: {
44957
45168
  parts: [
44958
45169
  {
44959
- text: [input.system, ROUTE_RESULT_INSTRUCTION].filter(Boolean).join(`
45170
+ text: [input.system, VOICE_SYSTEM_INSTRUCTIONS].filter(Boolean).join(`
44960
45171
 
44961
45172
  `)
44962
45173
  }
@@ -44992,18 +45203,14 @@ var createGeminiVoiceAssistantModel = (options) => {
44992
45203
  if (!response.ok) {
44993
45204
  throw createHTTPError("Gemini", response);
44994
45205
  }
44995
- const body = await response.json();
44996
- if (body.usageMetadata && typeof body.usageMetadata === "object") {
44997
- await options.onUsage?.(body.usageMetadata);
45206
+ const { assistantText, toolCalls, usage } = await consumeGeminiStream(response, input.onTextDelta);
45207
+ if (usage) {
45208
+ await options.onUsage?.(usage);
44998
45209
  }
44999
- const toolCalls = extractGeminiToolCalls(body);
45000
- if (toolCalls.length) {
45001
- return {
45002
- assistantText: extractGeminiText(body) || undefined,
45003
- toolCalls
45004
- };
45005
- }
45006
- return normalizeRouteOutput(parseJSON(extractGeminiText(body)));
45210
+ return {
45211
+ ...assistantText ? { assistantText } : {},
45212
+ ...toolCalls.length ? { toolCalls } : {}
45213
+ };
45007
45214
  }
45008
45215
  };
45009
45216
  };
@@ -48413,14 +48620,14 @@ var DEFAULT_VOICE_PROMPT_INJECTION_RULES = [
48413
48620
  severity: "low"
48414
48621
  }
48415
48622
  ];
48416
- var extractText2 = (input) => typeof input === "string" ? input : input.text;
48623
+ var extractText = (input) => typeof input === "string" ? input : input.text;
48417
48624
  var createVoicePromptInjectionGuard = (options = {}) => {
48418
48625
  const rules = options.rules ?? DEFAULT_VOICE_PROMPT_INJECTION_RULES;
48419
48626
  const replacement = options.sanitizedReplacement ?? "[REDACTED:INJECTION]";
48420
48627
  return {
48421
48628
  rules,
48422
48629
  evaluate: (input) => {
48423
- const text = extractText2(input);
48630
+ const text = extractText(input);
48424
48631
  const matches = [];
48425
48632
  for (const rule of rules) {
48426
48633
  rule.pattern.lastIndex = 0;