@alexkroman1/aai 1.8.0 → 1.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @alexkroman1/aai@1.8.0 build /home/runner/work/agent/agent/packages/aai
2
+ > @alexkroman1/aai@1.8.2 build /home/runner/work/agent/agent/packages/aai
3
3
  > tsdown && tsc -p tsconfig.build.json
4
4
 
5
5
  ℹ tsdown v0.21.7 powered by rolldown v1.0.0-rc.12
@@ -8,7 +8,7 @@
8
8
  ℹ target: node22
9
9
  ℹ tsconfig: tsconfig.json
10
10
  ℹ Build start
11
- ℹ dist/host/runtime-barrel.js 108.22 kB │ gzip: 29.84 kB
11
+ ℹ dist/host/runtime-barrel.js 110.46 kB │ gzip: 30.38 kB
12
12
  ℹ dist/sdk/protocol.js  5.70 kB │ gzip: 1.92 kB
13
13
  ℹ dist/index.js  2.88 kB │ gzip: 1.24 kB
14
14
  ℹ dist/sdk/manifest-barrel.js  0.36 kB │ gzip: 0.20 kB
@@ -18,7 +18,7 @@
18
18
  ℹ dist/sdk/providers/tts-barrel.js  0.25 kB │ gzip: 0.16 kB
19
19
  ℹ dist/sdk/providers/vector-barrel.js  0.22 kB │ gzip: 0.15 kB
20
20
  ℹ dist/sdk/providers/s2s-barrel.js  0.15 kB │ gzip: 0.12 kB
21
- ℹ dist/_internal-types-CfOAbK6V.js  5.45 kB │ gzip: 1.87 kB
21
+ ℹ dist/_internal-types-8v1qAa4A.js  6.04 kB │ gzip: 2.15 kB
22
22
  ℹ dist/types-DOWVZhb9.js  5.39 kB │ gzip: 2.27 kB
23
23
  ℹ dist/soniox-BQdL0mB5.js  2.03 kB │ gzip: 0.54 kB
24
24
  ℹ dist/constants-y68COEGj.js  1.70 kB │ gzip: 0.76 kB
@@ -28,5 +28,5 @@
28
28
  ℹ dist/s3-BtCMvCod.js  0.76 kB │ gzip: 0.29 kB
29
29
  ℹ dist/pinecone-CeJ69aRs.js  0.48 kB │ gzip: 0.24 kB
30
30
  ℹ dist/openai-realtime-cjPAHMMx.js  0.27 kB │ gzip: 0.19 kB
31
- ℹ 20 files, total: 138.16 kB
32
- ✔ Build complete in 48ms
31
+ ℹ 20 files, total: 140.98 kB
32
+ ✔ Build complete in 42ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # @alexkroman1/aai
2
2
 
3
+ ## 1.8.2
4
+
5
+ ### Patch Changes
6
+
7
+ - bb06b4e: Fix S2S tool calls arriving with empty args. Strip the $schema keyword from Zod-generated JSON Schema for tool parameters — some S2S providers ship the dialect URI to the underlying model and emit tool calls with empty args even when required params are listed. Also accept both 'arguments' and 'args' field names on the wire. Pipeline transport now surfaces tool-result stream parts as tool_call_done so the client UI flips pending → done.
8
+
9
+ ## 1.8.1
10
+
11
+ ### Patch Changes
12
+
13
+ - ba8effb: Make OpenAI Realtime usable end-to-end on gpt-realtime-2:
14
+
15
+ - Accept GA-renamed audio/transcript server events (`response.output_audio.{delta,done}`, `response.output_audio_transcript.{delta,done}`) alongside the legacy `response.audio.*` names so audio and transcript reach the client.
16
+ - Trigger the agent's `greeting` on connect by sending a one-shot `response.create` with quoted instructions, and honor `skipGreeting` so resumed sessions don't replay it.
17
+ - Coalesce `response.create` across multiple `sendToolResult` calls in the same tick. Multi-tool turns previously sent one `response.create` per tool, the second of which OpenAI rejected as `conversation_already_has_active_response`, stranding the turn so the model never received the tool results.
18
+ - Log unhandled event types and the full payload of `error` events to make silently rejected `session.update` fields visible.
19
+
20
+ - f4cc5ef: Migrate OpenAI Realtime transport to GA API schema (gpt-realtime-2). Drop OpenAI-Beta: realtime=v1 connect header and update session.update to session.type=realtime, output_modalities, and nested audio.input/audio.output with audio/pcm format.
21
+
3
22
  ## 1.8.0
4
23
 
5
24
  ### Minor Changes
@@ -128,13 +128,25 @@ const ToolSchemaSchema = z.object({
128
128
  parameters: z.record(z.string(), z.unknown())
129
129
  });
130
130
  const EMPTY_PARAMS = z.object({});
131
+ /**
132
+ * Convert a Zod schema to the JSON Schema shape that S2S providers expect.
133
+ * Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
134
+ * the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
135
+ * either reject the field outright or ship it through to the underlying
136
+ * model with a malformed function spec — observed empirically as tool
137
+ * calls that arrive with `args: {}` even when required params are listed.
138
+ */
139
+ function toToolJsonSchema(zodSchema) {
140
+ const { $schema: _omit, ...rest } = z.toJSONSchema(zodSchema);
141
+ return rest;
142
+ }
131
143
  function agentToolsToSchemas(tools) {
132
144
  return Object.entries(tools).map(([name, def]) => ({
133
145
  type: "function",
134
146
  name,
135
147
  description: def.description,
136
- parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS)
148
+ parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS)
137
149
  }));
138
150
  }
139
151
  //#endregion
140
- export { toAgentConfig as a, makeSttError as c, agentToolsToSchemas as i, makeTtsError as l, EMPTY_PARAMS as n, ProviderDescriptorSchema as o, ToolSchemaSchema as r, assertProviderTriple as s, AgentConfigSchema as t };
152
+ export { toAgentConfig as a, assertProviderTriple as c, agentToolsToSchemas as i, makeSttError as l, EMPTY_PARAMS as n, toToolJsonSchema as o, ToolSchemaSchema as r, ProviderDescriptorSchema as s, AgentConfigSchema as t, makeTtsError as u };
@@ -1,8 +1,8 @@
1
1
  import { r as DEFAULT_SYSTEM_PROMPT } from "../types-DOWVZhb9.js";
2
- import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-y68COEGj.js";
2
+ import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-y68COEGj.js";
3
3
  import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-CG8-by1n.js";
4
4
  import { ClientMessageSchema, VectorRequestSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
5
- import { a as toAgentConfig, c as makeSttError, i as agentToolsToSchemas, l as makeTtsError, n as EMPTY_PARAMS, s as assertProviderTriple } from "../_internal-types-CfOAbK6V.js";
5
+ import { a as toAgentConfig, c as assertProviderTriple, i as agentToolsToSchemas, l as makeSttError, n as EMPTY_PARAMS, o as toToolJsonSchema, u as makeTtsError } from "../_internal-types-8v1qAa4A.js";
6
6
  import { a as MISTRAL_KIND, d as ANTHROPIC_KIND, l as GOOGLE_KIND, r as OPENAI_KIND, s as GROQ_KIND } from "../xai-BDI61Y2M.js";
7
7
  import { a as DEEPGRAM_KIND, r as ELEVENLABS_KIND, s as ASSEMBLYAI_KIND, t as SONIOX_KIND } from "../soniox-BQdL0mB5.js";
8
8
  import { a as CARTESIA_KIND, n as RIME_KIND } from "../rime-58p9mDR8.js";
@@ -311,7 +311,7 @@ function resolveAllBuiltins(names, opts) {
311
311
  type: "function",
312
312
  name: toolName,
313
313
  description: def.description,
314
- parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS)
314
+ parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS)
315
315
  });
316
316
  if (def.guidance) guidance.push(def.guidance);
317
317
  }
@@ -1681,6 +1681,7 @@ function createOpenaiRealtimeTransport(opts) {
1681
1681
  const agentTranscriptBuffers = /* @__PURE__ */ new Map();
1682
1682
  const toolBuffers = /* @__PURE__ */ new Map();
1683
1683
  let currentResponseId = null;
1684
+ let responseCreateQueued = false;
1684
1685
  function send(payload) {
1685
1686
  if (!ws || ws.readyState !== 1) {
1686
1687
  log.debug("OpenAI Realtime send dropped: socket not open", { type: payload.type });
@@ -1688,17 +1689,39 @@ function createOpenaiRealtimeTransport(opts) {
1688
1689
  }
1689
1690
  ws.send(JSON.stringify(payload));
1690
1691
  }
1692
+ function sendGreeting() {
1693
+ if (opts.skipGreeting) return;
1694
+ const greeting = opts.sessionConfig.greeting;
1695
+ if (!greeting) return;
1696
+ send({
1697
+ type: "response.create",
1698
+ response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` }
1699
+ });
1700
+ }
1691
1701
  function sendSessionUpdate() {
1692
1702
  send({
1693
1703
  type: "session.update",
1694
1704
  session: {
1695
- modalities: ["audio", "text"],
1696
- voice,
1705
+ type: "realtime",
1706
+ output_modalities: ["audio"],
1697
1707
  instructions: opts.sessionConfig.systemPrompt,
1698
- input_audio_format: "pcm16",
1699
- output_audio_format: "pcm16",
1700
- input_audio_transcription: { model: "whisper-1" },
1701
- turn_detection: { type: "server_vad" },
1708
+ audio: {
1709
+ input: {
1710
+ format: {
1711
+ type: "audio/pcm",
1712
+ rate: 24e3
1713
+ },
1714
+ turn_detection: { type: "server_vad" },
1715
+ transcription: { model: "whisper-1" }
1716
+ },
1717
+ output: {
1718
+ format: {
1719
+ type: "audio/pcm",
1720
+ rate: 24e3
1721
+ },
1722
+ voice
1723
+ }
1724
+ },
1702
1725
  tools: opts.toolSchemas,
1703
1726
  tool_choice: opts.toolChoice
1704
1727
  }
@@ -1708,15 +1731,13 @@ function createOpenaiRealtimeTransport(opts) {
1708
1731
  const url = `${baseUrl}?model=${encodeURIComponent(model)}`;
1709
1732
  log.info("OpenAI Realtime connecting", { url });
1710
1733
  return new Promise((resolve, reject) => {
1711
- const sock = createWs(url, { headers: {
1712
- Authorization: `Bearer ${opts.apiKey}`,
1713
- "OpenAI-Beta": "realtime=v1"
1714
- } });
1734
+ const sock = createWs(url, { headers: { Authorization: `Bearer ${opts.apiKey}` } });
1715
1735
  ws = sock;
1716
1736
  let opened = false;
1717
1737
  sock.addEventListener("open", () => {
1718
1738
  opened = true;
1719
1739
  sendSessionUpdate();
1740
+ sendGreeting();
1720
1741
  resolve();
1721
1742
  });
1722
1743
  sock.addEventListener("message", (ev) => handleMessage(ev.data));
@@ -1773,11 +1794,17 @@ function createOpenaiRealtimeTransport(opts) {
1773
1794
  function handleErrorEvent(obj) {
1774
1795
  const err = obj.error;
1775
1796
  const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
1797
+ log.warn("OpenAI Realtime error event", { error: obj.error });
1776
1798
  clearTurnBuffers();
1777
1799
  opts.callbacks.onError("internal", message);
1778
1800
  }
1779
1801
  function handleOutputItemAdded(obj) {
1780
1802
  const item = obj.item;
1803
+ log.info("OpenAI Realtime output_item.added", {
1804
+ itemType: item?.type,
1805
+ name: item?.name,
1806
+ callId: item?.call_id
1807
+ });
1781
1808
  if (item?.type !== "function_call" || !item.id) return;
1782
1809
  toolBuffers.set(item.id, {
1783
1810
  callId: item.call_id ?? "",
@@ -1810,7 +1837,13 @@ function createOpenaiRealtimeTransport(opts) {
1810
1837
  toolBuffers.delete(id);
1811
1838
  const callId = asString(obj.call_id) || (buf?.callId ?? "");
1812
1839
  const name = asString(obj.name) || (buf?.name ?? "");
1813
- const args = parseToolArgs(asString(obj.arguments) || (buf?.argsBuffer ?? ""), name, callId);
1840
+ const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
1841
+ log.info("OpenAI Realtime tool call", {
1842
+ name,
1843
+ callId,
1844
+ args: argsStr
1845
+ });
1846
+ const args = parseToolArgs(argsStr, name, callId);
1814
1847
  opts.callbacks.onToolCall(callId, name, args);
1815
1848
  }
1816
1849
  function handleMessage(data) {
@@ -1824,9 +1857,11 @@ function createOpenaiRealtimeTransport(opts) {
1824
1857
  if (typeof raw !== "object" || raw === null) return;
1825
1858
  const obj = raw;
1826
1859
  switch (obj.type) {
1860
+ case "response.output_audio.delta":
1827
1861
  case "response.audio.delta":
1828
1862
  handleAudioDelta(obj);
1829
1863
  return;
1864
+ case "response.output_audio.done":
1830
1865
  case "response.audio.done":
1831
1866
  opts.callbacks.onAudioDone();
1832
1867
  return;
@@ -1842,9 +1877,11 @@ function createOpenaiRealtimeTransport(opts) {
1842
1877
  case "response.created":
1843
1878
  handleResponseCreated(obj);
1844
1879
  return;
1880
+ case "response.output_audio_transcript.delta":
1845
1881
  case "response.audio_transcript.delta":
1846
1882
  handleAgentTranscriptDelta(obj);
1847
1883
  return;
1884
+ case "response.output_audio_transcript.done":
1848
1885
  case "response.audio_transcript.done":
1849
1886
  handleAgentTranscriptDone(obj);
1850
1887
  return;
@@ -1863,7 +1900,9 @@ function createOpenaiRealtimeTransport(opts) {
1863
1900
  case "error":
1864
1901
  handleErrorEvent(obj);
1865
1902
  return;
1866
- default: return;
1903
+ default:
1904
+ log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
1905
+ return;
1867
1906
  }
1868
1907
  }
1869
1908
  function handleClose(code, reason) {
@@ -1893,6 +1932,11 @@ function createOpenaiRealtimeTransport(opts) {
1893
1932
  ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
1894
1933
  },
1895
1934
  sendToolResult(callId, result) {
1935
+ log.info("OpenAI Realtime sendToolResult", {
1936
+ callId,
1937
+ resultLen: result.length,
1938
+ preview: result.slice(0, 200)
1939
+ });
1896
1940
  send({
1897
1941
  type: "conversation.item.create",
1898
1942
  item: {
@@ -1901,7 +1945,13 @@ function createOpenaiRealtimeTransport(opts) {
1901
1945
  output: result
1902
1946
  }
1903
1947
  });
1904
- send({ type: "response.create" });
1948
+ if (!responseCreateQueued) {
1949
+ responseCreateQueued = true;
1950
+ queueMicrotask(() => {
1951
+ responseCreateQueued = false;
1952
+ send({ type: "response.create" });
1953
+ });
1954
+ }
1905
1955
  },
1906
1956
  cancelReply() {
1907
1957
  if (currentResponseId === null) return;
@@ -2091,6 +2141,14 @@ function createPipelineTransport(opts) {
2091
2141
  onDelta(out);
2092
2142
  ttsSession?.sendText(out);
2093
2143
  }
2144
+ function emitToolResult(part) {
2145
+ const callId = part.toolCallId ?? "";
2146
+ if (!callId) return;
2147
+ const raw = part.output ?? part.result ?? "";
2148
+ const str = typeof raw === "string" ? raw : JSON.stringify(raw);
2149
+ const truncated = str.length > 4e3 ? str.slice(0, MAX_TOOL_RESULT_CHARS) : str;
2150
+ callbacks.onToolCallDone?.(callId, truncated);
2151
+ }
2094
2152
  return function handlePart(part) {
2095
2153
  switch (part.type) {
2096
2154
  case "text-delta":
@@ -2104,6 +2162,9 @@ function createPipelineTransport(opts) {
2104
2162
  callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
2105
2163
  return;
2106
2164
  }
2165
+ case "tool-result":
2166
+ emitToolResult(part);
2167
+ return;
2107
2168
  case "error": {
2108
2169
  const msg = errorMessage(part.error);
2109
2170
  log.error("LLM stream error", {
@@ -2346,8 +2407,14 @@ const S2sMessageSchema = z.discriminatedUnion("type", [
2346
2407
  type: z.literal("tool.call"),
2347
2408
  call_id: z.string(),
2348
2409
  name: z.string(),
2349
- args: z.record(z.string(), z.unknown()).optional().default({})
2350
- }),
2410
+ arguments: z.record(z.string(), z.unknown()).optional(),
2411
+ args: z.record(z.string(), z.unknown()).optional()
2412
+ }).transform((m) => ({
2413
+ type: m.type,
2414
+ call_id: m.call_id,
2415
+ name: m.name,
2416
+ args: m.arguments ?? m.args ?? {}
2417
+ })),
2351
2418
  z.object({
2352
2419
  type: z.literal("reply.done"),
2353
2420
  status: z.string().optional()
@@ -2495,6 +2562,20 @@ function connectS2s(opts) {
2495
2562
  if (type === "reply.audio" || type === "input.audio" || type === "reply.done" || type === "session.error") return;
2496
2563
  log.info(`S2S << ${type}`);
2497
2564
  }
2565
+ function handleObject(obj, raw) {
2566
+ logIncoming(obj.type);
2567
+ if (obj.type === "tool.call") log.info("S2S << tool.call payload", { payload: JSON.stringify(obj) });
2568
+ if (obj.type === "reply.audio" && typeof obj.data === "string") {
2569
+ callbacks.onAudio(base64ToUint8(obj.data));
2570
+ return;
2571
+ }
2572
+ const parsed = parseS2sMessage(obj);
2573
+ if (!parsed) {
2574
+ log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
2575
+ return;
2576
+ }
2577
+ dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
2578
+ }
2498
2579
  ws.addEventListener("message", (ev) => {
2499
2580
  let raw;
2500
2581
  try {
@@ -2507,18 +2588,7 @@ function connectS2s(opts) {
2507
2588
  log.warn("S2S << non-object JSON message", { type: typeof raw });
2508
2589
  return;
2509
2590
  }
2510
- const obj = raw;
2511
- logIncoming(obj.type);
2512
- if (obj.type === "reply.audio" && typeof obj.data === "string") {
2513
- callbacks.onAudio(base64ToUint8(obj.data));
2514
- return;
2515
- }
2516
- const parsed = parseS2sMessage(obj);
2517
- if (!parsed) {
2518
- log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
2519
- return;
2520
- }
2521
- dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
2591
+ handleObject(raw, raw);
2522
2592
  });
2523
2593
  ws.addEventListener("close", (ev) => {
2524
2594
  const code = ev.code ?? 0;
@@ -3085,6 +3155,7 @@ function createRuntime(opts) {
3085
3155
  callbacks,
3086
3156
  sid: sessionOpts.id,
3087
3157
  agent: sessionOpts.agent,
3158
+ skipGreeting: sessionOpts.skipGreeting ?? false,
3088
3159
  ...createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {},
3089
3160
  logger
3090
3161
  });
@@ -3149,6 +3220,11 @@ function createRuntime(opts) {
3149
3220
  toolName: name,
3150
3221
  args
3151
3222
  }) : (id, name, args) => bindCore().onToolCall(id, name, args),
3223
+ ...isPipeline ? { onToolCallDone: (id, result) => sessionOpts.client.event({
3224
+ type: "tool_call_done",
3225
+ toolCallId: id,
3226
+ result
3227
+ }) } : {},
3152
3228
  onError: (code, message) => bindCore().onError(code, message),
3153
3229
  onSpeechStarted: () => bindCore().onSpeechStarted(),
3154
3230
  onSpeechStopped: () => bindCore().onSpeechStopped()
@@ -37,6 +37,8 @@ export type OpenaiRealtimeTransportOptions = {
37
37
  callbacks: TransportCallbacks;
38
38
  sid: string;
39
39
  agent: string;
40
+ /** Skip the initial greeting (used for session resume). */
41
+ skipGreeting?: boolean;
40
42
  createWebSocket?: CreateOpenaiRealtimeWebSocket;
41
43
  logger?: Logger;
42
44
  };
@@ -13,6 +13,13 @@ export type TransportCallbacks = {
13
13
  onUserTranscript(text: string): void;
14
14
  onAgentTranscript(text: string, interrupted: boolean): void;
15
15
  onToolCall(callId: string, name: string, args: Record<string, unknown>): void;
16
+ /**
17
+ * Tool execution finished. Pipeline mode invokes this from the
18
+ * `tool-result` stream part so the client UI can mark the call done.
19
+ * S2S transports leave this unset — SessionCore.onToolCall emits the
20
+ * `tool_call_done` event itself after dispatching the tool.
21
+ */
22
+ onToolCallDone?(callId: string, result: string): void;
16
23
  onError(code: SessionErrorCode, message: string): void;
17
24
  onSpeechStarted(): void;
18
25
  onSpeechStopped(): void;
@@ -85,4 +85,13 @@ export type ToolSchema = {
85
85
  parameters: JSONSchema7;
86
86
  };
87
87
  export declare const EMPTY_PARAMS: z.ZodObject<{}, z.core.$strip>;
88
+ /**
89
+ * Convert a Zod schema to the JSON Schema shape that S2S providers expect.
90
+ * Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
91
+ * the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
92
+ * either reject the field outright or ship it through to the underlying
93
+ * model with a malformed function spec — observed empirically as tool
94
+ * calls that arrive with `args: {}` even when required params are listed.
95
+ */
96
+ export declare function toToolJsonSchema(zodSchema: z.ZodTypeAny): JSONSchema7;
88
97
  export declare function agentToolsToSchemas(tools: Readonly<Record<string, ToolDef>>): ToolSchema[];
@@ -1,2 +1,2 @@
1
- import { a as toAgentConfig, i as agentToolsToSchemas, n as EMPTY_PARAMS, o as ProviderDescriptorSchema, r as ToolSchemaSchema, s as assertProviderTriple, t as AgentConfigSchema } from "../_internal-types-CfOAbK6V.js";
1
+ import { a as toAgentConfig, c as assertProviderTriple, i as agentToolsToSchemas, n as EMPTY_PARAMS, r as ToolSchemaSchema, s as ProviderDescriptorSchema, t as AgentConfigSchema } from "../_internal-types-8v1qAa4A.js";
2
2
  export { AgentConfigSchema, EMPTY_PARAMS, ProviderDescriptorSchema, ToolSchemaSchema, agentToolsToSchemas, assertProviderTriple, toAgentConfig };
@@ -9,7 +9,7 @@
9
9
 
10
10
  import { convert } from "html-to-text";
11
11
  import { z } from "zod";
12
- import { EMPTY_PARAMS, type ToolSchema } from "../sdk/_internal-types.ts";
12
+ import { EMPTY_PARAMS, type ToolSchema, toToolJsonSchema } from "../sdk/_internal-types.ts";
13
13
  import { FETCH_TIMEOUT_MS, MAX_HTML_BYTES, MAX_PAGE_CHARS } from "../sdk/constants.ts";
14
14
  import type { ToolDef } from "../sdk/types.ts";
15
15
  import { createRunCode } from "./_run-code.ts";
@@ -242,7 +242,7 @@ export function resolveAllBuiltins(
242
242
  type: "function",
243
243
  name: toolName,
244
244
  description: def.description,
245
- parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS) as ToolSchema["parameters"],
245
+ parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS) as ToolSchema["parameters"],
246
246
  });
247
247
  if (def.guidance) guidance.push(def.guidance);
248
248
  }
package/host/runtime.ts CHANGED
@@ -432,6 +432,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
432
432
  callbacks,
433
433
  sid: sessionOpts.id,
434
434
  agent: sessionOpts.agent,
435
+ skipGreeting: sessionOpts.skipGreeting ?? false,
435
436
  ...(createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {}),
436
437
  logger,
437
438
  });
@@ -512,6 +513,15 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
512
513
  ? (id, name, args) =>
513
514
  sessionOpts.client.event({ type: "tool_call", toolCallId: id, toolName: name, args })
514
515
  : (id, name, args) => bindCore().onToolCall(id, name, args),
516
+ // Pipeline: emit `tool_call_done` when streamText surfaces the
517
+ // `tool-result` part so the UI can flip status from pending → done.
518
+ // S2S transports never set this; SessionCore.onToolCall emits done itself.
519
+ ...(isPipeline
520
+ ? {
521
+ onToolCallDone: (id: string, result: string) =>
522
+ sessionOpts.client.event({ type: "tool_call_done", toolCallId: id, result }),
523
+ }
524
+ : {}),
515
525
  onError: (code, message) => bindCore().onError(code, message),
516
526
  onSpeechStarted: () => bindCore().onSpeechStarted(),
517
527
  onSpeechStopped: () => bindCore().onSpeechStopped(),
package/host/s2s.ts CHANGED
@@ -55,12 +55,23 @@ const S2sMessageSchema = z.discriminatedUnion("type", [
55
55
  item_id: z.string().optional().default(""),
56
56
  interrupted: z.boolean().optional().default(false),
57
57
  }),
58
- z.object({
59
- type: z.literal("tool.call"),
60
- call_id: z.string(),
61
- name: z.string(),
62
- args: z.record(z.string(), z.unknown()).optional().default({}),
63
- }),
58
+ // AssemblyAI's S2S protocol delivers tool args under `arguments`; older
59
+ // implementations and our internal tests use `args`. Accept either, with
60
+ // `arguments` taking precedence so the live wire format wins.
61
+ z
62
+ .object({
63
+ type: z.literal("tool.call"),
64
+ call_id: z.string(),
65
+ name: z.string(),
66
+ arguments: z.record(z.string(), z.unknown()).optional(),
67
+ args: z.record(z.string(), z.unknown()).optional(),
68
+ })
69
+ .transform((m) => ({
70
+ type: m.type,
71
+ call_id: m.call_id,
72
+ name: m.name,
73
+ args: m.arguments ?? m.args ?? {},
74
+ })),
64
75
  z.object({ type: z.literal("reply.done"), status: z.string().optional() }),
65
76
  z.object({ type: z.literal("session.error"), code: z.string(), message: z.string() }),
66
77
  z.object({ type: z.literal("error"), message: z.string() }),
@@ -303,27 +314,20 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
303
314
  log.info(`S2S << ${type}`);
304
315
  }
305
316
 
306
- ws.addEventListener("message", (ev) => {
307
- let raw: unknown;
308
- try {
309
- raw = JSON.parse(String(ev.data));
310
- } catch {
311
- log.warn("S2S << invalid JSON", { data: String(ev.data).slice(0, 200) });
312
- return;
313
- }
314
-
315
- if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
316
- log.warn("S2S << non-object JSON message", { type: typeof raw });
317
- return;
318
- }
319
- const obj = raw as Record<string, unknown>;
317
+ function handleObject(obj: Record<string, unknown>, raw: unknown): void {
320
318
  logIncoming(obj.type);
321
-
319
+ // Log the full tool.call payload so we can diagnose provider-side
320
+ // empty-args problems — the underlying LLM emitting a function call
321
+ // without populating its required parameters. Without the full
322
+ // payload we cannot tell apart "field-name mismatch" from
323
+ // "model emitted no args."
324
+ if (obj.type === "tool.call") {
325
+ log.info("S2S << tool.call payload", { payload: JSON.stringify(obj) });
326
+ }
322
327
  if (obj.type === "reply.audio" && typeof obj.data === "string") {
323
328
  callbacks.onAudio(base64ToUint8(obj.data));
324
329
  return;
325
330
  }
326
-
327
331
  const parsed = parseS2sMessage(obj);
328
332
  if (!parsed) {
329
333
  log.warn(
@@ -332,6 +336,21 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
332
336
  return;
333
337
  }
334
338
  dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
339
+ }
340
+
341
+ ws.addEventListener("message", (ev) => {
342
+ let raw: unknown;
343
+ try {
344
+ raw = JSON.parse(String(ev.data));
345
+ } catch {
346
+ log.warn("S2S << invalid JSON", { data: String(ev.data).slice(0, 200) });
347
+ return;
348
+ }
349
+ if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
350
+ log.warn("S2S << non-object JSON message", { type: typeof raw });
351
+ return;
352
+ }
353
+ handleObject(raw as Record<string, unknown>, raw);
335
354
  });
336
355
 
337
356
  ws.addEventListener("close", (ev) => {
@@ -83,7 +83,6 @@ describe("openai-realtime-transport: connect and session.update", () => {
83
83
  options: { model: "gpt-realtime", voice: "cedar" },
84
84
  sessionConfig: {
85
85
  systemPrompt: "Be terse.",
86
- greeting: "Hi.",
87
86
  tools: [],
88
87
  },
89
88
  toolSchemas: [
@@ -109,10 +108,7 @@ describe("openai-realtime-transport: connect and session.update", () => {
109
108
  expect(createWs).toHaveBeenCalledWith(
110
109
  "wss://api.openai.com/v1/realtime?model=gpt-realtime",
111
110
  expect.objectContaining({
112
- headers: expect.objectContaining({
113
- Authorization: "Bearer sk-test",
114
- "OpenAI-Beta": "realtime=v1",
115
- }),
111
+ headers: { Authorization: "Bearer sk-test" },
116
112
  }),
117
113
  );
118
114
 
@@ -121,13 +117,14 @@ describe("openai-realtime-transport: connect and session.update", () => {
121
117
  if (first === undefined) throw new Error("expected one send");
122
118
  const msg = JSON.parse(first);
123
119
  expect(msg.type).toBe("session.update");
124
- expect(msg.session.voice).toBe("cedar");
120
+ expect(msg.session.type).toBe("realtime");
121
+ expect(msg.session.output_modalities).toEqual(["audio"]);
125
122
  expect(msg.session.instructions).toBe("Be terse.");
126
- expect(msg.session.input_audio_format).toBe("pcm16");
127
- expect(msg.session.output_audio_format).toBe("pcm16");
128
- expect(msg.session.modalities).toEqual(["audio", "text"]);
129
- expect(msg.session.input_audio_transcription).toEqual({ model: "whisper-1" });
130
- expect(msg.session.turn_detection.type).toBe("server_vad");
123
+ expect(msg.session.audio.input.format).toEqual({ type: "audio/pcm", rate: 24_000 });
124
+ expect(msg.session.audio.input.turn_detection.type).toBe("server_vad");
125
+ expect(msg.session.audio.input.transcription).toEqual({ model: "whisper-1" });
126
+ expect(msg.session.audio.output.format).toEqual({ type: "audio/pcm", rate: 24_000 });
127
+ expect(msg.session.audio.output.voice).toBe("cedar");
131
128
  expect(msg.session.tools).toEqual([
132
129
  expect.objectContaining({ type: "function", name: "lookup" }),
133
130
  ]);
@@ -135,6 +132,57 @@ describe("openai-realtime-transport: connect and session.update", () => {
135
132
  });
136
133
  });
137
134
 
135
+ describe("greeting", () => {
136
+ function makeWithGreeting(args: { greeting?: string; skipGreeting?: boolean }) {
137
+ const fake = makeFakeWs();
138
+ const transport = createOpenaiRealtimeTransport({
139
+ apiKey: "sk",
140
+ options: {},
141
+ sessionConfig: {
142
+ systemPrompt: "",
143
+ ...(args.greeting !== undefined ? { greeting: args.greeting } : {}),
144
+ },
145
+ toolSchemas: [],
146
+ toolChoice: "auto",
147
+ callbacks: noopCallbacks(),
148
+ sid: "s",
149
+ agent: "a",
150
+ ...(args.skipGreeting !== undefined ? { skipGreeting: args.skipGreeting } : {}),
151
+ createWebSocket: () => fake,
152
+ logger: silentLogger,
153
+ });
154
+ const ready = transport.start();
155
+ fake.fire("open");
156
+ return { fake, ready };
157
+ }
158
+
159
+ test("sends response.create with quoted greeting after session.update", async () => {
160
+ const { fake, ready } = makeWithGreeting({ greeting: 'Hello, "friend".' });
161
+ await ready;
162
+ expect(fake.sent.length).toBe(2);
163
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
164
+ const greetingMsg = JSON.parse(fake.sent[1] ?? "{}");
165
+ expect(greetingMsg.type).toBe("response.create");
166
+ // JSON.stringify quotes the greeting and escapes any embedded quotes —
167
+ // protects against prompt-injection by closing the instruction string.
168
+ expect(greetingMsg.response.instructions).toBe('Say exactly: "Hello, \\"friend\\"."');
169
+ });
170
+
171
+ test("no greeting send when greeting is undefined", async () => {
172
+ const { fake, ready } = makeWithGreeting({});
173
+ await ready;
174
+ expect(fake.sent.length).toBe(1);
175
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
176
+ });
177
+
178
+ test("skipGreeting suppresses the greeting send", async () => {
179
+ const { fake, ready } = makeWithGreeting({ greeting: "Hi.", skipGreeting: true });
180
+ await ready;
181
+ expect(fake.sent.length).toBe(1);
182
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
183
+ });
184
+ });
185
+
138
186
  describe("audio in/out", () => {
139
187
  test("sendUserAudio sends input_audio_buffer.append with base64 payload", async () => {
140
188
  const { fake, transport, ready } = startedTransport();
@@ -150,21 +198,25 @@ describe("audio in/out", () => {
150
198
  expect(Buffer.from(msg.audio, "base64")).toEqual(Buffer.from([1, 2, 3, 4]));
151
199
  });
152
200
 
153
- test("response.audio.delta calls onAudioChunk with decoded bytes", async () => {
201
+ test.each([
202
+ ["response.audio.delta"],
203
+ ["response.output_audio.delta"],
204
+ ])("%s calls onAudioChunk with decoded bytes", async (type) => {
154
205
  const { fake, cbs, ready } = startedTransport();
155
206
  await ready;
156
207
  const audio = Buffer.from([5, 6, 7, 8]).toString("base64");
157
- fake.fire("message", {
158
- data: JSON.stringify({ type: "response.audio.delta", delta: audio }),
159
- });
208
+ fake.fire("message", { data: JSON.stringify({ type, delta: audio }) });
160
209
  expect(cbs.onAudioChunk).toHaveBeenCalledTimes(1);
161
210
  expect(cbs.onAudioChunk).toHaveBeenCalledWith(new Uint8Array([5, 6, 7, 8]));
162
211
  });
163
212
 
164
- test("response.audio.done calls onAudioDone", async () => {
213
+ test.each([
214
+ ["response.audio.done"],
215
+ ["response.output_audio.done"],
216
+ ])("%s calls onAudioDone", async (type) => {
165
217
  const { fake, cbs, ready } = startedTransport();
166
218
  await ready;
167
- fake.fire("message", { data: JSON.stringify({ type: "response.audio.done" }) });
219
+ fake.fire("message", { data: JSON.stringify({ type }) });
168
220
  expect(cbs.onAudioDone).toHaveBeenCalledTimes(1);
169
221
  });
170
222
  });
@@ -206,27 +258,22 @@ describe("VAD, user transcript, reply lifecycle, agent transcript", () => {
206
258
  expect(cbs.onReplyDone).toHaveBeenCalledTimes(1);
207
259
  });
208
260
 
209
- test("agent transcript: deltas accumulated, emitted on done", async () => {
261
+ test.each([
262
+ ["response.audio_transcript", "legacy"],
263
+ ["response.output_audio_transcript", "GA"],
264
+ ])("agent transcript (%s): deltas accumulated, emitted on done", async (prefix) => {
210
265
  const { fake, cbs, ready } = startedTransport();
211
266
  await ready;
212
267
  const item_id = "item_x";
213
268
  fake.fire("message", {
214
- data: JSON.stringify({
215
- type: "response.audio_transcript.delta",
216
- item_id,
217
- delta: "Hi ",
218
- }),
269
+ data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "Hi " }),
219
270
  });
220
271
  fake.fire("message", {
221
- data: JSON.stringify({
222
- type: "response.audio_transcript.delta",
223
- item_id,
224
- delta: "there.",
225
- }),
272
+ data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "there." }),
226
273
  });
227
274
  expect(cbs.onAgentTranscript).not.toHaveBeenCalled();
228
275
  fake.fire("message", {
229
- data: JSON.stringify({ type: "response.audio_transcript.done", item_id }),
276
+ data: JSON.stringify({ type: `${prefix}.done`, item_id }),
230
277
  });
231
278
  expect(cbs.onAgentTranscript).toHaveBeenCalledWith("Hi there.", false);
232
279
  });
@@ -308,15 +355,36 @@ describe("tool calls", () => {
308
355
  await ready;
309
356
  fake.sent.length = 0; // drop session.update
310
357
  transport.sendToolResult("call_1", '{"ok":true}');
311
- expect(fake.sent.length).toBe(2);
358
+ // function_call_output is sent immediately; response.create is queued.
359
+ expect(fake.sent.length).toBe(1);
312
360
  const m1 = JSON.parse(fake.sent[0] ?? "{}");
313
361
  expect(m1.type).toBe("conversation.item.create");
314
362
  expect(m1.item.type).toBe("function_call_output");
315
363
  expect(m1.item.call_id).toBe("call_1");
316
364
  expect(m1.item.output).toBe('{"ok":true}');
365
+ await new Promise((r) => queueMicrotask(() => r(undefined)));
366
+ expect(fake.sent.length).toBe(2);
317
367
  const m2 = JSON.parse(fake.sent[1] ?? "{}");
318
368
  expect(m2.type).toBe("response.create");
319
369
  });
370
+
371
+ test("multiple sendToolResult calls coalesce into a single response.create", async () => {
372
+ const { fake, transport, ready } = startedTransport();
373
+ await ready;
374
+ fake.sent.length = 0;
375
+ // Synchronous burst — session-core flushes pending tool results in a loop.
376
+ transport.sendToolResult("call_1", '{"a":1}');
377
+ transport.sendToolResult("call_2", '{"b":2}');
378
+ transport.sendToolResult("call_3", '{"c":3}');
379
+ // Three function_call_outputs sent immediately, no response.create yet.
380
+ expect(fake.sent.length).toBe(3);
381
+ expect(fake.sent.every((s) => JSON.parse(s).type === "conversation.item.create")).toBe(true);
382
+ await new Promise((r) => queueMicrotask(() => r(undefined)));
383
+ // After the microtask, exactly one response.create — second one would be
384
+ // rejected as `conversation_already_has_active_response`.
385
+ expect(fake.sent.length).toBe(4);
386
+ expect(JSON.parse(fake.sent[3] ?? "{}").type).toBe("response.create");
387
+ });
320
388
  });
321
389
 
322
390
  describe("cancel, error, close", () => {
@@ -49,6 +49,8 @@ export type OpenaiRealtimeTransportOptions = {
49
49
  callbacks: TransportCallbacks;
50
50
  sid: string;
51
51
  agent: string;
52
+ /** Skip the initial greeting (used for session resume). */
53
+ skipGreeting?: boolean;
52
54
  createWebSocket?: CreateOpenaiRealtimeWebSocket;
53
55
  logger?: Logger;
54
56
  };
@@ -66,6 +68,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
66
68
  type ToolBuffer = { callId: string; name: string; argsBuffer: string };
67
69
  const toolBuffers = new Map<string, ToolBuffer>();
68
70
  let currentResponseId: string | null = null;
71
+ let responseCreateQueued = false;
69
72
 
70
73
  function send(payload: Record<string, unknown>): void {
71
74
  if (!ws || ws.readyState !== WS_OPEN) {
@@ -75,17 +78,37 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
75
78
  ws.send(JSON.stringify(payload));
76
79
  }
77
80
 
81
+ function sendGreeting(): void {
82
+ if (opts.skipGreeting) return;
83
+ const greeting = opts.sessionConfig.greeting;
84
+ if (!greeting) return;
85
+ // OpenAI Realtime has no native greeting field — trigger it as a one-shot
86
+ // response with custom instructions that override the system prompt for
87
+ // this turn only. Audio + transcript ride the normal response.* events.
88
+ send({
89
+ type: "response.create",
90
+ response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` },
91
+ });
92
+ }
93
+
78
94
  function sendSessionUpdate(): void {
79
95
  send({
80
96
  type: "session.update",
81
97
  session: {
82
- modalities: ["audio", "text"],
83
- voice,
98
+ type: "realtime",
99
+ output_modalities: ["audio"],
84
100
  instructions: opts.sessionConfig.systemPrompt,
85
- input_audio_format: "pcm16",
86
- output_audio_format: "pcm16",
87
- input_audio_transcription: { model: "whisper-1" },
88
- turn_detection: { type: "server_vad" },
101
+ audio: {
102
+ input: {
103
+ format: { type: "audio/pcm", rate: 24_000 },
104
+ turn_detection: { type: "server_vad" },
105
+ transcription: { model: "whisper-1" },
106
+ },
107
+ output: {
108
+ format: { type: "audio/pcm", rate: 24_000 },
109
+ voice,
110
+ },
111
+ },
89
112
  tools: opts.toolSchemas,
90
113
  tool_choice: opts.toolChoice,
91
114
  },
@@ -99,7 +122,6 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
99
122
  const sock = createWs(url, {
100
123
  headers: {
101
124
  Authorization: `Bearer ${opts.apiKey}`,
102
- "OpenAI-Beta": "realtime=v1",
103
125
  },
104
126
  });
105
127
  ws = sock;
@@ -108,6 +130,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
108
130
  sock.addEventListener("open", () => {
109
131
  opened = true;
110
132
  sendSessionUpdate();
133
+ sendGreeting();
111
134
  resolve();
112
135
  });
113
136
  sock.addEventListener("message", (ev) => handleMessage(ev.data));
@@ -177,6 +200,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
177
200
  function handleErrorEvent(obj: Record<string, unknown>): void {
178
201
  const err = obj.error as { message?: unknown } | undefined;
179
202
  const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
203
+ log.warn("OpenAI Realtime error event", { error: obj.error });
180
204
  clearTurnBuffers();
181
205
  opts.callbacks.onError("internal", message);
182
206
  }
@@ -185,6 +209,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
185
209
  const item = obj.item as
186
210
  | { id?: string; type?: string; name?: string; call_id?: string }
187
211
  | undefined;
212
+ log.info("OpenAI Realtime output_item.added", {
213
+ itemType: item?.type,
214
+ name: item?.name,
215
+ callId: item?.call_id,
216
+ });
188
217
  if (item?.type !== "function_call" || !item.id) return;
189
218
  toolBuffers.set(item.id, {
190
219
  callId: item.call_id ?? "",
@@ -220,6 +249,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
220
249
  const callId = asString(obj.call_id) || (buf?.callId ?? "");
221
250
  const name = asString(obj.name) || (buf?.name ?? "");
222
251
  const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
252
+ log.info("OpenAI Realtime tool call", { name, callId, args: argsStr });
223
253
  const args = parseToolArgs(argsStr, name, callId);
224
254
  opts.callbacks.onToolCall(callId, name, args);
225
255
  }
@@ -235,9 +265,14 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
235
265
  if (typeof raw !== "object" || raw === null) return;
236
266
  const obj = raw as Record<string, unknown>;
237
267
  switch (obj.type) {
268
+ // GA renamed audio output events to `response.output_audio.*` and
269
+ // transcript events to `response.output_audio_transcript.*`. The legacy
270
+ // (beta) names are kept as aliases so older snapshots still work.
271
+ case "response.output_audio.delta":
238
272
  case "response.audio.delta":
239
273
  handleAudioDelta(obj);
240
274
  return;
275
+ case "response.output_audio.done":
241
276
  case "response.audio.done":
242
277
  opts.callbacks.onAudioDone();
243
278
  return;
@@ -253,9 +288,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
253
288
  case "response.created":
254
289
  handleResponseCreated(obj);
255
290
  return;
291
+ case "response.output_audio_transcript.delta":
256
292
  case "response.audio_transcript.delta":
257
293
  handleAgentTranscriptDelta(obj);
258
294
  return;
295
+ case "response.output_audio_transcript.done":
259
296
  case "response.audio_transcript.done":
260
297
  handleAgentTranscriptDone(obj);
261
298
  return;
@@ -275,6 +312,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
275
312
  handleErrorEvent(obj);
276
313
  return;
277
314
  default:
315
+ log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
278
316
  return;
279
317
  }
280
318
  }
@@ -302,11 +340,25 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
302
340
  ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
303
341
  },
304
342
  sendToolResult(callId, result) {
343
+ log.info("OpenAI Realtime sendToolResult", {
344
+ callId,
345
+ resultLen: result.length,
346
+ preview: result.slice(0, 200),
347
+ });
305
348
  send({
306
349
  type: "conversation.item.create",
307
350
  item: { type: "function_call_output", call_id: callId, output: result },
308
351
  });
309
- send({ type: "response.create" });
352
+ // Multiple tool results from one turn arrive synchronously; coalesce them
353
+ // into a single response.create per tick. OpenAI rejects a second
354
+ // response.create while one is in flight, which strands the turn.
355
+ if (!responseCreateQueued) {
356
+ responseCreateQueued = true;
357
+ queueMicrotask(() => {
358
+ responseCreateQueued = false;
359
+ send({ type: "response.create" });
360
+ });
361
+ }
310
362
  },
311
363
  cancelReply() {
312
364
  if (currentResponseId === null) return;
@@ -14,6 +14,7 @@ import {
14
14
  DEFAULT_MAX_HISTORY,
15
15
  DEFAULT_STT_SAMPLE_RATE,
16
16
  DEFAULT_TTS_SAMPLE_RATE,
17
+ MAX_TOOL_RESULT_CHARS,
17
18
  PIPELINE_FLUSH_TIMEOUT_MS,
18
19
  } from "../../sdk/constants.ts";
19
20
  import type { SessionErrorCode } from "../../sdk/protocol.ts";
@@ -235,6 +236,25 @@ export function createPipelineTransport(opts: PipelineTransportOptions): Transpo
235
236
  ttsSession?.sendText(out);
236
237
  }
237
238
 
239
+ function emitToolResult(part: {
240
+ readonly toolCallId?: string;
241
+ readonly output?: unknown;
242
+ }): void {
243
+ // Inline execution finished — surface completion so the client UI can
244
+ // flip the tool-call from "pending" to "done". Schema requires a
245
+ // string result capped at MAX_TOOL_RESULT_CHARS.
246
+ const callId = part.toolCallId ?? "";
247
+ if (!callId) return;
248
+ const raw =
249
+ (part as { output?: unknown; result?: unknown }).output ??
250
+ (part as { result?: unknown }).result ??
251
+ "";
252
+ const str = typeof raw === "string" ? raw : JSON.stringify(raw);
253
+ const truncated =
254
+ str.length > MAX_TOOL_RESULT_CHARS ? str.slice(0, MAX_TOOL_RESULT_CHARS) : str;
255
+ callbacks.onToolCallDone?.(callId, truncated);
256
+ }
257
+
238
258
  return function handlePart(part: {
239
259
  readonly type: string;
240
260
  readonly text?: string;
@@ -257,6 +277,9 @@ export function createPipelineTransport(opts: PipelineTransportOptions): Transpo
257
277
  callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
258
278
  return;
259
279
  }
280
+ case "tool-result":
281
+ emitToolResult(part);
282
+ return;
260
283
  case "error": {
261
284
  const msg = errorMessage(part.error);
262
285
  log.error("LLM stream error", { message: msg, sid: opts.sid });
@@ -17,6 +17,13 @@ export type TransportCallbacks = {
17
17
  onUserTranscript(text: string): void;
18
18
  onAgentTranscript(text: string, interrupted: boolean): void;
19
19
  onToolCall(callId: string, name: string, args: Record<string, unknown>): void;
20
+ /**
21
+ * Tool execution finished. Pipeline mode invokes this from the
22
+ * `tool-result` stream part so the client UI can mark the call done.
23
+ * S2S transports leave this unset — SessionCore.onToolCall emits the
24
+ * `tool_call_done` event itself after dispatching the tool.
25
+ */
26
+ onToolCallDone?(callId: string, result: string): void;
20
27
  onError(code: SessionErrorCode, message: string): void;
21
28
  onSpeechStarted(): void;
22
29
  onSpeechStopped(): void;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alexkroman1/aai",
3
- "version": "1.8.0",
3
+ "version": "1.8.2",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  ".": {
@@ -118,11 +118,24 @@ export type ToolSchema = {
118
118
 
119
119
  export const EMPTY_PARAMS = z.object({});
120
120
 
121
+ /**
122
+ * Convert a Zod schema to the JSON Schema shape that S2S providers expect.
123
+ * Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
124
+ * the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
125
+ * either reject the field outright or ship it through to the underlying
126
+ * model with a malformed function spec — observed empirically as tool
127
+ * calls that arrive with `args: {}` even when required params are listed.
128
+ */
129
+ export function toToolJsonSchema(zodSchema: z.ZodTypeAny): JSONSchema7 {
130
+ const { $schema: _omit, ...rest } = z.toJSONSchema(zodSchema) as Record<string, unknown>;
131
+ return rest as JSONSchema7;
132
+ }
133
+
121
134
  export function agentToolsToSchemas(tools: Readonly<Record<string, ToolDef>>): ToolSchema[] {
122
135
  return Object.entries(tools).map(([name, def]) => ({
123
136
  type: "function",
124
137
  name,
125
138
  description: def.description,
126
- parameters: z.toJSONSchema(def.parameters ?? EMPTY_PARAMS) as JSONSchema7,
139
+ parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS),
127
140
  }));
128
141
  }