@alexkroman1/aai 1.8.0 → 1.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -5
- package/CHANGELOG.md +19 -0
- package/dist/{_internal-types-CfOAbK6V.js → _internal-types-8v1qAa4A.js} +14 -2
- package/dist/host/runtime-barrel.js +106 -30
- package/dist/host/transports/openai-realtime-transport.d.ts +2 -0
- package/dist/host/transports/types.d.ts +7 -0
- package/dist/sdk/_internal-types.d.ts +9 -0
- package/dist/sdk/manifest-barrel.js +1 -1
- package/host/builtin-tools.ts +2 -2
- package/host/runtime.ts +10 -0
- package/host/s2s.ts +41 -22
- package/host/transports/openai-realtime-transport.test.ts +98 -30
- package/host/transports/openai-realtime-transport.ts +60 -8
- package/host/transports/pipeline-transport.ts +23 -0
- package/host/transports/types.ts +7 -0
- package/package.json +1 -1
- package/sdk/_internal-types.ts +14 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.8.
|
|
2
|
+
> @alexkroman1/aai@1.8.2 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m110.46 kB[22m [2m│ gzip: 30.38 kB[22m
|
|
12
12
|
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 5.70 kB[22m [2m│ gzip: 1.92 kB[22m
|
|
13
13
|
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 2.88 kB[22m [2m│ gzip: 1.24 kB[22m
|
|
14
14
|
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.36 kB[22m [2m│ gzip: 0.20 kB[22m
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
[34mℹ[39m [2mdist/[22m[1msdk/providers/tts-barrel.js[22m [2m 0.25 kB[22m [2m│ gzip: 0.16 kB[22m
|
|
19
19
|
[34mℹ[39m [2mdist/[22m[1msdk/providers/vector-barrel.js[22m [2m 0.22 kB[22m [2m│ gzip: 0.15 kB[22m
|
|
20
20
|
[34mℹ[39m [2mdist/[22m[1msdk/providers/s2s-barrel.js[22m [2m 0.15 kB[22m [2m│ gzip: 0.12 kB[22m
|
|
21
|
-
[34mℹ[39m [2mdist/[22m_internal-types-
|
|
21
|
+
[34mℹ[39m [2mdist/[22m_internal-types-8v1qAa4A.js [2m 6.04 kB[22m [2m│ gzip: 2.15 kB[22m
|
|
22
22
|
[34mℹ[39m [2mdist/[22mtypes-DOWVZhb9.js [2m 5.39 kB[22m [2m│ gzip: 2.27 kB[22m
|
|
23
23
|
[34mℹ[39m [2mdist/[22msoniox-BQdL0mB5.js [2m 2.03 kB[22m [2m│ gzip: 0.54 kB[22m
|
|
24
24
|
[34mℹ[39m [2mdist/[22mconstants-y68COEGj.js [2m 1.70 kB[22m [2m│ gzip: 0.76 kB[22m
|
|
@@ -28,5 +28,5 @@
|
|
|
28
28
|
[34mℹ[39m [2mdist/[22ms3-BtCMvCod.js [2m 0.76 kB[22m [2m│ gzip: 0.29 kB[22m
|
|
29
29
|
[34mℹ[39m [2mdist/[22mpinecone-CeJ69aRs.js [2m 0.48 kB[22m [2m│ gzip: 0.24 kB[22m
|
|
30
30
|
[34mℹ[39m [2mdist/[22mopenai-realtime-cjPAHMMx.js [2m 0.27 kB[22m [2m│ gzip: 0.19 kB[22m
|
|
31
|
-
[34mℹ[39m 20 files, total:
|
|
32
|
-
[32m✔[39m Build complete in [
|
|
31
|
+
[34mℹ[39m 20 files, total: 140.98 kB
|
|
32
|
+
[32m✔[39m Build complete in [32m42ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.8.2
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- bb06b4e: Fix S2S tool calls arriving with empty args. Strip the $schema keyword from Zod-generated JSON Schema for tool parameters — some S2S providers ship the dialect URI to the underlying model and emit tool calls with empty args even when required params are listed. Also accept both 'arguments' and 'args' field names on the wire. Pipeline transport now surfaces tool-result stream parts as tool_call_done so the client UI flips pending → done.
|
|
8
|
+
|
|
9
|
+
## 1.8.1
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- ba8effb: Make OpenAI Realtime usable end-to-end on gpt-realtime-2:
|
|
14
|
+
|
|
15
|
+
- Accept GA-renamed audio/transcript server events (`response.output_audio.{delta,done}`, `response.output_audio_transcript.{delta,done}`) alongside the legacy `response.audio.*` names so audio and transcript reach the client.
|
|
16
|
+
- Trigger the agent's `greeting` on connect by sending a one-shot `response.create` with quoted instructions, and honor `skipGreeting` so resumed sessions don't replay it.
|
|
17
|
+
- Coalesce `response.create` across multiple `sendToolResult` calls in the same tick. Multi-tool turns previously sent one `response.create` per tool, the second of which OpenAI rejected as `conversation_already_has_active_response`, stranding the turn so the model never received the tool results.
|
|
18
|
+
- Log unhandled event types and the full payload of `error` events to make silently rejected `session.update` fields visible.
|
|
19
|
+
|
|
20
|
+
- f4cc5ef: Migrate OpenAI Realtime transport to GA API schema (gpt-realtime-2). Drop OpenAI-Beta: realtime=v1 connect header and update session.update to session.type=realtime, output_modalities, and nested audio.input/audio.output with audio/pcm format.
|
|
21
|
+
|
|
3
22
|
## 1.8.0
|
|
4
23
|
|
|
5
24
|
### Minor Changes
|
|
@@ -128,13 +128,25 @@ const ToolSchemaSchema = z.object({
|
|
|
128
128
|
parameters: z.record(z.string(), z.unknown())
|
|
129
129
|
});
|
|
130
130
|
const EMPTY_PARAMS = z.object({});
|
|
131
|
+
/**
|
|
132
|
+
* Convert a Zod schema to the JSON Schema shape that S2S providers expect.
|
|
133
|
+
* Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
|
|
134
|
+
* the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
|
|
135
|
+
* either reject the field outright or ship it through to the underlying
|
|
136
|
+
* model with a malformed function spec — observed empirically as tool
|
|
137
|
+
* calls that arrive with `args: {}` even when required params are listed.
|
|
138
|
+
*/
|
|
139
|
+
function toToolJsonSchema(zodSchema) {
|
|
140
|
+
const { $schema: _omit, ...rest } = z.toJSONSchema(zodSchema);
|
|
141
|
+
return rest;
|
|
142
|
+
}
|
|
131
143
|
function agentToolsToSchemas(tools) {
|
|
132
144
|
return Object.entries(tools).map(([name, def]) => ({
|
|
133
145
|
type: "function",
|
|
134
146
|
name,
|
|
135
147
|
description: def.description,
|
|
136
|
-
parameters:
|
|
148
|
+
parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS)
|
|
137
149
|
}));
|
|
138
150
|
}
|
|
139
151
|
//#endregion
|
|
140
|
-
export { toAgentConfig as a,
|
|
152
|
+
export { toAgentConfig as a, assertProviderTriple as c, agentToolsToSchemas as i, makeSttError as l, EMPTY_PARAMS as n, toToolJsonSchema as o, ToolSchemaSchema as r, ProviderDescriptorSchema as s, AgentConfigSchema as t, makeTtsError as u };
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { r as DEFAULT_SYSTEM_PROMPT } from "../types-DOWVZhb9.js";
|
|
2
|
-
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-y68COEGj.js";
|
|
2
|
+
import { _ as TOOL_EXECUTION_TIMEOUT_MS, a as DEFAULT_SHUTDOWN_TIMEOUT_MS, c as FETCH_TIMEOUT_MS, d as MAX_PAGE_CHARS, f as MAX_TOOL_RESULT_CHARS, g as RUN_CODE_TIMEOUT_MS, h as PIPELINE_FLUSH_TIMEOUT_MS, l as MAX_HTML_BYTES, m as MAX_WS_PAYLOAD_BYTES, o as DEFAULT_STT_SAMPLE_RATE, p as MAX_VALUE_SIZE, s as DEFAULT_TTS_SAMPLE_RATE, t as AGENT_CSP } from "../constants-y68COEGj.js";
|
|
3
3
|
import { i as toolError, n as errorDetail, r as errorMessage, t as parseWsUpgradeParams } from "../ws-upgrade-CG8-by1n.js";
|
|
4
4
|
import { ClientMessageSchema, VectorRequestSchema, buildReadyConfig, lenientParse } from "../sdk/protocol.js";
|
|
5
|
-
import { a as toAgentConfig, c as
|
|
5
|
+
import { a as toAgentConfig, c as assertProviderTriple, i as agentToolsToSchemas, l as makeSttError, n as EMPTY_PARAMS, o as toToolJsonSchema, u as makeTtsError } from "../_internal-types-8v1qAa4A.js";
|
|
6
6
|
import { a as MISTRAL_KIND, d as ANTHROPIC_KIND, l as GOOGLE_KIND, r as OPENAI_KIND, s as GROQ_KIND } from "../xai-BDI61Y2M.js";
|
|
7
7
|
import { a as DEEPGRAM_KIND, r as ELEVENLABS_KIND, s as ASSEMBLYAI_KIND, t as SONIOX_KIND } from "../soniox-BQdL0mB5.js";
|
|
8
8
|
import { a as CARTESIA_KIND, n as RIME_KIND } from "../rime-58p9mDR8.js";
|
|
@@ -311,7 +311,7 @@ function resolveAllBuiltins(names, opts) {
|
|
|
311
311
|
type: "function",
|
|
312
312
|
name: toolName,
|
|
313
313
|
description: def.description,
|
|
314
|
-
parameters:
|
|
314
|
+
parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS)
|
|
315
315
|
});
|
|
316
316
|
if (def.guidance) guidance.push(def.guidance);
|
|
317
317
|
}
|
|
@@ -1681,6 +1681,7 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1681
1681
|
const agentTranscriptBuffers = /* @__PURE__ */ new Map();
|
|
1682
1682
|
const toolBuffers = /* @__PURE__ */ new Map();
|
|
1683
1683
|
let currentResponseId = null;
|
|
1684
|
+
let responseCreateQueued = false;
|
|
1684
1685
|
function send(payload) {
|
|
1685
1686
|
if (!ws || ws.readyState !== 1) {
|
|
1686
1687
|
log.debug("OpenAI Realtime send dropped: socket not open", { type: payload.type });
|
|
@@ -1688,17 +1689,39 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1688
1689
|
}
|
|
1689
1690
|
ws.send(JSON.stringify(payload));
|
|
1690
1691
|
}
|
|
1692
|
+
function sendGreeting() {
|
|
1693
|
+
if (opts.skipGreeting) return;
|
|
1694
|
+
const greeting = opts.sessionConfig.greeting;
|
|
1695
|
+
if (!greeting) return;
|
|
1696
|
+
send({
|
|
1697
|
+
type: "response.create",
|
|
1698
|
+
response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` }
|
|
1699
|
+
});
|
|
1700
|
+
}
|
|
1691
1701
|
function sendSessionUpdate() {
|
|
1692
1702
|
send({
|
|
1693
1703
|
type: "session.update",
|
|
1694
1704
|
session: {
|
|
1695
|
-
|
|
1696
|
-
|
|
1705
|
+
type: "realtime",
|
|
1706
|
+
output_modalities: ["audio"],
|
|
1697
1707
|
instructions: opts.sessionConfig.systemPrompt,
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1708
|
+
audio: {
|
|
1709
|
+
input: {
|
|
1710
|
+
format: {
|
|
1711
|
+
type: "audio/pcm",
|
|
1712
|
+
rate: 24e3
|
|
1713
|
+
},
|
|
1714
|
+
turn_detection: { type: "server_vad" },
|
|
1715
|
+
transcription: { model: "whisper-1" }
|
|
1716
|
+
},
|
|
1717
|
+
output: {
|
|
1718
|
+
format: {
|
|
1719
|
+
type: "audio/pcm",
|
|
1720
|
+
rate: 24e3
|
|
1721
|
+
},
|
|
1722
|
+
voice
|
|
1723
|
+
}
|
|
1724
|
+
},
|
|
1702
1725
|
tools: opts.toolSchemas,
|
|
1703
1726
|
tool_choice: opts.toolChoice
|
|
1704
1727
|
}
|
|
@@ -1708,15 +1731,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1708
1731
|
const url = `${baseUrl}?model=${encodeURIComponent(model)}`;
|
|
1709
1732
|
log.info("OpenAI Realtime connecting", { url });
|
|
1710
1733
|
return new Promise((resolve, reject) => {
|
|
1711
|
-
const sock = createWs(url, { headers: {
|
|
1712
|
-
Authorization: `Bearer ${opts.apiKey}`,
|
|
1713
|
-
"OpenAI-Beta": "realtime=v1"
|
|
1714
|
-
} });
|
|
1734
|
+
const sock = createWs(url, { headers: { Authorization: `Bearer ${opts.apiKey}` } });
|
|
1715
1735
|
ws = sock;
|
|
1716
1736
|
let opened = false;
|
|
1717
1737
|
sock.addEventListener("open", () => {
|
|
1718
1738
|
opened = true;
|
|
1719
1739
|
sendSessionUpdate();
|
|
1740
|
+
sendGreeting();
|
|
1720
1741
|
resolve();
|
|
1721
1742
|
});
|
|
1722
1743
|
sock.addEventListener("message", (ev) => handleMessage(ev.data));
|
|
@@ -1773,11 +1794,17 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1773
1794
|
function handleErrorEvent(obj) {
|
|
1774
1795
|
const err = obj.error;
|
|
1775
1796
|
const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
|
|
1797
|
+
log.warn("OpenAI Realtime error event", { error: obj.error });
|
|
1776
1798
|
clearTurnBuffers();
|
|
1777
1799
|
opts.callbacks.onError("internal", message);
|
|
1778
1800
|
}
|
|
1779
1801
|
function handleOutputItemAdded(obj) {
|
|
1780
1802
|
const item = obj.item;
|
|
1803
|
+
log.info("OpenAI Realtime output_item.added", {
|
|
1804
|
+
itemType: item?.type,
|
|
1805
|
+
name: item?.name,
|
|
1806
|
+
callId: item?.call_id
|
|
1807
|
+
});
|
|
1781
1808
|
if (item?.type !== "function_call" || !item.id) return;
|
|
1782
1809
|
toolBuffers.set(item.id, {
|
|
1783
1810
|
callId: item.call_id ?? "",
|
|
@@ -1810,7 +1837,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1810
1837
|
toolBuffers.delete(id);
|
|
1811
1838
|
const callId = asString(obj.call_id) || (buf?.callId ?? "");
|
|
1812
1839
|
const name = asString(obj.name) || (buf?.name ?? "");
|
|
1813
|
-
const
|
|
1840
|
+
const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
|
|
1841
|
+
log.info("OpenAI Realtime tool call", {
|
|
1842
|
+
name,
|
|
1843
|
+
callId,
|
|
1844
|
+
args: argsStr
|
|
1845
|
+
});
|
|
1846
|
+
const args = parseToolArgs(argsStr, name, callId);
|
|
1814
1847
|
opts.callbacks.onToolCall(callId, name, args);
|
|
1815
1848
|
}
|
|
1816
1849
|
function handleMessage(data) {
|
|
@@ -1824,9 +1857,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1824
1857
|
if (typeof raw !== "object" || raw === null) return;
|
|
1825
1858
|
const obj = raw;
|
|
1826
1859
|
switch (obj.type) {
|
|
1860
|
+
case "response.output_audio.delta":
|
|
1827
1861
|
case "response.audio.delta":
|
|
1828
1862
|
handleAudioDelta(obj);
|
|
1829
1863
|
return;
|
|
1864
|
+
case "response.output_audio.done":
|
|
1830
1865
|
case "response.audio.done":
|
|
1831
1866
|
opts.callbacks.onAudioDone();
|
|
1832
1867
|
return;
|
|
@@ -1842,9 +1877,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1842
1877
|
case "response.created":
|
|
1843
1878
|
handleResponseCreated(obj);
|
|
1844
1879
|
return;
|
|
1880
|
+
case "response.output_audio_transcript.delta":
|
|
1845
1881
|
case "response.audio_transcript.delta":
|
|
1846
1882
|
handleAgentTranscriptDelta(obj);
|
|
1847
1883
|
return;
|
|
1884
|
+
case "response.output_audio_transcript.done":
|
|
1848
1885
|
case "response.audio_transcript.done":
|
|
1849
1886
|
handleAgentTranscriptDone(obj);
|
|
1850
1887
|
return;
|
|
@@ -1863,7 +1900,9 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1863
1900
|
case "error":
|
|
1864
1901
|
handleErrorEvent(obj);
|
|
1865
1902
|
return;
|
|
1866
|
-
default:
|
|
1903
|
+
default:
|
|
1904
|
+
log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
|
|
1905
|
+
return;
|
|
1867
1906
|
}
|
|
1868
1907
|
}
|
|
1869
1908
|
function handleClose(code, reason) {
|
|
@@ -1893,6 +1932,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1893
1932
|
ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
|
|
1894
1933
|
},
|
|
1895
1934
|
sendToolResult(callId, result) {
|
|
1935
|
+
log.info("OpenAI Realtime sendToolResult", {
|
|
1936
|
+
callId,
|
|
1937
|
+
resultLen: result.length,
|
|
1938
|
+
preview: result.slice(0, 200)
|
|
1939
|
+
});
|
|
1896
1940
|
send({
|
|
1897
1941
|
type: "conversation.item.create",
|
|
1898
1942
|
item: {
|
|
@@ -1901,7 +1945,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1901
1945
|
output: result
|
|
1902
1946
|
}
|
|
1903
1947
|
});
|
|
1904
|
-
|
|
1948
|
+
if (!responseCreateQueued) {
|
|
1949
|
+
responseCreateQueued = true;
|
|
1950
|
+
queueMicrotask(() => {
|
|
1951
|
+
responseCreateQueued = false;
|
|
1952
|
+
send({ type: "response.create" });
|
|
1953
|
+
});
|
|
1954
|
+
}
|
|
1905
1955
|
},
|
|
1906
1956
|
cancelReply() {
|
|
1907
1957
|
if (currentResponseId === null) return;
|
|
@@ -2091,6 +2141,14 @@ function createPipelineTransport(opts) {
|
|
|
2091
2141
|
onDelta(out);
|
|
2092
2142
|
ttsSession?.sendText(out);
|
|
2093
2143
|
}
|
|
2144
|
+
function emitToolResult(part) {
|
|
2145
|
+
const callId = part.toolCallId ?? "";
|
|
2146
|
+
if (!callId) return;
|
|
2147
|
+
const raw = part.output ?? part.result ?? "";
|
|
2148
|
+
const str = typeof raw === "string" ? raw : JSON.stringify(raw);
|
|
2149
|
+
const truncated = str.length > 4e3 ? str.slice(0, MAX_TOOL_RESULT_CHARS) : str;
|
|
2150
|
+
callbacks.onToolCallDone?.(callId, truncated);
|
|
2151
|
+
}
|
|
2094
2152
|
return function handlePart(part) {
|
|
2095
2153
|
switch (part.type) {
|
|
2096
2154
|
case "text-delta":
|
|
@@ -2104,6 +2162,9 @@ function createPipelineTransport(opts) {
|
|
|
2104
2162
|
callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
|
|
2105
2163
|
return;
|
|
2106
2164
|
}
|
|
2165
|
+
case "tool-result":
|
|
2166
|
+
emitToolResult(part);
|
|
2167
|
+
return;
|
|
2107
2168
|
case "error": {
|
|
2108
2169
|
const msg = errorMessage(part.error);
|
|
2109
2170
|
log.error("LLM stream error", {
|
|
@@ -2346,8 +2407,14 @@ const S2sMessageSchema = z.discriminatedUnion("type", [
|
|
|
2346
2407
|
type: z.literal("tool.call"),
|
|
2347
2408
|
call_id: z.string(),
|
|
2348
2409
|
name: z.string(),
|
|
2349
|
-
|
|
2350
|
-
|
|
2410
|
+
arguments: z.record(z.string(), z.unknown()).optional(),
|
|
2411
|
+
args: z.record(z.string(), z.unknown()).optional()
|
|
2412
|
+
}).transform((m) => ({
|
|
2413
|
+
type: m.type,
|
|
2414
|
+
call_id: m.call_id,
|
|
2415
|
+
name: m.name,
|
|
2416
|
+
args: m.arguments ?? m.args ?? {}
|
|
2417
|
+
})),
|
|
2351
2418
|
z.object({
|
|
2352
2419
|
type: z.literal("reply.done"),
|
|
2353
2420
|
status: z.string().optional()
|
|
@@ -2495,6 +2562,20 @@ function connectS2s(opts) {
|
|
|
2495
2562
|
if (type === "reply.audio" || type === "input.audio" || type === "reply.done" || type === "session.error") return;
|
|
2496
2563
|
log.info(`S2S << ${type}`);
|
|
2497
2564
|
}
|
|
2565
|
+
function handleObject(obj, raw) {
|
|
2566
|
+
logIncoming(obj.type);
|
|
2567
|
+
if (obj.type === "tool.call") log.info("S2S << tool.call payload", { payload: JSON.stringify(obj) });
|
|
2568
|
+
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
2569
|
+
callbacks.onAudio(base64ToUint8(obj.data));
|
|
2570
|
+
return;
|
|
2571
|
+
}
|
|
2572
|
+
const parsed = parseS2sMessage(obj);
|
|
2573
|
+
if (!parsed) {
|
|
2574
|
+
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
2575
|
+
return;
|
|
2576
|
+
}
|
|
2577
|
+
dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
|
|
2578
|
+
}
|
|
2498
2579
|
ws.addEventListener("message", (ev) => {
|
|
2499
2580
|
let raw;
|
|
2500
2581
|
try {
|
|
@@ -2507,18 +2588,7 @@ function connectS2s(opts) {
|
|
|
2507
2588
|
log.warn("S2S << non-object JSON message", { type: typeof raw });
|
|
2508
2589
|
return;
|
|
2509
2590
|
}
|
|
2510
|
-
|
|
2511
|
-
logIncoming(obj.type);
|
|
2512
|
-
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
2513
|
-
callbacks.onAudio(base64ToUint8(obj.data));
|
|
2514
|
-
return;
|
|
2515
|
-
}
|
|
2516
|
-
const parsed = parseS2sMessage(obj);
|
|
2517
|
-
if (!parsed) {
|
|
2518
|
-
log.warn(`S2S << unrecognised message type: ${obj.type ?? JSON.stringify(raw).slice(0, 200)}`);
|
|
2519
|
-
return;
|
|
2520
|
-
}
|
|
2521
|
-
dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
|
|
2591
|
+
handleObject(raw, raw);
|
|
2522
2592
|
});
|
|
2523
2593
|
ws.addEventListener("close", (ev) => {
|
|
2524
2594
|
const code = ev.code ?? 0;
|
|
@@ -3085,6 +3155,7 @@ function createRuntime(opts) {
|
|
|
3085
3155
|
callbacks,
|
|
3086
3156
|
sid: sessionOpts.id,
|
|
3087
3157
|
agent: sessionOpts.agent,
|
|
3158
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
3088
3159
|
...createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {},
|
|
3089
3160
|
logger
|
|
3090
3161
|
});
|
|
@@ -3149,6 +3220,11 @@ function createRuntime(opts) {
|
|
|
3149
3220
|
toolName: name,
|
|
3150
3221
|
args
|
|
3151
3222
|
}) : (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
3223
|
+
...isPipeline ? { onToolCallDone: (id, result) => sessionOpts.client.event({
|
|
3224
|
+
type: "tool_call_done",
|
|
3225
|
+
toolCallId: id,
|
|
3226
|
+
result
|
|
3227
|
+
}) } : {},
|
|
3152
3228
|
onError: (code, message) => bindCore().onError(code, message),
|
|
3153
3229
|
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
3154
3230
|
onSpeechStopped: () => bindCore().onSpeechStopped()
|
|
@@ -37,6 +37,8 @@ export type OpenaiRealtimeTransportOptions = {
|
|
|
37
37
|
callbacks: TransportCallbacks;
|
|
38
38
|
sid: string;
|
|
39
39
|
agent: string;
|
|
40
|
+
/** Skip the initial greeting (used for session resume). */
|
|
41
|
+
skipGreeting?: boolean;
|
|
40
42
|
createWebSocket?: CreateOpenaiRealtimeWebSocket;
|
|
41
43
|
logger?: Logger;
|
|
42
44
|
};
|
|
@@ -13,6 +13,13 @@ export type TransportCallbacks = {
|
|
|
13
13
|
onUserTranscript(text: string): void;
|
|
14
14
|
onAgentTranscript(text: string, interrupted: boolean): void;
|
|
15
15
|
onToolCall(callId: string, name: string, args: Record<string, unknown>): void;
|
|
16
|
+
/**
|
|
17
|
+
* Tool execution finished. Pipeline mode invokes this from the
|
|
18
|
+
* `tool-result` stream part so the client UI can mark the call done.
|
|
19
|
+
* S2S transports leave this unset — SessionCore.onToolCall emits the
|
|
20
|
+
* `tool_call_done` event itself after dispatching the tool.
|
|
21
|
+
*/
|
|
22
|
+
onToolCallDone?(callId: string, result: string): void;
|
|
16
23
|
onError(code: SessionErrorCode, message: string): void;
|
|
17
24
|
onSpeechStarted(): void;
|
|
18
25
|
onSpeechStopped(): void;
|
|
@@ -85,4 +85,13 @@ export type ToolSchema = {
|
|
|
85
85
|
parameters: JSONSchema7;
|
|
86
86
|
};
|
|
87
87
|
export declare const EMPTY_PARAMS: z.ZodObject<{}, z.core.$strip>;
|
|
88
|
+
/**
|
|
89
|
+
* Convert a Zod schema to the JSON Schema shape that S2S providers expect.
|
|
90
|
+
* Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
|
|
91
|
+
* the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
|
|
92
|
+
* either reject the field outright or ship it through to the underlying
|
|
93
|
+
* model with a malformed function spec — observed empirically as tool
|
|
94
|
+
* calls that arrive with `args: {}` even when required params are listed.
|
|
95
|
+
*/
|
|
96
|
+
export declare function toToolJsonSchema(zodSchema: z.ZodTypeAny): JSONSchema7;
|
|
88
97
|
export declare function agentToolsToSchemas(tools: Readonly<Record<string, ToolDef>>): ToolSchema[];
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { a as toAgentConfig,
|
|
1
|
+
import { a as toAgentConfig, c as assertProviderTriple, i as agentToolsToSchemas, n as EMPTY_PARAMS, r as ToolSchemaSchema, s as ProviderDescriptorSchema, t as AgentConfigSchema } from "../_internal-types-8v1qAa4A.js";
|
|
2
2
|
export { AgentConfigSchema, EMPTY_PARAMS, ProviderDescriptorSchema, ToolSchemaSchema, agentToolsToSchemas, assertProviderTriple, toAgentConfig };
|
package/host/builtin-tools.ts
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
import { convert } from "html-to-text";
|
|
11
11
|
import { z } from "zod";
|
|
12
|
-
import { EMPTY_PARAMS, type ToolSchema } from "../sdk/_internal-types.ts";
|
|
12
|
+
import { EMPTY_PARAMS, type ToolSchema, toToolJsonSchema } from "../sdk/_internal-types.ts";
|
|
13
13
|
import { FETCH_TIMEOUT_MS, MAX_HTML_BYTES, MAX_PAGE_CHARS } from "../sdk/constants.ts";
|
|
14
14
|
import type { ToolDef } from "../sdk/types.ts";
|
|
15
15
|
import { createRunCode } from "./_run-code.ts";
|
|
@@ -242,7 +242,7 @@ export function resolveAllBuiltins(
|
|
|
242
242
|
type: "function",
|
|
243
243
|
name: toolName,
|
|
244
244
|
description: def.description,
|
|
245
|
-
parameters:
|
|
245
|
+
parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS) as ToolSchema["parameters"],
|
|
246
246
|
});
|
|
247
247
|
if (def.guidance) guidance.push(def.guidance);
|
|
248
248
|
}
|
package/host/runtime.ts
CHANGED
|
@@ -432,6 +432,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
432
432
|
callbacks,
|
|
433
433
|
sid: sessionOpts.id,
|
|
434
434
|
agent: sessionOpts.agent,
|
|
435
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
435
436
|
...(createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {}),
|
|
436
437
|
logger,
|
|
437
438
|
});
|
|
@@ -512,6 +513,15 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
512
513
|
? (id, name, args) =>
|
|
513
514
|
sessionOpts.client.event({ type: "tool_call", toolCallId: id, toolName: name, args })
|
|
514
515
|
: (id, name, args) => bindCore().onToolCall(id, name, args),
|
|
516
|
+
// Pipeline: emit `tool_call_done` when streamText surfaces the
|
|
517
|
+
// `tool-result` part so the UI can flip status from pending → done.
|
|
518
|
+
// S2S transports never set this; SessionCore.onToolCall emits done itself.
|
|
519
|
+
...(isPipeline
|
|
520
|
+
? {
|
|
521
|
+
onToolCallDone: (id: string, result: string) =>
|
|
522
|
+
sessionOpts.client.event({ type: "tool_call_done", toolCallId: id, result }),
|
|
523
|
+
}
|
|
524
|
+
: {}),
|
|
515
525
|
onError: (code, message) => bindCore().onError(code, message),
|
|
516
526
|
onSpeechStarted: () => bindCore().onSpeechStarted(),
|
|
517
527
|
onSpeechStopped: () => bindCore().onSpeechStopped(),
|
package/host/s2s.ts
CHANGED
|
@@ -55,12 +55,23 @@ const S2sMessageSchema = z.discriminatedUnion("type", [
|
|
|
55
55
|
item_id: z.string().optional().default(""),
|
|
56
56
|
interrupted: z.boolean().optional().default(false),
|
|
57
57
|
}),
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
// AssemblyAI's S2S protocol delivers tool args under `arguments`; older
|
|
59
|
+
// implementations and our internal tests use `args`. Accept either, with
|
|
60
|
+
// `arguments` taking precedence so the live wire format wins.
|
|
61
|
+
z
|
|
62
|
+
.object({
|
|
63
|
+
type: z.literal("tool.call"),
|
|
64
|
+
call_id: z.string(),
|
|
65
|
+
name: z.string(),
|
|
66
|
+
arguments: z.record(z.string(), z.unknown()).optional(),
|
|
67
|
+
args: z.record(z.string(), z.unknown()).optional(),
|
|
68
|
+
})
|
|
69
|
+
.transform((m) => ({
|
|
70
|
+
type: m.type,
|
|
71
|
+
call_id: m.call_id,
|
|
72
|
+
name: m.name,
|
|
73
|
+
args: m.arguments ?? m.args ?? {},
|
|
74
|
+
})),
|
|
64
75
|
z.object({ type: z.literal("reply.done"), status: z.string().optional() }),
|
|
65
76
|
z.object({ type: z.literal("session.error"), code: z.string(), message: z.string() }),
|
|
66
77
|
z.object({ type: z.literal("error"), message: z.string() }),
|
|
@@ -303,27 +314,20 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
|
|
|
303
314
|
log.info(`S2S << ${type}`);
|
|
304
315
|
}
|
|
305
316
|
|
|
306
|
-
|
|
307
|
-
let raw: unknown;
|
|
308
|
-
try {
|
|
309
|
-
raw = JSON.parse(String(ev.data));
|
|
310
|
-
} catch {
|
|
311
|
-
log.warn("S2S << invalid JSON", { data: String(ev.data).slice(0, 200) });
|
|
312
|
-
return;
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
|
|
316
|
-
log.warn("S2S << non-object JSON message", { type: typeof raw });
|
|
317
|
-
return;
|
|
318
|
-
}
|
|
319
|
-
const obj = raw as Record<string, unknown>;
|
|
317
|
+
function handleObject(obj: Record<string, unknown>, raw: unknown): void {
|
|
320
318
|
logIncoming(obj.type);
|
|
321
|
-
|
|
319
|
+
// Log the full tool.call payload so we can diagnose provider-side
|
|
320
|
+
// empty-args problems — the underlying LLM emitting a function call
|
|
321
|
+
// without populating its required parameters. Without the full
|
|
322
|
+
// payload we cannot tell apart "field-name mismatch" from
|
|
323
|
+
// "model emitted no args."
|
|
324
|
+
if (obj.type === "tool.call") {
|
|
325
|
+
log.info("S2S << tool.call payload", { payload: JSON.stringify(obj) });
|
|
326
|
+
}
|
|
322
327
|
if (obj.type === "reply.audio" && typeof obj.data === "string") {
|
|
323
328
|
callbacks.onAudio(base64ToUint8(obj.data));
|
|
324
329
|
return;
|
|
325
330
|
}
|
|
326
|
-
|
|
327
331
|
const parsed = parseS2sMessage(obj);
|
|
328
332
|
if (!parsed) {
|
|
329
333
|
log.warn(
|
|
@@ -332,6 +336,21 @@ export function connectS2s(opts: ConnectS2sOptions): Promise<S2sHandle> {
|
|
|
332
336
|
return;
|
|
333
337
|
}
|
|
334
338
|
dispatchS2sMessage(callbacks, parsed, dispatchState, dispatchCtx);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
ws.addEventListener("message", (ev) => {
|
|
342
|
+
let raw: unknown;
|
|
343
|
+
try {
|
|
344
|
+
raw = JSON.parse(String(ev.data));
|
|
345
|
+
} catch {
|
|
346
|
+
log.warn("S2S << invalid JSON", { data: String(ev.data).slice(0, 200) });
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
if (typeof raw !== "object" || raw === null || Array.isArray(raw)) {
|
|
350
|
+
log.warn("S2S << non-object JSON message", { type: typeof raw });
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
handleObject(raw as Record<string, unknown>, raw);
|
|
335
354
|
});
|
|
336
355
|
|
|
337
356
|
ws.addEventListener("close", (ev) => {
|
|
@@ -83,7 +83,6 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
83
83
|
options: { model: "gpt-realtime", voice: "cedar" },
|
|
84
84
|
sessionConfig: {
|
|
85
85
|
systemPrompt: "Be terse.",
|
|
86
|
-
greeting: "Hi.",
|
|
87
86
|
tools: [],
|
|
88
87
|
},
|
|
89
88
|
toolSchemas: [
|
|
@@ -109,10 +108,7 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
109
108
|
expect(createWs).toHaveBeenCalledWith(
|
|
110
109
|
"wss://api.openai.com/v1/realtime?model=gpt-realtime",
|
|
111
110
|
expect.objectContaining({
|
|
112
|
-
headers:
|
|
113
|
-
Authorization: "Bearer sk-test",
|
|
114
|
-
"OpenAI-Beta": "realtime=v1",
|
|
115
|
-
}),
|
|
111
|
+
headers: { Authorization: "Bearer sk-test" },
|
|
116
112
|
}),
|
|
117
113
|
);
|
|
118
114
|
|
|
@@ -121,13 +117,14 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
121
117
|
if (first === undefined) throw new Error("expected one send");
|
|
122
118
|
const msg = JSON.parse(first);
|
|
123
119
|
expect(msg.type).toBe("session.update");
|
|
124
|
-
expect(msg.session.
|
|
120
|
+
expect(msg.session.type).toBe("realtime");
|
|
121
|
+
expect(msg.session.output_modalities).toEqual(["audio"]);
|
|
125
122
|
expect(msg.session.instructions).toBe("Be terse.");
|
|
126
|
-
expect(msg.session.
|
|
127
|
-
expect(msg.session.
|
|
128
|
-
expect(msg.session.
|
|
129
|
-
expect(msg.session.
|
|
130
|
-
expect(msg.session.
|
|
123
|
+
expect(msg.session.audio.input.format).toEqual({ type: "audio/pcm", rate: 24_000 });
|
|
124
|
+
expect(msg.session.audio.input.turn_detection.type).toBe("server_vad");
|
|
125
|
+
expect(msg.session.audio.input.transcription).toEqual({ model: "whisper-1" });
|
|
126
|
+
expect(msg.session.audio.output.format).toEqual({ type: "audio/pcm", rate: 24_000 });
|
|
127
|
+
expect(msg.session.audio.output.voice).toBe("cedar");
|
|
131
128
|
expect(msg.session.tools).toEqual([
|
|
132
129
|
expect.objectContaining({ type: "function", name: "lookup" }),
|
|
133
130
|
]);
|
|
@@ -135,6 +132,57 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
135
132
|
});
|
|
136
133
|
});
|
|
137
134
|
|
|
135
|
+
describe("greeting", () => {
|
|
136
|
+
function makeWithGreeting(args: { greeting?: string; skipGreeting?: boolean }) {
|
|
137
|
+
const fake = makeFakeWs();
|
|
138
|
+
const transport = createOpenaiRealtimeTransport({
|
|
139
|
+
apiKey: "sk",
|
|
140
|
+
options: {},
|
|
141
|
+
sessionConfig: {
|
|
142
|
+
systemPrompt: "",
|
|
143
|
+
...(args.greeting !== undefined ? { greeting: args.greeting } : {}),
|
|
144
|
+
},
|
|
145
|
+
toolSchemas: [],
|
|
146
|
+
toolChoice: "auto",
|
|
147
|
+
callbacks: noopCallbacks(),
|
|
148
|
+
sid: "s",
|
|
149
|
+
agent: "a",
|
|
150
|
+
...(args.skipGreeting !== undefined ? { skipGreeting: args.skipGreeting } : {}),
|
|
151
|
+
createWebSocket: () => fake,
|
|
152
|
+
logger: silentLogger,
|
|
153
|
+
});
|
|
154
|
+
const ready = transport.start();
|
|
155
|
+
fake.fire("open");
|
|
156
|
+
return { fake, ready };
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
test("sends response.create with quoted greeting after session.update", async () => {
|
|
160
|
+
const { fake, ready } = makeWithGreeting({ greeting: 'Hello, "friend".' });
|
|
161
|
+
await ready;
|
|
162
|
+
expect(fake.sent.length).toBe(2);
|
|
163
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
164
|
+
const greetingMsg = JSON.parse(fake.sent[1] ?? "{}");
|
|
165
|
+
expect(greetingMsg.type).toBe("response.create");
|
|
166
|
+
// JSON.stringify quotes the greeting and escapes any embedded quotes —
|
|
167
|
+
// protects against prompt-injection by closing the instruction string.
|
|
168
|
+
expect(greetingMsg.response.instructions).toBe('Say exactly: "Hello, \\"friend\\"."');
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
test("no greeting send when greeting is undefined", async () => {
|
|
172
|
+
const { fake, ready } = makeWithGreeting({});
|
|
173
|
+
await ready;
|
|
174
|
+
expect(fake.sent.length).toBe(1);
|
|
175
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
test("skipGreeting suppresses the greeting send", async () => {
|
|
179
|
+
const { fake, ready } = makeWithGreeting({ greeting: "Hi.", skipGreeting: true });
|
|
180
|
+
await ready;
|
|
181
|
+
expect(fake.sent.length).toBe(1);
|
|
182
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
183
|
+
});
|
|
184
|
+
});
|
|
185
|
+
|
|
138
186
|
describe("audio in/out", () => {
|
|
139
187
|
test("sendUserAudio sends input_audio_buffer.append with base64 payload", async () => {
|
|
140
188
|
const { fake, transport, ready } = startedTransport();
|
|
@@ -150,21 +198,25 @@ describe("audio in/out", () => {
|
|
|
150
198
|
expect(Buffer.from(msg.audio, "base64")).toEqual(Buffer.from([1, 2, 3, 4]));
|
|
151
199
|
});
|
|
152
200
|
|
|
153
|
-
test
|
|
201
|
+
test.each([
|
|
202
|
+
["response.audio.delta"],
|
|
203
|
+
["response.output_audio.delta"],
|
|
204
|
+
])("%s calls onAudioChunk with decoded bytes", async (type) => {
|
|
154
205
|
const { fake, cbs, ready } = startedTransport();
|
|
155
206
|
await ready;
|
|
156
207
|
const audio = Buffer.from([5, 6, 7, 8]).toString("base64");
|
|
157
|
-
fake.fire("message", {
|
|
158
|
-
data: JSON.stringify({ type: "response.audio.delta", delta: audio }),
|
|
159
|
-
});
|
|
208
|
+
fake.fire("message", { data: JSON.stringify({ type, delta: audio }) });
|
|
160
209
|
expect(cbs.onAudioChunk).toHaveBeenCalledTimes(1);
|
|
161
210
|
expect(cbs.onAudioChunk).toHaveBeenCalledWith(new Uint8Array([5, 6, 7, 8]));
|
|
162
211
|
});
|
|
163
212
|
|
|
164
|
-
test
|
|
213
|
+
test.each([
|
|
214
|
+
["response.audio.done"],
|
|
215
|
+
["response.output_audio.done"],
|
|
216
|
+
])("%s calls onAudioDone", async (type) => {
|
|
165
217
|
const { fake, cbs, ready } = startedTransport();
|
|
166
218
|
await ready;
|
|
167
|
-
fake.fire("message", { data: JSON.stringify({ type
|
|
219
|
+
fake.fire("message", { data: JSON.stringify({ type }) });
|
|
168
220
|
expect(cbs.onAudioDone).toHaveBeenCalledTimes(1);
|
|
169
221
|
});
|
|
170
222
|
});
|
|
@@ -206,27 +258,22 @@ describe("VAD, user transcript, reply lifecycle, agent transcript", () => {
|
|
|
206
258
|
expect(cbs.onReplyDone).toHaveBeenCalledTimes(1);
|
|
207
259
|
});
|
|
208
260
|
|
|
209
|
-
test(
|
|
261
|
+
test.each([
|
|
262
|
+
["response.audio_transcript", "legacy"],
|
|
263
|
+
["response.output_audio_transcript", "GA"],
|
|
264
|
+
])("agent transcript (%s): deltas accumulated, emitted on done", async (prefix) => {
|
|
210
265
|
const { fake, cbs, ready } = startedTransport();
|
|
211
266
|
await ready;
|
|
212
267
|
const item_id = "item_x";
|
|
213
268
|
fake.fire("message", {
|
|
214
|
-
data: JSON.stringify({
|
|
215
|
-
type: "response.audio_transcript.delta",
|
|
216
|
-
item_id,
|
|
217
|
-
delta: "Hi ",
|
|
218
|
-
}),
|
|
269
|
+
data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "Hi " }),
|
|
219
270
|
});
|
|
220
271
|
fake.fire("message", {
|
|
221
|
-
data: JSON.stringify({
|
|
222
|
-
type: "response.audio_transcript.delta",
|
|
223
|
-
item_id,
|
|
224
|
-
delta: "there.",
|
|
225
|
-
}),
|
|
272
|
+
data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "there." }),
|
|
226
273
|
});
|
|
227
274
|
expect(cbs.onAgentTranscript).not.toHaveBeenCalled();
|
|
228
275
|
fake.fire("message", {
|
|
229
|
-
data: JSON.stringify({ type:
|
|
276
|
+
data: JSON.stringify({ type: `${prefix}.done`, item_id }),
|
|
230
277
|
});
|
|
231
278
|
expect(cbs.onAgentTranscript).toHaveBeenCalledWith("Hi there.", false);
|
|
232
279
|
});
|
|
@@ -308,15 +355,36 @@ describe("tool calls", () => {
|
|
|
308
355
|
await ready;
|
|
309
356
|
fake.sent.length = 0; // drop session.update
|
|
310
357
|
transport.sendToolResult("call_1", '{"ok":true}');
|
|
311
|
-
|
|
358
|
+
// function_call_output is sent immediately; response.create is queued.
|
|
359
|
+
expect(fake.sent.length).toBe(1);
|
|
312
360
|
const m1 = JSON.parse(fake.sent[0] ?? "{}");
|
|
313
361
|
expect(m1.type).toBe("conversation.item.create");
|
|
314
362
|
expect(m1.item.type).toBe("function_call_output");
|
|
315
363
|
expect(m1.item.call_id).toBe("call_1");
|
|
316
364
|
expect(m1.item.output).toBe('{"ok":true}');
|
|
365
|
+
await new Promise((r) => queueMicrotask(() => r(undefined)));
|
|
366
|
+
expect(fake.sent.length).toBe(2);
|
|
317
367
|
const m2 = JSON.parse(fake.sent[1] ?? "{}");
|
|
318
368
|
expect(m2.type).toBe("response.create");
|
|
319
369
|
});
|
|
370
|
+
|
|
371
|
+
test("multiple sendToolResult calls coalesce into a single response.create", async () => {
|
|
372
|
+
const { fake, transport, ready } = startedTransport();
|
|
373
|
+
await ready;
|
|
374
|
+
fake.sent.length = 0;
|
|
375
|
+
// Synchronous burst — session-core flushes pending tool results in a loop.
|
|
376
|
+
transport.sendToolResult("call_1", '{"a":1}');
|
|
377
|
+
transport.sendToolResult("call_2", '{"b":2}');
|
|
378
|
+
transport.sendToolResult("call_3", '{"c":3}');
|
|
379
|
+
// Three function_call_outputs sent immediately, no response.create yet.
|
|
380
|
+
expect(fake.sent.length).toBe(3);
|
|
381
|
+
expect(fake.sent.every((s) => JSON.parse(s).type === "conversation.item.create")).toBe(true);
|
|
382
|
+
await new Promise((r) => queueMicrotask(() => r(undefined)));
|
|
383
|
+
// After the microtask, exactly one response.create — second one would be
|
|
384
|
+
// rejected as `conversation_already_has_active_response`.
|
|
385
|
+
expect(fake.sent.length).toBe(4);
|
|
386
|
+
expect(JSON.parse(fake.sent[3] ?? "{}").type).toBe("response.create");
|
|
387
|
+
});
|
|
320
388
|
});
|
|
321
389
|
|
|
322
390
|
describe("cancel, error, close", () => {
|
|
@@ -49,6 +49,8 @@ export type OpenaiRealtimeTransportOptions = {
|
|
|
49
49
|
callbacks: TransportCallbacks;
|
|
50
50
|
sid: string;
|
|
51
51
|
agent: string;
|
|
52
|
+
/** Skip the initial greeting (used for session resume). */
|
|
53
|
+
skipGreeting?: boolean;
|
|
52
54
|
createWebSocket?: CreateOpenaiRealtimeWebSocket;
|
|
53
55
|
logger?: Logger;
|
|
54
56
|
};
|
|
@@ -66,6 +68,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
66
68
|
type ToolBuffer = { callId: string; name: string; argsBuffer: string };
|
|
67
69
|
const toolBuffers = new Map<string, ToolBuffer>();
|
|
68
70
|
let currentResponseId: string | null = null;
|
|
71
|
+
let responseCreateQueued = false;
|
|
69
72
|
|
|
70
73
|
function send(payload: Record<string, unknown>): void {
|
|
71
74
|
if (!ws || ws.readyState !== WS_OPEN) {
|
|
@@ -75,17 +78,37 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
75
78
|
ws.send(JSON.stringify(payload));
|
|
76
79
|
}
|
|
77
80
|
|
|
81
|
+
function sendGreeting(): void {
|
|
82
|
+
if (opts.skipGreeting) return;
|
|
83
|
+
const greeting = opts.sessionConfig.greeting;
|
|
84
|
+
if (!greeting) return;
|
|
85
|
+
// OpenAI Realtime has no native greeting field — trigger it as a one-shot
|
|
86
|
+
// response with custom instructions that override the system prompt for
|
|
87
|
+
// this turn only. Audio + transcript ride the normal response.* events.
|
|
88
|
+
send({
|
|
89
|
+
type: "response.create",
|
|
90
|
+
response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` },
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
78
94
|
function sendSessionUpdate(): void {
|
|
79
95
|
send({
|
|
80
96
|
type: "session.update",
|
|
81
97
|
session: {
|
|
82
|
-
|
|
83
|
-
|
|
98
|
+
type: "realtime",
|
|
99
|
+
output_modalities: ["audio"],
|
|
84
100
|
instructions: opts.sessionConfig.systemPrompt,
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
101
|
+
audio: {
|
|
102
|
+
input: {
|
|
103
|
+
format: { type: "audio/pcm", rate: 24_000 },
|
|
104
|
+
turn_detection: { type: "server_vad" },
|
|
105
|
+
transcription: { model: "whisper-1" },
|
|
106
|
+
},
|
|
107
|
+
output: {
|
|
108
|
+
format: { type: "audio/pcm", rate: 24_000 },
|
|
109
|
+
voice,
|
|
110
|
+
},
|
|
111
|
+
},
|
|
89
112
|
tools: opts.toolSchemas,
|
|
90
113
|
tool_choice: opts.toolChoice,
|
|
91
114
|
},
|
|
@@ -99,7 +122,6 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
99
122
|
const sock = createWs(url, {
|
|
100
123
|
headers: {
|
|
101
124
|
Authorization: `Bearer ${opts.apiKey}`,
|
|
102
|
-
"OpenAI-Beta": "realtime=v1",
|
|
103
125
|
},
|
|
104
126
|
});
|
|
105
127
|
ws = sock;
|
|
@@ -108,6 +130,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
108
130
|
sock.addEventListener("open", () => {
|
|
109
131
|
opened = true;
|
|
110
132
|
sendSessionUpdate();
|
|
133
|
+
sendGreeting();
|
|
111
134
|
resolve();
|
|
112
135
|
});
|
|
113
136
|
sock.addEventListener("message", (ev) => handleMessage(ev.data));
|
|
@@ -177,6 +200,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
177
200
|
function handleErrorEvent(obj: Record<string, unknown>): void {
|
|
178
201
|
const err = obj.error as { message?: unknown } | undefined;
|
|
179
202
|
const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
|
|
203
|
+
log.warn("OpenAI Realtime error event", { error: obj.error });
|
|
180
204
|
clearTurnBuffers();
|
|
181
205
|
opts.callbacks.onError("internal", message);
|
|
182
206
|
}
|
|
@@ -185,6 +209,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
185
209
|
const item = obj.item as
|
|
186
210
|
| { id?: string; type?: string; name?: string; call_id?: string }
|
|
187
211
|
| undefined;
|
|
212
|
+
log.info("OpenAI Realtime output_item.added", {
|
|
213
|
+
itemType: item?.type,
|
|
214
|
+
name: item?.name,
|
|
215
|
+
callId: item?.call_id,
|
|
216
|
+
});
|
|
188
217
|
if (item?.type !== "function_call" || !item.id) return;
|
|
189
218
|
toolBuffers.set(item.id, {
|
|
190
219
|
callId: item.call_id ?? "",
|
|
@@ -220,6 +249,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
220
249
|
const callId = asString(obj.call_id) || (buf?.callId ?? "");
|
|
221
250
|
const name = asString(obj.name) || (buf?.name ?? "");
|
|
222
251
|
const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
|
|
252
|
+
log.info("OpenAI Realtime tool call", { name, callId, args: argsStr });
|
|
223
253
|
const args = parseToolArgs(argsStr, name, callId);
|
|
224
254
|
opts.callbacks.onToolCall(callId, name, args);
|
|
225
255
|
}
|
|
@@ -235,9 +265,14 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
235
265
|
if (typeof raw !== "object" || raw === null) return;
|
|
236
266
|
const obj = raw as Record<string, unknown>;
|
|
237
267
|
switch (obj.type) {
|
|
268
|
+
// GA renamed audio output events to `response.output_audio.*` and
|
|
269
|
+
// transcript events to `response.output_audio_transcript.*`. The legacy
|
|
270
|
+
// (beta) names are kept as aliases so older snapshots still work.
|
|
271
|
+
case "response.output_audio.delta":
|
|
238
272
|
case "response.audio.delta":
|
|
239
273
|
handleAudioDelta(obj);
|
|
240
274
|
return;
|
|
275
|
+
case "response.output_audio.done":
|
|
241
276
|
case "response.audio.done":
|
|
242
277
|
opts.callbacks.onAudioDone();
|
|
243
278
|
return;
|
|
@@ -253,9 +288,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
253
288
|
case "response.created":
|
|
254
289
|
handleResponseCreated(obj);
|
|
255
290
|
return;
|
|
291
|
+
case "response.output_audio_transcript.delta":
|
|
256
292
|
case "response.audio_transcript.delta":
|
|
257
293
|
handleAgentTranscriptDelta(obj);
|
|
258
294
|
return;
|
|
295
|
+
case "response.output_audio_transcript.done":
|
|
259
296
|
case "response.audio_transcript.done":
|
|
260
297
|
handleAgentTranscriptDone(obj);
|
|
261
298
|
return;
|
|
@@ -275,6 +312,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
275
312
|
handleErrorEvent(obj);
|
|
276
313
|
return;
|
|
277
314
|
default:
|
|
315
|
+
log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
|
|
278
316
|
return;
|
|
279
317
|
}
|
|
280
318
|
}
|
|
@@ -302,11 +340,25 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
302
340
|
ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
|
|
303
341
|
},
|
|
304
342
|
sendToolResult(callId, result) {
|
|
343
|
+
log.info("OpenAI Realtime sendToolResult", {
|
|
344
|
+
callId,
|
|
345
|
+
resultLen: result.length,
|
|
346
|
+
preview: result.slice(0, 200),
|
|
347
|
+
});
|
|
305
348
|
send({
|
|
306
349
|
type: "conversation.item.create",
|
|
307
350
|
item: { type: "function_call_output", call_id: callId, output: result },
|
|
308
351
|
});
|
|
309
|
-
|
|
352
|
+
// Multiple tool results from one turn arrive synchronously; coalesce them
|
|
353
|
+
// into a single response.create per tick. OpenAI rejects a second
|
|
354
|
+
// response.create while one is in flight, which strands the turn.
|
|
355
|
+
if (!responseCreateQueued) {
|
|
356
|
+
responseCreateQueued = true;
|
|
357
|
+
queueMicrotask(() => {
|
|
358
|
+
responseCreateQueued = false;
|
|
359
|
+
send({ type: "response.create" });
|
|
360
|
+
});
|
|
361
|
+
}
|
|
310
362
|
},
|
|
311
363
|
cancelReply() {
|
|
312
364
|
if (currentResponseId === null) return;
|
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
DEFAULT_MAX_HISTORY,
|
|
15
15
|
DEFAULT_STT_SAMPLE_RATE,
|
|
16
16
|
DEFAULT_TTS_SAMPLE_RATE,
|
|
17
|
+
MAX_TOOL_RESULT_CHARS,
|
|
17
18
|
PIPELINE_FLUSH_TIMEOUT_MS,
|
|
18
19
|
} from "../../sdk/constants.ts";
|
|
19
20
|
import type { SessionErrorCode } from "../../sdk/protocol.ts";
|
|
@@ -235,6 +236,25 @@ export function createPipelineTransport(opts: PipelineTransportOptions): Transpo
|
|
|
235
236
|
ttsSession?.sendText(out);
|
|
236
237
|
}
|
|
237
238
|
|
|
239
|
+
function emitToolResult(part: {
|
|
240
|
+
readonly toolCallId?: string;
|
|
241
|
+
readonly output?: unknown;
|
|
242
|
+
}): void {
|
|
243
|
+
// Inline execution finished — surface completion so the client UI can
|
|
244
|
+
// flip the tool-call from "pending" to "done". Schema requires a
|
|
245
|
+
// string result capped at MAX_TOOL_RESULT_CHARS.
|
|
246
|
+
const callId = part.toolCallId ?? "";
|
|
247
|
+
if (!callId) return;
|
|
248
|
+
const raw =
|
|
249
|
+
(part as { output?: unknown; result?: unknown }).output ??
|
|
250
|
+
(part as { result?: unknown }).result ??
|
|
251
|
+
"";
|
|
252
|
+
const str = typeof raw === "string" ? raw : JSON.stringify(raw);
|
|
253
|
+
const truncated =
|
|
254
|
+
str.length > MAX_TOOL_RESULT_CHARS ? str.slice(0, MAX_TOOL_RESULT_CHARS) : str;
|
|
255
|
+
callbacks.onToolCallDone?.(callId, truncated);
|
|
256
|
+
}
|
|
257
|
+
|
|
238
258
|
return function handlePart(part: {
|
|
239
259
|
readonly type: string;
|
|
240
260
|
readonly text?: string;
|
|
@@ -257,6 +277,9 @@ export function createPipelineTransport(opts: PipelineTransportOptions): Transpo
|
|
|
257
277
|
callbacks.onToolCall(part.toolCallId ?? "", part.toolName ?? "", input);
|
|
258
278
|
return;
|
|
259
279
|
}
|
|
280
|
+
case "tool-result":
|
|
281
|
+
emitToolResult(part);
|
|
282
|
+
return;
|
|
260
283
|
case "error": {
|
|
261
284
|
const msg = errorMessage(part.error);
|
|
262
285
|
log.error("LLM stream error", { message: msg, sid: opts.sid });
|
package/host/transports/types.ts
CHANGED
|
@@ -17,6 +17,13 @@ export type TransportCallbacks = {
|
|
|
17
17
|
onUserTranscript(text: string): void;
|
|
18
18
|
onAgentTranscript(text: string, interrupted: boolean): void;
|
|
19
19
|
onToolCall(callId: string, name: string, args: Record<string, unknown>): void;
|
|
20
|
+
/**
|
|
21
|
+
* Tool execution finished. Pipeline mode invokes this from the
|
|
22
|
+
* `tool-result` stream part so the client UI can mark the call done.
|
|
23
|
+
* S2S transports leave this unset — SessionCore.onToolCall emits the
|
|
24
|
+
* `tool_call_done` event itself after dispatching the tool.
|
|
25
|
+
*/
|
|
26
|
+
onToolCallDone?(callId: string, result: string): void;
|
|
20
27
|
onError(code: SessionErrorCode, message: string): void;
|
|
21
28
|
onSpeechStarted(): void;
|
|
22
29
|
onSpeechStopped(): void;
|
package/package.json
CHANGED
package/sdk/_internal-types.ts
CHANGED
|
@@ -118,11 +118,24 @@ export type ToolSchema = {
|
|
|
118
118
|
|
|
119
119
|
export const EMPTY_PARAMS = z.object({});
|
|
120
120
|
|
|
121
|
+
/**
|
|
122
|
+
* Convert a Zod schema to the JSON Schema shape that S2S providers expect.
|
|
123
|
+
* Strips the `$schema` keyword: `z.toJSONSchema` (Zod v4) tags output with
|
|
124
|
+
* the JSON Schema 2020-12 dialect URI, and some Realtime/S2S providers
|
|
125
|
+
* either reject the field outright or ship it through to the underlying
|
|
126
|
+
* model with a malformed function spec — observed empirically as tool
|
|
127
|
+
* calls that arrive with `args: {}` even when required params are listed.
|
|
128
|
+
*/
|
|
129
|
+
export function toToolJsonSchema(zodSchema: z.ZodTypeAny): JSONSchema7 {
|
|
130
|
+
const { $schema: _omit, ...rest } = z.toJSONSchema(zodSchema) as Record<string, unknown>;
|
|
131
|
+
return rest as JSONSchema7;
|
|
132
|
+
}
|
|
133
|
+
|
|
121
134
|
export function agentToolsToSchemas(tools: Readonly<Record<string, ToolDef>>): ToolSchema[] {
|
|
122
135
|
return Object.entries(tools).map(([name, def]) => ({
|
|
123
136
|
type: "function",
|
|
124
137
|
name,
|
|
125
138
|
description: def.description,
|
|
126
|
-
parameters:
|
|
139
|
+
parameters: toToolJsonSchema(def.parameters ?? EMPTY_PARAMS),
|
|
127
140
|
}));
|
|
128
141
|
}
|