@alexkroman1/aai 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +4 -4
- package/CHANGELOG.md +13 -0
- package/dist/host/runtime-barrel.js +64 -13
- package/dist/host/transports/openai-realtime-transport.d.ts +2 -0
- package/host/runtime.ts +1 -0
- package/host/transports/openai-realtime-transport.test.ts +98 -30
- package/host/transports/openai-realtime-transport.ts +60 -8
- package/package.json +1 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.8.
|
|
2
|
+
> @alexkroman1/aai@1.8.1 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m109.52 kB[22m [2m│ gzip: 30.12 kB[22m
|
|
12
12
|
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 5.70 kB[22m [2m│ gzip: 1.92 kB[22m
|
|
13
13
|
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 2.88 kB[22m [2m│ gzip: 1.24 kB[22m
|
|
14
14
|
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.36 kB[22m [2m│ gzip: 0.20 kB[22m
|
|
@@ -28,5 +28,5 @@
|
|
|
28
28
|
[34mℹ[39m [2mdist/[22ms3-BtCMvCod.js [2m 0.76 kB[22m [2m│ gzip: 0.29 kB[22m
|
|
29
29
|
[34mℹ[39m [2mdist/[22mpinecone-CeJ69aRs.js [2m 0.48 kB[22m [2m│ gzip: 0.24 kB[22m
|
|
30
30
|
[34mℹ[39m [2mdist/[22mopenai-realtime-cjPAHMMx.js [2m 0.27 kB[22m [2m│ gzip: 0.19 kB[22m
|
|
31
|
-
[34mℹ[39m 20 files, total:
|
|
32
|
-
[32m✔[39m Build complete in [
|
|
31
|
+
[34mℹ[39m 20 files, total: 139.45 kB
|
|
32
|
+
[32m✔[39m Build complete in [32m62ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.8.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- ba8effb: Make OpenAI Realtime usable end-to-end on gpt-realtime-2:
|
|
8
|
+
|
|
9
|
+
- Accept GA-renamed audio/transcript server events (`response.output_audio.{delta,done}`, `response.output_audio_transcript.{delta,done}`) alongside the legacy `response.audio.*` names so audio and transcript reach the client.
|
|
10
|
+
- Trigger the agent's `greeting` on connect by sending a one-shot `response.create` with quoted instructions, and honor `skipGreeting` so resumed sessions don't replay it.
|
|
11
|
+
- Coalesce `response.create` across multiple `sendToolResult` calls in the same tick. Multi-tool turns previously sent one `response.create` per tool, the second of which OpenAI rejected as `conversation_already_has_active_response`, stranding the turn so the model never received the tool results.
|
|
12
|
+
- Log unhandled event types and the full payload of `error` events to make silently rejected `session.update` fields visible.
|
|
13
|
+
|
|
14
|
+
- f4cc5ef: Migrate OpenAI Realtime transport to GA API schema (gpt-realtime-2). Drop OpenAI-Beta: realtime=v1 connect header and update session.update to session.type=realtime, output_modalities, and nested audio.input/audio.output with audio/pcm format.
|
|
15
|
+
|
|
3
16
|
## 1.8.0
|
|
4
17
|
|
|
5
18
|
### Minor Changes
|
|
@@ -1681,6 +1681,7 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1681
1681
|
const agentTranscriptBuffers = /* @__PURE__ */ new Map();
|
|
1682
1682
|
const toolBuffers = /* @__PURE__ */ new Map();
|
|
1683
1683
|
let currentResponseId = null;
|
|
1684
|
+
let responseCreateQueued = false;
|
|
1684
1685
|
function send(payload) {
|
|
1685
1686
|
if (!ws || ws.readyState !== 1) {
|
|
1686
1687
|
log.debug("OpenAI Realtime send dropped: socket not open", { type: payload.type });
|
|
@@ -1688,17 +1689,39 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1688
1689
|
}
|
|
1689
1690
|
ws.send(JSON.stringify(payload));
|
|
1690
1691
|
}
|
|
1692
|
+
function sendGreeting() {
|
|
1693
|
+
if (opts.skipGreeting) return;
|
|
1694
|
+
const greeting = opts.sessionConfig.greeting;
|
|
1695
|
+
if (!greeting) return;
|
|
1696
|
+
send({
|
|
1697
|
+
type: "response.create",
|
|
1698
|
+
response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` }
|
|
1699
|
+
});
|
|
1700
|
+
}
|
|
1691
1701
|
function sendSessionUpdate() {
|
|
1692
1702
|
send({
|
|
1693
1703
|
type: "session.update",
|
|
1694
1704
|
session: {
|
|
1695
|
-
|
|
1696
|
-
|
|
1705
|
+
type: "realtime",
|
|
1706
|
+
output_modalities: ["audio"],
|
|
1697
1707
|
instructions: opts.sessionConfig.systemPrompt,
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1708
|
+
audio: {
|
|
1709
|
+
input: {
|
|
1710
|
+
format: {
|
|
1711
|
+
type: "audio/pcm",
|
|
1712
|
+
rate: 24e3
|
|
1713
|
+
},
|
|
1714
|
+
turn_detection: { type: "server_vad" },
|
|
1715
|
+
transcription: { model: "whisper-1" }
|
|
1716
|
+
},
|
|
1717
|
+
output: {
|
|
1718
|
+
format: {
|
|
1719
|
+
type: "audio/pcm",
|
|
1720
|
+
rate: 24e3
|
|
1721
|
+
},
|
|
1722
|
+
voice
|
|
1723
|
+
}
|
|
1724
|
+
},
|
|
1702
1725
|
tools: opts.toolSchemas,
|
|
1703
1726
|
tool_choice: opts.toolChoice
|
|
1704
1727
|
}
|
|
@@ -1708,15 +1731,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1708
1731
|
const url = `${baseUrl}?model=${encodeURIComponent(model)}`;
|
|
1709
1732
|
log.info("OpenAI Realtime connecting", { url });
|
|
1710
1733
|
return new Promise((resolve, reject) => {
|
|
1711
|
-
const sock = createWs(url, { headers: {
|
|
1712
|
-
Authorization: `Bearer ${opts.apiKey}`,
|
|
1713
|
-
"OpenAI-Beta": "realtime=v1"
|
|
1714
|
-
} });
|
|
1734
|
+
const sock = createWs(url, { headers: { Authorization: `Bearer ${opts.apiKey}` } });
|
|
1715
1735
|
ws = sock;
|
|
1716
1736
|
let opened = false;
|
|
1717
1737
|
sock.addEventListener("open", () => {
|
|
1718
1738
|
opened = true;
|
|
1719
1739
|
sendSessionUpdate();
|
|
1740
|
+
sendGreeting();
|
|
1720
1741
|
resolve();
|
|
1721
1742
|
});
|
|
1722
1743
|
sock.addEventListener("message", (ev) => handleMessage(ev.data));
|
|
@@ -1773,11 +1794,17 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1773
1794
|
function handleErrorEvent(obj) {
|
|
1774
1795
|
const err = obj.error;
|
|
1775
1796
|
const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
|
|
1797
|
+
log.warn("OpenAI Realtime error event", { error: obj.error });
|
|
1776
1798
|
clearTurnBuffers();
|
|
1777
1799
|
opts.callbacks.onError("internal", message);
|
|
1778
1800
|
}
|
|
1779
1801
|
function handleOutputItemAdded(obj) {
|
|
1780
1802
|
const item = obj.item;
|
|
1803
|
+
log.info("OpenAI Realtime output_item.added", {
|
|
1804
|
+
itemType: item?.type,
|
|
1805
|
+
name: item?.name,
|
|
1806
|
+
callId: item?.call_id
|
|
1807
|
+
});
|
|
1781
1808
|
if (item?.type !== "function_call" || !item.id) return;
|
|
1782
1809
|
toolBuffers.set(item.id, {
|
|
1783
1810
|
callId: item.call_id ?? "",
|
|
@@ -1810,7 +1837,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1810
1837
|
toolBuffers.delete(id);
|
|
1811
1838
|
const callId = asString(obj.call_id) || (buf?.callId ?? "");
|
|
1812
1839
|
const name = asString(obj.name) || (buf?.name ?? "");
|
|
1813
|
-
const
|
|
1840
|
+
const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
|
|
1841
|
+
log.info("OpenAI Realtime tool call", {
|
|
1842
|
+
name,
|
|
1843
|
+
callId,
|
|
1844
|
+
args: argsStr
|
|
1845
|
+
});
|
|
1846
|
+
const args = parseToolArgs(argsStr, name, callId);
|
|
1814
1847
|
opts.callbacks.onToolCall(callId, name, args);
|
|
1815
1848
|
}
|
|
1816
1849
|
function handleMessage(data) {
|
|
@@ -1824,9 +1857,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1824
1857
|
if (typeof raw !== "object" || raw === null) return;
|
|
1825
1858
|
const obj = raw;
|
|
1826
1859
|
switch (obj.type) {
|
|
1860
|
+
case "response.output_audio.delta":
|
|
1827
1861
|
case "response.audio.delta":
|
|
1828
1862
|
handleAudioDelta(obj);
|
|
1829
1863
|
return;
|
|
1864
|
+
case "response.output_audio.done":
|
|
1830
1865
|
case "response.audio.done":
|
|
1831
1866
|
opts.callbacks.onAudioDone();
|
|
1832
1867
|
return;
|
|
@@ -1842,9 +1877,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1842
1877
|
case "response.created":
|
|
1843
1878
|
handleResponseCreated(obj);
|
|
1844
1879
|
return;
|
|
1880
|
+
case "response.output_audio_transcript.delta":
|
|
1845
1881
|
case "response.audio_transcript.delta":
|
|
1846
1882
|
handleAgentTranscriptDelta(obj);
|
|
1847
1883
|
return;
|
|
1884
|
+
case "response.output_audio_transcript.done":
|
|
1848
1885
|
case "response.audio_transcript.done":
|
|
1849
1886
|
handleAgentTranscriptDone(obj);
|
|
1850
1887
|
return;
|
|
@@ -1863,7 +1900,9 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1863
1900
|
case "error":
|
|
1864
1901
|
handleErrorEvent(obj);
|
|
1865
1902
|
return;
|
|
1866
|
-
default:
|
|
1903
|
+
default:
|
|
1904
|
+
log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
|
|
1905
|
+
return;
|
|
1867
1906
|
}
|
|
1868
1907
|
}
|
|
1869
1908
|
function handleClose(code, reason) {
|
|
@@ -1893,6 +1932,11 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1893
1932
|
ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
|
|
1894
1933
|
},
|
|
1895
1934
|
sendToolResult(callId, result) {
|
|
1935
|
+
log.info("OpenAI Realtime sendToolResult", {
|
|
1936
|
+
callId,
|
|
1937
|
+
resultLen: result.length,
|
|
1938
|
+
preview: result.slice(0, 200)
|
|
1939
|
+
});
|
|
1896
1940
|
send({
|
|
1897
1941
|
type: "conversation.item.create",
|
|
1898
1942
|
item: {
|
|
@@ -1901,7 +1945,13 @@ function createOpenaiRealtimeTransport(opts) {
|
|
|
1901
1945
|
output: result
|
|
1902
1946
|
}
|
|
1903
1947
|
});
|
|
1904
|
-
|
|
1948
|
+
if (!responseCreateQueued) {
|
|
1949
|
+
responseCreateQueued = true;
|
|
1950
|
+
queueMicrotask(() => {
|
|
1951
|
+
responseCreateQueued = false;
|
|
1952
|
+
send({ type: "response.create" });
|
|
1953
|
+
});
|
|
1954
|
+
}
|
|
1905
1955
|
},
|
|
1906
1956
|
cancelReply() {
|
|
1907
1957
|
if (currentResponseId === null) return;
|
|
@@ -3085,6 +3135,7 @@ function createRuntime(opts) {
|
|
|
3085
3135
|
callbacks,
|
|
3086
3136
|
sid: sessionOpts.id,
|
|
3087
3137
|
agent: sessionOpts.agent,
|
|
3138
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
3088
3139
|
...createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {},
|
|
3089
3140
|
logger
|
|
3090
3141
|
});
|
|
@@ -37,6 +37,8 @@ export type OpenaiRealtimeTransportOptions = {
|
|
|
37
37
|
callbacks: TransportCallbacks;
|
|
38
38
|
sid: string;
|
|
39
39
|
agent: string;
|
|
40
|
+
/** Skip the initial greeting (used for session resume). */
|
|
41
|
+
skipGreeting?: boolean;
|
|
40
42
|
createWebSocket?: CreateOpenaiRealtimeWebSocket;
|
|
41
43
|
logger?: Logger;
|
|
42
44
|
};
|
package/host/runtime.ts
CHANGED
|
@@ -432,6 +432,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
|
|
|
432
432
|
callbacks,
|
|
433
433
|
sid: sessionOpts.id,
|
|
434
434
|
agent: sessionOpts.agent,
|
|
435
|
+
skipGreeting: sessionOpts.skipGreeting ?? false,
|
|
435
436
|
...(createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {}),
|
|
436
437
|
logger,
|
|
437
438
|
});
|
|
@@ -83,7 +83,6 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
83
83
|
options: { model: "gpt-realtime", voice: "cedar" },
|
|
84
84
|
sessionConfig: {
|
|
85
85
|
systemPrompt: "Be terse.",
|
|
86
|
-
greeting: "Hi.",
|
|
87
86
|
tools: [],
|
|
88
87
|
},
|
|
89
88
|
toolSchemas: [
|
|
@@ -109,10 +108,7 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
109
108
|
expect(createWs).toHaveBeenCalledWith(
|
|
110
109
|
"wss://api.openai.com/v1/realtime?model=gpt-realtime",
|
|
111
110
|
expect.objectContaining({
|
|
112
|
-
headers:
|
|
113
|
-
Authorization: "Bearer sk-test",
|
|
114
|
-
"OpenAI-Beta": "realtime=v1",
|
|
115
|
-
}),
|
|
111
|
+
headers: { Authorization: "Bearer sk-test" },
|
|
116
112
|
}),
|
|
117
113
|
);
|
|
118
114
|
|
|
@@ -121,13 +117,14 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
121
117
|
if (first === undefined) throw new Error("expected one send");
|
|
122
118
|
const msg = JSON.parse(first);
|
|
123
119
|
expect(msg.type).toBe("session.update");
|
|
124
|
-
expect(msg.session.
|
|
120
|
+
expect(msg.session.type).toBe("realtime");
|
|
121
|
+
expect(msg.session.output_modalities).toEqual(["audio"]);
|
|
125
122
|
expect(msg.session.instructions).toBe("Be terse.");
|
|
126
|
-
expect(msg.session.
|
|
127
|
-
expect(msg.session.
|
|
128
|
-
expect(msg.session.
|
|
129
|
-
expect(msg.session.
|
|
130
|
-
expect(msg.session.
|
|
123
|
+
expect(msg.session.audio.input.format).toEqual({ type: "audio/pcm", rate: 24_000 });
|
|
124
|
+
expect(msg.session.audio.input.turn_detection.type).toBe("server_vad");
|
|
125
|
+
expect(msg.session.audio.input.transcription).toEqual({ model: "whisper-1" });
|
|
126
|
+
expect(msg.session.audio.output.format).toEqual({ type: "audio/pcm", rate: 24_000 });
|
|
127
|
+
expect(msg.session.audio.output.voice).toBe("cedar");
|
|
131
128
|
expect(msg.session.tools).toEqual([
|
|
132
129
|
expect.objectContaining({ type: "function", name: "lookup" }),
|
|
133
130
|
]);
|
|
@@ -135,6 +132,57 @@ describe("openai-realtime-transport: connect and session.update", () => {
|
|
|
135
132
|
});
|
|
136
133
|
});
|
|
137
134
|
|
|
135
|
+
describe("greeting", () => {
|
|
136
|
+
function makeWithGreeting(args: { greeting?: string; skipGreeting?: boolean }) {
|
|
137
|
+
const fake = makeFakeWs();
|
|
138
|
+
const transport = createOpenaiRealtimeTransport({
|
|
139
|
+
apiKey: "sk",
|
|
140
|
+
options: {},
|
|
141
|
+
sessionConfig: {
|
|
142
|
+
systemPrompt: "",
|
|
143
|
+
...(args.greeting !== undefined ? { greeting: args.greeting } : {}),
|
|
144
|
+
},
|
|
145
|
+
toolSchemas: [],
|
|
146
|
+
toolChoice: "auto",
|
|
147
|
+
callbacks: noopCallbacks(),
|
|
148
|
+
sid: "s",
|
|
149
|
+
agent: "a",
|
|
150
|
+
...(args.skipGreeting !== undefined ? { skipGreeting: args.skipGreeting } : {}),
|
|
151
|
+
createWebSocket: () => fake,
|
|
152
|
+
logger: silentLogger,
|
|
153
|
+
});
|
|
154
|
+
const ready = transport.start();
|
|
155
|
+
fake.fire("open");
|
|
156
|
+
return { fake, ready };
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
test("sends response.create with quoted greeting after session.update", async () => {
|
|
160
|
+
const { fake, ready } = makeWithGreeting({ greeting: 'Hello, "friend".' });
|
|
161
|
+
await ready;
|
|
162
|
+
expect(fake.sent.length).toBe(2);
|
|
163
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
164
|
+
const greetingMsg = JSON.parse(fake.sent[1] ?? "{}");
|
|
165
|
+
expect(greetingMsg.type).toBe("response.create");
|
|
166
|
+
// JSON.stringify quotes the greeting and escapes any embedded quotes —
|
|
167
|
+
// protects against prompt-injection by closing the instruction string.
|
|
168
|
+
expect(greetingMsg.response.instructions).toBe('Say exactly: "Hello, \\"friend\\"."');
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
test("no greeting send when greeting is undefined", async () => {
|
|
172
|
+
const { fake, ready } = makeWithGreeting({});
|
|
173
|
+
await ready;
|
|
174
|
+
expect(fake.sent.length).toBe(1);
|
|
175
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
test("skipGreeting suppresses the greeting send", async () => {
|
|
179
|
+
const { fake, ready } = makeWithGreeting({ greeting: "Hi.", skipGreeting: true });
|
|
180
|
+
await ready;
|
|
181
|
+
expect(fake.sent.length).toBe(1);
|
|
182
|
+
expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
|
|
183
|
+
});
|
|
184
|
+
});
|
|
185
|
+
|
|
138
186
|
describe("audio in/out", () => {
|
|
139
187
|
test("sendUserAudio sends input_audio_buffer.append with base64 payload", async () => {
|
|
140
188
|
const { fake, transport, ready } = startedTransport();
|
|
@@ -150,21 +198,25 @@ describe("audio in/out", () => {
|
|
|
150
198
|
expect(Buffer.from(msg.audio, "base64")).toEqual(Buffer.from([1, 2, 3, 4]));
|
|
151
199
|
});
|
|
152
200
|
|
|
153
|
-
test
|
|
201
|
+
test.each([
|
|
202
|
+
["response.audio.delta"],
|
|
203
|
+
["response.output_audio.delta"],
|
|
204
|
+
])("%s calls onAudioChunk with decoded bytes", async (type) => {
|
|
154
205
|
const { fake, cbs, ready } = startedTransport();
|
|
155
206
|
await ready;
|
|
156
207
|
const audio = Buffer.from([5, 6, 7, 8]).toString("base64");
|
|
157
|
-
fake.fire("message", {
|
|
158
|
-
data: JSON.stringify({ type: "response.audio.delta", delta: audio }),
|
|
159
|
-
});
|
|
208
|
+
fake.fire("message", { data: JSON.stringify({ type, delta: audio }) });
|
|
160
209
|
expect(cbs.onAudioChunk).toHaveBeenCalledTimes(1);
|
|
161
210
|
expect(cbs.onAudioChunk).toHaveBeenCalledWith(new Uint8Array([5, 6, 7, 8]));
|
|
162
211
|
});
|
|
163
212
|
|
|
164
|
-
test
|
|
213
|
+
test.each([
|
|
214
|
+
["response.audio.done"],
|
|
215
|
+
["response.output_audio.done"],
|
|
216
|
+
])("%s calls onAudioDone", async (type) => {
|
|
165
217
|
const { fake, cbs, ready } = startedTransport();
|
|
166
218
|
await ready;
|
|
167
|
-
fake.fire("message", { data: JSON.stringify({ type
|
|
219
|
+
fake.fire("message", { data: JSON.stringify({ type }) });
|
|
168
220
|
expect(cbs.onAudioDone).toHaveBeenCalledTimes(1);
|
|
169
221
|
});
|
|
170
222
|
});
|
|
@@ -206,27 +258,22 @@ describe("VAD, user transcript, reply lifecycle, agent transcript", () => {
|
|
|
206
258
|
expect(cbs.onReplyDone).toHaveBeenCalledTimes(1);
|
|
207
259
|
});
|
|
208
260
|
|
|
209
|
-
test(
|
|
261
|
+
test.each([
|
|
262
|
+
["response.audio_transcript", "legacy"],
|
|
263
|
+
["response.output_audio_transcript", "GA"],
|
|
264
|
+
])("agent transcript (%s): deltas accumulated, emitted on done", async (prefix) => {
|
|
210
265
|
const { fake, cbs, ready } = startedTransport();
|
|
211
266
|
await ready;
|
|
212
267
|
const item_id = "item_x";
|
|
213
268
|
fake.fire("message", {
|
|
214
|
-
data: JSON.stringify({
|
|
215
|
-
type: "response.audio_transcript.delta",
|
|
216
|
-
item_id,
|
|
217
|
-
delta: "Hi ",
|
|
218
|
-
}),
|
|
269
|
+
data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "Hi " }),
|
|
219
270
|
});
|
|
220
271
|
fake.fire("message", {
|
|
221
|
-
data: JSON.stringify({
|
|
222
|
-
type: "response.audio_transcript.delta",
|
|
223
|
-
item_id,
|
|
224
|
-
delta: "there.",
|
|
225
|
-
}),
|
|
272
|
+
data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "there." }),
|
|
226
273
|
});
|
|
227
274
|
expect(cbs.onAgentTranscript).not.toHaveBeenCalled();
|
|
228
275
|
fake.fire("message", {
|
|
229
|
-
data: JSON.stringify({ type:
|
|
276
|
+
data: JSON.stringify({ type: `${prefix}.done`, item_id }),
|
|
230
277
|
});
|
|
231
278
|
expect(cbs.onAgentTranscript).toHaveBeenCalledWith("Hi there.", false);
|
|
232
279
|
});
|
|
@@ -308,15 +355,36 @@ describe("tool calls", () => {
|
|
|
308
355
|
await ready;
|
|
309
356
|
fake.sent.length = 0; // drop session.update
|
|
310
357
|
transport.sendToolResult("call_1", '{"ok":true}');
|
|
311
|
-
|
|
358
|
+
// function_call_output is sent immediately; response.create is queued.
|
|
359
|
+
expect(fake.sent.length).toBe(1);
|
|
312
360
|
const m1 = JSON.parse(fake.sent[0] ?? "{}");
|
|
313
361
|
expect(m1.type).toBe("conversation.item.create");
|
|
314
362
|
expect(m1.item.type).toBe("function_call_output");
|
|
315
363
|
expect(m1.item.call_id).toBe("call_1");
|
|
316
364
|
expect(m1.item.output).toBe('{"ok":true}');
|
|
365
|
+
await new Promise((r) => queueMicrotask(() => r(undefined)));
|
|
366
|
+
expect(fake.sent.length).toBe(2);
|
|
317
367
|
const m2 = JSON.parse(fake.sent[1] ?? "{}");
|
|
318
368
|
expect(m2.type).toBe("response.create");
|
|
319
369
|
});
|
|
370
|
+
|
|
371
|
+
test("multiple sendToolResult calls coalesce into a single response.create", async () => {
|
|
372
|
+
const { fake, transport, ready } = startedTransport();
|
|
373
|
+
await ready;
|
|
374
|
+
fake.sent.length = 0;
|
|
375
|
+
// Synchronous burst — session-core flushes pending tool results in a loop.
|
|
376
|
+
transport.sendToolResult("call_1", '{"a":1}');
|
|
377
|
+
transport.sendToolResult("call_2", '{"b":2}');
|
|
378
|
+
transport.sendToolResult("call_3", '{"c":3}');
|
|
379
|
+
// Three function_call_outputs sent immediately, no response.create yet.
|
|
380
|
+
expect(fake.sent.length).toBe(3);
|
|
381
|
+
expect(fake.sent.every((s) => JSON.parse(s).type === "conversation.item.create")).toBe(true);
|
|
382
|
+
await new Promise((r) => queueMicrotask(() => r(undefined)));
|
|
383
|
+
// After the microtask, exactly one response.create — second one would be
|
|
384
|
+
// rejected as `conversation_already_has_active_response`.
|
|
385
|
+
expect(fake.sent.length).toBe(4);
|
|
386
|
+
expect(JSON.parse(fake.sent[3] ?? "{}").type).toBe("response.create");
|
|
387
|
+
});
|
|
320
388
|
});
|
|
321
389
|
|
|
322
390
|
describe("cancel, error, close", () => {
|
|
@@ -49,6 +49,8 @@ export type OpenaiRealtimeTransportOptions = {
|
|
|
49
49
|
callbacks: TransportCallbacks;
|
|
50
50
|
sid: string;
|
|
51
51
|
agent: string;
|
|
52
|
+
/** Skip the initial greeting (used for session resume). */
|
|
53
|
+
skipGreeting?: boolean;
|
|
52
54
|
createWebSocket?: CreateOpenaiRealtimeWebSocket;
|
|
53
55
|
logger?: Logger;
|
|
54
56
|
};
|
|
@@ -66,6 +68,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
66
68
|
type ToolBuffer = { callId: string; name: string; argsBuffer: string };
|
|
67
69
|
const toolBuffers = new Map<string, ToolBuffer>();
|
|
68
70
|
let currentResponseId: string | null = null;
|
|
71
|
+
let responseCreateQueued = false;
|
|
69
72
|
|
|
70
73
|
function send(payload: Record<string, unknown>): void {
|
|
71
74
|
if (!ws || ws.readyState !== WS_OPEN) {
|
|
@@ -75,17 +78,37 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
75
78
|
ws.send(JSON.stringify(payload));
|
|
76
79
|
}
|
|
77
80
|
|
|
81
|
+
function sendGreeting(): void {
|
|
82
|
+
if (opts.skipGreeting) return;
|
|
83
|
+
const greeting = opts.sessionConfig.greeting;
|
|
84
|
+
if (!greeting) return;
|
|
85
|
+
// OpenAI Realtime has no native greeting field — trigger it as a one-shot
|
|
86
|
+
// response with custom instructions that override the system prompt for
|
|
87
|
+
// this turn only. Audio + transcript ride the normal response.* events.
|
|
88
|
+
send({
|
|
89
|
+
type: "response.create",
|
|
90
|
+
response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` },
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
78
94
|
function sendSessionUpdate(): void {
|
|
79
95
|
send({
|
|
80
96
|
type: "session.update",
|
|
81
97
|
session: {
|
|
82
|
-
|
|
83
|
-
|
|
98
|
+
type: "realtime",
|
|
99
|
+
output_modalities: ["audio"],
|
|
84
100
|
instructions: opts.sessionConfig.systemPrompt,
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
101
|
+
audio: {
|
|
102
|
+
input: {
|
|
103
|
+
format: { type: "audio/pcm", rate: 24_000 },
|
|
104
|
+
turn_detection: { type: "server_vad" },
|
|
105
|
+
transcription: { model: "whisper-1" },
|
|
106
|
+
},
|
|
107
|
+
output: {
|
|
108
|
+
format: { type: "audio/pcm", rate: 24_000 },
|
|
109
|
+
voice,
|
|
110
|
+
},
|
|
111
|
+
},
|
|
89
112
|
tools: opts.toolSchemas,
|
|
90
113
|
tool_choice: opts.toolChoice,
|
|
91
114
|
},
|
|
@@ -99,7 +122,6 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
99
122
|
const sock = createWs(url, {
|
|
100
123
|
headers: {
|
|
101
124
|
Authorization: `Bearer ${opts.apiKey}`,
|
|
102
|
-
"OpenAI-Beta": "realtime=v1",
|
|
103
125
|
},
|
|
104
126
|
});
|
|
105
127
|
ws = sock;
|
|
@@ -108,6 +130,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
108
130
|
sock.addEventListener("open", () => {
|
|
109
131
|
opened = true;
|
|
110
132
|
sendSessionUpdate();
|
|
133
|
+
sendGreeting();
|
|
111
134
|
resolve();
|
|
112
135
|
});
|
|
113
136
|
sock.addEventListener("message", (ev) => handleMessage(ev.data));
|
|
@@ -177,6 +200,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
177
200
|
function handleErrorEvent(obj: Record<string, unknown>): void {
|
|
178
201
|
const err = obj.error as { message?: unknown } | undefined;
|
|
179
202
|
const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
|
|
203
|
+
log.warn("OpenAI Realtime error event", { error: obj.error });
|
|
180
204
|
clearTurnBuffers();
|
|
181
205
|
opts.callbacks.onError("internal", message);
|
|
182
206
|
}
|
|
@@ -185,6 +209,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
185
209
|
const item = obj.item as
|
|
186
210
|
| { id?: string; type?: string; name?: string; call_id?: string }
|
|
187
211
|
| undefined;
|
|
212
|
+
log.info("OpenAI Realtime output_item.added", {
|
|
213
|
+
itemType: item?.type,
|
|
214
|
+
name: item?.name,
|
|
215
|
+
callId: item?.call_id,
|
|
216
|
+
});
|
|
188
217
|
if (item?.type !== "function_call" || !item.id) return;
|
|
189
218
|
toolBuffers.set(item.id, {
|
|
190
219
|
callId: item.call_id ?? "",
|
|
@@ -220,6 +249,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
220
249
|
const callId = asString(obj.call_id) || (buf?.callId ?? "");
|
|
221
250
|
const name = asString(obj.name) || (buf?.name ?? "");
|
|
222
251
|
const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
|
|
252
|
+
log.info("OpenAI Realtime tool call", { name, callId, args: argsStr });
|
|
223
253
|
const args = parseToolArgs(argsStr, name, callId);
|
|
224
254
|
opts.callbacks.onToolCall(callId, name, args);
|
|
225
255
|
}
|
|
@@ -235,9 +265,14 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
235
265
|
if (typeof raw !== "object" || raw === null) return;
|
|
236
266
|
const obj = raw as Record<string, unknown>;
|
|
237
267
|
switch (obj.type) {
|
|
268
|
+
// GA renamed audio output events to `response.output_audio.*` and
|
|
269
|
+
// transcript events to `response.output_audio_transcript.*`. The legacy
|
|
270
|
+
// (beta) names are kept as aliases so older snapshots still work.
|
|
271
|
+
case "response.output_audio.delta":
|
|
238
272
|
case "response.audio.delta":
|
|
239
273
|
handleAudioDelta(obj);
|
|
240
274
|
return;
|
|
275
|
+
case "response.output_audio.done":
|
|
241
276
|
case "response.audio.done":
|
|
242
277
|
opts.callbacks.onAudioDone();
|
|
243
278
|
return;
|
|
@@ -253,9 +288,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
253
288
|
case "response.created":
|
|
254
289
|
handleResponseCreated(obj);
|
|
255
290
|
return;
|
|
291
|
+
case "response.output_audio_transcript.delta":
|
|
256
292
|
case "response.audio_transcript.delta":
|
|
257
293
|
handleAgentTranscriptDelta(obj);
|
|
258
294
|
return;
|
|
295
|
+
case "response.output_audio_transcript.done":
|
|
259
296
|
case "response.audio_transcript.done":
|
|
260
297
|
handleAgentTranscriptDone(obj);
|
|
261
298
|
return;
|
|
@@ -275,6 +312,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
275
312
|
handleErrorEvent(obj);
|
|
276
313
|
return;
|
|
277
314
|
default:
|
|
315
|
+
log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
|
|
278
316
|
return;
|
|
279
317
|
}
|
|
280
318
|
}
|
|
@@ -302,11 +340,25 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
|
|
|
302
340
|
ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
|
|
303
341
|
},
|
|
304
342
|
sendToolResult(callId, result) {
|
|
343
|
+
log.info("OpenAI Realtime sendToolResult", {
|
|
344
|
+
callId,
|
|
345
|
+
resultLen: result.length,
|
|
346
|
+
preview: result.slice(0, 200),
|
|
347
|
+
});
|
|
305
348
|
send({
|
|
306
349
|
type: "conversation.item.create",
|
|
307
350
|
item: { type: "function_call_output", call_id: callId, output: result },
|
|
308
351
|
});
|
|
309
|
-
|
|
352
|
+
// Multiple tool results from one turn arrive synchronously; coalesce them
|
|
353
|
+
// into a single response.create per tick. OpenAI rejects a second
|
|
354
|
+
// response.create while one is in flight, which strands the turn.
|
|
355
|
+
if (!responseCreateQueued) {
|
|
356
|
+
responseCreateQueued = true;
|
|
357
|
+
queueMicrotask(() => {
|
|
358
|
+
responseCreateQueued = false;
|
|
359
|
+
send({ type: "response.create" });
|
|
360
|
+
});
|
|
361
|
+
}
|
|
310
362
|
},
|
|
311
363
|
cancelReply() {
|
|
312
364
|
if (currentResponseId === null) return;
|