@alexkroman1/aai 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @alexkroman1/aai@1.8.0 build /home/runner/work/agent/agent/packages/aai
2
+ > @alexkroman1/aai@1.8.1 build /home/runner/work/agent/agent/packages/aai
3
3
  > tsdown && tsc -p tsconfig.build.json
4
4
 
5
5
  ℹ tsdown v0.21.7 powered by rolldown v1.0.0-rc.12
@@ -8,7 +8,7 @@
8
8
  ℹ target: node22
9
9
  ℹ tsconfig: tsconfig.json
10
10
  ℹ Build start
11
- ℹ dist/host/runtime-barrel.js 108.22 kB │ gzip: 29.84 kB
11
+ ℹ dist/host/runtime-barrel.js 109.52 kB │ gzip: 30.12 kB
12
12
  ℹ dist/sdk/protocol.js  5.70 kB │ gzip: 1.92 kB
13
13
  ℹ dist/index.js  2.88 kB │ gzip: 1.24 kB
14
14
  ℹ dist/sdk/manifest-barrel.js  0.36 kB │ gzip: 0.20 kB
@@ -28,5 +28,5 @@
28
28
  ℹ dist/s3-BtCMvCod.js  0.76 kB │ gzip: 0.29 kB
29
29
  ℹ dist/pinecone-CeJ69aRs.js  0.48 kB │ gzip: 0.24 kB
30
30
  ℹ dist/openai-realtime-cjPAHMMx.js  0.27 kB │ gzip: 0.19 kB
31
- ℹ 20 files, total: 138.16 kB
32
- ✔ Build complete in 48ms
31
+ ℹ 20 files, total: 139.45 kB
32
+ ✔ Build complete in 62ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # @alexkroman1/aai
2
2
 
3
+ ## 1.8.1
4
+
5
+ ### Patch Changes
6
+
7
+ - ba8effb: Make OpenAI Realtime usable end-to-end on gpt-realtime-2:
8
+
9
+ - Accept GA-renamed audio/transcript server events (`response.output_audio.{delta,done}`, `response.output_audio_transcript.{delta,done}`) alongside the legacy `response.audio.*` names so audio and transcript reach the client.
10
+ - Trigger the agent's `greeting` on connect by sending a one-shot `response.create` with quoted instructions, and honor `skipGreeting` so resumed sessions don't replay it.
11
+ - Coalesce `response.create` across multiple `sendToolResult` calls in the same tick. Multi-tool turns previously sent one `response.create` per tool, the second of which OpenAI rejected as `conversation_already_has_active_response`, stranding the turn so the model never received the tool results.
12
+ - Log unhandled event types and the full payload of `error` events to make silently rejected `session.update` fields visible.
13
+
14
+ - f4cc5ef: Migrate OpenAI Realtime transport to GA API schema (gpt-realtime-2). Drop OpenAI-Beta: realtime=v1 connect header and update session.update to session.type=realtime, output_modalities, and nested audio.input/audio.output with audio/pcm format.
15
+
3
16
  ## 1.8.0
4
17
 
5
18
  ### Minor Changes
@@ -1681,6 +1681,7 @@ function createOpenaiRealtimeTransport(opts) {
1681
1681
  const agentTranscriptBuffers = /* @__PURE__ */ new Map();
1682
1682
  const toolBuffers = /* @__PURE__ */ new Map();
1683
1683
  let currentResponseId = null;
1684
+ let responseCreateQueued = false;
1684
1685
  function send(payload) {
1685
1686
  if (!ws || ws.readyState !== 1) {
1686
1687
  log.debug("OpenAI Realtime send dropped: socket not open", { type: payload.type });
@@ -1688,17 +1689,39 @@ function createOpenaiRealtimeTransport(opts) {
1688
1689
  }
1689
1690
  ws.send(JSON.stringify(payload));
1690
1691
  }
1692
+ function sendGreeting() {
1693
+ if (opts.skipGreeting) return;
1694
+ const greeting = opts.sessionConfig.greeting;
1695
+ if (!greeting) return;
1696
+ send({
1697
+ type: "response.create",
1698
+ response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` }
1699
+ });
1700
+ }
1691
1701
  function sendSessionUpdate() {
1692
1702
  send({
1693
1703
  type: "session.update",
1694
1704
  session: {
1695
- modalities: ["audio", "text"],
1696
- voice,
1705
+ type: "realtime",
1706
+ output_modalities: ["audio"],
1697
1707
  instructions: opts.sessionConfig.systemPrompt,
1698
- input_audio_format: "pcm16",
1699
- output_audio_format: "pcm16",
1700
- input_audio_transcription: { model: "whisper-1" },
1701
- turn_detection: { type: "server_vad" },
1708
+ audio: {
1709
+ input: {
1710
+ format: {
1711
+ type: "audio/pcm",
1712
+ rate: 24e3
1713
+ },
1714
+ turn_detection: { type: "server_vad" },
1715
+ transcription: { model: "whisper-1" }
1716
+ },
1717
+ output: {
1718
+ format: {
1719
+ type: "audio/pcm",
1720
+ rate: 24e3
1721
+ },
1722
+ voice
1723
+ }
1724
+ },
1702
1725
  tools: opts.toolSchemas,
1703
1726
  tool_choice: opts.toolChoice
1704
1727
  }
@@ -1708,15 +1731,13 @@ function createOpenaiRealtimeTransport(opts) {
1708
1731
  const url = `${baseUrl}?model=${encodeURIComponent(model)}`;
1709
1732
  log.info("OpenAI Realtime connecting", { url });
1710
1733
  return new Promise((resolve, reject) => {
1711
- const sock = createWs(url, { headers: {
1712
- Authorization: `Bearer ${opts.apiKey}`,
1713
- "OpenAI-Beta": "realtime=v1"
1714
- } });
1734
+ const sock = createWs(url, { headers: { Authorization: `Bearer ${opts.apiKey}` } });
1715
1735
  ws = sock;
1716
1736
  let opened = false;
1717
1737
  sock.addEventListener("open", () => {
1718
1738
  opened = true;
1719
1739
  sendSessionUpdate();
1740
+ sendGreeting();
1720
1741
  resolve();
1721
1742
  });
1722
1743
  sock.addEventListener("message", (ev) => handleMessage(ev.data));
@@ -1773,11 +1794,17 @@ function createOpenaiRealtimeTransport(opts) {
1773
1794
  function handleErrorEvent(obj) {
1774
1795
  const err = obj.error;
1775
1796
  const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
1797
+ log.warn("OpenAI Realtime error event", { error: obj.error });
1776
1798
  clearTurnBuffers();
1777
1799
  opts.callbacks.onError("internal", message);
1778
1800
  }
1779
1801
  function handleOutputItemAdded(obj) {
1780
1802
  const item = obj.item;
1803
+ log.info("OpenAI Realtime output_item.added", {
1804
+ itemType: item?.type,
1805
+ name: item?.name,
1806
+ callId: item?.call_id
1807
+ });
1781
1808
  if (item?.type !== "function_call" || !item.id) return;
1782
1809
  toolBuffers.set(item.id, {
1783
1810
  callId: item.call_id ?? "",
@@ -1810,7 +1837,13 @@ function createOpenaiRealtimeTransport(opts) {
1810
1837
  toolBuffers.delete(id);
1811
1838
  const callId = asString(obj.call_id) || (buf?.callId ?? "");
1812
1839
  const name = asString(obj.name) || (buf?.name ?? "");
1813
- const args = parseToolArgs(asString(obj.arguments) || (buf?.argsBuffer ?? ""), name, callId);
1840
+ const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
1841
+ log.info("OpenAI Realtime tool call", {
1842
+ name,
1843
+ callId,
1844
+ args: argsStr
1845
+ });
1846
+ const args = parseToolArgs(argsStr, name, callId);
1814
1847
  opts.callbacks.onToolCall(callId, name, args);
1815
1848
  }
1816
1849
  function handleMessage(data) {
@@ -1824,9 +1857,11 @@ function createOpenaiRealtimeTransport(opts) {
1824
1857
  if (typeof raw !== "object" || raw === null) return;
1825
1858
  const obj = raw;
1826
1859
  switch (obj.type) {
1860
+ case "response.output_audio.delta":
1827
1861
  case "response.audio.delta":
1828
1862
  handleAudioDelta(obj);
1829
1863
  return;
1864
+ case "response.output_audio.done":
1830
1865
  case "response.audio.done":
1831
1866
  opts.callbacks.onAudioDone();
1832
1867
  return;
@@ -1842,9 +1877,11 @@ function createOpenaiRealtimeTransport(opts) {
1842
1877
  case "response.created":
1843
1878
  handleResponseCreated(obj);
1844
1879
  return;
1880
+ case "response.output_audio_transcript.delta":
1845
1881
  case "response.audio_transcript.delta":
1846
1882
  handleAgentTranscriptDelta(obj);
1847
1883
  return;
1884
+ case "response.output_audio_transcript.done":
1848
1885
  case "response.audio_transcript.done":
1849
1886
  handleAgentTranscriptDone(obj);
1850
1887
  return;
@@ -1863,7 +1900,9 @@ function createOpenaiRealtimeTransport(opts) {
1863
1900
  case "error":
1864
1901
  handleErrorEvent(obj);
1865
1902
  return;
1866
- default: return;
1903
+ default:
1904
+ log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
1905
+ return;
1867
1906
  }
1868
1907
  }
1869
1908
  function handleClose(code, reason) {
@@ -1893,6 +1932,11 @@ function createOpenaiRealtimeTransport(opts) {
1893
1932
  ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
1894
1933
  },
1895
1934
  sendToolResult(callId, result) {
1935
+ log.info("OpenAI Realtime sendToolResult", {
1936
+ callId,
1937
+ resultLen: result.length,
1938
+ preview: result.slice(0, 200)
1939
+ });
1896
1940
  send({
1897
1941
  type: "conversation.item.create",
1898
1942
  item: {
@@ -1901,7 +1945,13 @@ function createOpenaiRealtimeTransport(opts) {
1901
1945
  output: result
1902
1946
  }
1903
1947
  });
1904
- send({ type: "response.create" });
1948
+ if (!responseCreateQueued) {
1949
+ responseCreateQueued = true;
1950
+ queueMicrotask(() => {
1951
+ responseCreateQueued = false;
1952
+ send({ type: "response.create" });
1953
+ });
1954
+ }
1905
1955
  },
1906
1956
  cancelReply() {
1907
1957
  if (currentResponseId === null) return;
@@ -3085,6 +3135,7 @@ function createRuntime(opts) {
3085
3135
  callbacks,
3086
3136
  sid: sessionOpts.id,
3087
3137
  agent: sessionOpts.agent,
3138
+ skipGreeting: sessionOpts.skipGreeting ?? false,
3088
3139
  ...createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {},
3089
3140
  logger
3090
3141
  });
@@ -37,6 +37,8 @@ export type OpenaiRealtimeTransportOptions = {
37
37
  callbacks: TransportCallbacks;
38
38
  sid: string;
39
39
  agent: string;
40
+ /** Skip the initial greeting (used for session resume). */
41
+ skipGreeting?: boolean;
40
42
  createWebSocket?: CreateOpenaiRealtimeWebSocket;
41
43
  logger?: Logger;
42
44
  };
package/host/runtime.ts CHANGED
@@ -432,6 +432,7 @@ export function createRuntime(opts: RuntimeOptions): Runtime {
432
432
  callbacks,
433
433
  sid: sessionOpts.id,
434
434
  agent: sessionOpts.agent,
435
+ skipGreeting: sessionOpts.skipGreeting ?? false,
435
436
  ...(createOpenaiRealtimeWebSocket ? { createWebSocket: createOpenaiRealtimeWebSocket } : {}),
436
437
  logger,
437
438
  });
@@ -83,7 +83,6 @@ describe("openai-realtime-transport: connect and session.update", () => {
83
83
  options: { model: "gpt-realtime", voice: "cedar" },
84
84
  sessionConfig: {
85
85
  systemPrompt: "Be terse.",
86
- greeting: "Hi.",
87
86
  tools: [],
88
87
  },
89
88
  toolSchemas: [
@@ -109,10 +108,7 @@ describe("openai-realtime-transport: connect and session.update", () => {
109
108
  expect(createWs).toHaveBeenCalledWith(
110
109
  "wss://api.openai.com/v1/realtime?model=gpt-realtime",
111
110
  expect.objectContaining({
112
- headers: expect.objectContaining({
113
- Authorization: "Bearer sk-test",
114
- "OpenAI-Beta": "realtime=v1",
115
- }),
111
+ headers: { Authorization: "Bearer sk-test" },
116
112
  }),
117
113
  );
118
114
 
@@ -121,13 +117,14 @@ describe("openai-realtime-transport: connect and session.update", () => {
121
117
  if (first === undefined) throw new Error("expected one send");
122
118
  const msg = JSON.parse(first);
123
119
  expect(msg.type).toBe("session.update");
124
- expect(msg.session.voice).toBe("cedar");
120
+ expect(msg.session.type).toBe("realtime");
121
+ expect(msg.session.output_modalities).toEqual(["audio"]);
125
122
  expect(msg.session.instructions).toBe("Be terse.");
126
- expect(msg.session.input_audio_format).toBe("pcm16");
127
- expect(msg.session.output_audio_format).toBe("pcm16");
128
- expect(msg.session.modalities).toEqual(["audio", "text"]);
129
- expect(msg.session.input_audio_transcription).toEqual({ model: "whisper-1" });
130
- expect(msg.session.turn_detection.type).toBe("server_vad");
123
+ expect(msg.session.audio.input.format).toEqual({ type: "audio/pcm", rate: 24_000 });
124
+ expect(msg.session.audio.input.turn_detection.type).toBe("server_vad");
125
+ expect(msg.session.audio.input.transcription).toEqual({ model: "whisper-1" });
126
+ expect(msg.session.audio.output.format).toEqual({ type: "audio/pcm", rate: 24_000 });
127
+ expect(msg.session.audio.output.voice).toBe("cedar");
131
128
  expect(msg.session.tools).toEqual([
132
129
  expect.objectContaining({ type: "function", name: "lookup" }),
133
130
  ]);
@@ -135,6 +132,57 @@ describe("openai-realtime-transport: connect and session.update", () => {
135
132
  });
136
133
  });
137
134
 
135
+ describe("greeting", () => {
136
+ function makeWithGreeting(args: { greeting?: string; skipGreeting?: boolean }) {
137
+ const fake = makeFakeWs();
138
+ const transport = createOpenaiRealtimeTransport({
139
+ apiKey: "sk",
140
+ options: {},
141
+ sessionConfig: {
142
+ systemPrompt: "",
143
+ ...(args.greeting !== undefined ? { greeting: args.greeting } : {}),
144
+ },
145
+ toolSchemas: [],
146
+ toolChoice: "auto",
147
+ callbacks: noopCallbacks(),
148
+ sid: "s",
149
+ agent: "a",
150
+ ...(args.skipGreeting !== undefined ? { skipGreeting: args.skipGreeting } : {}),
151
+ createWebSocket: () => fake,
152
+ logger: silentLogger,
153
+ });
154
+ const ready = transport.start();
155
+ fake.fire("open");
156
+ return { fake, ready };
157
+ }
158
+
159
+ test("sends response.create with quoted greeting after session.update", async () => {
160
+ const { fake, ready } = makeWithGreeting({ greeting: 'Hello, "friend".' });
161
+ await ready;
162
+ expect(fake.sent.length).toBe(2);
163
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
164
+ const greetingMsg = JSON.parse(fake.sent[1] ?? "{}");
165
+ expect(greetingMsg.type).toBe("response.create");
166
+ // JSON.stringify quotes the greeting and escapes any embedded quotes —
167
+ // protects against prompt-injection by closing the instruction string.
168
+ expect(greetingMsg.response.instructions).toBe('Say exactly: "Hello, \\"friend\\"."');
169
+ });
170
+
171
+ test("no greeting send when greeting is undefined", async () => {
172
+ const { fake, ready } = makeWithGreeting({});
173
+ await ready;
174
+ expect(fake.sent.length).toBe(1);
175
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
176
+ });
177
+
178
+ test("skipGreeting suppresses the greeting send", async () => {
179
+ const { fake, ready } = makeWithGreeting({ greeting: "Hi.", skipGreeting: true });
180
+ await ready;
181
+ expect(fake.sent.length).toBe(1);
182
+ expect(JSON.parse(fake.sent[0] ?? "{}").type).toBe("session.update");
183
+ });
184
+ });
185
+
138
186
  describe("audio in/out", () => {
139
187
  test("sendUserAudio sends input_audio_buffer.append with base64 payload", async () => {
140
188
  const { fake, transport, ready } = startedTransport();
@@ -150,21 +198,25 @@ describe("audio in/out", () => {
150
198
  expect(Buffer.from(msg.audio, "base64")).toEqual(Buffer.from([1, 2, 3, 4]));
151
199
  });
152
200
 
153
- test("response.audio.delta calls onAudioChunk with decoded bytes", async () => {
201
+ test.each([
202
+ ["response.audio.delta"],
203
+ ["response.output_audio.delta"],
204
+ ])("%s calls onAudioChunk with decoded bytes", async (type) => {
154
205
  const { fake, cbs, ready } = startedTransport();
155
206
  await ready;
156
207
  const audio = Buffer.from([5, 6, 7, 8]).toString("base64");
157
- fake.fire("message", {
158
- data: JSON.stringify({ type: "response.audio.delta", delta: audio }),
159
- });
208
+ fake.fire("message", { data: JSON.stringify({ type, delta: audio }) });
160
209
  expect(cbs.onAudioChunk).toHaveBeenCalledTimes(1);
161
210
  expect(cbs.onAudioChunk).toHaveBeenCalledWith(new Uint8Array([5, 6, 7, 8]));
162
211
  });
163
212
 
164
- test("response.audio.done calls onAudioDone", async () => {
213
+ test.each([
214
+ ["response.audio.done"],
215
+ ["response.output_audio.done"],
216
+ ])("%s calls onAudioDone", async (type) => {
165
217
  const { fake, cbs, ready } = startedTransport();
166
218
  await ready;
167
- fake.fire("message", { data: JSON.stringify({ type: "response.audio.done" }) });
219
+ fake.fire("message", { data: JSON.stringify({ type }) });
168
220
  expect(cbs.onAudioDone).toHaveBeenCalledTimes(1);
169
221
  });
170
222
  });
@@ -206,27 +258,22 @@ describe("VAD, user transcript, reply lifecycle, agent transcript", () => {
206
258
  expect(cbs.onReplyDone).toHaveBeenCalledTimes(1);
207
259
  });
208
260
 
209
- test("agent transcript: deltas accumulated, emitted on done", async () => {
261
+ test.each([
262
+ ["response.audio_transcript", "legacy"],
263
+ ["response.output_audio_transcript", "GA"],
264
+ ])("agent transcript (%s): deltas accumulated, emitted on done", async (prefix) => {
210
265
  const { fake, cbs, ready } = startedTransport();
211
266
  await ready;
212
267
  const item_id = "item_x";
213
268
  fake.fire("message", {
214
- data: JSON.stringify({
215
- type: "response.audio_transcript.delta",
216
- item_id,
217
- delta: "Hi ",
218
- }),
269
+ data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "Hi " }),
219
270
  });
220
271
  fake.fire("message", {
221
- data: JSON.stringify({
222
- type: "response.audio_transcript.delta",
223
- item_id,
224
- delta: "there.",
225
- }),
272
+ data: JSON.stringify({ type: `${prefix}.delta`, item_id, delta: "there." }),
226
273
  });
227
274
  expect(cbs.onAgentTranscript).not.toHaveBeenCalled();
228
275
  fake.fire("message", {
229
- data: JSON.stringify({ type: "response.audio_transcript.done", item_id }),
276
+ data: JSON.stringify({ type: `${prefix}.done`, item_id }),
230
277
  });
231
278
  expect(cbs.onAgentTranscript).toHaveBeenCalledWith("Hi there.", false);
232
279
  });
@@ -308,15 +355,36 @@ describe("tool calls", () => {
308
355
  await ready;
309
356
  fake.sent.length = 0; // drop session.update
310
357
  transport.sendToolResult("call_1", '{"ok":true}');
311
- expect(fake.sent.length).toBe(2);
358
+ // function_call_output is sent immediately; response.create is queued.
359
+ expect(fake.sent.length).toBe(1);
312
360
  const m1 = JSON.parse(fake.sent[0] ?? "{}");
313
361
  expect(m1.type).toBe("conversation.item.create");
314
362
  expect(m1.item.type).toBe("function_call_output");
315
363
  expect(m1.item.call_id).toBe("call_1");
316
364
  expect(m1.item.output).toBe('{"ok":true}');
365
+ await new Promise((r) => queueMicrotask(() => r(undefined)));
366
+ expect(fake.sent.length).toBe(2);
317
367
  const m2 = JSON.parse(fake.sent[1] ?? "{}");
318
368
  expect(m2.type).toBe("response.create");
319
369
  });
370
+
371
+ test("multiple sendToolResult calls coalesce into a single response.create", async () => {
372
+ const { fake, transport, ready } = startedTransport();
373
+ await ready;
374
+ fake.sent.length = 0;
375
+ // Synchronous burst — session-core flushes pending tool results in a loop.
376
+ transport.sendToolResult("call_1", '{"a":1}');
377
+ transport.sendToolResult("call_2", '{"b":2}');
378
+ transport.sendToolResult("call_3", '{"c":3}');
379
+ // Three function_call_outputs sent immediately, no response.create yet.
380
+ expect(fake.sent.length).toBe(3);
381
+ expect(fake.sent.every((s) => JSON.parse(s).type === "conversation.item.create")).toBe(true);
382
+ await new Promise((r) => queueMicrotask(() => r(undefined)));
383
+ // After the microtask, exactly one response.create — second one would be
384
+ // rejected as `conversation_already_has_active_response`.
385
+ expect(fake.sent.length).toBe(4);
386
+ expect(JSON.parse(fake.sent[3] ?? "{}").type).toBe("response.create");
387
+ });
320
388
  });
321
389
 
322
390
  describe("cancel, error, close", () => {
@@ -49,6 +49,8 @@ export type OpenaiRealtimeTransportOptions = {
49
49
  callbacks: TransportCallbacks;
50
50
  sid: string;
51
51
  agent: string;
52
+ /** Skip the initial greeting (used for session resume). */
53
+ skipGreeting?: boolean;
52
54
  createWebSocket?: CreateOpenaiRealtimeWebSocket;
53
55
  logger?: Logger;
54
56
  };
@@ -66,6 +68,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
66
68
  type ToolBuffer = { callId: string; name: string; argsBuffer: string };
67
69
  const toolBuffers = new Map<string, ToolBuffer>();
68
70
  let currentResponseId: string | null = null;
71
+ let responseCreateQueued = false;
69
72
 
70
73
  function send(payload: Record<string, unknown>): void {
71
74
  if (!ws || ws.readyState !== WS_OPEN) {
@@ -75,17 +78,37 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
75
78
  ws.send(JSON.stringify(payload));
76
79
  }
77
80
 
81
+ function sendGreeting(): void {
82
+ if (opts.skipGreeting) return;
83
+ const greeting = opts.sessionConfig.greeting;
84
+ if (!greeting) return;
85
+ // OpenAI Realtime has no native greeting field — trigger it as a one-shot
86
+ // response with custom instructions that override the system prompt for
87
+ // this turn only. Audio + transcript ride the normal response.* events.
88
+ send({
89
+ type: "response.create",
90
+ response: { instructions: `Say exactly: ${JSON.stringify(greeting)}` },
91
+ });
92
+ }
93
+
78
94
  function sendSessionUpdate(): void {
79
95
  send({
80
96
  type: "session.update",
81
97
  session: {
82
- modalities: ["audio", "text"],
83
- voice,
98
+ type: "realtime",
99
+ output_modalities: ["audio"],
84
100
  instructions: opts.sessionConfig.systemPrompt,
85
- input_audio_format: "pcm16",
86
- output_audio_format: "pcm16",
87
- input_audio_transcription: { model: "whisper-1" },
88
- turn_detection: { type: "server_vad" },
101
+ audio: {
102
+ input: {
103
+ format: { type: "audio/pcm", rate: 24_000 },
104
+ turn_detection: { type: "server_vad" },
105
+ transcription: { model: "whisper-1" },
106
+ },
107
+ output: {
108
+ format: { type: "audio/pcm", rate: 24_000 },
109
+ voice,
110
+ },
111
+ },
89
112
  tools: opts.toolSchemas,
90
113
  tool_choice: opts.toolChoice,
91
114
  },
@@ -99,7 +122,6 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
99
122
  const sock = createWs(url, {
100
123
  headers: {
101
124
  Authorization: `Bearer ${opts.apiKey}`,
102
- "OpenAI-Beta": "realtime=v1",
103
125
  },
104
126
  });
105
127
  ws = sock;
@@ -108,6 +130,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
108
130
  sock.addEventListener("open", () => {
109
131
  opened = true;
110
132
  sendSessionUpdate();
133
+ sendGreeting();
111
134
  resolve();
112
135
  });
113
136
  sock.addEventListener("message", (ev) => handleMessage(ev.data));
@@ -177,6 +200,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
177
200
  function handleErrorEvent(obj: Record<string, unknown>): void {
178
201
  const err = obj.error as { message?: unknown } | undefined;
179
202
  const message = typeof err?.message === "string" ? err.message : "OpenAI Realtime error";
203
+ log.warn("OpenAI Realtime error event", { error: obj.error });
180
204
  clearTurnBuffers();
181
205
  opts.callbacks.onError("internal", message);
182
206
  }
@@ -185,6 +209,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
185
209
  const item = obj.item as
186
210
  | { id?: string; type?: string; name?: string; call_id?: string }
187
211
  | undefined;
212
+ log.info("OpenAI Realtime output_item.added", {
213
+ itemType: item?.type,
214
+ name: item?.name,
215
+ callId: item?.call_id,
216
+ });
188
217
  if (item?.type !== "function_call" || !item.id) return;
189
218
  toolBuffers.set(item.id, {
190
219
  callId: item.call_id ?? "",
@@ -220,6 +249,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
220
249
  const callId = asString(obj.call_id) || (buf?.callId ?? "");
221
250
  const name = asString(obj.name) || (buf?.name ?? "");
222
251
  const argsStr = asString(obj.arguments) || (buf?.argsBuffer ?? "");
252
+ log.info("OpenAI Realtime tool call", { name, callId, args: argsStr });
223
253
  const args = parseToolArgs(argsStr, name, callId);
224
254
  opts.callbacks.onToolCall(callId, name, args);
225
255
  }
@@ -235,9 +265,14 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
235
265
  if (typeof raw !== "object" || raw === null) return;
236
266
  const obj = raw as Record<string, unknown>;
237
267
  switch (obj.type) {
268
+ // GA renamed audio output events to `response.output_audio.*` and
269
+ // transcript events to `response.output_audio_transcript.*`. The legacy
270
+ // (beta) names are kept as aliases so older snapshots still work.
271
+ case "response.output_audio.delta":
238
272
  case "response.audio.delta":
239
273
  handleAudioDelta(obj);
240
274
  return;
275
+ case "response.output_audio.done":
241
276
  case "response.audio.done":
242
277
  opts.callbacks.onAudioDone();
243
278
  return;
@@ -253,9 +288,11 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
253
288
  case "response.created":
254
289
  handleResponseCreated(obj);
255
290
  return;
291
+ case "response.output_audio_transcript.delta":
256
292
  case "response.audio_transcript.delta":
257
293
  handleAgentTranscriptDelta(obj);
258
294
  return;
295
+ case "response.output_audio_transcript.done":
259
296
  case "response.audio_transcript.done":
260
297
  handleAgentTranscriptDone(obj);
261
298
  return;
@@ -275,6 +312,7 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
275
312
  handleErrorEvent(obj);
276
313
  return;
277
314
  default:
315
+ log.debug("OpenAI Realtime: unhandled event", { type: obj.type });
278
316
  return;
279
317
  }
280
318
  }
@@ -302,11 +340,25 @@ export function createOpenaiRealtimeTransport(opts: OpenaiRealtimeTransportOptio
302
340
  ws.send(`{"type":"input_audio_buffer.append","audio":"${uint8ToBase64(bytes)}"}`);
303
341
  },
304
342
  sendToolResult(callId, result) {
343
+ log.info("OpenAI Realtime sendToolResult", {
344
+ callId,
345
+ resultLen: result.length,
346
+ preview: result.slice(0, 200),
347
+ });
305
348
  send({
306
349
  type: "conversation.item.create",
307
350
  item: { type: "function_call_output", call_id: callId, output: result },
308
351
  });
309
- send({ type: "response.create" });
352
+ // Multiple tool results from one turn arrive synchronously; coalesce them
353
+ // into a single response.create per tick. OpenAI rejects a second
354
+ // response.create while one is in flight, which strands the turn.
355
+ if (!responseCreateQueued) {
356
+ responseCreateQueued = true;
357
+ queueMicrotask(() => {
358
+ responseCreateQueued = false;
359
+ send({ type: "response.create" });
360
+ });
361
+ }
310
362
  },
311
363
  cancelReply() {
312
364
  if (currentResponseId === null) return;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@alexkroman1/aai",
3
- "version": "1.8.0",
3
+ "version": "1.8.1",
4
4
  "type": "module",
5
5
  "exports": {
6
6
  ".": {