@kernl-sdk/openai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -0
- package/CHANGELOG.md +15 -0
- package/dist/__tests__/realtime.integration.test.d.ts +2 -0
- package/dist/__tests__/realtime.integration.test.d.ts.map +1 -0
- package/dist/__tests__/realtime.integration.test.js +169 -0
- package/dist/__tests__/realtime.test.d.ts +2 -0
- package/dist/__tests__/realtime.test.d.ts.map +1 -0
- package/dist/__tests__/realtime.test.js +314 -0
- package/dist/convert/__tests__/event.test.d.ts +2 -0
- package/dist/convert/__tests__/event.test.d.ts.map +1 -0
- package/dist/convert/__tests__/event.test.js +514 -0
- package/dist/convert/event.d.ts +24 -0
- package/dist/convert/event.d.ts.map +1 -0
- package/dist/convert/event.js +398 -0
- package/dist/convert/types.d.ts +259 -0
- package/dist/convert/types.d.ts.map +1 -0
- package/dist/convert/types.js +1 -0
- package/dist/index.d.ts +36 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/realtime.d.ts +30 -0
- package/dist/realtime.d.ts.map +1 -0
- package/dist/realtime.js +214 -0
- package/package.json +54 -0
- package/src/__tests__/realtime.integration.test.ts +217 -0
- package/src/__tests__/realtime.test.ts +421 -0
- package/src/convert/__tests__/event.test.ts +592 -0
- package/src/convert/event.ts +481 -0
- package/src/convert/types.ts +344 -0
- package/src/index.ts +41 -0
- package/src/realtime.ts +276 -0
- package/tsconfig.json +13 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# @kernl-sdk/openai
|
|
2
|
+
|
|
3
|
+
## 0.1.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- 572ae80: Add realtime voice agent support with memory capabilities.
|
|
8
|
+
- **protocol**: Add realtime model and event types for voice agents
|
|
9
|
+
- **kernl**: Extract BaseAgent class shared by Agent and RealtimeAgent, enabling memory support for realtime agents. Add `kind` discriminator for type narrowing.
|
|
10
|
+
- **openai**: Add OpenAI realtime voice provider with WebSocket-based streaming
|
|
11
|
+
|
|
12
|
+
### Patch Changes
|
|
13
|
+
|
|
14
|
+
- Updated dependencies [572ae80]
|
|
15
|
+
- @kernl-sdk/protocol@0.3.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"realtime.integration.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/realtime.integration.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { describe, it, expect, beforeAll } from "vitest";
|
|
2
|
+
import { OpenAIRealtimeModel } from "../realtime.js";
|
|
3
|
+
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
|
|
4
|
+
describe.skipIf(!OPENAI_API_KEY)("OpenAI Realtime Integration", () => {
|
|
5
|
+
let model;
|
|
6
|
+
beforeAll(() => {
|
|
7
|
+
model = new OpenAIRealtimeModel("gpt-realtime", {
|
|
8
|
+
apiKey: OPENAI_API_KEY,
|
|
9
|
+
});
|
|
10
|
+
});
|
|
11
|
+
it("should connect and receive session.created", async () => {
|
|
12
|
+
const conn = await model.connect();
|
|
13
|
+
const events = [];
|
|
14
|
+
const sessionCreated = new Promise((resolve, reject) => {
|
|
15
|
+
const timeout = setTimeout(() => reject(new Error("timeout")), 10000);
|
|
16
|
+
conn.on("event", (e) => {
|
|
17
|
+
events.push(e);
|
|
18
|
+
if (e.kind === "session.created") {
|
|
19
|
+
clearTimeout(timeout);
|
|
20
|
+
resolve();
|
|
21
|
+
}
|
|
22
|
+
});
|
|
23
|
+
});
|
|
24
|
+
await sessionCreated;
|
|
25
|
+
conn.close();
|
|
26
|
+
expect(events.some((e) => e.kind === "session.created")).toBe(true);
|
|
27
|
+
expect(conn.sessionId).toBeTruthy();
|
|
28
|
+
});
|
|
29
|
+
it("should complete text round-trip", async () => {
|
|
30
|
+
const conn = await model.connect();
|
|
31
|
+
const events = [];
|
|
32
|
+
conn.on("event", (e) => {
|
|
33
|
+
events.push(e);
|
|
34
|
+
});
|
|
35
|
+
// wait for session
|
|
36
|
+
await waitFor(conn, "session.created");
|
|
37
|
+
// configure text-only mode
|
|
38
|
+
conn.send({
|
|
39
|
+
kind: "session.update",
|
|
40
|
+
config: {
|
|
41
|
+
modalities: ["text"],
|
|
42
|
+
instructions: "You are a helpful assistant. Be very brief.",
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
await waitFor(conn, "session.updated");
|
|
46
|
+
// add user message
|
|
47
|
+
conn.send({
|
|
48
|
+
kind: "item.create",
|
|
49
|
+
item: {
|
|
50
|
+
kind: "message",
|
|
51
|
+
id: "test-msg-1",
|
|
52
|
+
role: "user",
|
|
53
|
+
content: [{ kind: "text", text: "Say exactly: hello world" }],
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
// trigger response
|
|
57
|
+
conn.send({ kind: "response.create" });
|
|
58
|
+
// wait for response to complete
|
|
59
|
+
await waitFor(conn, "response.done", 15000);
|
|
60
|
+
conn.close();
|
|
61
|
+
// verify event flow
|
|
62
|
+
const kinds = events.map((e) => e.kind);
|
|
63
|
+
expect(kinds).toContain("session.created");
|
|
64
|
+
expect(kinds).toContain("session.updated");
|
|
65
|
+
expect(kinds).toContain("response.created");
|
|
66
|
+
expect(kinds).toContain("response.done");
|
|
67
|
+
// verify we got text output
|
|
68
|
+
const textOutput = events.find((e) => e.kind === "text.output");
|
|
69
|
+
expect(textOutput).toBeDefined();
|
|
70
|
+
if (textOutput?.kind === "text.output") {
|
|
71
|
+
expect(textOutput.text.toLowerCase()).toContain("hello");
|
|
72
|
+
}
|
|
73
|
+
// verify response completed successfully
|
|
74
|
+
const done = events.find((e) => e.kind === "response.done");
|
|
75
|
+
if (done?.kind === "response.done") {
|
|
76
|
+
expect(done.status).toBe("completed");
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
it("should handle tool calling", { timeout: 10000 }, async () => {
|
|
80
|
+
const conn = await model.connect();
|
|
81
|
+
const events = [];
|
|
82
|
+
conn.on("event", (e) => {
|
|
83
|
+
events.push(e);
|
|
84
|
+
});
|
|
85
|
+
await waitFor(conn, "session.created");
|
|
86
|
+
// configure with a tool
|
|
87
|
+
conn.send({
|
|
88
|
+
kind: "session.update",
|
|
89
|
+
config: {
|
|
90
|
+
modalities: ["text"],
|
|
91
|
+
instructions: "You have access to tools. Use them when appropriate.",
|
|
92
|
+
tools: [
|
|
93
|
+
{
|
|
94
|
+
kind: "function",
|
|
95
|
+
name: "get_weather",
|
|
96
|
+
description: "Get the current weather for a location",
|
|
97
|
+
parameters: {
|
|
98
|
+
type: "object",
|
|
99
|
+
properties: {
|
|
100
|
+
location: { type: "string", description: "City name" },
|
|
101
|
+
},
|
|
102
|
+
required: ["location"],
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
],
|
|
106
|
+
},
|
|
107
|
+
});
|
|
108
|
+
await waitFor(conn, "session.updated");
|
|
109
|
+
// ask about weather
|
|
110
|
+
conn.send({
|
|
111
|
+
kind: "item.create",
|
|
112
|
+
item: {
|
|
113
|
+
kind: "message",
|
|
114
|
+
id: "test-msg-2",
|
|
115
|
+
role: "user",
|
|
116
|
+
content: [{ kind: "text", text: "What is the weather in Tokyo?" }],
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
conn.send({ kind: "response.create" });
|
|
120
|
+
// wait for tool call
|
|
121
|
+
const toolCall = await waitFor(conn, "tool.call", 15000);
|
|
122
|
+
expect(toolCall.kind).toBe("tool.call");
|
|
123
|
+
if (toolCall.kind !== "tool.call") {
|
|
124
|
+
throw new Error("Expected tool.call");
|
|
125
|
+
}
|
|
126
|
+
expect(toolCall.toolId).toBe("get_weather");
|
|
127
|
+
const args = JSON.parse(toolCall.arguments);
|
|
128
|
+
expect(args.location.toLowerCase()).toContain("tokyo");
|
|
129
|
+
// wait for first response to complete before sending tool result
|
|
130
|
+
await waitFor(conn, "response.done", 15000);
|
|
131
|
+
// send tool result
|
|
132
|
+
conn.send({
|
|
133
|
+
kind: "tool.result",
|
|
134
|
+
callId: toolCall.callId,
|
|
135
|
+
result: JSON.stringify({ temperature: 22, condition: "sunny" }),
|
|
136
|
+
});
|
|
137
|
+
// trigger follow-up response
|
|
138
|
+
conn.send({ kind: "response.create" });
|
|
139
|
+
// wait for second response to complete
|
|
140
|
+
await waitFor(conn, "response.done", 15000);
|
|
141
|
+
conn.close();
|
|
142
|
+
// verify we got text mentioning the weather
|
|
143
|
+
const textEvents = events.filter((e) => e.kind === "text.output");
|
|
144
|
+
const allText = textEvents
|
|
145
|
+
.map((e) => (e.kind === "text.output" ? e.text : ""))
|
|
146
|
+
.join(" ")
|
|
147
|
+
.toLowerCase();
|
|
148
|
+
expect(allText).toMatch(/sunny|22|tokyo/i);
|
|
149
|
+
});
|
|
150
|
+
});
|
|
151
|
+
/**
|
|
152
|
+
* Wait for a specific event kind.
|
|
153
|
+
*/
|
|
154
|
+
function waitFor(conn, kind, timeout = 10000) {
|
|
155
|
+
return new Promise((resolve, reject) => {
|
|
156
|
+
const timer = setTimeout(() => {
|
|
157
|
+
conn.off("event", handler);
|
|
158
|
+
reject(new Error(`timeout waiting for ${kind}`));
|
|
159
|
+
}, timeout);
|
|
160
|
+
const handler = (e) => {
|
|
161
|
+
if (e.kind === kind) {
|
|
162
|
+
clearTimeout(timer);
|
|
163
|
+
conn.off("event", handler);
|
|
164
|
+
resolve(e);
|
|
165
|
+
}
|
|
166
|
+
};
|
|
167
|
+
conn.on("event", handler);
|
|
168
|
+
});
|
|
169
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"realtime.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/realtime.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from "vitest";
|
|
2
|
+
import { EventEmitter } from "node:events";
|
|
3
|
+
// Track mock WebSocket instances
|
|
4
|
+
const wsInstances = [];
|
|
5
|
+
function createMockWebSocket() {
|
|
6
|
+
const emitter = new EventEmitter();
|
|
7
|
+
emitter.send = vi.fn();
|
|
8
|
+
emitter.close = vi.fn();
|
|
9
|
+
emitter.readyState = 1; // OPEN
|
|
10
|
+
emitter.OPEN = 1;
|
|
11
|
+
return emitter;
|
|
12
|
+
}
|
|
13
|
+
// Mock WebSocket with a proper constructor function
|
|
14
|
+
vi.mock("ws", () => {
|
|
15
|
+
const MockWebSocket = function () {
|
|
16
|
+
const instance = createMockWebSocket();
|
|
17
|
+
wsInstances.push(instance);
|
|
18
|
+
return instance;
|
|
19
|
+
};
|
|
20
|
+
MockWebSocket.OPEN = 1;
|
|
21
|
+
return {
|
|
22
|
+
default: MockWebSocket,
|
|
23
|
+
WebSocket: MockWebSocket,
|
|
24
|
+
};
|
|
25
|
+
});
|
|
26
|
+
// Import after mock
|
|
27
|
+
import { OpenAIRealtimeModel } from "../realtime.js";
|
|
28
|
+
describe("OpenAIRealtimeModel", () => {
|
|
29
|
+
beforeEach(() => {
|
|
30
|
+
vi.clearAllMocks();
|
|
31
|
+
wsInstances.length = 0;
|
|
32
|
+
});
|
|
33
|
+
it("should require API key", () => {
|
|
34
|
+
const originalEnv = process.env.OPENAI_API_KEY;
|
|
35
|
+
delete process.env.OPENAI_API_KEY;
|
|
36
|
+
expect(() => new OpenAIRealtimeModel("gpt-4o-realtime")).toThrow("OpenAI API key is required");
|
|
37
|
+
process.env.OPENAI_API_KEY = originalEnv;
|
|
38
|
+
});
|
|
39
|
+
it("should accept API key via options", () => {
|
|
40
|
+
const model = new OpenAIRealtimeModel("gpt-4o-realtime", {
|
|
41
|
+
apiKey: "test-key",
|
|
42
|
+
});
|
|
43
|
+
expect(model.modelId).toBe("gpt-4o-realtime");
|
|
44
|
+
expect(model.provider).toBe("openai");
|
|
45
|
+
expect(model.spec).toBe("1.0");
|
|
46
|
+
});
|
|
47
|
+
it("should use OPENAI_API_KEY env var", () => {
|
|
48
|
+
const originalEnv = process.env.OPENAI_API_KEY;
|
|
49
|
+
process.env.OPENAI_API_KEY = "env-key";
|
|
50
|
+
const model = new OpenAIRealtimeModel("gpt-4o-realtime");
|
|
51
|
+
expect(model.modelId).toBe("gpt-4o-realtime");
|
|
52
|
+
process.env.OPENAI_API_KEY = originalEnv;
|
|
53
|
+
});
|
|
54
|
+
});
|
|
55
|
+
describe("base64ByteLength", () => {
|
|
56
|
+
// Test the helper function indirectly through the module
|
|
57
|
+
// The actual function is not exported, but we can verify the audio length calculation works
|
|
58
|
+
it("should calculate correct byte length for base64 without padding", () => {
|
|
59
|
+
// "SGVsbG8" = "Hello" (5 bytes), no padding
|
|
60
|
+
// base64 length = 8, padding = 0
|
|
61
|
+
// bytes = (8 * 3) / 4 - 0 = 6 (actually "Hello" is 5 bytes, but base64 of "Hello" is "SGVsbG8=" with padding)
|
|
62
|
+
// Let's use a known value: "AAAA" = 3 bytes (no padding needed for 3 bytes)
|
|
63
|
+
// Actually "AAA=" is 2 bytes, "AAAA" is 3 bytes
|
|
64
|
+
const b64NoPadding = "AAAA"; // 3 bytes
|
|
65
|
+
const expectedBytes = 3;
|
|
66
|
+
const padding = 0;
|
|
67
|
+
const calculated = (b64NoPadding.length * 3) / 4 - padding;
|
|
68
|
+
expect(calculated).toBe(expectedBytes);
|
|
69
|
+
});
|
|
70
|
+
it("should calculate correct byte length for base64 with single padding", () => {
|
|
71
|
+
// "AAA=" represents 2 bytes
|
|
72
|
+
const b64SinglePad = "AAA=";
|
|
73
|
+
const expectedBytes = 2;
|
|
74
|
+
const padding = 1;
|
|
75
|
+
const calculated = (b64SinglePad.length * 3) / 4 - padding;
|
|
76
|
+
expect(calculated).toBe(expectedBytes);
|
|
77
|
+
});
|
|
78
|
+
it("should calculate correct byte length for base64 with double padding", () => {
|
|
79
|
+
// "AA==" represents 1 byte
|
|
80
|
+
const b64DoublePad = "AA==";
|
|
81
|
+
const expectedBytes = 1;
|
|
82
|
+
const padding = 2;
|
|
83
|
+
const calculated = (b64DoublePad.length * 3) / 4 - padding;
|
|
84
|
+
expect(calculated).toBe(expectedBytes);
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
describe("audio length calculation", () => {
|
|
88
|
+
it("should calculate correct duration for 24kHz PCM16", () => {
|
|
89
|
+
// 24kHz PCM16 = 24000 samples/sec, 2 bytes/sample = 48000 bytes/sec
|
|
90
|
+
// 48000 bytes = 1000ms
|
|
91
|
+
// 4800 bytes = 100ms
|
|
92
|
+
const bytes = 4800;
|
|
93
|
+
const expectedMs = (bytes / 2 / 24000) * 1000;
|
|
94
|
+
expect(expectedMs).toBe(100);
|
|
95
|
+
});
|
|
96
|
+
it("should accumulate audio length from multiple chunks", () => {
|
|
97
|
+
// Simulate multiple audio chunks
|
|
98
|
+
const chunk1Bytes = 2400; // 50ms
|
|
99
|
+
const chunk2Bytes = 2400; // 50ms
|
|
100
|
+
const chunk3Bytes = 2400; // 50ms
|
|
101
|
+
let totalMs = 0;
|
|
102
|
+
totalMs += (chunk1Bytes / 2 / 24000) * 1000;
|
|
103
|
+
totalMs += (chunk2Bytes / 2 / 24000) * 1000;
|
|
104
|
+
totalMs += (chunk3Bytes / 2 / 24000) * 1000;
|
|
105
|
+
expect(totalMs).toBe(150);
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
describe("interruption timing", () => {
|
|
109
|
+
it("should calculate audio_end_ms as min of elapsed and total length", () => {
|
|
110
|
+
const firstAudioTimestamp = 1000;
|
|
111
|
+
const currentTime = 1150; // 150ms elapsed
|
|
112
|
+
const audioLengthMs = 200; // but only 200ms of audio received
|
|
113
|
+
const elapsed = currentTime - firstAudioTimestamp;
|
|
114
|
+
const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
|
|
115
|
+
expect(audioEndMs).toBe(150); // elapsed is less than total
|
|
116
|
+
});
|
|
117
|
+
it("should cap audio_end_ms at total audio length", () => {
|
|
118
|
+
const firstAudioTimestamp = 1000;
|
|
119
|
+
const currentTime = 1500; // 500ms elapsed
|
|
120
|
+
const audioLengthMs = 200; // but only 200ms of audio received
|
|
121
|
+
const elapsed = currentTime - firstAudioTimestamp;
|
|
122
|
+
const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
|
|
123
|
+
expect(audioEndMs).toBe(200); // capped at audio length
|
|
124
|
+
});
|
|
125
|
+
it("should handle zero elapsed time", () => {
|
|
126
|
+
const firstAudioTimestamp = 1000;
|
|
127
|
+
const currentTime = 1000; // 0ms elapsed
|
|
128
|
+
const audioLengthMs = 200;
|
|
129
|
+
const elapsed = currentTime - firstAudioTimestamp;
|
|
130
|
+
const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
|
|
131
|
+
expect(audioEndMs).toBe(0);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
describe("OpenAIRealtimeConnection (mocked WebSocket)", () => {
|
|
135
|
+
const apiKey = "test-key";
|
|
136
|
+
beforeEach(() => {
|
|
137
|
+
vi.clearAllMocks();
|
|
138
|
+
wsInstances.length = 0;
|
|
139
|
+
});
|
|
140
|
+
const getLastSocket = () => {
|
|
141
|
+
if (wsInstances.length === 0) {
|
|
142
|
+
throw new Error("No WebSocket instances were created");
|
|
143
|
+
}
|
|
144
|
+
return wsInstances[wsInstances.length - 1];
|
|
145
|
+
};
|
|
146
|
+
const emitServerEvent = (socket, event) => {
|
|
147
|
+
const payload = Buffer.from(JSON.stringify(event));
|
|
148
|
+
socket.emit("message", payload);
|
|
149
|
+
};
|
|
150
|
+
const createConnectedRealtime = async () => {
|
|
151
|
+
// Ensure env key is set so constructor does not throw.
|
|
152
|
+
const originalEnv = process.env.OPENAI_API_KEY;
|
|
153
|
+
process.env.OPENAI_API_KEY = apiKey;
|
|
154
|
+
const model = new OpenAIRealtimeModel("gpt-4o-realtime");
|
|
155
|
+
const connectPromise = model.connect();
|
|
156
|
+
const socket = getLastSocket();
|
|
157
|
+
// Simulate successful WebSocket open.
|
|
158
|
+
socket.emit("open");
|
|
159
|
+
const connection = await connectPromise;
|
|
160
|
+
// Restore env to avoid side effects for other tests.
|
|
161
|
+
process.env.OPENAI_API_KEY = originalEnv;
|
|
162
|
+
return { connection, socket };
|
|
163
|
+
};
|
|
164
|
+
it("should process a basic conversation flow and emit events", async () => {
|
|
165
|
+
const { connection, socket } = await createConnectedRealtime();
|
|
166
|
+
const statusEvents = [];
|
|
167
|
+
const realtimeEvents = [];
|
|
168
|
+
connection.on("status", (status) => {
|
|
169
|
+
statusEvents.push(status);
|
|
170
|
+
});
|
|
171
|
+
connection.on("event", (event) => {
|
|
172
|
+
realtimeEvents.push(event);
|
|
173
|
+
});
|
|
174
|
+
// Verify initial status after open.
|
|
175
|
+
expect(connection.status).toBe("connected");
|
|
176
|
+
// session.created
|
|
177
|
+
emitServerEvent(socket, {
|
|
178
|
+
type: "session.created",
|
|
179
|
+
session: { id: "sess-1", instructions: "Be helpful" },
|
|
180
|
+
});
|
|
181
|
+
// response.created
|
|
182
|
+
emitServerEvent(socket, {
|
|
183
|
+
type: "response.created",
|
|
184
|
+
response: { id: "resp-1" },
|
|
185
|
+
});
|
|
186
|
+
// small audio delta then done
|
|
187
|
+
emitServerEvent(socket, {
|
|
188
|
+
type: "response.output_audio.delta",
|
|
189
|
+
response_id: "resp-1",
|
|
190
|
+
item_id: "item-1",
|
|
191
|
+
content_index: 0,
|
|
192
|
+
delta: "AAAA",
|
|
193
|
+
});
|
|
194
|
+
emitServerEvent(socket, {
|
|
195
|
+
type: "response.output_audio.done",
|
|
196
|
+
response_id: "resp-1",
|
|
197
|
+
item_id: "item-1",
|
|
198
|
+
content_index: 0,
|
|
199
|
+
});
|
|
200
|
+
// text delta then done
|
|
201
|
+
emitServerEvent(socket, {
|
|
202
|
+
type: "response.text.delta",
|
|
203
|
+
response_id: "resp-1",
|
|
204
|
+
item_id: "item-1",
|
|
205
|
+
content_index: 0,
|
|
206
|
+
delta: "Hello",
|
|
207
|
+
});
|
|
208
|
+
emitServerEvent(socket, {
|
|
209
|
+
type: "response.text.done",
|
|
210
|
+
response_id: "resp-1",
|
|
211
|
+
item_id: "item-1",
|
|
212
|
+
content_index: 0,
|
|
213
|
+
text: "Hello world",
|
|
214
|
+
});
|
|
215
|
+
// transcripts
|
|
216
|
+
emitServerEvent(socket, {
|
|
217
|
+
type: "conversation.item.input_audio_transcription.completed",
|
|
218
|
+
item_id: "item-1",
|
|
219
|
+
content_index: 0,
|
|
220
|
+
transcript: "User said hello",
|
|
221
|
+
});
|
|
222
|
+
emitServerEvent(socket, {
|
|
223
|
+
type: "response.output_audio_transcript.done",
|
|
224
|
+
response_id: "resp-1",
|
|
225
|
+
item_id: "item-1",
|
|
226
|
+
content_index: 0,
|
|
227
|
+
transcript: "Assistant said hi",
|
|
228
|
+
});
|
|
229
|
+
// response.done with usage
|
|
230
|
+
emitServerEvent(socket, {
|
|
231
|
+
type: "response.done",
|
|
232
|
+
response: {
|
|
233
|
+
id: "resp-1",
|
|
234
|
+
status: "completed",
|
|
235
|
+
usage: {
|
|
236
|
+
input_tokens: 10,
|
|
237
|
+
output_tokens: 20,
|
|
238
|
+
total_tokens: 30,
|
|
239
|
+
},
|
|
240
|
+
},
|
|
241
|
+
});
|
|
242
|
+
// Close socket to trigger status change and reset.
|
|
243
|
+
socket.emit("close");
|
|
244
|
+
// Status events should include closed (connected is emitted before we subscribe).
|
|
245
|
+
expect(statusEvents).toContain("closed");
|
|
246
|
+
// We should have seen several realtime events in a sensible order.
|
|
247
|
+
const kinds = realtimeEvents.map((e) => e?.kind);
|
|
248
|
+
expect(kinds).toContain("session.created");
|
|
249
|
+
expect(kinds).toContain("response.created");
|
|
250
|
+
expect(kinds).toContain("audio.output.delta");
|
|
251
|
+
expect(kinds).toContain("audio.output.done");
|
|
252
|
+
expect(kinds).toContain("text.output.delta");
|
|
253
|
+
expect(kinds).toContain("text.output");
|
|
254
|
+
expect(kinds).toContain("transcript.input");
|
|
255
|
+
expect(kinds).toContain("transcript.output");
|
|
256
|
+
expect(kinds).toContain("response.done");
|
|
257
|
+
});
|
|
258
|
+
it("should cancel and truncate correctly on speech start (interrupt)", async () => {
|
|
259
|
+
const { connection, socket } = await createConnectedRealtime();
|
|
260
|
+
const connectionWithInterrupt = connection;
|
|
261
|
+
// Control time so we can reason about audio_end_ms.
|
|
262
|
+
let now = 1000;
|
|
263
|
+
const dateSpy = vi
|
|
264
|
+
.spyOn(Date, "now")
|
|
265
|
+
.mockImplementation(() => now);
|
|
266
|
+
// Mark that a response is in progress with some audio.
|
|
267
|
+
emitServerEvent(socket, {
|
|
268
|
+
type: "response.created",
|
|
269
|
+
response: { id: "resp-1" },
|
|
270
|
+
});
|
|
271
|
+
// Single audio delta chunk; compute its duration with the same formula.
|
|
272
|
+
const deltaAudio = "AAAA";
|
|
273
|
+
emitServerEvent(socket, {
|
|
274
|
+
type: "response.output_audio.delta",
|
|
275
|
+
response_id: "resp-1",
|
|
276
|
+
item_id: "item-1",
|
|
277
|
+
content_index: 0,
|
|
278
|
+
delta: deltaAudio,
|
|
279
|
+
});
|
|
280
|
+
const base64ByteLength = (b64) => {
|
|
281
|
+
const padding = b64.endsWith("==")
|
|
282
|
+
? 2
|
|
283
|
+
: b64.endsWith("=")
|
|
284
|
+
? 1
|
|
285
|
+
: 0;
|
|
286
|
+
return (b64.length * 3) / 4 - padding;
|
|
287
|
+
};
|
|
288
|
+
const bytes = base64ByteLength(deltaAudio);
|
|
289
|
+
const totalAudioMs = (bytes / 2 / 24000) * 1000;
|
|
290
|
+
// Advance time so that some time has elapsed since first audio.
|
|
291
|
+
now = 1150; // 150ms elapsed
|
|
292
|
+
// speech_started should trigger interrupt logic.
|
|
293
|
+
emitServerEvent(socket, {
|
|
294
|
+
type: "input_audio_buffer.speech_started",
|
|
295
|
+
audio_start_ms: 0,
|
|
296
|
+
item_id: "item-2",
|
|
297
|
+
});
|
|
298
|
+
// We expect two outbound sends: response.cancel and item.truncate.
|
|
299
|
+
const sendMock = socket.send;
|
|
300
|
+
expect(sendMock.mock.calls.length).toBe(2);
|
|
301
|
+
const cancelPayload = JSON.parse(sendMock.mock.calls[0][0]);
|
|
302
|
+
expect(cancelPayload).toEqual({ type: "response.cancel" });
|
|
303
|
+
const truncatePayload = JSON.parse(sendMock.mock.calls[1][0]);
|
|
304
|
+
expect(truncatePayload.type).toBe("conversation.item.truncate");
|
|
305
|
+
expect(truncatePayload.item_id).toBe("item-1");
|
|
306
|
+
expect(truncatePayload.content_index).toBe(0);
|
|
307
|
+
const expectedEndMs = Math.max(0, Math.floor(Math.min(150, totalAudioMs)));
|
|
308
|
+
expect(truncatePayload.audio_end_ms).toBe(expectedEndMs);
|
|
309
|
+
// Calling interrupt again should be a no-op (state was reset).
|
|
310
|
+
connectionWithInterrupt.interrupt();
|
|
311
|
+
expect(sendMock.mock.calls.length).toBe(2);
|
|
312
|
+
dateSpy.mockRestore();
|
|
313
|
+
});
|
|
314
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"event.test.d.ts","sourceRoot":"","sources":["../../../src/convert/__tests__/event.test.ts"],"names":[],"mappings":""}
|