@kernl-sdk/openai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,421 @@
1
+ import { describe, it, expect, vi, beforeEach } from "vitest";
2
+ import { EventEmitter } from "node:events";
3
+
4
+ import type {
5
+ RealtimeServerEvent,
6
+ TransportStatus,
7
+ } from "@kernl-sdk/protocol";
8
+ import type { OpenAIServerEvent } from "../convert/types";
9
+
10
+ // Track mock WebSocket instances
11
+ const wsInstances: TestWebSocket[] = [];
12
+
13
+ interface TestWebSocket extends EventEmitter {
14
+ send: ReturnType<typeof vi.fn>;
15
+ close: ReturnType<typeof vi.fn>;
16
+ readyState: number;
17
+ OPEN: number;
18
+ }
19
+
20
+ function createMockWebSocket(): TestWebSocket {
21
+ const emitter = new EventEmitter() as TestWebSocket;
22
+ emitter.send = vi.fn();
23
+ emitter.close = vi.fn();
24
+ emitter.readyState = 1; // OPEN
25
+ emitter.OPEN = 1;
26
+ return emitter;
27
+ }
28
+
29
+ // Mock WebSocket with a proper constructor function
30
+ vi.mock("ws", () => {
31
+ const MockWebSocket = function (this: TestWebSocket) {
32
+ const instance = createMockWebSocket();
33
+ wsInstances.push(instance);
34
+ return instance;
35
+ } as unknown as { new (): TestWebSocket; OPEN: number };
36
+ MockWebSocket.OPEN = 1;
37
+
38
+ return {
39
+ default: MockWebSocket,
40
+ WebSocket: MockWebSocket,
41
+ };
42
+ });
43
+
44
+ // Import after mock
45
+ import { OpenAIRealtimeModel } from "../realtime";
46
+
47
+ describe("OpenAIRealtimeModel", () => {
48
+ beforeEach(() => {
49
+ vi.clearAllMocks();
50
+ wsInstances.length = 0;
51
+ });
52
+
53
+ it("should require API key", () => {
54
+ const originalEnv = process.env.OPENAI_API_KEY;
55
+ delete process.env.OPENAI_API_KEY;
56
+
57
+ expect(() => new OpenAIRealtimeModel("gpt-4o-realtime")).toThrow(
58
+ "OpenAI API key is required",
59
+ );
60
+
61
+ process.env.OPENAI_API_KEY = originalEnv;
62
+ });
63
+
64
+ it("should accept API key via options", () => {
65
+ const model = new OpenAIRealtimeModel("gpt-4o-realtime", {
66
+ apiKey: "test-key",
67
+ });
68
+
69
+ expect(model.modelId).toBe("gpt-4o-realtime");
70
+ expect(model.provider).toBe("openai");
71
+ expect(model.spec).toBe("1.0");
72
+ });
73
+
74
+ it("should use OPENAI_API_KEY env var", () => {
75
+ const originalEnv = process.env.OPENAI_API_KEY;
76
+ process.env.OPENAI_API_KEY = "env-key";
77
+
78
+ const model = new OpenAIRealtimeModel("gpt-4o-realtime");
79
+ expect(model.modelId).toBe("gpt-4o-realtime");
80
+
81
+ process.env.OPENAI_API_KEY = originalEnv;
82
+ });
83
+ });
84
+
85
+ describe("base64ByteLength", () => {
86
+ // Test the helper function indirectly through the module
87
+ // The actual function is not exported, but we can verify the audio length calculation works
88
+
89
+ it("should calculate correct byte length for base64 without padding", () => {
90
+ // "SGVsbG8" = "Hello" (5 bytes), no padding
91
+ // base64 length = 8, padding = 0
92
+ // bytes = (8 * 3) / 4 - 0 = 6 (actually "Hello" is 5 bytes, but base64 of "Hello" is "SGVsbG8=" with padding)
93
+ // Let's use a known value: "AAAA" = 3 bytes (no padding needed for 3 bytes)
94
+ // Actually "AAA=" is 2 bytes, "AAAA" is 3 bytes
95
+ const b64NoPadding = "AAAA"; // 3 bytes
96
+ const expectedBytes = 3;
97
+ const padding = 0;
98
+ const calculated = (b64NoPadding.length * 3) / 4 - padding;
99
+ expect(calculated).toBe(expectedBytes);
100
+ });
101
+
102
+ it("should calculate correct byte length for base64 with single padding", () => {
103
+ // "AAA=" represents 2 bytes
104
+ const b64SinglePad = "AAA=";
105
+ const expectedBytes = 2;
106
+ const padding = 1;
107
+ const calculated = (b64SinglePad.length * 3) / 4 - padding;
108
+ expect(calculated).toBe(expectedBytes);
109
+ });
110
+
111
+ it("should calculate correct byte length for base64 with double padding", () => {
112
+ // "AA==" represents 1 byte
113
+ const b64DoublePad = "AA==";
114
+ const expectedBytes = 1;
115
+ const padding = 2;
116
+ const calculated = (b64DoublePad.length * 3) / 4 - padding;
117
+ expect(calculated).toBe(expectedBytes);
118
+ });
119
+ });
120
+
121
+ describe("audio length calculation", () => {
122
+ it("should calculate correct duration for 24kHz PCM16", () => {
123
+ // 24kHz PCM16 = 24000 samples/sec, 2 bytes/sample = 48000 bytes/sec
124
+ // 48000 bytes = 1000ms
125
+ // 4800 bytes = 100ms
126
+ const bytes = 4800;
127
+ const expectedMs = (bytes / 2 / 24000) * 1000;
128
+ expect(expectedMs).toBe(100);
129
+ });
130
+
131
+ it("should accumulate audio length from multiple chunks", () => {
132
+ // Simulate multiple audio chunks
133
+ const chunk1Bytes = 2400; // 50ms
134
+ const chunk2Bytes = 2400; // 50ms
135
+ const chunk3Bytes = 2400; // 50ms
136
+
137
+ let totalMs = 0;
138
+ totalMs += (chunk1Bytes / 2 / 24000) * 1000;
139
+ totalMs += (chunk2Bytes / 2 / 24000) * 1000;
140
+ totalMs += (chunk3Bytes / 2 / 24000) * 1000;
141
+
142
+ expect(totalMs).toBe(150);
143
+ });
144
+ });
145
+
146
+ describe("interruption timing", () => {
147
+ it("should calculate audio_end_ms as min of elapsed and total length", () => {
148
+ const firstAudioTimestamp = 1000;
149
+ const currentTime = 1150; // 150ms elapsed
150
+ const audioLengthMs = 200; // but only 200ms of audio received
151
+
152
+ const elapsed = currentTime - firstAudioTimestamp;
153
+ const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
154
+
155
+ expect(audioEndMs).toBe(150); // elapsed is less than total
156
+ });
157
+
158
+ it("should cap audio_end_ms at total audio length", () => {
159
+ const firstAudioTimestamp = 1000;
160
+ const currentTime = 1500; // 500ms elapsed
161
+ const audioLengthMs = 200; // but only 200ms of audio received
162
+
163
+ const elapsed = currentTime - firstAudioTimestamp;
164
+ const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
165
+
166
+ expect(audioEndMs).toBe(200); // capped at audio length
167
+ });
168
+
169
+ it("should handle zero elapsed time", () => {
170
+ const firstAudioTimestamp = 1000;
171
+ const currentTime = 1000; // 0ms elapsed
172
+ const audioLengthMs = 200;
173
+
174
+ const elapsed = currentTime - firstAudioTimestamp;
175
+ const audioEndMs = Math.max(0, Math.floor(Math.min(elapsed, audioLengthMs)));
176
+
177
+ expect(audioEndMs).toBe(0);
178
+ });
179
+ });
180
+
181
+ describe("OpenAIRealtimeConnection (mocked WebSocket)", () => {
182
+ const apiKey = "test-key";
183
+
184
+ beforeEach(() => {
185
+ vi.clearAllMocks();
186
+ wsInstances.length = 0;
187
+ });
188
+
189
+ const getLastSocket = (): TestWebSocket => {
190
+ if (wsInstances.length === 0) {
191
+ throw new Error("No WebSocket instances were created");
192
+ }
193
+ return wsInstances[wsInstances.length - 1];
194
+ };
195
+
196
+ const emitServerEvent = (
197
+ socket: TestWebSocket,
198
+ event: OpenAIServerEvent,
199
+ ): void => {
200
+ const payload = Buffer.from(JSON.stringify(event));
201
+ socket.emit("message", payload);
202
+ };
203
+
204
+ const createConnectedRealtime = async () => {
205
+ // Ensure env key is set so constructor does not throw.
206
+ const originalEnv = process.env.OPENAI_API_KEY;
207
+ process.env.OPENAI_API_KEY = apiKey;
208
+
209
+ const model = new OpenAIRealtimeModel("gpt-4o-realtime");
210
+ const connectPromise = model.connect();
211
+
212
+ const socket = getLastSocket();
213
+ // Simulate successful WebSocket open.
214
+ socket.emit("open");
215
+
216
+ const connection = await connectPromise;
217
+
218
+ // Restore env to avoid side effects for other tests.
219
+ process.env.OPENAI_API_KEY = originalEnv;
220
+
221
+ return { connection, socket };
222
+ };
223
+
224
+ it("should process a basic conversation flow and emit events", async () => {
225
+ const { connection, socket } = await createConnectedRealtime();
226
+
227
+ const statusEvents: TransportStatus[] = [];
228
+ const realtimeEvents: RealtimeServerEvent[] = [];
229
+
230
+ (connection as unknown as EventEmitter).on(
231
+ "status",
232
+ (status: TransportStatus) => {
233
+ statusEvents.push(status);
234
+ },
235
+ );
236
+
237
+ (connection as unknown as EventEmitter).on(
238
+ "event",
239
+ (event: RealtimeServerEvent) => {
240
+ realtimeEvents.push(event);
241
+ },
242
+ );
243
+
244
+ // Verify initial status after open.
245
+ expect(
246
+ (connection as unknown as { status: TransportStatus }).status,
247
+ ).toBe("connected");
248
+
249
+ // session.created
250
+ emitServerEvent(socket, {
251
+ type: "session.created",
252
+ session: { id: "sess-1", instructions: "Be helpful" },
253
+ });
254
+
255
+ // response.created
256
+ emitServerEvent(socket, {
257
+ type: "response.created",
258
+ response: { id: "resp-1" },
259
+ });
260
+
261
+ // small audio delta then done
262
+ emitServerEvent(socket, {
263
+ type: "response.output_audio.delta",
264
+ response_id: "resp-1",
265
+ item_id: "item-1",
266
+ content_index: 0,
267
+ delta: "AAAA",
268
+ });
269
+
270
+ emitServerEvent(socket, {
271
+ type: "response.output_audio.done",
272
+ response_id: "resp-1",
273
+ item_id: "item-1",
274
+ content_index: 0,
275
+ });
276
+
277
+ // text delta then done
278
+ emitServerEvent(socket, {
279
+ type: "response.text.delta",
280
+ response_id: "resp-1",
281
+ item_id: "item-1",
282
+ content_index: 0,
283
+ delta: "Hello",
284
+ });
285
+
286
+ emitServerEvent(socket, {
287
+ type: "response.text.done",
288
+ response_id: "resp-1",
289
+ item_id: "item-1",
290
+ content_index: 0,
291
+ text: "Hello world",
292
+ });
293
+
294
+ // transcripts
295
+ emitServerEvent(socket, {
296
+ type: "conversation.item.input_audio_transcription.completed",
297
+ item_id: "item-1",
298
+ content_index: 0,
299
+ transcript: "User said hello",
300
+ });
301
+
302
+ emitServerEvent(socket, {
303
+ type: "response.output_audio_transcript.done",
304
+ response_id: "resp-1",
305
+ item_id: "item-1",
306
+ content_index: 0,
307
+ transcript: "Assistant said hi",
308
+ });
309
+
310
+ // response.done with usage
311
+ emitServerEvent(socket, {
312
+ type: "response.done",
313
+ response: {
314
+ id: "resp-1",
315
+ status: "completed",
316
+ usage: {
317
+ input_tokens: 10,
318
+ output_tokens: 20,
319
+ total_tokens: 30,
320
+ },
321
+ },
322
+ });
323
+
324
+ // Close socket to trigger status change and reset.
325
+ socket.emit("close");
326
+
327
+ // Status events should include closed (connected is emitted before we subscribe).
328
+ expect(statusEvents).toContain("closed");
329
+
330
+ // We should have seen several realtime events in a sensible order.
331
+ const kinds = realtimeEvents.map((e) => e?.kind);
332
+ expect(kinds).toContain("session.created");
333
+ expect(kinds).toContain("response.created");
334
+ expect(kinds).toContain("audio.output.delta");
335
+ expect(kinds).toContain("audio.output.done");
336
+ expect(kinds).toContain("text.output.delta");
337
+ expect(kinds).toContain("text.output");
338
+ expect(kinds).toContain("transcript.input");
339
+ expect(kinds).toContain("transcript.output");
340
+ expect(kinds).toContain("response.done");
341
+ });
342
+
343
+ it("should cancel and truncate correctly on speech start (interrupt)", async () => {
344
+ const { connection, socket } = await createConnectedRealtime();
345
+
346
+ const connectionWithInterrupt = connection as unknown as {
347
+ interrupt: () => void;
348
+ };
349
+
350
+ // Control time so we can reason about audio_end_ms.
351
+ let now = 1000;
352
+ const dateSpy = vi
353
+ .spyOn(Date, "now")
354
+ .mockImplementation(() => now);
355
+
356
+ // Mark that a response is in progress with some audio.
357
+ emitServerEvent(socket, {
358
+ type: "response.created",
359
+ response: { id: "resp-1" },
360
+ });
361
+
362
+ // Single audio delta chunk; compute its duration with the same formula.
363
+ const deltaAudio = "AAAA";
364
+ emitServerEvent(socket, {
365
+ type: "response.output_audio.delta",
366
+ response_id: "resp-1",
367
+ item_id: "item-1",
368
+ content_index: 0,
369
+ delta: deltaAudio,
370
+ });
371
+
372
+ const base64ByteLength = (b64: string): number => {
373
+ const padding = b64.endsWith("==")
374
+ ? 2
375
+ : b64.endsWith("=")
376
+ ? 1
377
+ : 0;
378
+ return (b64.length * 3) / 4 - padding;
379
+ };
380
+
381
+ const bytes = base64ByteLength(deltaAudio);
382
+ const totalAudioMs = (bytes / 2 / 24000) * 1000;
383
+
384
+ // Advance time so that some time has elapsed since first audio.
385
+ now = 1150; // 150ms elapsed
386
+
387
+ // speech_started should trigger interrupt logic.
388
+ emitServerEvent(socket, {
389
+ type: "input_audio_buffer.speech_started",
390
+ audio_start_ms: 0,
391
+ item_id: "item-2",
392
+ });
393
+
394
+ // We expect two outbound sends: response.cancel and item.truncate.
395
+ const sendMock = socket.send as unknown as {
396
+ mock: { calls: [string][] };
397
+ };
398
+
399
+ expect(sendMock.mock.calls.length).toBe(2);
400
+
401
+ const cancelPayload = JSON.parse(sendMock.mock.calls[0][0]);
402
+ expect(cancelPayload).toEqual({ type: "response.cancel" });
403
+
404
+ const truncatePayload = JSON.parse(sendMock.mock.calls[1][0]);
405
+ expect(truncatePayload.type).toBe("conversation.item.truncate");
406
+ expect(truncatePayload.item_id).toBe("item-1");
407
+ expect(truncatePayload.content_index).toBe(0);
408
+
409
+ const expectedEndMs = Math.max(
410
+ 0,
411
+ Math.floor(Math.min(150, totalAudioMs)),
412
+ );
413
+ expect(truncatePayload.audio_end_ms).toBe(expectedEndMs);
414
+
415
+ // Calling interrupt again should be a no-op (state was reset).
416
+ connectionWithInterrupt.interrupt();
417
+ expect(sendMock.mock.calls.length).toBe(2);
418
+
419
+ dateSpy.mockRestore();
420
+ });
421
+ });