@livekit/agents-plugin-openai 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +18 -0
  2. package/dist/index.cjs +55 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +13 -8
  5. package/dist/index.js.map +1 -1
  6. package/dist/llm.cjs +506 -0
  7. package/dist/llm.cjs.map +1 -0
  8. package/dist/llm.d.ts.map +1 -1
  9. package/dist/llm.js +438 -423
  10. package/dist/llm.js.map +1 -1
  11. package/dist/llm.test.cjs +8 -0
  12. package/dist/llm.test.cjs.map +1 -0
  13. package/dist/llm.test.d.ts +2 -0
  14. package/dist/llm.test.d.ts.map +1 -0
  15. package/dist/llm.test.js +7 -0
  16. package/dist/llm.test.js.map +1 -0
  17. package/dist/models.cjs +17 -0
  18. package/dist/models.cjs.map +1 -0
  19. package/dist/models.js +0 -4
  20. package/dist/models.js.map +1 -1
  21. package/dist/realtime/api_proto.cjs +41 -0
  22. package/dist/realtime/api_proto.cjs.map +1 -0
  23. package/dist/realtime/api_proto.js +12 -8
  24. package/dist/realtime/api_proto.js.map +1 -1
  25. package/dist/realtime/index.cjs +25 -0
  26. package/dist/realtime/index.cjs.map +1 -0
  27. package/dist/realtime/index.js +2 -5
  28. package/dist/realtime/index.js.map +1 -1
  29. package/dist/realtime/realtime_model.cjs +878 -0
  30. package/dist/realtime/realtime_model.cjs.map +1 -0
  31. package/dist/realtime/realtime_model.js +828 -777
  32. package/dist/realtime/realtime_model.js.map +1 -1
  33. package/dist/stt.cjs +130 -0
  34. package/dist/stt.cjs.map +1 -0
  35. package/dist/stt.js +99 -102
  36. package/dist/stt.js.map +1 -1
  37. package/dist/stt.test.cjs +9 -0
  38. package/dist/stt.test.cjs.map +1 -0
  39. package/dist/stt.test.d.ts +2 -0
  40. package/dist/stt.test.d.ts.map +1 -0
  41. package/dist/stt.test.js +8 -0
  42. package/dist/stt.test.js.map +1 -0
  43. package/dist/tts.cjs +100 -0
  44. package/dist/tts.cjs.map +1 -0
  45. package/dist/tts.d.ts +1 -1
  46. package/dist/tts.d.ts.map +1 -1
  47. package/dist/tts.js +67 -65
  48. package/dist/tts.js.map +1 -1
  49. package/dist/tts.test.cjs +9 -0
  50. package/dist/tts.test.cjs.map +1 -0
  51. package/dist/tts.test.d.ts +2 -0
  52. package/dist/tts.test.d.ts.map +1 -0
  53. package/dist/tts.test.js +8 -0
  54. package/dist/tts.test.js.map +1 -0
  55. package/package.json +20 -8
  56. package/src/llm.test.ts +10 -0
  57. package/src/llm.ts +7 -2
  58. package/src/stt.test.ts +11 -0
  59. package/src/tts.test.ts +11 -0
  60. package/src/tts.ts +2 -1
@@ -0,0 +1,878 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+ var realtime_model_exports = {};
30
+ __export(realtime_model_exports, {
31
+ RealtimeModel: () => RealtimeModel,
32
+ RealtimeSession: () => RealtimeSession
33
+ });
34
+ module.exports = __toCommonJS(realtime_model_exports);
35
+ var import_agents = require("@livekit/agents");
36
+ var import_rtc_node = require("@livekit/rtc-node");
37
+ var import_node_events = require("node:events");
38
+ var import_ws = require("ws");
39
+ var api_proto = __toESM(require("./api_proto.cjs"), 1);
40
+ class InputAudioBuffer {
41
+ #session;
42
+ constructor(session) {
43
+ this.#session = session;
44
+ }
45
+ append(frame) {
46
+ this.#session.queueMsg({
47
+ type: "input_audio_buffer.append",
48
+ audio: Buffer.from(frame.data.buffer).toString("base64")
49
+ });
50
+ }
51
+ clear() {
52
+ this.#session.queueMsg({
53
+ type: "input_audio_buffer.clear"
54
+ });
55
+ }
56
+ commit() {
57
+ this.#session.queueMsg({
58
+ type: "input_audio_buffer.commit"
59
+ });
60
+ }
61
+ }
62
+ class ConversationItem {
63
+ #session;
64
+ #logger = (0, import_agents.log)();
65
+ constructor(session) {
66
+ this.#session = session;
67
+ }
68
+ truncate(itemId, contentIndex, audioEnd) {
69
+ this.#session.queueMsg({
70
+ type: "conversation.item.truncate",
71
+ item_id: itemId,
72
+ content_index: contentIndex,
73
+ audio_end_ms: audioEnd
74
+ });
75
+ }
76
+ delete(itemId) {
77
+ this.#session.queueMsg({
78
+ type: "conversation.item.delete",
79
+ item_id: itemId
80
+ });
81
+ }
82
+ create(message, previousItemId) {
83
+ if (!message.content) {
84
+ return;
85
+ }
86
+ let event;
87
+ if (message.toolCallId) {
88
+ if (typeof message.content !== "string") {
89
+ throw new TypeError("message.content must be a string");
90
+ }
91
+ event = {
92
+ type: "conversation.item.create",
93
+ previous_item_id: previousItemId,
94
+ item: {
95
+ type: "function_call_output",
96
+ call_id: message.toolCallId,
97
+ output: message.content
98
+ }
99
+ };
100
+ } else {
101
+ let content = message.content;
102
+ if (!Array.isArray(content)) {
103
+ content = [content];
104
+ }
105
+ if (message.role === import_agents.llm.ChatRole.USER) {
106
+ const contents = [];
107
+ for (const c of content) {
108
+ if (typeof c === "string") {
109
+ contents.push({
110
+ type: "input_text",
111
+ text: c
112
+ });
113
+ } else if (
114
+ // typescript type guard for determining ChatAudio vs ChatImage
115
+ ((c2) => {
116
+ return c2.frame !== void 0;
117
+ })(c)
118
+ ) {
119
+ contents.push({
120
+ type: "input_audio",
121
+ audio: Buffer.from((0, import_agents.mergeFrames)(c.frame).data.buffer).toString("base64")
122
+ });
123
+ }
124
+ }
125
+ event = {
126
+ type: "conversation.item.create",
127
+ previous_item_id: previousItemId,
128
+ item: {
129
+ type: "message",
130
+ role: "user",
131
+ content: contents
132
+ }
133
+ };
134
+ } else if (message.role === import_agents.llm.ChatRole.ASSISTANT) {
135
+ const contents = [];
136
+ for (const c of content) {
137
+ if (typeof c === "string") {
138
+ contents.push({
139
+ type: "text",
140
+ text: c
141
+ });
142
+ } else if (
143
+ // typescript type guard for determining ChatAudio vs ChatImage
144
+ ((c2) => {
145
+ return c2.frame !== void 0;
146
+ })(c)
147
+ ) {
148
+ this.#logger.warn("audio content in assistant message is not supported");
149
+ }
150
+ }
151
+ event = {
152
+ type: "conversation.item.create",
153
+ previous_item_id: previousItemId,
154
+ item: {
155
+ type: "message",
156
+ role: "assistant",
157
+ content: contents
158
+ }
159
+ };
160
+ } else if (message.role === import_agents.llm.ChatRole.SYSTEM) {
161
+ const contents = [];
162
+ for (const c of content) {
163
+ if (typeof c === "string") {
164
+ contents.push({
165
+ type: "input_text",
166
+ text: c
167
+ });
168
+ } else if (
169
+ // typescript type guard for determining ChatAudio vs ChatImage
170
+ ((c2) => {
171
+ return c2.frame !== void 0;
172
+ })(c)
173
+ ) {
174
+ this.#logger.warn("audio content in system message is not supported");
175
+ }
176
+ }
177
+ event = {
178
+ type: "conversation.item.create",
179
+ previous_item_id: previousItemId,
180
+ item: {
181
+ type: "message",
182
+ role: "system",
183
+ content: contents
184
+ }
185
+ };
186
+ } else {
187
+ this.#logger.child({ message }).warn("chat message is not supported inside the realtime API");
188
+ return;
189
+ }
190
+ }
191
+ this.#session.queueMsg(event);
192
+ }
193
+ }
194
+ class Conversation {
195
+ #session;
196
+ constructor(session) {
197
+ this.#session = session;
198
+ }
199
+ get item() {
200
+ return new ConversationItem(this.#session);
201
+ }
202
+ }
203
+ class Response {
204
+ #session;
205
+ constructor(session) {
206
+ this.#session = session;
207
+ }
208
+ create() {
209
+ this.#session.queueMsg({
210
+ type: "response.create"
211
+ });
212
+ }
213
+ cancel() {
214
+ this.#session.queueMsg({
215
+ type: "response.cancel"
216
+ });
217
+ }
218
+ }
219
+ class RealtimeModel extends import_agents.multimodal.RealtimeModel {
220
+ sampleRate = api_proto.SAMPLE_RATE;
221
+ numChannels = api_proto.NUM_CHANNELS;
222
+ inFrameSize = api_proto.IN_FRAME_SIZE;
223
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
224
+ #defaultOpts;
225
+ #sessions = [];
226
+ static withAzure({
227
+ baseURL,
228
+ azureDeployment,
229
+ apiVersion = "2024-10-01-preview",
230
+ apiKey = void 0,
231
+ entraToken = void 0,
232
+ instructions = "",
233
+ modalities = ["text", "audio"],
234
+ voice = "alloy",
235
+ inputAudioFormat = "pcm16",
236
+ outputAudioFormat = "pcm16",
237
+ inputAudioTranscription = { model: "whisper-1" },
238
+ turnDetection = { type: "server_vad" },
239
+ temperature = 0.8,
240
+ maxResponseOutputTokens = Infinity
241
+ }) {
242
+ return new RealtimeModel({
243
+ isAzure: true,
244
+ baseURL: new URL("openai", baseURL).toString(),
245
+ model: azureDeployment,
246
+ apiVersion,
247
+ apiKey,
248
+ entraToken,
249
+ instructions,
250
+ modalities,
251
+ voice,
252
+ inputAudioFormat,
253
+ outputAudioFormat,
254
+ inputAudioTranscription,
255
+ turnDetection,
256
+ temperature,
257
+ maxResponseOutputTokens
258
+ });
259
+ }
260
+ constructor({
261
+ modalities = ["text", "audio"],
262
+ instructions = "",
263
+ voice = "alloy",
264
+ inputAudioFormat = "pcm16",
265
+ outputAudioFormat = "pcm16",
266
+ inputAudioTranscription = { model: "whisper-1" },
267
+ turnDetection = { type: "server_vad" },
268
+ temperature = 0.8,
269
+ maxResponseOutputTokens = Infinity,
270
+ model = "gpt-4o-realtime-preview-2024-10-01",
271
+ apiKey = process.env.OPENAI_API_KEY || "",
272
+ baseURL = api_proto.BASE_URL,
273
+ // used for microsoft
274
+ isAzure = false,
275
+ apiVersion = void 0,
276
+ entraToken = void 0
277
+ }) {
278
+ super();
279
+ if (apiKey === "") {
280
+ throw new Error(
281
+ "OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable"
282
+ );
283
+ }
284
+ this.#defaultOpts = {
285
+ modalities,
286
+ instructions,
287
+ voice,
288
+ inputAudioFormat,
289
+ outputAudioFormat,
290
+ inputAudioTranscription,
291
+ turnDetection,
292
+ temperature,
293
+ maxResponseOutputTokens,
294
+ model,
295
+ apiKey,
296
+ baseURL,
297
+ isAzure,
298
+ apiVersion,
299
+ entraToken
300
+ };
301
+ }
302
+ get sessions() {
303
+ return this.#sessions;
304
+ }
305
+ session({
306
+ fncCtx,
307
+ chatCtx,
308
+ modalities = this.#defaultOpts.modalities,
309
+ instructions = this.#defaultOpts.instructions,
310
+ voice = this.#defaultOpts.voice,
311
+ inputAudioFormat = this.#defaultOpts.inputAudioFormat,
312
+ outputAudioFormat = this.#defaultOpts.outputAudioFormat,
313
+ inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
314
+ turnDetection = this.#defaultOpts.turnDetection,
315
+ temperature = this.#defaultOpts.temperature,
316
+ maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens
317
+ }) {
318
+ const opts = {
319
+ modalities,
320
+ instructions,
321
+ voice,
322
+ inputAudioFormat,
323
+ outputAudioFormat,
324
+ inputAudioTranscription,
325
+ turnDetection,
326
+ temperature,
327
+ maxResponseOutputTokens,
328
+ model: this.#defaultOpts.model,
329
+ apiKey: this.#defaultOpts.apiKey,
330
+ baseURL: this.#defaultOpts.baseURL,
331
+ isAzure: this.#defaultOpts.isAzure,
332
+ apiVersion: this.#defaultOpts.apiVersion,
333
+ entraToken: this.#defaultOpts.entraToken
334
+ };
335
+ const newSession = new RealtimeSession(opts, {
336
+ chatCtx: chatCtx || new import_agents.llm.ChatContext(),
337
+ fncCtx
338
+ });
339
+ this.#sessions.push(newSession);
340
+ return newSession;
341
+ }
342
+ async close() {
343
+ await Promise.allSettled(this.#sessions.map((session) => session.close()));
344
+ }
345
+ }
346
+ class RealtimeSession extends import_agents.multimodal.RealtimeSession {
347
+ #chatCtx = void 0;
348
+ #fncCtx = void 0;
349
+ #opts;
350
+ #pendingResponses = {};
351
+ #sessionId = "not-connected";
352
+ #ws = null;
353
+ #expiresAt = null;
354
+ #logger = (0, import_agents.log)();
355
+ #task;
356
+ #closing = true;
357
+ #sendQueue = new import_agents.Queue();
358
+ constructor(opts, { fncCtx, chatCtx }) {
359
+ super();
360
+ this.#opts = opts;
361
+ this.#chatCtx = chatCtx;
362
+ this.#fncCtx = fncCtx;
363
+ this.#task = this.#start();
364
+ this.sessionUpdate({
365
+ modalities: this.#opts.modalities,
366
+ instructions: this.#opts.instructions,
367
+ voice: this.#opts.voice,
368
+ inputAudioFormat: this.#opts.inputAudioFormat,
369
+ outputAudioFormat: this.#opts.outputAudioFormat,
370
+ inputAudioTranscription: this.#opts.inputAudioTranscription,
371
+ turnDetection: this.#opts.turnDetection,
372
+ temperature: this.#opts.temperature,
373
+ maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
374
+ toolChoice: "auto"
375
+ });
376
+ }
377
+ get chatCtx() {
378
+ return this.#chatCtx;
379
+ }
380
+ get fncCtx() {
381
+ return this.#fncCtx;
382
+ }
383
+ set fncCtx(ctx) {
384
+ this.#fncCtx = ctx;
385
+ }
386
+ get conversation() {
387
+ return new Conversation(this);
388
+ }
389
+ get inputAudioBuffer() {
390
+ return new InputAudioBuffer(this);
391
+ }
392
+ get response() {
393
+ return new Response(this);
394
+ }
395
+ get expiration() {
396
+ if (!this.#expiresAt) {
397
+ throw new Error("session not started");
398
+ }
399
+ return this.#expiresAt * 1e3;
400
+ }
401
+ queueMsg(command) {
402
+ this.#sendQueue.put(command);
403
+ }
404
+ /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
405
+ /// with large amounts of base64 audio data.
406
+ #loggableEvent(event, maxLength = 30) {
407
+ const untypedEvent = {};
408
+ for (const [key, value] of Object.entries(event)) {
409
+ if (value !== void 0) {
410
+ untypedEvent[key] = value;
411
+ }
412
+ }
413
+ if (untypedEvent.audio && typeof untypedEvent.audio === "string") {
414
+ const truncatedData = untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? "\u2026" : "");
415
+ return { ...untypedEvent, audio: truncatedData };
416
+ }
417
+ if (untypedEvent.delta && typeof untypedEvent.delta === "string" && event.type === "response.audio.delta") {
418
+ const truncatedDelta = untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? "\u2026" : "");
419
+ return { ...untypedEvent, delta: truncatedDelta };
420
+ }
421
+ return untypedEvent;
422
+ }
423
+ sessionUpdate({
424
+ modalities = this.#opts.modalities,
425
+ instructions = this.#opts.instructions,
426
+ voice = this.#opts.voice,
427
+ inputAudioFormat = this.#opts.inputAudioFormat,
428
+ outputAudioFormat = this.#opts.outputAudioFormat,
429
+ inputAudioTranscription = this.#opts.inputAudioTranscription,
430
+ turnDetection = this.#opts.turnDetection,
431
+ temperature = this.#opts.temperature,
432
+ maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
433
+ toolChoice = "auto"
434
+ }) {
435
+ this.#opts = {
436
+ modalities,
437
+ instructions,
438
+ voice,
439
+ inputAudioFormat,
440
+ outputAudioFormat,
441
+ inputAudioTranscription,
442
+ turnDetection,
443
+ temperature,
444
+ maxResponseOutputTokens,
445
+ model: this.#opts.model,
446
+ apiKey: this.#opts.apiKey,
447
+ baseURL: this.#opts.baseURL,
448
+ isAzure: this.#opts.isAzure,
449
+ apiVersion: this.#opts.apiVersion,
450
+ entraToken: this.#opts.entraToken
451
+ };
452
+ const tools = this.#fncCtx ? Object.entries(this.#fncCtx).map(([name, func]) => ({
453
+ type: "function",
454
+ name,
455
+ description: func.description,
456
+ parameters: (
457
+ // don't format parameters if they are raw openai params
458
+ func.parameters.type == "object" ? func.parameters : import_agents.llm.oaiParams(func.parameters)
459
+ )
460
+ })) : [];
461
+ const sessionUpdateEvent = {
462
+ type: "session.update",
463
+ session: {
464
+ modalities: this.#opts.modalities,
465
+ instructions: this.#opts.instructions,
466
+ voice: this.#opts.voice,
467
+ input_audio_format: this.#opts.inputAudioFormat,
468
+ output_audio_format: this.#opts.outputAudioFormat,
469
+ input_audio_transcription: this.#opts.inputAudioTranscription,
470
+ turn_detection: this.#opts.turnDetection,
471
+ temperature: this.#opts.temperature,
472
+ max_response_output_tokens: this.#opts.maxResponseOutputTokens === Infinity ? "inf" : this.#opts.maxResponseOutputTokens,
473
+ tools,
474
+ tool_choice: toolChoice
475
+ }
476
+ };
477
+ if (this.#opts.isAzure && this.#opts.maxResponseOutputTokens === Infinity) {
478
+ sessionUpdateEvent.session.max_response_output_tokens = void 0;
479
+ }
480
+ this.queueMsg(sessionUpdateEvent);
481
+ }
482
+ #start() {
483
+ return new Promise(async (resolve, reject) => {
484
+ const headers = {
485
+ "User-Agent": "LiveKit-Agents-JS"
486
+ };
487
+ if (this.#opts.isAzure) {
488
+ if (this.#opts.entraToken) {
489
+ headers.Authorization = `Bearer ${this.#opts.entraToken}`;
490
+ } else if (this.#opts.apiKey) {
491
+ headers["api-key"] = this.#opts.apiKey;
492
+ } else {
493
+ reject(new Error("Microsoft API key or entraToken is required"));
494
+ return;
495
+ }
496
+ } else {
497
+ headers.Authorization = `Bearer ${this.#opts.apiKey}`;
498
+ headers["OpenAI-Beta"] = "realtime=v1";
499
+ }
500
+ const url = new URL([this.#opts.baseURL, "realtime"].join("/"));
501
+ if (url.protocol === "https:") {
502
+ url.protocol = "wss:";
503
+ }
504
+ const queryParams = {};
505
+ if (this.#opts.isAzure) {
506
+ queryParams["api-version"] = "2024-10-01-preview";
507
+ queryParams["deployment"] = this.#opts.model;
508
+ } else {
509
+ queryParams["model"] = this.#opts.model;
510
+ }
511
+ for (const [key, value] of Object.entries(queryParams)) {
512
+ url.searchParams.set(key, value);
513
+ }
514
+ console.debug("Connecting to OpenAI Realtime API at ", url.toString());
515
+ this.#ws = new import_ws.WebSocket(url.toString(), {
516
+ headers
517
+ });
518
+ this.#ws.onerror = (error) => {
519
+ reject(new Error("OpenAI Realtime WebSocket error: " + error.message));
520
+ };
521
+ await (0, import_node_events.once)(this.#ws, "open");
522
+ this.#closing = false;
523
+ this.#ws.onmessage = (message) => {
524
+ const event = JSON.parse(message.data);
525
+ this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
526
+ switch (event.type) {
527
+ case "error":
528
+ this.#handleError(event);
529
+ break;
530
+ case "session.created":
531
+ this.#handleSessionCreated(event);
532
+ break;
533
+ case "session.updated":
534
+ this.#handleSessionUpdated(event);
535
+ break;
536
+ case "conversation.created":
537
+ this.#handleConversationCreated(event);
538
+ break;
539
+ case "input_audio_buffer.committed":
540
+ this.#handleInputAudioBufferCommitted(event);
541
+ break;
542
+ case "input_audio_buffer.cleared":
543
+ this.#handleInputAudioBufferCleared(event);
544
+ break;
545
+ case "input_audio_buffer.speech_started":
546
+ this.#handleInputAudioBufferSpeechStarted(event);
547
+ break;
548
+ case "input_audio_buffer.speech_stopped":
549
+ this.#handleInputAudioBufferSpeechStopped(event);
550
+ break;
551
+ case "conversation.item.created":
552
+ this.#handleConversationItemCreated(event);
553
+ break;
554
+ case "conversation.item.input_audio_transcription.completed":
555
+ this.#handleConversationItemInputAudioTranscriptionCompleted(event);
556
+ break;
557
+ case "conversation.item.input_audio_transcription.failed":
558
+ this.#handleConversationItemInputAudioTranscriptionFailed(event);
559
+ break;
560
+ case "conversation.item.truncated":
561
+ this.#handleConversationItemTruncated(event);
562
+ break;
563
+ case "conversation.item.deleted":
564
+ this.#handleConversationItemDeleted(event);
565
+ break;
566
+ case "response.created":
567
+ this.#handleResponseCreated(event);
568
+ break;
569
+ case "response.done":
570
+ this.#handleResponseDone(event);
571
+ break;
572
+ case "response.output_item.added":
573
+ this.#handleResponseOutputItemAdded(event);
574
+ break;
575
+ case "response.output_item.done":
576
+ this.#handleResponseOutputItemDone(event);
577
+ break;
578
+ case "response.content_part.added":
579
+ this.#handleResponseContentPartAdded(event);
580
+ break;
581
+ case "response.content_part.done":
582
+ this.#handleResponseContentPartDone(event);
583
+ break;
584
+ case "response.text.delta":
585
+ this.#handleResponseTextDelta(event);
586
+ break;
587
+ case "response.text.done":
588
+ this.#handleResponseTextDone(event);
589
+ break;
590
+ case "response.audio_transcript.delta":
591
+ this.#handleResponseAudioTranscriptDelta(event);
592
+ break;
593
+ case "response.audio_transcript.done":
594
+ this.#handleResponseAudioTranscriptDone(event);
595
+ break;
596
+ case "response.audio.delta":
597
+ this.#handleResponseAudioDelta(event);
598
+ break;
599
+ case "response.audio.done":
600
+ this.#handleResponseAudioDone(event);
601
+ break;
602
+ case "response.function_call_arguments.delta":
603
+ this.#handleResponseFunctionCallArgumentsDelta(event);
604
+ break;
605
+ case "response.function_call_arguments.done":
606
+ this.#handleResponseFunctionCallArgumentsDone(event);
607
+ break;
608
+ case "rate_limits.updated":
609
+ this.#handleRateLimitsUpdated(event);
610
+ break;
611
+ }
612
+ };
613
+ const sendTask = async () => {
614
+ while (this.#ws && !this.#closing && this.#ws.readyState === import_ws.WebSocket.OPEN) {
615
+ try {
616
+ const event = await this.#sendQueue.get();
617
+ if (event.type !== "input_audio_buffer.append") {
618
+ this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
619
+ }
620
+ this.#ws.send(JSON.stringify(event));
621
+ } catch (error) {
622
+ this.#logger.error("Error sending event:", error);
623
+ }
624
+ }
625
+ };
626
+ sendTask();
627
+ this.#ws.onclose = () => {
628
+ if (this.#expiresAt && Date.now() >= this.#expiresAt * 1e3) {
629
+ this.#closing = true;
630
+ }
631
+ if (!this.#closing) {
632
+ reject(new Error("OpenAI Realtime connection closed unexpectedly"));
633
+ }
634
+ this.#ws = null;
635
+ resolve();
636
+ };
637
+ });
638
+ }
639
+ async close() {
640
+ if (!this.#ws) return;
641
+ this.#closing = true;
642
+ this.#ws.close();
643
+ await this.#task;
644
+ }
645
+ #getContent(ptr) {
646
+ const response = this.#pendingResponses[ptr.response_id];
647
+ const output = response.output[ptr.output_index];
648
+ const content = output.content[ptr.content_index];
649
+ return content;
650
+ }
651
+ #handleError(event) {
652
+ this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
653
+ }
654
+ #handleSessionCreated(event) {
655
+ this.#sessionId = event.session.id;
656
+ this.#expiresAt = event.session.expires_at;
657
+ this.#logger = this.#logger.child({ sessionId: this.#sessionId });
658
+ }
659
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
660
+ #handleSessionUpdated(event) {
661
+ }
662
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
663
+ #handleConversationCreated(event) {
664
+ }
665
+ #handleInputAudioBufferCommitted(event) {
666
+ this.emit("input_speech_committed", {
667
+ itemId: event.item_id
668
+ });
669
+ }
670
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
671
+ #handleInputAudioBufferCleared(event) {
672
+ }
673
+ #handleInputAudioBufferSpeechStarted(event) {
674
+ this.emit("input_speech_started", {
675
+ itemId: event.item_id
676
+ });
677
+ }
678
+ #handleInputAudioBufferSpeechStopped(event) {
679
+ this.emit("input_speech_stopped");
680
+ }
681
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
682
+ #handleConversationItemCreated(event) {
683
+ }
684
+ #handleConversationItemInputAudioTranscriptionCompleted(event) {
685
+ const transcript = event.transcript;
686
+ this.emit("input_speech_transcription_completed", {
687
+ itemId: event.item_id,
688
+ transcript
689
+ });
690
+ }
691
+ #handleConversationItemInputAudioTranscriptionFailed(event) {
692
+ const error = event.error;
693
+ this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
694
+ this.emit("input_speech_transcription_failed", {
695
+ itemId: event.item_id,
696
+ message: error.message
697
+ });
698
+ }
699
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
700
+ #handleConversationItemTruncated(event) {
701
+ }
702
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
703
+ #handleConversationItemDeleted(event) {
704
+ }
705
+ #handleResponseCreated(responseCreated) {
706
+ const response = responseCreated.response;
707
+ const doneFut = new import_agents.Future();
708
+ const newResponse = {
709
+ id: response.id,
710
+ status: response.status,
711
+ statusDetails: response.status_details,
712
+ usage: null,
713
+ output: [],
714
+ doneFut
715
+ };
716
+ this.#pendingResponses[newResponse.id] = newResponse;
717
+ this.emit("response_created", newResponse);
718
+ }
719
+ #handleResponseDone(event) {
720
+ const responseData = event.response;
721
+ const responseId = responseData.id;
722
+ const response = this.#pendingResponses[responseId];
723
+ response.status = responseData.status;
724
+ response.statusDetails = responseData.status_details;
725
+ response.usage = responseData.usage ?? null;
726
+ this.#pendingResponses[responseId] = response;
727
+ response.doneFut.resolve();
728
+ this.emit("response_done", response);
729
+ }
730
+ #handleResponseOutputItemAdded(event) {
731
+ const responseId = event.response_id;
732
+ const response = this.#pendingResponses[responseId];
733
+ const itemData = event.item;
734
+ if (itemData.type !== "message" && itemData.type !== "function_call") {
735
+ throw new Error(`Unexpected item type: ${itemData.type}`);
736
+ }
737
+ let role;
738
+ if (itemData.type === "function_call") {
739
+ role = "assistant";
740
+ } else {
741
+ role = itemData.role;
742
+ }
743
+ const newOutput = {
744
+ responseId,
745
+ itemId: itemData.id,
746
+ outputIndex: event.output_index,
747
+ type: itemData.type,
748
+ role,
749
+ content: [],
750
+ doneFut: new import_agents.Future()
751
+ };
752
+ response == null ? void 0 : response.output.push(newOutput);
753
+ this.emit("response_output_added", newOutput);
754
+ }
755
+ #handleResponseOutputItemDone(event) {
756
+ const responseId = event.response_id;
757
+ const response = this.#pendingResponses[responseId];
758
+ const outputIndex = event.output_index;
759
+ const output = response.output[outputIndex];
760
+ if ((output == null ? void 0 : output.type) === "function_call") {
761
+ if (!this.#fncCtx) {
762
+ this.#logger.error("function call received but no fncCtx is available");
763
+ return;
764
+ }
765
+ const item = event.item;
766
+ if (item.type !== "function_call") {
767
+ throw new Error("Expected function_call item");
768
+ }
769
+ const func = this.#fncCtx[item.name];
770
+ if (!func) {
771
+ this.#logger.error(`no function with name ${item.name} in fncCtx`);
772
+ return;
773
+ }
774
+ this.emit("function_call_started", {
775
+ callId: item.call_id
776
+ });
777
+ const parsedArgs = JSON.parse(item.arguments);
778
+ this.#logger.debug(
779
+ `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`
780
+ );
781
+ func.execute(parsedArgs).then(
782
+ (content) => {
783
+ this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
784
+ this.emit("function_call_completed", {
785
+ callId: item.call_id
786
+ });
787
+ this.conversation.item.create(
788
+ import_agents.llm.ChatMessage.createToolFromFunctionResult({
789
+ name: item.name,
790
+ toolCallId: item.call_id,
791
+ result: content
792
+ }),
793
+ output.itemId
794
+ );
795
+ this.response.create();
796
+ },
797
+ (error) => {
798
+ this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
799
+ this.emit("function_call_failed", {
800
+ callId: item.call_id
801
+ });
802
+ }
803
+ );
804
+ }
805
+ output == null ? void 0 : output.doneFut.resolve();
806
+ this.emit("response_output_done", output);
807
+ }
808
+ #handleResponseContentPartAdded(event) {
809
+ const responseId = event.response_id;
810
+ const response = this.#pendingResponses[responseId];
811
+ const outputIndex = event.output_index;
812
+ const output = response.output[outputIndex];
813
+ const textStream = new import_agents.AsyncIterableQueue();
814
+ const audioStream = new import_agents.AsyncIterableQueue();
815
+ const newContent = {
816
+ responseId,
817
+ itemId: event.item_id,
818
+ outputIndex,
819
+ contentIndex: event.content_index,
820
+ text: "",
821
+ audio: [],
822
+ textStream,
823
+ audioStream,
824
+ toolCalls: []
825
+ };
826
+ output == null ? void 0 : output.content.push(newContent);
827
+ this.emit("response_content_added", newContent);
828
+ }
829
+ #handleResponseContentPartDone(event) {
830
+ const content = this.#getContent(event);
831
+ this.emit("response_content_done", content);
832
+ }
833
+ #handleResponseTextDelta(event) {
834
+ this.emit("response_text_delta", event);
835
+ }
836
+ #handleResponseTextDone(event) {
837
+ this.emit("response_text_done", event);
838
+ }
839
+ #handleResponseAudioTranscriptDelta(event) {
840
+ const content = this.#getContent(event);
841
+ const transcript = event.delta;
842
+ content.text += transcript;
843
+ content.textStream.put(transcript);
844
+ }
845
+ #handleResponseAudioTranscriptDone(event) {
846
+ const content = this.#getContent(event);
847
+ content.textStream.close();
848
+ }
849
+ #handleResponseAudioDelta(event) {
850
+ const content = this.#getContent(event);
851
+ const data = Buffer.from(event.delta, "base64");
852
+ const audio = new import_rtc_node.AudioFrame(
853
+ new Int16Array(data.buffer),
854
+ api_proto.SAMPLE_RATE,
855
+ api_proto.NUM_CHANNELS,
856
+ data.length / 2
857
+ );
858
+ content.audio.push(audio);
859
+ content.audioStream.put(audio);
860
+ }
861
+ #handleResponseAudioDone(event) {
862
+ const content = this.#getContent(event);
863
+ content.audioStream.close();
864
+ }
865
+ #handleResponseFunctionCallArgumentsDelta(event) {
866
+ }
867
+ #handleResponseFunctionCallArgumentsDone(event) {
868
+ }
869
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
870
+ #handleRateLimitsUpdated(event) {
871
+ }
872
+ }
873
+ // Annotate the CommonJS export names for ESM import in node:
874
+ 0 && (module.exports = {
875
+ RealtimeModel,
876
+ RealtimeSession
877
+ });
878
+ //# sourceMappingURL=realtime_model.cjs.map