@livekit/agents-plugin-openai 1.0.51 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/index.cjs +1 -1
  2. package/dist/index.js +1 -1
  3. package/dist/llm.cjs +8 -0
  4. package/dist/llm.cjs.map +1 -1
  5. package/dist/llm.d.cts +1 -0
  6. package/dist/llm.d.ts +1 -0
  7. package/dist/llm.d.ts.map +1 -1
  8. package/dist/llm.js +8 -0
  9. package/dist/llm.js.map +1 -1
  10. package/dist/realtime/api_proto.cjs.map +1 -1
  11. package/dist/realtime/api_proto.d.cts +7 -3
  12. package/dist/realtime/api_proto.d.ts +7 -3
  13. package/dist/realtime/api_proto.d.ts.map +1 -1
  14. package/dist/realtime/api_proto.js.map +1 -1
  15. package/dist/realtime/realtime_model.cjs +46 -22
  16. package/dist/realtime/realtime_model.cjs.map +1 -1
  17. package/dist/realtime/realtime_model.d.cts +2 -1
  18. package/dist/realtime/realtime_model.d.ts +2 -1
  19. package/dist/realtime/realtime_model.d.ts.map +1 -1
  20. package/dist/realtime/realtime_model.js +46 -22
  21. package/dist/realtime/realtime_model.js.map +1 -1
  22. package/dist/realtime/realtime_model.test.cjs +104 -14
  23. package/dist/realtime/realtime_model.test.cjs.map +1 -1
  24. package/dist/realtime/realtime_model.test.js +104 -14
  25. package/dist/realtime/realtime_model.test.js.map +1 -1
  26. package/dist/realtime/realtime_model_beta.cjs +40 -22
  27. package/dist/realtime/realtime_model_beta.cjs.map +1 -1
  28. package/dist/realtime/realtime_model_beta.d.ts.map +1 -1
  29. package/dist/realtime/realtime_model_beta.js +40 -22
  30. package/dist/realtime/realtime_model_beta.js.map +1 -1
  31. package/dist/stt.cjs +11 -0
  32. package/dist/stt.cjs.map +1 -1
  33. package/dist/stt.d.cts +2 -0
  34. package/dist/stt.d.ts +2 -0
  35. package/dist/stt.d.ts.map +1 -1
  36. package/dist/stt.js +11 -0
  37. package/dist/stt.js.map +1 -1
  38. package/dist/tts.cjs +11 -0
  39. package/dist/tts.cjs.map +1 -1
  40. package/dist/tts.d.cts +2 -0
  41. package/dist/tts.d.ts +2 -0
  42. package/dist/tts.d.ts.map +1 -1
  43. package/dist/tts.js +11 -0
  44. package/dist/tts.js.map +1 -1
  45. package/package.json +5 -5
  46. package/src/llm.ts +9 -0
  47. package/src/realtime/api_proto.ts +8 -2
  48. package/src/realtime/realtime_model.test.ts +129 -14
  49. package/src/realtime/realtime_model.ts +51 -26
  50. package/src/realtime/realtime_model_beta.ts +42 -25
  51. package/src/stt.ts +13 -0
  52. package/src/tts.ts +13 -0
package/dist/tts.d.ts CHANGED
@@ -14,6 +14,8 @@ export declare class TTS extends tts.TTS {
14
14
  #private;
15
15
  label: string;
16
16
  private abortController;
17
+ get model(): string;
18
+ get provider(): string;
17
19
  /**
18
20
  * Create a new instance of OpenAI TTS.
19
21
  *
package/dist/tts.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,iBAAiB,EAA8B,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAE1F,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAgB;IACrB,OAAO,CAAC,eAAe,CAAyB;IAEhD;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CACR,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW,GACxB,aAAa;IAoBhB,MAAM,IAAI,GAAG,CAAC,gBAAgB;IAIxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAG7B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,KAAK,SAA0B;IAC/B,OAAO,CAAC,MAAM,CAAe;gBAI3B,GAAG,EAAE,GAAG,EACR,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,EACpB,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW;cAMX,GAAG;CA+BpB"}
1
+ {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,iBAAiB,EAA8B,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAE1F,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAgB;IACrB,OAAO,CAAC,eAAe,CAAyB;IAEhD,IAAI,KAAK,IAAI,MAAM,CAElB;IAED,IAAI,QAAQ,IAAI,MAAM,CAOrB;IAED;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CACR,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW,GACxB,aAAa;IAoBhB,MAAM,IAAI,GAAG,CAAC,gBAAgB;IAIxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAG7B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,KAAK,SAA0B;IAC/B,OAAO,CAAC,MAAM,CAAe;gBAI3B,GAAG,EAAE,GAAG,EACR,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,EACpB,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW;cAMX,GAAG;CA+BpB"}
package/dist/tts.js CHANGED
@@ -13,6 +13,17 @@ class TTS extends tts.TTS {
13
13
  #client;
14
14
  label = "openai.TTS";
15
15
  abortController = new AbortController();
16
+ get model() {
17
+ return this.#opts.model;
18
+ }
19
+ get provider() {
20
+ try {
21
+ const url = new URL(this.#client.baseURL);
22
+ return url.host;
23
+ } catch {
24
+ return "api.openai.com";
25
+ }
26
+ }
16
27
  /**
17
28
  * Create a new instance of OpenAI TTS.
18
29
  *
package/dist/tts.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type APIConnectOptions, AudioByteStream, shortuuid, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { OpenAI } from 'openai';\nimport type { TTSModels, TTSVoices } from './models.js';\n\nconst OPENAI_TTS_SAMPLE_RATE = 24000;\nconst OPENAI_TTS_CHANNELS = 1;\n\nexport interface TTSOptions {\n model: TTSModels | string;\n voice: TTSVoices;\n speed: number;\n instructions?: string;\n baseURL?: string;\n client?: OpenAI;\n apiKey?: string;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.OPENAI_API_KEY,\n model: 'tts-1',\n voice: 'alloy',\n speed: 1,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n #client: OpenAI;\n label = 'openai.TTS';\n private abortController = new AbortController();\n\n /**\n * Create a new instance of OpenAI TTS.\n *\n * @remarks\n * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the\n * `OPENAI_API_KEY` environment variable.\n */\n constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {\n super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });\n\n this.#opts = { ...defaultTTSOptions, ...opts };\n if (this.#opts.apiKey === undefined && !this.#opts.client) {\n throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');\n }\n\n this.#client =\n this.#opts.client ||\n new OpenAI({\n baseURL: this.#opts.baseURL,\n apiKey: this.#opts.apiKey,\n });\n }\n\n updateOptions(opts: { model?: TTSModels | string; voice?: TTSVoices; speed?: number }) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n synthesize(\n text: string,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ): ChunkedStream {\n return new ChunkedStream(\n this,\n text,\n this.#client.audio.speech.create(\n {\n input: text,\n model: this.#opts.model,\n voice: this.#opts.voice,\n instructions: this.#opts.instructions,\n response_format: 'pcm',\n speed: this.#opts.speed,\n },\n { signal: abortSignal },\n ),\n connOptions,\n abortSignal,\n );\n }\n\n stream(): tts.SynthesizeStream {\n throw new Error('Streaming is not supported on OpenAI TTS');\n }\n\n async close(): Promise<void> {\n this.abortController.abort();\n }\n}\n\nexport class ChunkedStream extends tts.ChunkedStream {\n label = 'openai.ChunkedStream';\n private stream: Promise<any>;\n\n // set Promise<T> to any because OpenAI returns an annoying Response type\n constructor(\n tts: TTS,\n text: string,\n stream: Promise<any>,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ) {\n super(text, tts, connOptions, abortSignal);\n this.stream = stream;\n }\n\n protected async run() {\n try {\n const buffer = await this.stream.then((r) => r.arrayBuffer());\n const requestId = shortuuid();\n const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);\n const frames = audioByteStream.write(buffer);\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n for (const frame of frames) {\n sendLastFrame(requestId, false);\n lastFrame = frame;\n }\n sendLastFrame(requestId, true);\n\n this.queue.close();\n } catch (error) {\n if (error instanceof Error && error.name === 'AbortError') {\n return;\n }\n throw error;\n } finally {\n this.queue.close();\n }\n }\n}\n"],"mappings":"AAGA,SAAiC,iBAAiB,WAAW,WAAW;AAExE,SAAS,cAAc;AAGvB,MAAM,yBAAyB;AAC/B,MAAM,sBAAsB;AAY5B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AACT;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EACA,kBAAkB,IAAI,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAS9C,YAAY,OAA4B,mBAAmB;AACzD,UAAM,wBAAwB,qBAAqB,EAAE,WAAW,MAAM,CAAC;AAEvE,SAAK,QAAQ,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC7C,QAAI,KAAK,MAAM,WAAW,UAAa,CAAC,KAAK,MAAM,QAAQ;AACzD,YAAM,IAAI,MAAM,0EAA0E;AAAA,IAC5F;AAEA,SAAK,UACH,KAAK,MAAM,UACX,IAAI,OAAO;AAAA,MACT,SAAS,KAAK,MAAM;AAAA,MACpB,QAAQ,KAAK,MAAM;AAAA,IACrB,CAAC;AAAA,EACL;AAAA,EAEA,cAAc,MAAyE;AACrF,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,WACE,MACA,aACA,aACe;AACf,WAAO,IAAI;AAAA,MACT;AAAA,MACA;AAAA,MACA,KAAK,QAAQ,MAAM,OAAO;AAAA,QACxB;AAAA,UACE,OAAO;AAAA,UACP,OAAO,KAAK,MAAM;AAAA,UAClB,OAAO,KAAK,MAAM;AAAA,UAClB,cAAc,KAAK,MAAM;AAAA,UACzB,iBAAiB;AAAA,UACjB,OAAO,KAAK,MAAM;AAAA,QACpB;AAAA,QACA,EAAE,QAAQ,YAAY;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA,EAEA,SAA+B;AAC7B,UAAM,IAAI,MAAM,0CAA0C;AAAA,EAC5D;AAAA,EAEA,MAAM,QAAuB;AAC3B,SAAK,gBAAgB,MAAM;AAAA,EAC7B;AACF;AAEO,MAAM,sBAAsB,IAAI,cAAc;AAAA,EACnD,QAAQ;AAAA,EACA;AAAA;AAAA,EAGR,YACEA,MACA,MACA,QACA,aACA,aACA;AACA,UAAM,MAAMA,MAAK,aAAa,WAAW;AACzC,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAgB,MAAM;AACpB,QAAI;AACF,YAAM,SAAS,MAAM,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,YAAY,CAAC;AAC5D,YAAM,YAAY,UAAU;AAC5B,YAAM,kBAAkB,IAAI,gBAAgB,wBAAwB,mBAAmB;AACvF,YAAM,SAAS,gBAAgB,MAAM,MAAM;AAE3C,UAAI;AACJ,YAAM,gBAAgB,CAAC,WAAmB,UAAmB;AAC3D,YAAI,WAAW;AACb,eAAK,MAAM,IAAI,EAAE,WAAW,WAAW,OAAO,WAAW,MAAM,CAAC;AAChE,sBAAY;AAAA,QACd;AAAA,MACF;AAEA,iBAAW,SAAS,QAAQ;AAC1B,sBAAc,WAAW,KAAK;AAC9B,oBAAY;AAAA,MACd;AACA,oBAAc,WAAW,IAAI;AAE7B,WAAK,MAAM,MAAM;AAAA,IACnB,SAAS,OAAO;AACd,UAAI,iBAAiB,SAAS,MAAM,SAAS,cAAc;AACzD;AAAA,MACF;AACA,YAAM;AAAA,IACR,UAAE;AACA,WAAK,MAAM,MAAM;AAAA,IACnB;AAAA,EACF;AACF;","names":["tts"]}
1
+ {"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type APIConnectOptions, AudioByteStream, shortuuid, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { OpenAI } from 'openai';\nimport type { TTSModels, TTSVoices } from './models.js';\n\nconst OPENAI_TTS_SAMPLE_RATE = 24000;\nconst OPENAI_TTS_CHANNELS = 1;\n\nexport interface TTSOptions {\n model: TTSModels | string;\n voice: TTSVoices;\n speed: number;\n instructions?: string;\n baseURL?: string;\n client?: OpenAI;\n apiKey?: string;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.OPENAI_API_KEY,\n model: 'tts-1',\n voice: 'alloy',\n speed: 1,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n #client: OpenAI;\n label = 'openai.TTS';\n private abortController = new AbortController();\n\n get model(): string {\n return this.#opts.model;\n }\n\n get provider(): string {\n try {\n const url = new URL(this.#client.baseURL);\n return url.host;\n } catch {\n return 'api.openai.com';\n }\n }\n\n /**\n * Create a new instance of OpenAI TTS.\n *\n * @remarks\n * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the\n * `OPENAI_API_KEY` environment variable.\n */\n constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {\n super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });\n\n this.#opts = { ...defaultTTSOptions, ...opts };\n if (this.#opts.apiKey === undefined && !this.#opts.client) {\n throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');\n }\n\n this.#client =\n this.#opts.client ||\n new OpenAI({\n baseURL: this.#opts.baseURL,\n apiKey: this.#opts.apiKey,\n });\n }\n\n updateOptions(opts: { model?: TTSModels | string; voice?: TTSVoices; speed?: number }) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n synthesize(\n text: string,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ): ChunkedStream {\n return new ChunkedStream(\n this,\n text,\n this.#client.audio.speech.create(\n {\n input: text,\n model: this.#opts.model,\n voice: this.#opts.voice,\n instructions: this.#opts.instructions,\n response_format: 'pcm',\n speed: this.#opts.speed,\n },\n { signal: abortSignal },\n ),\n connOptions,\n abortSignal,\n );\n }\n\n stream(): tts.SynthesizeStream {\n throw new Error('Streaming is not supported on OpenAI TTS');\n }\n\n async close(): Promise<void> {\n this.abortController.abort();\n }\n}\n\nexport class ChunkedStream extends tts.ChunkedStream {\n label = 'openai.ChunkedStream';\n private stream: Promise<any>;\n\n // set Promise<T> to any because OpenAI returns an annoying Response type\n constructor(\n tts: TTS,\n text: string,\n stream: Promise<any>,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ) {\n super(text, tts, connOptions, abortSignal);\n this.stream = stream;\n }\n\n protected async run() {\n try {\n const buffer = await this.stream.then((r) => r.arrayBuffer());\n const requestId = shortuuid();\n const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);\n const frames = audioByteStream.write(buffer);\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n for (const frame of frames) {\n sendLastFrame(requestId, false);\n lastFrame = frame;\n }\n sendLastFrame(requestId, true);\n\n this.queue.close();\n } catch (error) {\n if (error instanceof Error && error.name === 'AbortError') {\n return;\n }\n throw error;\n } finally {\n this.queue.close();\n }\n }\n}\n"],"mappings":"AAGA,SAAiC,iBAAiB,WAAW,WAAW;AAExE,SAAS,cAAc;AAGvB,MAAM,yBAAyB;AAC/B,MAAM,sBAAsB;AAY5B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AACT;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EACA,kBAAkB,IAAI,gBAAgB;AAAA,EAE9C,IAAI,QAAgB;AAClB,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,WAAmB;AACrB,QAAI;AACF,YAAM,MAAM,IAAI,IAAI,KAAK,QAAQ,OAAO;AACxC,aAAO,IAAI;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,YAAY,OAA4B,mBAAmB;AACzD,UAAM,wBAAwB,qBAAqB,EAAE,WAAW,MAAM,CAAC;AAEvE,SAAK,QAAQ,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC7C,QAAI,KAAK,MAAM,WAAW,UAAa,CAAC,KAAK,MAAM,QAAQ;AACzD,YAAM,IAAI,MAAM,0EAA0E;AAAA,IAC5F;AAEA,SAAK,UACH,KAAK,MAAM,UACX,IAAI,OAAO;AAAA,MACT,SAAS,KAAK,MAAM;AAAA,MACpB,QAAQ,KAAK,MAAM;AAAA,IACrB,CAAC;AAAA,EACL;AAAA,EAEA,cAAc,MAAyE;AACrF,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,WACE,MACA,aACA,aACe;AACf,WAAO,IAAI;AAAA,MACT;AAAA,MACA;AAAA,MACA,KAAK,QAAQ,MAAM,OAAO;AAAA,QACxB;AAAA,UACE,OAAO;AAAA,UACP,OAAO,KAAK,MAAM;AAAA,UAClB,OAAO,KAAK,MAAM;AAAA,UAClB,cAAc,KAAK,MAAM;AAAA,UACzB,iBAAiB;AAAA,UACjB,OAAO,KAAK,MAAM;AAAA,QACpB;AAAA,QACA,EAAE,QAAQ,YAAY;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA,EAEA,SAA+B;AAC7B,UAAM,IAAI,MAAM,0CAA0C;AAAA,EAC5D;AAAA,EAEA,MAAM,QAAuB;AAC3B,SAAK,gBAAgB,MAAM;AAAA,EAC7B;AACF;AAEO,MAAM,sBAAsB,IAAI,cAAc;AAAA,EACnD,QAAQ;AAAA,EACA;AAAA;AAAA,EAGR,YACEA,MACA,MACA,QACA,aACA,aACA;AACA,UAAM,MAAMA,MAAK,aAAa,WAAW;AACzC,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAgB,MAAM;AACpB,QAAI;AACF,YAAM,SAAS,MAAM,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,YAAY,CAAC;AAC5D,YAAM,YAAY,UAAU;AAC5B,YAAM,kBAAkB,IAAI,gBAAgB,wBAAwB,mBAAmB;AACvF,YAAM,SAAS,gBAAgB,MAAM,MAAM;AAE3C,UAAI;AACJ,YAAM,gBAAgB,CAAC,WAAmB,UAAmB;AAC3D,YAAI,WAAW;AACb,eAAK,MAAM,IAAI,EAAE,WAAW,WAAW,OAAO,WAAW,MAAM,CAAC;AAChE,sBAAY;AAAA,QACd;AAAA,MACF;AAEA,iBAAW,SAAS,QAAQ;AAC1B,sBAAc,WAAW,KAAK;AAC9B,oBAAY;AAAA,MACd;AACA,oBAAc,WAAW,IAAI;AAE7B,WAAK,MAAM,MAAM;AAAA,IACnB,SAAS,OAAO;AACd,UAAI,iBAAiB,SAAS,MAAM,SAAS,cAAc;AACzD;AAAA,MACF;AACA,YAAM;AAAA,IACR,UAAE;AACA,WAAK,MAAM,MAAM;AAAA,IACnB;AAAA,EACF;AACF;","names":["tts"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents-plugin-openai",
3
- "version": "1.0.51",
3
+ "version": "1.1.0",
4
4
  "description": "OpenAI plugin for LiveKit Node Agents",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
@@ -30,9 +30,9 @@
30
30
  "@types/ws": "^8.5.10",
31
31
  "tsup": "^8.3.5",
32
32
  "typescript": "^5.0.0",
33
- "@livekit/agents": "1.0.51",
34
- "@livekit/agents-plugin-silero": "1.0.51",
35
- "@livekit/agents-plugins-test": "1.0.51"
33
+ "@livekit/agents": "1.1.0",
34
+ "@livekit/agents-plugin-silero": "1.1.0",
35
+ "@livekit/agents-plugins-test": "1.1.0"
36
36
  },
37
37
  "dependencies": {
38
38
  "@livekit/mutex": "^1.1.1",
@@ -42,7 +42,7 @@
42
42
  "peerDependencies": {
43
43
  "@livekit/rtc-node": "^0.13.24",
44
44
  "zod": "^3.25.76 || ^4.1.8",
45
- "@livekit/agents": "1.0.51"
45
+ "@livekit/agents": "1.1.0"
46
46
  },
47
47
  "scripts": {
48
48
  "build": "tsup --onSuccess \"pnpm build:types\"",
package/src/llm.ts CHANGED
@@ -86,6 +86,15 @@ export class LLM extends llm.LLM {
86
86
  return this.#opts.model;
87
87
  }
88
88
 
89
+ get provider(): string {
90
+ try {
91
+ const url = new URL(this.#client.baseURL);
92
+ return url.host;
93
+ } catch {
94
+ return 'api.openai.com';
95
+ }
96
+ }
97
+
89
98
  /**
90
99
  * Create a new instance of OpenAI LLM with Azure.
91
100
  *
@@ -162,6 +162,11 @@ export interface InputAudioContent {
162
162
  audio: AudioBase64Bytes;
163
163
  }
164
164
 
165
+ export interface InputImageContent {
166
+ type: 'input_image';
167
+ image_url: string;
168
+ }
169
+
165
170
  export interface TextContent {
166
171
  type: 'text';
167
172
  text: string;
@@ -181,6 +186,7 @@ export interface AudioContent {
181
186
  export type Content =
182
187
  | InputTextContent
183
188
  | InputAudioContent
189
+ | InputImageContent
184
190
  | TextContent
185
191
  | OutputTextContent
186
192
  | AudioContent;
@@ -206,7 +212,7 @@ export interface SystemItem extends BaseItem {
206
212
  export interface UserItem extends BaseItem {
207
213
  type: 'message';
208
214
  role: 'user';
209
- content: (InputTextContent | InputAudioContent)[];
215
+ content: (InputTextContent | InputAudioContent | InputImageContent)[];
210
216
  }
211
217
 
212
218
  export interface AssistantItem extends BaseItem {
@@ -361,7 +367,7 @@ export interface UserItemCreate {
361
367
  id: string;
362
368
  type: 'message';
363
369
  role: 'user';
364
- content: (InputTextContent | InputAudioContent)[];
370
+ content: (InputTextContent | InputAudioContent | InputImageContent)[];
365
371
  }
366
372
 
367
373
  export interface AssistantItemCreate {
@@ -8,14 +8,14 @@ import { livekitItemToOpenAIItem } from './realtime_model.js';
8
8
 
9
9
  describe('livekitItemToOpenAIItem', () => {
10
10
  describe('message items', () => {
11
- it('should use output_text type for assistant messages', () => {
11
+ it('should use output_text type for assistant messages', async () => {
12
12
  const assistantMessage = new llm.ChatMessage({
13
13
  role: 'assistant',
14
14
  content: 'Hello, how can I help you?',
15
15
  id: 'test-assistant-msg',
16
16
  });
17
17
 
18
- const result = livekitItemToOpenAIItem(assistantMessage) as api_proto.AssistantItem;
18
+ const result = (await livekitItemToOpenAIItem(assistantMessage)) as api_proto.AssistantItem;
19
19
 
20
20
  expect(result.type).toBe('message');
21
21
  expect(result.role).toBe('assistant');
@@ -25,14 +25,14 @@ describe('livekitItemToOpenAIItem', () => {
25
25
  expect((content as api_proto.OutputTextContent).text).toBe('Hello, how can I help you?');
26
26
  });
27
27
 
28
- it('should use input_text type for user messages', () => {
28
+ it('should use input_text type for user messages', async () => {
29
29
  const userMessage = new llm.ChatMessage({
30
30
  role: 'user',
31
31
  content: 'What is the weather like?',
32
32
  id: 'test-user-msg',
33
33
  });
34
34
 
35
- const result = livekitItemToOpenAIItem(userMessage) as api_proto.UserItem;
35
+ const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
36
36
 
37
37
  expect(result.type).toBe('message');
38
38
  expect(result.role).toBe('user');
@@ -42,14 +42,14 @@ describe('livekitItemToOpenAIItem', () => {
42
42
  expect((content as api_proto.InputTextContent).text).toBe('What is the weather like?');
43
43
  });
44
44
 
45
- it('should use input_text type for system messages', () => {
45
+ it('should use input_text type for system messages', async () => {
46
46
  const systemMessage = new llm.ChatMessage({
47
47
  role: 'system',
48
48
  content: 'You are a helpful assistant.',
49
49
  id: 'test-system-msg',
50
50
  });
51
51
 
52
- const result = livekitItemToOpenAIItem(systemMessage) as api_proto.UserItem;
52
+ const result = (await livekitItemToOpenAIItem(systemMessage)) as api_proto.UserItem;
53
53
 
54
54
  expect(result.type).toBe('message');
55
55
  expect(result.role).toBe('system');
@@ -58,14 +58,14 @@ describe('livekitItemToOpenAIItem', () => {
58
58
  expect(content.type).toBe('input_text');
59
59
  });
60
60
 
61
- it('should convert developer role to system role', () => {
61
+ it('should convert developer role to system role', async () => {
62
62
  const developerMessage = new llm.ChatMessage({
63
63
  role: 'developer',
64
64
  content: 'System instructions.',
65
65
  id: 'test-developer-msg',
66
66
  });
67
67
 
68
- const result = livekitItemToOpenAIItem(developerMessage) as api_proto.UserItem;
68
+ const result = (await livekitItemToOpenAIItem(developerMessage)) as api_proto.UserItem;
69
69
 
70
70
  expect(result.type).toBe('message');
71
71
  expect(result.role).toBe('system');
@@ -73,14 +73,16 @@ describe('livekitItemToOpenAIItem', () => {
73
73
  expect(content.type).toBe('input_text');
74
74
  });
75
75
 
76
- it('should handle multiple content items for assistant', () => {
76
+ it('should handle multiple content items for assistant', async () => {
77
77
  const multiContentMessage = new llm.ChatMessage({
78
78
  role: 'assistant',
79
79
  content: ['First part.', 'Second part.'],
80
80
  id: 'test-multi-msg',
81
81
  });
82
82
 
83
- const result = livekitItemToOpenAIItem(multiContentMessage) as api_proto.AssistantItem;
83
+ const result = (await livekitItemToOpenAIItem(
84
+ multiContentMessage,
85
+ )) as api_proto.AssistantItem;
84
86
 
85
87
  expect(result.content).toHaveLength(2);
86
88
  const content0 = result.content[0]!;
@@ -88,10 +90,121 @@ describe('livekitItemToOpenAIItem', () => {
88
90
  expect(content0.type).toBe('output_text');
89
91
  expect(content1.type).toBe('output_text');
90
92
  });
93
+
94
+ it('should convert image content to input_image for user messages', async () => {
95
+ const base64Data =
96
+ 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
97
+ const imageContent = llm.createImageContent({
98
+ image: `data:image/png;base64,${base64Data}`,
99
+ mimeType: 'image/png',
100
+ });
101
+
102
+ const userMessage = new llm.ChatMessage({
103
+ role: 'user',
104
+ content: [imageContent],
105
+ id: 'test-image-msg',
106
+ });
107
+
108
+ const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
109
+
110
+ expect(result.type).toBe('message');
111
+ expect(result.role).toBe('user');
112
+ expect(result.content).toHaveLength(1);
113
+ const content = result.content[0]!;
114
+ expect(content.type).toBe('input_image');
115
+ expect((content as api_proto.InputImageContent).image_url).toBe(
116
+ `data:image/png;base64,${base64Data}`,
117
+ );
118
+ });
119
+
120
+ it('should ignore image content for assistant messages', async () => {
121
+ const base64Data =
122
+ 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
123
+ const imageContent = llm.createImageContent({
124
+ image: `data:image/png;base64,${base64Data}`,
125
+ mimeType: 'image/png',
126
+ });
127
+
128
+ const assistantMessage = new llm.ChatMessage({
129
+ role: 'assistant',
130
+ content: [imageContent],
131
+ id: 'test-assistant-image-msg',
132
+ });
133
+
134
+ const result = (await livekitItemToOpenAIItem(assistantMessage)) as api_proto.AssistantItem;
135
+
136
+ expect(result.type).toBe('message');
137
+ expect(result.role).toBe('assistant');
138
+ expect(result.content).toHaveLength(0);
139
+ });
140
+
141
+ it('should ignore image content for system messages', async () => {
142
+ const base64Data =
143
+ 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
144
+ const imageContent = llm.createImageContent({
145
+ image: `data:image/png;base64,${base64Data}`,
146
+ mimeType: 'image/png',
147
+ });
148
+
149
+ const systemMessage = new llm.ChatMessage({
150
+ role: 'system',
151
+ content: [imageContent],
152
+ id: 'test-system-image-msg',
153
+ });
154
+
155
+ const result = (await livekitItemToOpenAIItem(systemMessage)) as api_proto.SystemItem;
156
+
157
+ expect(result.type).toBe('message');
158
+ expect(result.role).toBe('system');
159
+ expect(result.content).toHaveLength(0);
160
+ });
161
+
162
+ it('should ignore image content for developer messages mapped to system', async () => {
163
+ const base64Data =
164
+ 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
165
+ const imageContent = llm.createImageContent({
166
+ image: `data:image/png;base64,${base64Data}`,
167
+ mimeType: 'image/png',
168
+ });
169
+
170
+ const developerMessage = new llm.ChatMessage({
171
+ role: 'developer',
172
+ content: [imageContent],
173
+ id: 'test-developer-image-msg',
174
+ });
175
+
176
+ const result = (await livekitItemToOpenAIItem(developerMessage)) as api_proto.SystemItem;
177
+
178
+ expect(result.type).toBe('message');
179
+ expect(result.role).toBe('system');
180
+ expect(result.content).toHaveLength(0);
181
+ });
182
+
183
+ it('should handle mixed text and image content', async () => {
184
+ const base64Data =
185
+ 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
186
+ const imageContent = llm.createImageContent({
187
+ image: `data:image/png;base64,${base64Data}`,
188
+ mimeType: 'image/png',
189
+ });
190
+
191
+ const userMessage = new llm.ChatMessage({
192
+ role: 'user',
193
+ content: ['Describe this image:', imageContent],
194
+ id: 'test-mixed-msg',
195
+ });
196
+
197
+ const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
198
+
199
+ expect(result.type).toBe('message');
200
+ expect(result.content).toHaveLength(2);
201
+ expect(result.content[0]!.type).toBe('input_text');
202
+ expect(result.content[1]!.type).toBe('input_image');
203
+ });
91
204
  });
92
205
 
93
206
  describe('function_call items', () => {
94
- it('should convert function call items correctly', () => {
207
+ it('should convert function call items correctly', async () => {
95
208
  const functionCall = new llm.FunctionCall({
96
209
  callId: 'call-123',
97
210
  name: 'get_weather',
@@ -99,7 +212,7 @@ describe('livekitItemToOpenAIItem', () => {
99
212
  id: 'test-func-call',
100
213
  });
101
214
 
102
- const result = livekitItemToOpenAIItem(functionCall) as api_proto.FunctionCallItem;
215
+ const result = (await livekitItemToOpenAIItem(functionCall)) as api_proto.FunctionCallItem;
103
216
 
104
217
  expect(result.type).toBe('function_call');
105
218
  expect(result.id).toBe('test-func-call');
@@ -110,7 +223,7 @@ describe('livekitItemToOpenAIItem', () => {
110
223
  });
111
224
 
112
225
  describe('function_call_output items', () => {
113
- it('should convert function call output items correctly', () => {
226
+ it('should convert function call output items correctly', async () => {
114
227
  const functionOutput = new llm.FunctionCallOutput({
115
228
  callId: 'call-123',
116
229
  output: 'The weather in San Francisco is sunny.',
@@ -118,7 +231,9 @@ describe('livekitItemToOpenAIItem', () => {
118
231
  id: 'test-func-output',
119
232
  });
120
233
 
121
- const result = livekitItemToOpenAIItem(functionOutput) as api_proto.FunctionCallOutputItem;
234
+ const result = (await livekitItemToOpenAIItem(
235
+ functionOutput,
236
+ )) as api_proto.FunctionCallOutputItem;
122
237
 
123
238
  expect(result.type).toBe('function_call_output');
124
239
  expect(result.id).toBe('test-func-output');
@@ -144,6 +144,15 @@ export class RealtimeModel extends llm.RealtimeModel {
144
144
  return this._options.model;
145
145
  }
146
146
 
147
+ get provider(): string {
148
+ try {
149
+ const url = new URL(this._options.baseURL);
150
+ return url.host;
151
+ } catch {
152
+ return 'api.openai.com';
153
+ }
154
+ }
155
+
147
156
  constructor(
148
157
  options: {
149
158
  model?: string;
@@ -461,28 +470,27 @@ export class RealtimeSession extends llm.RealtimeSession {
461
470
 
462
471
  async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
463
472
  const unlock = await this.updateChatCtxLock.lock();
464
- const events = this.createChatCtxUpdateEvents(_chatCtx);
465
- const futures: Future<void>[] = [];
473
+ try {
474
+ const events = await this.createChatCtxUpdateEvents(_chatCtx);
475
+ const futures: Future<void>[] = [];
466
476
 
467
- for (const event of events) {
468
- const future = new Future<void>();
469
- futures.push(future);
477
+ for (const event of events) {
478
+ const future = new Future<void>();
479
+ futures.push(future);
470
480
 
471
- if (event.type === 'conversation.item.create') {
472
- this.itemCreateFutures[event.item.id] = future;
473
- } else if (event.type == 'conversation.item.delete') {
474
- this.itemDeleteFutures[event.item_id] = future;
475
- }
481
+ if (event.type === 'conversation.item.create') {
482
+ this.itemCreateFutures[event.item.id] = future;
483
+ } else if (event.type == 'conversation.item.delete') {
484
+ this.itemDeleteFutures[event.item_id] = future;
485
+ }
476
486
 
477
- this.sendEvent(event);
478
- }
487
+ this.sendEvent(event);
488
+ }
479
489
 
480
- if (futures.length === 0) {
481
- unlock();
482
- return;
483
- }
490
+ if (futures.length === 0) {
491
+ return;
492
+ }
484
493
 
485
- try {
486
494
  // wait for futures to resolve or timeout
487
495
  await Promise.race([
488
496
  Promise.all(futures),
@@ -498,10 +506,10 @@ export class RealtimeSession extends llm.RealtimeSession {
498
506
  }
499
507
  }
500
508
 
501
- private createChatCtxUpdateEvents(
509
+ private async createChatCtxUpdateEvents(
502
510
  chatCtx: llm.ChatContext,
503
511
  addMockAudio: boolean = false,
504
- ): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
512
+ ): Promise<(api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[]> {
505
513
  const newChatCtx = chatCtx.copy();
506
514
  if (addMockAudio) {
507
515
  newChatCtx.items.push(createMockAudioItem());
@@ -533,7 +541,7 @@ export class RealtimeSession extends llm.RealtimeSession {
533
541
  }
534
542
  events.push({
535
543
  type: 'conversation.item.create',
536
- item: livekitItemToOpenAIItem(chatItem),
544
+ item: await livekitItemToOpenAIItem(chatItem),
537
545
  previous_item_id: previousId ?? undefined,
538
546
  event_id: shortuuid('chat_ctx_create_'),
539
547
  } as api_proto.ConversationItemCreateEvent);
@@ -704,7 +712,7 @@ export class RealtimeSession extends llm.RealtimeSession {
704
712
  content: [_options.audioTranscript],
705
713
  });
706
714
  chatCtx.items[idx] = newItem;
707
- const events = this.createChatCtxUpdateEvents(chatCtx);
715
+ const events = await this.createChatCtxUpdateEvents(chatCtx);
708
716
  for (const ev of events) {
709
717
  this.sendEvent(ev);
710
718
  }
@@ -845,7 +853,7 @@ export class RealtimeSession extends llm.RealtimeSession {
845
853
 
846
854
  const oldChatCtx = this.remoteChatCtx;
847
855
  this.remoteChatCtx = new llm.RemoteChatContext();
848
- events.push(...this.createChatCtxUpdateEvents(chatCtx));
856
+ events.push(...(await this.createChatCtxUpdateEvents(chatCtx)));
849
857
 
850
858
  try {
851
859
  for (const ev of events) {
@@ -1560,7 +1568,6 @@ export class RealtimeSession extends llm.RealtimeSession {
1560
1568
  if (event.error.message.startsWith('Cancellation failed')) {
1561
1569
  return;
1562
1570
  }
1563
-
1564
1571
  this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
1565
1572
  this.emitError({
1566
1573
  error: new APIError(event.error.message, {
@@ -1622,7 +1629,7 @@ export class RealtimeSession extends llm.RealtimeSession {
1622
1629
  }
1623
1630
 
1624
1631
  /** @internal Exported for testing purposes */
1625
- export function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1632
+ export async function livekitItemToOpenAIItem(item: llm.ChatItem): Promise<api_proto.ItemResource> {
1626
1633
  switch (item.type) {
1627
1634
  case 'function_call':
1628
1635
  return {
@@ -1649,8 +1656,22 @@ export function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResou
1649
1656
  text: c,
1650
1657
  } as api_proto.InputTextContent | api_proto.OutputTextContent);
1651
1658
  } else if (c.type === 'image_content') {
1652
- // not supported for now
1653
- continue;
1659
+ // only user can send image
1660
+ if (role !== 'user') continue;
1661
+
1662
+ const serialized = await llm.serializeImage(c);
1663
+ if (serialized.externalUrl) {
1664
+ log().warn('External URL is not supported for input_image in realtime API');
1665
+ continue;
1666
+ }
1667
+ if (!serialized.base64Data) {
1668
+ log().warn('Serialized image has no data bytes');
1669
+ continue;
1670
+ }
1671
+ contentList.push({
1672
+ type: 'input_image',
1673
+ image_url: `data:${serialized.mimeType};base64,${serialized.base64Data}`,
1674
+ } as api_proto.InputImageContent);
1654
1675
  } else if (c.type === 'audio_content') {
1655
1676
  if (role === 'user') {
1656
1677
  const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
@@ -1699,6 +1720,10 @@ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
1699
1720
  for (const c of contents) {
1700
1721
  if (c.type === 'text' || c.type === 'input_text') {
1701
1722
  content.push(c.text);
1723
+ } else if (c.type === 'input_image' && (c as api_proto.InputImageContent).image_url) {
1724
+ content.push(
1725
+ llm.createImageContent({ image: (c as api_proto.InputImageContent).image_url }),
1726
+ );
1702
1727
  }
1703
1728
  }
1704
1729
  return llm.ChatMessage.create({
@@ -448,28 +448,27 @@ export class RealtimeSession extends llm.RealtimeSession {
448
448
 
449
449
  async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
450
450
  const unlock = await this.updateChatCtxLock.lock();
451
- const events = this.createChatCtxUpdateEvents(_chatCtx);
452
- const futures: Future<void>[] = [];
451
+ try {
452
+ const events = await this.createChatCtxUpdateEvents(_chatCtx);
453
+ const futures: Future<void>[] = [];
453
454
 
454
- for (const event of events) {
455
- const future = new Future<void>();
456
- futures.push(future);
455
+ for (const event of events) {
456
+ const future = new Future<void>();
457
+ futures.push(future);
457
458
 
458
- if (event.type === 'conversation.item.create') {
459
- this.itemCreateFutures[event.item.id] = future;
460
- } else if (event.type == 'conversation.item.delete') {
461
- this.itemDeleteFutures[event.item_id] = future;
462
- }
459
+ if (event.type === 'conversation.item.create') {
460
+ this.itemCreateFutures[event.item.id] = future;
461
+ } else if (event.type == 'conversation.item.delete') {
462
+ this.itemDeleteFutures[event.item_id] = future;
463
+ }
463
464
 
464
- this.sendEvent(event);
465
- }
465
+ this.sendEvent(event);
466
+ }
466
467
 
467
- if (futures.length === 0) {
468
- unlock();
469
- return;
470
- }
468
+ if (futures.length === 0) {
469
+ return;
470
+ }
471
471
 
472
- try {
473
472
  // wait for futures to resolve or timeout
474
473
  await Promise.race([
475
474
  Promise.all(futures),
@@ -485,10 +484,10 @@ export class RealtimeSession extends llm.RealtimeSession {
485
484
  }
486
485
  }
487
486
 
488
- private createChatCtxUpdateEvents(
487
+ private async createChatCtxUpdateEvents(
489
488
  chatCtx: llm.ChatContext,
490
489
  addMockAudio: boolean = false,
491
- ): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
490
+ ): Promise<(api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[]> {
492
491
  const newChatCtx = chatCtx.copy();
493
492
  if (addMockAudio) {
494
493
  newChatCtx.items.push(createMockAudioItem());
@@ -520,7 +519,7 @@ export class RealtimeSession extends llm.RealtimeSession {
520
519
  }
521
520
  events.push({
522
521
  type: 'conversation.item.create',
523
- item: livekitItemToOpenAIItem(chatItem),
522
+ item: await livekitItemToOpenAIItem(chatItem),
524
523
  previous_item_id: previousId ?? undefined,
525
524
  event_id: shortuuid('chat_ctx_create_'),
526
525
  } as api_proto.ConversationItemCreateEvent);
@@ -682,7 +681,7 @@ export class RealtimeSession extends llm.RealtimeSession {
682
681
  content: [_options.audioTranscript],
683
682
  });
684
683
  chatCtx.items[idx] = newItem;
685
- const events = this.createChatCtxUpdateEvents(chatCtx);
684
+ const events = await this.createChatCtxUpdateEvents(chatCtx);
686
685
  for (const ev of events) {
687
686
  this.sendEvent(ev);
688
687
  }
@@ -805,7 +804,7 @@ export class RealtimeSession extends llm.RealtimeSession {
805
804
 
806
805
  const oldChatCtx = this.remoteChatCtx;
807
806
  this.remoteChatCtx = new llm.RemoteChatContext();
808
- events.push(...this.createChatCtxUpdateEvents(chatCtx));
807
+ events.push(...(await this.createChatCtxUpdateEvents(chatCtx)));
809
808
 
810
809
  try {
811
810
  for (const ev of events) {
@@ -1521,7 +1520,7 @@ export class RealtimeSession extends llm.RealtimeSession {
1521
1520
  }
1522
1521
  }
1523
1522
 
1524
- function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1523
+ async function livekitItemToOpenAIItem(item: llm.ChatItem): Promise<api_proto.ItemResource> {
1525
1524
  switch (item.type) {
1526
1525
  case 'function_call':
1527
1526
  return {
@@ -1548,8 +1547,22 @@ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
1548
1547
  text: c,
1549
1548
  } as api_proto.InputTextContent);
1550
1549
  } else if (c.type === 'image_content') {
1551
- // not supported for now
1552
- continue;
1550
+ if (role !== 'user') {
1551
+ continue;
1552
+ }
1553
+ const serialized = await llm.serializeImage(c);
1554
+ if (serialized.externalUrl) {
1555
+ log().warn('External URL is not supported for input_image in realtime API');
1556
+ continue;
1557
+ }
1558
+ if (!serialized.base64Data) {
1559
+ log().warn('Serialized image has no data bytes');
1560
+ continue;
1561
+ }
1562
+ contentList.push({
1563
+ type: 'input_image',
1564
+ image_url: `data:${serialized.mimeType};base64,${serialized.base64Data}`,
1565
+ } as api_proto.InputImageContent);
1553
1566
  } else if (c.type === 'audio_content') {
1554
1567
  if (role === 'user') {
1555
1568
  const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
@@ -1598,6 +1611,10 @@ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
1598
1611
  for (const c of contents) {
1599
1612
  if (c.type === 'text' || c.type === 'input_text') {
1600
1613
  content.push(c.text);
1614
+ } else if (c.type === 'input_image' && (c as api_proto.InputImageContent).image_url) {
1615
+ content.push(
1616
+ llm.createImageContent({ image: (c as api_proto.InputImageContent).image_url }),
1617
+ );
1601
1618
  }
1602
1619
  }
1603
1620
  return llm.ChatMessage.create({