@livekit/agents-plugin-openai 1.0.51 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/llm.cjs +8 -0
- package/dist/llm.cjs.map +1 -1
- package/dist/llm.d.cts +1 -0
- package/dist/llm.d.ts +1 -0
- package/dist/llm.d.ts.map +1 -1
- package/dist/llm.js +8 -0
- package/dist/llm.js.map +1 -1
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +7 -3
- package/dist/realtime/api_proto.d.ts +7 -3
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +46 -22
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +2 -1
- package/dist/realtime/realtime_model.d.ts +2 -1
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +46 -22
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/realtime/realtime_model.test.cjs +104 -14
- package/dist/realtime/realtime_model.test.cjs.map +1 -1
- package/dist/realtime/realtime_model.test.js +104 -14
- package/dist/realtime/realtime_model.test.js.map +1 -1
- package/dist/realtime/realtime_model_beta.cjs +40 -22
- package/dist/realtime/realtime_model_beta.cjs.map +1 -1
- package/dist/realtime/realtime_model_beta.d.ts.map +1 -1
- package/dist/realtime/realtime_model_beta.js +40 -22
- package/dist/realtime/realtime_model_beta.js.map +1 -1
- package/dist/stt.cjs +11 -0
- package/dist/stt.cjs.map +1 -1
- package/dist/stt.d.cts +2 -0
- package/dist/stt.d.ts +2 -0
- package/dist/stt.d.ts.map +1 -1
- package/dist/stt.js +11 -0
- package/dist/stt.js.map +1 -1
- package/dist/tts.cjs +11 -0
- package/dist/tts.cjs.map +1 -1
- package/dist/tts.d.cts +2 -0
- package/dist/tts.d.ts +2 -0
- package/dist/tts.d.ts.map +1 -1
- package/dist/tts.js +11 -0
- package/dist/tts.js.map +1 -1
- package/package.json +5 -5
- package/src/llm.ts +9 -0
- package/src/realtime/api_proto.ts +8 -2
- package/src/realtime/realtime_model.test.ts +129 -14
- package/src/realtime/realtime_model.ts +51 -26
- package/src/realtime/realtime_model_beta.ts +42 -25
- package/src/stt.ts +13 -0
- package/src/tts.ts +13 -0
package/dist/tts.d.ts
CHANGED
package/dist/tts.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,iBAAiB,EAA8B,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAE1F,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAgB;IACrB,OAAO,CAAC,eAAe,CAAyB;IAEhD;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CACR,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW,GACxB,aAAa;IAoBhB,MAAM,IAAI,GAAG,CAAC,gBAAgB;IAIxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAG7B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,KAAK,SAA0B;IAC/B,OAAO,CAAC,MAAM,CAAe;gBAI3B,GAAG,EAAE,GAAG,EACR,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,EACpB,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW;cAMX,GAAG;CA+BpB"}
|
|
1
|
+
{"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,KAAK,iBAAiB,EAA8B,GAAG,EAAE,MAAM,iBAAiB,CAAC;AAE1F,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAChC,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAKxD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AASD,qBAAa,GAAI,SAAQ,GAAG,CAAC,GAAG;;IAG9B,KAAK,SAAgB;IACrB,OAAO,CAAC,eAAe,CAAyB;IAEhD,IAAI,KAAK,IAAI,MAAM,CAElB;IAED,IAAI,QAAQ,IAAI,MAAM,CAOrB;IAED;;;;;;OAMG;gBACS,IAAI,GAAE,OAAO,CAAC,UAAU,CAAqB;IAgBzD,aAAa,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,SAAS,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE;IAIrF,UAAU,CACR,IAAI,EAAE,MAAM,EACZ,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW,GACxB,aAAa;IAoBhB,MAAM,IAAI,GAAG,CAAC,gBAAgB;IAIxB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAG7B;AAED,qBAAa,aAAc,SAAQ,GAAG,CAAC,aAAa;IAClD,KAAK,SAA0B;IAC/B,OAAO,CAAC,MAAM,CAAe;gBAI3B,GAAG,EAAE,GAAG,EACR,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,EACpB,WAAW,CAAC,EAAE,iBAAiB,EAC/B,WAAW,CAAC,EAAE,WAAW;cAMX,GAAG;CA+BpB"}
|
package/dist/tts.js
CHANGED
|
@@ -13,6 +13,17 @@ class TTS extends tts.TTS {
|
|
|
13
13
|
#client;
|
|
14
14
|
label = "openai.TTS";
|
|
15
15
|
abortController = new AbortController();
|
|
16
|
+
get model() {
|
|
17
|
+
return this.#opts.model;
|
|
18
|
+
}
|
|
19
|
+
get provider() {
|
|
20
|
+
try {
|
|
21
|
+
const url = new URL(this.#client.baseURL);
|
|
22
|
+
return url.host;
|
|
23
|
+
} catch {
|
|
24
|
+
return "api.openai.com";
|
|
25
|
+
}
|
|
26
|
+
}
|
|
16
27
|
/**
|
|
17
28
|
* Create a new instance of OpenAI TTS.
|
|
18
29
|
*
|
package/dist/tts.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type APIConnectOptions, AudioByteStream, shortuuid, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { OpenAI } from 'openai';\nimport type { TTSModels, TTSVoices } from './models.js';\n\nconst OPENAI_TTS_SAMPLE_RATE = 24000;\nconst OPENAI_TTS_CHANNELS = 1;\n\nexport interface TTSOptions {\n model: TTSModels | string;\n voice: TTSVoices;\n speed: number;\n instructions?: string;\n baseURL?: string;\n client?: OpenAI;\n apiKey?: string;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.OPENAI_API_KEY,\n model: 'tts-1',\n voice: 'alloy',\n speed: 1,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n #client: OpenAI;\n label = 'openai.TTS';\n private abortController = new AbortController();\n\n /**\n * Create a new instance of OpenAI TTS.\n *\n * @remarks\n * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the\n * `OPENAI_API_KEY` environment variable.\n */\n constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {\n super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });\n\n this.#opts = { ...defaultTTSOptions, ...opts };\n if (this.#opts.apiKey === undefined && !this.#opts.client) {\n throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');\n }\n\n this.#client =\n this.#opts.client ||\n new OpenAI({\n baseURL: this.#opts.baseURL,\n apiKey: this.#opts.apiKey,\n });\n }\n\n updateOptions(opts: { model?: TTSModels | string; voice?: TTSVoices; speed?: number }) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n synthesize(\n text: string,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ): ChunkedStream {\n return new ChunkedStream(\n this,\n text,\n this.#client.audio.speech.create(\n {\n input: text,\n model: this.#opts.model,\n voice: this.#opts.voice,\n instructions: this.#opts.instructions,\n response_format: 'pcm',\n speed: this.#opts.speed,\n },\n { signal: abortSignal },\n ),\n connOptions,\n abortSignal,\n );\n }\n\n stream(): tts.SynthesizeStream {\n throw new Error('Streaming is not supported on OpenAI TTS');\n }\n\n async close(): Promise<void> {\n this.abortController.abort();\n }\n}\n\nexport class ChunkedStream extends tts.ChunkedStream {\n label = 'openai.ChunkedStream';\n private stream: Promise<any>;\n\n // set Promise<T> to any because OpenAI returns an annoying Response type\n constructor(\n tts: TTS,\n text: string,\n stream: Promise<any>,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ) {\n super(text, tts, connOptions, abortSignal);\n this.stream = stream;\n }\n\n protected async run() {\n try {\n const buffer = await this.stream.then((r) => r.arrayBuffer());\n const requestId = shortuuid();\n const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);\n const frames = audioByteStream.write(buffer);\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n for (const frame of frames) {\n sendLastFrame(requestId, false);\n lastFrame = frame;\n }\n sendLastFrame(requestId, true);\n\n this.queue.close();\n } catch (error) {\n if (error instanceof Error && error.name === 'AbortError') {\n return;\n }\n throw error;\n } finally {\n this.queue.close();\n }\n }\n}\n"],"mappings":"AAGA,SAAiC,iBAAiB,WAAW,WAAW;AAExE,SAAS,cAAc;AAGvB,MAAM,yBAAyB;AAC/B,MAAM,sBAAsB;AAY5B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AACT;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EACA,kBAAkB,IAAI,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,
|
|
1
|
+
{"version":3,"sources":["../src/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { type APIConnectOptions, AudioByteStream, shortuuid, tts } from '@livekit/agents';\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { OpenAI } from 'openai';\nimport type { TTSModels, TTSVoices } from './models.js';\n\nconst OPENAI_TTS_SAMPLE_RATE = 24000;\nconst OPENAI_TTS_CHANNELS = 1;\n\nexport interface TTSOptions {\n model: TTSModels | string;\n voice: TTSVoices;\n speed: number;\n instructions?: string;\n baseURL?: string;\n client?: OpenAI;\n apiKey?: string;\n}\n\nconst defaultTTSOptions: TTSOptions = {\n apiKey: process.env.OPENAI_API_KEY,\n model: 'tts-1',\n voice: 'alloy',\n speed: 1,\n};\n\nexport class TTS extends tts.TTS {\n #opts: TTSOptions;\n #client: OpenAI;\n label = 'openai.TTS';\n private abortController = new AbortController();\n\n get model(): string {\n return this.#opts.model;\n }\n\n get provider(): string {\n try {\n const url = new URL(this.#client.baseURL);\n return url.host;\n } catch {\n return 'api.openai.com';\n }\n }\n\n /**\n * Create a new instance of OpenAI TTS.\n *\n * @remarks\n * `apiKey` must be set to your OpenAI API key, either using the argument or by setting the\n * `OPENAI_API_KEY` environment variable.\n */\n constructor(opts: Partial<TTSOptions> = defaultTTSOptions) {\n super(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS, { streaming: false });\n\n this.#opts = { ...defaultTTSOptions, ...opts };\n if (this.#opts.apiKey === undefined && !this.#opts.client) {\n throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');\n }\n\n this.#client =\n this.#opts.client ||\n new OpenAI({\n baseURL: this.#opts.baseURL,\n apiKey: this.#opts.apiKey,\n });\n }\n\n updateOptions(opts: { model?: TTSModels | string; voice?: TTSVoices; speed?: number }) {\n this.#opts = { ...this.#opts, ...opts };\n }\n\n synthesize(\n text: string,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ): ChunkedStream {\n return new ChunkedStream(\n this,\n text,\n this.#client.audio.speech.create(\n {\n input: text,\n model: this.#opts.model,\n voice: this.#opts.voice,\n instructions: this.#opts.instructions,\n response_format: 'pcm',\n speed: this.#opts.speed,\n },\n { signal: abortSignal },\n ),\n connOptions,\n abortSignal,\n );\n }\n\n stream(): tts.SynthesizeStream {\n throw new Error('Streaming is not supported on OpenAI TTS');\n }\n\n async close(): Promise<void> {\n this.abortController.abort();\n }\n}\n\nexport class ChunkedStream extends tts.ChunkedStream {\n label = 'openai.ChunkedStream';\n private stream: Promise<any>;\n\n // set Promise<T> to any because OpenAI returns an annoying Response type\n constructor(\n tts: TTS,\n text: string,\n stream: Promise<any>,\n connOptions?: APIConnectOptions,\n abortSignal?: AbortSignal,\n ) {\n super(text, tts, connOptions, abortSignal);\n this.stream = stream;\n }\n\n protected async run() {\n try {\n const buffer = await this.stream.then((r) => r.arrayBuffer());\n const requestId = shortuuid();\n const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS);\n const frames = audioByteStream.write(buffer);\n\n let lastFrame: AudioFrame | undefined;\n const sendLastFrame = (segmentId: string, final: boolean) => {\n if (lastFrame) {\n this.queue.put({ requestId, segmentId, frame: lastFrame, final });\n lastFrame = undefined;\n }\n };\n\n for (const frame of frames) {\n sendLastFrame(requestId, false);\n lastFrame = frame;\n }\n sendLastFrame(requestId, true);\n\n this.queue.close();\n } catch (error) {\n if (error instanceof Error && error.name === 'AbortError') {\n return;\n }\n throw error;\n } finally {\n this.queue.close();\n }\n }\n}\n"],"mappings":"AAGA,SAAiC,iBAAiB,WAAW,WAAW;AAExE,SAAS,cAAc;AAGvB,MAAM,yBAAyB;AAC/B,MAAM,sBAAsB;AAY5B,MAAM,oBAAgC;AAAA,EACpC,QAAQ,QAAQ,IAAI;AAAA,EACpB,OAAO;AAAA,EACP,OAAO;AAAA,EACP,OAAO;AACT;AAEO,MAAM,YAAY,IAAI,IAAI;AAAA,EAC/B;AAAA,EACA;AAAA,EACA,QAAQ;AAAA,EACA,kBAAkB,IAAI,gBAAgB;AAAA,EAE9C,IAAI,QAAgB;AAClB,WAAO,KAAK,MAAM;AAAA,EACpB;AAAA,EAEA,IAAI,WAAmB;AACrB,QAAI;AACF,YAAM,MAAM,IAAI,IAAI,KAAK,QAAQ,OAAO;AACxC,aAAO,IAAI;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,YAAY,OAA4B,mBAAmB;AACzD,UAAM,wBAAwB,qBAAqB,EAAE,WAAW,MAAM,CAAC;AAEvE,SAAK,QAAQ,EAAE,GAAG,mBAAmB,GAAG,KAAK;AAC7C,QAAI,KAAK,MAAM,WAAW,UAAa,CAAC,KAAK,MAAM,QAAQ;AACzD,YAAM,IAAI,MAAM,0EAA0E;AAAA,IAC5F;AAEA,SAAK,UACH,KAAK,MAAM,UACX,IAAI,OAAO;AAAA,MACT,SAAS,KAAK,MAAM;AAAA,MACpB,QAAQ,KAAK,MAAM;AAAA,IACrB,CAAC;AAAA,EACL;AAAA,EAEA,cAAc,MAAyE;AACrF,SAAK,QAAQ,EAAE,GAAG,KAAK,OAAO,GAAG,KAAK;AAAA,EACxC;AAAA,EAEA,WACE,MACA,aACA,aACe;AACf,WAAO,IAAI;AAAA,MACT;AAAA,MACA;AAAA,MACA,KAAK,QAAQ,MAAM,OAAO;AAAA,QACxB;AAAA,UACE,OAAO;AAAA,UACP,OAAO,KAAK,MAAM;AAAA,UAClB,OAAO,KAAK,MAAM;AAAA,UAClB,cAAc,KAAK,MAAM;AAAA,UACzB,iBAAiB;AAAA,UACjB,OAAO,KAAK,MAAM;AAAA,QACpB;AAAA,QACA,EAAE,QAAQ,YAAY;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAAA,EACF;AAAA,EAEA,SAA+B;AAC7B,UAAM,IAAI,MAAM,0CAA0C;AAAA,EAC5D;AAAA,EAEA,MAAM,QAAuB;AAC3B,SAAK,gBAAgB,MAAM;AAAA,EAC7B;AACF;AAEO,MAAM,sBAAsB,IAAI,cAAc;AAAA,EACnD,QAAQ;AAAA,EACA;AAAA;AAAA,EAGR,YACEA,MACA,MACA,QACA,aACA,aACA;AACA,UAAM,MAAMA,MAAK,aAAa,WAAW;AACzC,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,MAAgB,MAAM;AACpB,QAAI;AACF,YAAM,SAAS,MAAM,KAAK,OAAO,KAAK,CAAC,MAAM,EAAE,YAAY,CAAC;AAC5D,YAAM,YAAY,UAAU;AAC5B,YAAM,kBAAkB,IAAI,gBAAgB,wBAAwB,mBAAmB;AACvF,YAAM,SAAS,gBAAgB,MAAM,MAAM;AAE3C,UAAI;AACJ,YAAM,gBAAgB,CAAC,WAAmB,UAAmB;AAC3D,YAAI,WAAW;AACb,eAAK,MAAM,IAAI,EAAE,WAAW,WAAW,OAAO,WAAW,MAAM,CAAC;AAChE,sBAAY;AAAA,QACd;AAAA,MACF;AAEA,iBAAW,SAAS,QAAQ;AAC1B,sBAAc,WAAW,KAAK;AAC9B,oBAAY;AAAA,MACd;AACA,oBAAc,WAAW,IAAI;AAE7B,WAAK,MAAM,MAAM;AAAA,IACnB,SAAS,OAAO;AACd,UAAI,iBAAiB,SAAS,MAAM,SAAS,cAAc;AACzD;AAAA,MACF;AACA,YAAM;AAAA,IACR,UAAE;AACA,WAAK,MAAM,MAAM;AAAA,IACnB;AAAA,EACF;AACF;","names":["tts"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livekit/agents-plugin-openai",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "OpenAI plugin for LiveKit Node Agents",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"require": "dist/index.cjs",
|
|
@@ -30,9 +30,9 @@
|
|
|
30
30
|
"@types/ws": "^8.5.10",
|
|
31
31
|
"tsup": "^8.3.5",
|
|
32
32
|
"typescript": "^5.0.0",
|
|
33
|
-
"@livekit/agents": "1.0
|
|
34
|
-
"@livekit/agents-plugin-silero": "1.0
|
|
35
|
-
"@livekit/agents-plugins-test": "1.0
|
|
33
|
+
"@livekit/agents": "1.2.0",
|
|
34
|
+
"@livekit/agents-plugin-silero": "1.2.0",
|
|
35
|
+
"@livekit/agents-plugins-test": "1.2.0"
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"@livekit/mutex": "^1.1.1",
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
"peerDependencies": {
|
|
43
43
|
"@livekit/rtc-node": "^0.13.24",
|
|
44
44
|
"zod": "^3.25.76 || ^4.1.8",
|
|
45
|
-
"@livekit/agents": "1.0
|
|
45
|
+
"@livekit/agents": "1.2.0"
|
|
46
46
|
},
|
|
47
47
|
"scripts": {
|
|
48
48
|
"build": "tsup --onSuccess \"pnpm build:types\"",
|
package/src/llm.ts
CHANGED
|
@@ -86,6 +86,15 @@ export class LLM extends llm.LLM {
|
|
|
86
86
|
return this.#opts.model;
|
|
87
87
|
}
|
|
88
88
|
|
|
89
|
+
get provider(): string {
|
|
90
|
+
try {
|
|
91
|
+
const url = new URL(this.#client.baseURL);
|
|
92
|
+
return url.host;
|
|
93
|
+
} catch {
|
|
94
|
+
return 'api.openai.com';
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
89
98
|
/**
|
|
90
99
|
* Create a new instance of OpenAI LLM with Azure.
|
|
91
100
|
*
|
|
@@ -162,6 +162,11 @@ export interface InputAudioContent {
|
|
|
162
162
|
audio: AudioBase64Bytes;
|
|
163
163
|
}
|
|
164
164
|
|
|
165
|
+
export interface InputImageContent {
|
|
166
|
+
type: 'input_image';
|
|
167
|
+
image_url: string;
|
|
168
|
+
}
|
|
169
|
+
|
|
165
170
|
export interface TextContent {
|
|
166
171
|
type: 'text';
|
|
167
172
|
text: string;
|
|
@@ -181,6 +186,7 @@ export interface AudioContent {
|
|
|
181
186
|
export type Content =
|
|
182
187
|
| InputTextContent
|
|
183
188
|
| InputAudioContent
|
|
189
|
+
| InputImageContent
|
|
184
190
|
| TextContent
|
|
185
191
|
| OutputTextContent
|
|
186
192
|
| AudioContent;
|
|
@@ -206,7 +212,7 @@ export interface SystemItem extends BaseItem {
|
|
|
206
212
|
export interface UserItem extends BaseItem {
|
|
207
213
|
type: 'message';
|
|
208
214
|
role: 'user';
|
|
209
|
-
content: (InputTextContent | InputAudioContent)[];
|
|
215
|
+
content: (InputTextContent | InputAudioContent | InputImageContent)[];
|
|
210
216
|
}
|
|
211
217
|
|
|
212
218
|
export interface AssistantItem extends BaseItem {
|
|
@@ -361,7 +367,7 @@ export interface UserItemCreate {
|
|
|
361
367
|
id: string;
|
|
362
368
|
type: 'message';
|
|
363
369
|
role: 'user';
|
|
364
|
-
content: (InputTextContent | InputAudioContent)[];
|
|
370
|
+
content: (InputTextContent | InputAudioContent | InputImageContent)[];
|
|
365
371
|
}
|
|
366
372
|
|
|
367
373
|
export interface AssistantItemCreate {
|
|
@@ -8,14 +8,14 @@ import { livekitItemToOpenAIItem } from './realtime_model.js';
|
|
|
8
8
|
|
|
9
9
|
describe('livekitItemToOpenAIItem', () => {
|
|
10
10
|
describe('message items', () => {
|
|
11
|
-
it('should use output_text type for assistant messages', () => {
|
|
11
|
+
it('should use output_text type for assistant messages', async () => {
|
|
12
12
|
const assistantMessage = new llm.ChatMessage({
|
|
13
13
|
role: 'assistant',
|
|
14
14
|
content: 'Hello, how can I help you?',
|
|
15
15
|
id: 'test-assistant-msg',
|
|
16
16
|
});
|
|
17
17
|
|
|
18
|
-
const result = livekitItemToOpenAIItem(assistantMessage) as api_proto.AssistantItem;
|
|
18
|
+
const result = (await livekitItemToOpenAIItem(assistantMessage)) as api_proto.AssistantItem;
|
|
19
19
|
|
|
20
20
|
expect(result.type).toBe('message');
|
|
21
21
|
expect(result.role).toBe('assistant');
|
|
@@ -25,14 +25,14 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
25
25
|
expect((content as api_proto.OutputTextContent).text).toBe('Hello, how can I help you?');
|
|
26
26
|
});
|
|
27
27
|
|
|
28
|
-
it('should use input_text type for user messages', () => {
|
|
28
|
+
it('should use input_text type for user messages', async () => {
|
|
29
29
|
const userMessage = new llm.ChatMessage({
|
|
30
30
|
role: 'user',
|
|
31
31
|
content: 'What is the weather like?',
|
|
32
32
|
id: 'test-user-msg',
|
|
33
33
|
});
|
|
34
34
|
|
|
35
|
-
const result = livekitItemToOpenAIItem(userMessage) as api_proto.UserItem;
|
|
35
|
+
const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
|
|
36
36
|
|
|
37
37
|
expect(result.type).toBe('message');
|
|
38
38
|
expect(result.role).toBe('user');
|
|
@@ -42,14 +42,14 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
42
42
|
expect((content as api_proto.InputTextContent).text).toBe('What is the weather like?');
|
|
43
43
|
});
|
|
44
44
|
|
|
45
|
-
it('should use input_text type for system messages', () => {
|
|
45
|
+
it('should use input_text type for system messages', async () => {
|
|
46
46
|
const systemMessage = new llm.ChatMessage({
|
|
47
47
|
role: 'system',
|
|
48
48
|
content: 'You are a helpful assistant.',
|
|
49
49
|
id: 'test-system-msg',
|
|
50
50
|
});
|
|
51
51
|
|
|
52
|
-
const result = livekitItemToOpenAIItem(systemMessage) as api_proto.UserItem;
|
|
52
|
+
const result = (await livekitItemToOpenAIItem(systemMessage)) as api_proto.UserItem;
|
|
53
53
|
|
|
54
54
|
expect(result.type).toBe('message');
|
|
55
55
|
expect(result.role).toBe('system');
|
|
@@ -58,14 +58,14 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
58
58
|
expect(content.type).toBe('input_text');
|
|
59
59
|
});
|
|
60
60
|
|
|
61
|
-
it('should convert developer role to system role', () => {
|
|
61
|
+
it('should convert developer role to system role', async () => {
|
|
62
62
|
const developerMessage = new llm.ChatMessage({
|
|
63
63
|
role: 'developer',
|
|
64
64
|
content: 'System instructions.',
|
|
65
65
|
id: 'test-developer-msg',
|
|
66
66
|
});
|
|
67
67
|
|
|
68
|
-
const result = livekitItemToOpenAIItem(developerMessage) as api_proto.UserItem;
|
|
68
|
+
const result = (await livekitItemToOpenAIItem(developerMessage)) as api_proto.UserItem;
|
|
69
69
|
|
|
70
70
|
expect(result.type).toBe('message');
|
|
71
71
|
expect(result.role).toBe('system');
|
|
@@ -73,14 +73,16 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
73
73
|
expect(content.type).toBe('input_text');
|
|
74
74
|
});
|
|
75
75
|
|
|
76
|
-
it('should handle multiple content items for assistant', () => {
|
|
76
|
+
it('should handle multiple content items for assistant', async () => {
|
|
77
77
|
const multiContentMessage = new llm.ChatMessage({
|
|
78
78
|
role: 'assistant',
|
|
79
79
|
content: ['First part.', 'Second part.'],
|
|
80
80
|
id: 'test-multi-msg',
|
|
81
81
|
});
|
|
82
82
|
|
|
83
|
-
const result = livekitItemToOpenAIItem(
|
|
83
|
+
const result = (await livekitItemToOpenAIItem(
|
|
84
|
+
multiContentMessage,
|
|
85
|
+
)) as api_proto.AssistantItem;
|
|
84
86
|
|
|
85
87
|
expect(result.content).toHaveLength(2);
|
|
86
88
|
const content0 = result.content[0]!;
|
|
@@ -88,10 +90,121 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
88
90
|
expect(content0.type).toBe('output_text');
|
|
89
91
|
expect(content1.type).toBe('output_text');
|
|
90
92
|
});
|
|
93
|
+
|
|
94
|
+
it('should convert image content to input_image for user messages', async () => {
|
|
95
|
+
const base64Data =
|
|
96
|
+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
|
|
97
|
+
const imageContent = llm.createImageContent({
|
|
98
|
+
image: `data:image/png;base64,${base64Data}`,
|
|
99
|
+
mimeType: 'image/png',
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
const userMessage = new llm.ChatMessage({
|
|
103
|
+
role: 'user',
|
|
104
|
+
content: [imageContent],
|
|
105
|
+
id: 'test-image-msg',
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
|
|
109
|
+
|
|
110
|
+
expect(result.type).toBe('message');
|
|
111
|
+
expect(result.role).toBe('user');
|
|
112
|
+
expect(result.content).toHaveLength(1);
|
|
113
|
+
const content = result.content[0]!;
|
|
114
|
+
expect(content.type).toBe('input_image');
|
|
115
|
+
expect((content as api_proto.InputImageContent).image_url).toBe(
|
|
116
|
+
`data:image/png;base64,${base64Data}`,
|
|
117
|
+
);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
it('should ignore image content for assistant messages', async () => {
|
|
121
|
+
const base64Data =
|
|
122
|
+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
|
|
123
|
+
const imageContent = llm.createImageContent({
|
|
124
|
+
image: `data:image/png;base64,${base64Data}`,
|
|
125
|
+
mimeType: 'image/png',
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
const assistantMessage = new llm.ChatMessage({
|
|
129
|
+
role: 'assistant',
|
|
130
|
+
content: [imageContent],
|
|
131
|
+
id: 'test-assistant-image-msg',
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
const result = (await livekitItemToOpenAIItem(assistantMessage)) as api_proto.AssistantItem;
|
|
135
|
+
|
|
136
|
+
expect(result.type).toBe('message');
|
|
137
|
+
expect(result.role).toBe('assistant');
|
|
138
|
+
expect(result.content).toHaveLength(0);
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it('should ignore image content for system messages', async () => {
|
|
142
|
+
const base64Data =
|
|
143
|
+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
|
|
144
|
+
const imageContent = llm.createImageContent({
|
|
145
|
+
image: `data:image/png;base64,${base64Data}`,
|
|
146
|
+
mimeType: 'image/png',
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
const systemMessage = new llm.ChatMessage({
|
|
150
|
+
role: 'system',
|
|
151
|
+
content: [imageContent],
|
|
152
|
+
id: 'test-system-image-msg',
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
const result = (await livekitItemToOpenAIItem(systemMessage)) as api_proto.SystemItem;
|
|
156
|
+
|
|
157
|
+
expect(result.type).toBe('message');
|
|
158
|
+
expect(result.role).toBe('system');
|
|
159
|
+
expect(result.content).toHaveLength(0);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
it('should ignore image content for developer messages mapped to system', async () => {
|
|
163
|
+
const base64Data =
|
|
164
|
+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
|
|
165
|
+
const imageContent = llm.createImageContent({
|
|
166
|
+
image: `data:image/png;base64,${base64Data}`,
|
|
167
|
+
mimeType: 'image/png',
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
const developerMessage = new llm.ChatMessage({
|
|
171
|
+
role: 'developer',
|
|
172
|
+
content: [imageContent],
|
|
173
|
+
id: 'test-developer-image-msg',
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
const result = (await livekitItemToOpenAIItem(developerMessage)) as api_proto.SystemItem;
|
|
177
|
+
|
|
178
|
+
expect(result.type).toBe('message');
|
|
179
|
+
expect(result.role).toBe('system');
|
|
180
|
+
expect(result.content).toHaveLength(0);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it('should handle mixed text and image content', async () => {
|
|
184
|
+
const base64Data =
|
|
185
|
+
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==';
|
|
186
|
+
const imageContent = llm.createImageContent({
|
|
187
|
+
image: `data:image/png;base64,${base64Data}`,
|
|
188
|
+
mimeType: 'image/png',
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
const userMessage = new llm.ChatMessage({
|
|
192
|
+
role: 'user',
|
|
193
|
+
content: ['Describe this image:', imageContent],
|
|
194
|
+
id: 'test-mixed-msg',
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
const result = (await livekitItemToOpenAIItem(userMessage)) as api_proto.UserItem;
|
|
198
|
+
|
|
199
|
+
expect(result.type).toBe('message');
|
|
200
|
+
expect(result.content).toHaveLength(2);
|
|
201
|
+
expect(result.content[0]!.type).toBe('input_text');
|
|
202
|
+
expect(result.content[1]!.type).toBe('input_image');
|
|
203
|
+
});
|
|
91
204
|
});
|
|
92
205
|
|
|
93
206
|
describe('function_call items', () => {
|
|
94
|
-
it('should convert function call items correctly', () => {
|
|
207
|
+
it('should convert function call items correctly', async () => {
|
|
95
208
|
const functionCall = new llm.FunctionCall({
|
|
96
209
|
callId: 'call-123',
|
|
97
210
|
name: 'get_weather',
|
|
@@ -99,7 +212,7 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
99
212
|
id: 'test-func-call',
|
|
100
213
|
});
|
|
101
214
|
|
|
102
|
-
const result = livekitItemToOpenAIItem(functionCall) as api_proto.FunctionCallItem;
|
|
215
|
+
const result = (await livekitItemToOpenAIItem(functionCall)) as api_proto.FunctionCallItem;
|
|
103
216
|
|
|
104
217
|
expect(result.type).toBe('function_call');
|
|
105
218
|
expect(result.id).toBe('test-func-call');
|
|
@@ -110,7 +223,7 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
110
223
|
});
|
|
111
224
|
|
|
112
225
|
describe('function_call_output items', () => {
|
|
113
|
-
it('should convert function call output items correctly', () => {
|
|
226
|
+
it('should convert function call output items correctly', async () => {
|
|
114
227
|
const functionOutput = new llm.FunctionCallOutput({
|
|
115
228
|
callId: 'call-123',
|
|
116
229
|
output: 'The weather in San Francisco is sunny.',
|
|
@@ -118,7 +231,9 @@ describe('livekitItemToOpenAIItem', () => {
|
|
|
118
231
|
id: 'test-func-output',
|
|
119
232
|
});
|
|
120
233
|
|
|
121
|
-
const result = livekitItemToOpenAIItem(
|
|
234
|
+
const result = (await livekitItemToOpenAIItem(
|
|
235
|
+
functionOutput,
|
|
236
|
+
)) as api_proto.FunctionCallOutputItem;
|
|
122
237
|
|
|
123
238
|
expect(result.type).toBe('function_call_output');
|
|
124
239
|
expect(result.id).toBe('test-func-output');
|
|
@@ -144,6 +144,15 @@ export class RealtimeModel extends llm.RealtimeModel {
|
|
|
144
144
|
return this._options.model;
|
|
145
145
|
}
|
|
146
146
|
|
|
147
|
+
get provider(): string {
|
|
148
|
+
try {
|
|
149
|
+
const url = new URL(this._options.baseURL);
|
|
150
|
+
return url.host;
|
|
151
|
+
} catch {
|
|
152
|
+
return 'api.openai.com';
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
147
156
|
constructor(
|
|
148
157
|
options: {
|
|
149
158
|
model?: string;
|
|
@@ -461,28 +470,27 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
461
470
|
|
|
462
471
|
async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
|
|
463
472
|
const unlock = await this.updateChatCtxLock.lock();
|
|
464
|
-
|
|
465
|
-
|
|
473
|
+
try {
|
|
474
|
+
const events = await this.createChatCtxUpdateEvents(_chatCtx);
|
|
475
|
+
const futures: Future<void>[] = [];
|
|
466
476
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
477
|
+
for (const event of events) {
|
|
478
|
+
const future = new Future<void>();
|
|
479
|
+
futures.push(future);
|
|
470
480
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
481
|
+
if (event.type === 'conversation.item.create') {
|
|
482
|
+
this.itemCreateFutures[event.item.id] = future;
|
|
483
|
+
} else if (event.type == 'conversation.item.delete') {
|
|
484
|
+
this.itemDeleteFutures[event.item_id] = future;
|
|
485
|
+
}
|
|
476
486
|
|
|
477
|
-
|
|
478
|
-
|
|
487
|
+
this.sendEvent(event);
|
|
488
|
+
}
|
|
479
489
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
}
|
|
490
|
+
if (futures.length === 0) {
|
|
491
|
+
return;
|
|
492
|
+
}
|
|
484
493
|
|
|
485
|
-
try {
|
|
486
494
|
// wait for futures to resolve or timeout
|
|
487
495
|
await Promise.race([
|
|
488
496
|
Promise.all(futures),
|
|
@@ -498,10 +506,10 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
498
506
|
}
|
|
499
507
|
}
|
|
500
508
|
|
|
501
|
-
private createChatCtxUpdateEvents(
|
|
509
|
+
private async createChatCtxUpdateEvents(
|
|
502
510
|
chatCtx: llm.ChatContext,
|
|
503
511
|
addMockAudio: boolean = false,
|
|
504
|
-
): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
|
|
512
|
+
): Promise<(api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[]> {
|
|
505
513
|
const newChatCtx = chatCtx.copy();
|
|
506
514
|
if (addMockAudio) {
|
|
507
515
|
newChatCtx.items.push(createMockAudioItem());
|
|
@@ -533,7 +541,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
533
541
|
}
|
|
534
542
|
events.push({
|
|
535
543
|
type: 'conversation.item.create',
|
|
536
|
-
item: livekitItemToOpenAIItem(chatItem),
|
|
544
|
+
item: await livekitItemToOpenAIItem(chatItem),
|
|
537
545
|
previous_item_id: previousId ?? undefined,
|
|
538
546
|
event_id: shortuuid('chat_ctx_create_'),
|
|
539
547
|
} as api_proto.ConversationItemCreateEvent);
|
|
@@ -704,7 +712,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
704
712
|
content: [_options.audioTranscript],
|
|
705
713
|
});
|
|
706
714
|
chatCtx.items[idx] = newItem;
|
|
707
|
-
const events = this.createChatCtxUpdateEvents(chatCtx);
|
|
715
|
+
const events = await this.createChatCtxUpdateEvents(chatCtx);
|
|
708
716
|
for (const ev of events) {
|
|
709
717
|
this.sendEvent(ev);
|
|
710
718
|
}
|
|
@@ -845,7 +853,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
845
853
|
|
|
846
854
|
const oldChatCtx = this.remoteChatCtx;
|
|
847
855
|
this.remoteChatCtx = new llm.RemoteChatContext();
|
|
848
|
-
events.push(...this.createChatCtxUpdateEvents(chatCtx));
|
|
856
|
+
events.push(...(await this.createChatCtxUpdateEvents(chatCtx)));
|
|
849
857
|
|
|
850
858
|
try {
|
|
851
859
|
for (const ev of events) {
|
|
@@ -1560,7 +1568,6 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1560
1568
|
if (event.error.message.startsWith('Cancellation failed')) {
|
|
1561
1569
|
return;
|
|
1562
1570
|
}
|
|
1563
|
-
|
|
1564
1571
|
this.#logger.error({ error: event.error }, 'OpenAI Realtime API returned an error');
|
|
1565
1572
|
this.emitError({
|
|
1566
1573
|
error: new APIError(event.error.message, {
|
|
@@ -1622,7 +1629,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1622
1629
|
}
|
|
1623
1630
|
|
|
1624
1631
|
/** @internal Exported for testing purposes */
|
|
1625
|
-
export function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
|
|
1632
|
+
export async function livekitItemToOpenAIItem(item: llm.ChatItem): Promise<api_proto.ItemResource> {
|
|
1626
1633
|
switch (item.type) {
|
|
1627
1634
|
case 'function_call':
|
|
1628
1635
|
return {
|
|
@@ -1649,8 +1656,22 @@ export function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResou
|
|
|
1649
1656
|
text: c,
|
|
1650
1657
|
} as api_proto.InputTextContent | api_proto.OutputTextContent);
|
|
1651
1658
|
} else if (c.type === 'image_content') {
|
|
1652
|
-
//
|
|
1653
|
-
continue;
|
|
1659
|
+
// only user can send image
|
|
1660
|
+
if (role !== 'user') continue;
|
|
1661
|
+
|
|
1662
|
+
const serialized = await llm.serializeImage(c);
|
|
1663
|
+
if (serialized.externalUrl) {
|
|
1664
|
+
log().warn('External URL is not supported for input_image in realtime API');
|
|
1665
|
+
continue;
|
|
1666
|
+
}
|
|
1667
|
+
if (!serialized.base64Data) {
|
|
1668
|
+
log().warn('Serialized image has no data bytes');
|
|
1669
|
+
continue;
|
|
1670
|
+
}
|
|
1671
|
+
contentList.push({
|
|
1672
|
+
type: 'input_image',
|
|
1673
|
+
image_url: `data:${serialized.mimeType};base64,${serialized.base64Data}`,
|
|
1674
|
+
} as api_proto.InputImageContent);
|
|
1654
1675
|
} else if (c.type === 'audio_content') {
|
|
1655
1676
|
if (role === 'user') {
|
|
1656
1677
|
const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
|
|
@@ -1699,6 +1720,10 @@ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
|
|
|
1699
1720
|
for (const c of contents) {
|
|
1700
1721
|
if (c.type === 'text' || c.type === 'input_text') {
|
|
1701
1722
|
content.push(c.text);
|
|
1723
|
+
} else if (c.type === 'input_image' && (c as api_proto.InputImageContent).image_url) {
|
|
1724
|
+
content.push(
|
|
1725
|
+
llm.createImageContent({ image: (c as api_proto.InputImageContent).image_url }),
|
|
1726
|
+
);
|
|
1702
1727
|
}
|
|
1703
1728
|
}
|
|
1704
1729
|
return llm.ChatMessage.create({
|
|
@@ -448,28 +448,27 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
448
448
|
|
|
449
449
|
async updateChatCtx(_chatCtx: llm.ChatContext): Promise<void> {
|
|
450
450
|
const unlock = await this.updateChatCtxLock.lock();
|
|
451
|
-
|
|
452
|
-
|
|
451
|
+
try {
|
|
452
|
+
const events = await this.createChatCtxUpdateEvents(_chatCtx);
|
|
453
|
+
const futures: Future<void>[] = [];
|
|
453
454
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
455
|
+
for (const event of events) {
|
|
456
|
+
const future = new Future<void>();
|
|
457
|
+
futures.push(future);
|
|
457
458
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
459
|
+
if (event.type === 'conversation.item.create') {
|
|
460
|
+
this.itemCreateFutures[event.item.id] = future;
|
|
461
|
+
} else if (event.type == 'conversation.item.delete') {
|
|
462
|
+
this.itemDeleteFutures[event.item_id] = future;
|
|
463
|
+
}
|
|
463
464
|
|
|
464
|
-
|
|
465
|
-
|
|
465
|
+
this.sendEvent(event);
|
|
466
|
+
}
|
|
466
467
|
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
}
|
|
468
|
+
if (futures.length === 0) {
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
471
|
|
|
472
|
-
try {
|
|
473
472
|
// wait for futures to resolve or timeout
|
|
474
473
|
await Promise.race([
|
|
475
474
|
Promise.all(futures),
|
|
@@ -485,10 +484,10 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
485
484
|
}
|
|
486
485
|
}
|
|
487
486
|
|
|
488
|
-
private createChatCtxUpdateEvents(
|
|
487
|
+
private async createChatCtxUpdateEvents(
|
|
489
488
|
chatCtx: llm.ChatContext,
|
|
490
489
|
addMockAudio: boolean = false,
|
|
491
|
-
): (api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[] {
|
|
490
|
+
): Promise<(api_proto.ConversationItemCreateEvent | api_proto.ConversationItemDeleteEvent)[]> {
|
|
492
491
|
const newChatCtx = chatCtx.copy();
|
|
493
492
|
if (addMockAudio) {
|
|
494
493
|
newChatCtx.items.push(createMockAudioItem());
|
|
@@ -520,7 +519,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
520
519
|
}
|
|
521
520
|
events.push({
|
|
522
521
|
type: 'conversation.item.create',
|
|
523
|
-
item: livekitItemToOpenAIItem(chatItem),
|
|
522
|
+
item: await livekitItemToOpenAIItem(chatItem),
|
|
524
523
|
previous_item_id: previousId ?? undefined,
|
|
525
524
|
event_id: shortuuid('chat_ctx_create_'),
|
|
526
525
|
} as api_proto.ConversationItemCreateEvent);
|
|
@@ -682,7 +681,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
682
681
|
content: [_options.audioTranscript],
|
|
683
682
|
});
|
|
684
683
|
chatCtx.items[idx] = newItem;
|
|
685
|
-
const events = this.createChatCtxUpdateEvents(chatCtx);
|
|
684
|
+
const events = await this.createChatCtxUpdateEvents(chatCtx);
|
|
686
685
|
for (const ev of events) {
|
|
687
686
|
this.sendEvent(ev);
|
|
688
687
|
}
|
|
@@ -805,7 +804,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
805
804
|
|
|
806
805
|
const oldChatCtx = this.remoteChatCtx;
|
|
807
806
|
this.remoteChatCtx = new llm.RemoteChatContext();
|
|
808
|
-
events.push(...this.createChatCtxUpdateEvents(chatCtx));
|
|
807
|
+
events.push(...(await this.createChatCtxUpdateEvents(chatCtx)));
|
|
809
808
|
|
|
810
809
|
try {
|
|
811
810
|
for (const ev of events) {
|
|
@@ -1521,7 +1520,7 @@ export class RealtimeSession extends llm.RealtimeSession {
|
|
|
1521
1520
|
}
|
|
1522
1521
|
}
|
|
1523
1522
|
|
|
1524
|
-
function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
|
|
1523
|
+
async function livekitItemToOpenAIItem(item: llm.ChatItem): Promise<api_proto.ItemResource> {
|
|
1525
1524
|
switch (item.type) {
|
|
1526
1525
|
case 'function_call':
|
|
1527
1526
|
return {
|
|
@@ -1548,8 +1547,22 @@ function livekitItemToOpenAIItem(item: llm.ChatItem): api_proto.ItemResource {
|
|
|
1548
1547
|
text: c,
|
|
1549
1548
|
} as api_proto.InputTextContent);
|
|
1550
1549
|
} else if (c.type === 'image_content') {
|
|
1551
|
-
|
|
1552
|
-
|
|
1550
|
+
if (role !== 'user') {
|
|
1551
|
+
continue;
|
|
1552
|
+
}
|
|
1553
|
+
const serialized = await llm.serializeImage(c);
|
|
1554
|
+
if (serialized.externalUrl) {
|
|
1555
|
+
log().warn('External URL is not supported for input_image in realtime API');
|
|
1556
|
+
continue;
|
|
1557
|
+
}
|
|
1558
|
+
if (!serialized.base64Data) {
|
|
1559
|
+
log().warn('Serialized image has no data bytes');
|
|
1560
|
+
continue;
|
|
1561
|
+
}
|
|
1562
|
+
contentList.push({
|
|
1563
|
+
type: 'input_image',
|
|
1564
|
+
image_url: `data:${serialized.mimeType};base64,${serialized.base64Data}`,
|
|
1565
|
+
} as api_proto.InputImageContent);
|
|
1553
1566
|
} else if (c.type === 'audio_content') {
|
|
1554
1567
|
if (role === 'user') {
|
|
1555
1568
|
const encodedAudio = Buffer.from(combineAudioFrames(c.frame).data).toString('base64');
|
|
@@ -1598,6 +1611,10 @@ function openAIItemToLivekitItem(item: api_proto.ItemResource): llm.ChatItem {
|
|
|
1598
1611
|
for (const c of contents) {
|
|
1599
1612
|
if (c.type === 'text' || c.type === 'input_text') {
|
|
1600
1613
|
content.push(c.text);
|
|
1614
|
+
} else if (c.type === 'input_image' && (c as api_proto.InputImageContent).image_url) {
|
|
1615
|
+
content.push(
|
|
1616
|
+
llm.createImageContent({ image: (c as api_proto.InputImageContent).image_url }),
|
|
1617
|
+
);
|
|
1601
1618
|
}
|
|
1602
1619
|
}
|
|
1603
1620
|
return llm.ChatMessage.create({
|