@livekit/agents-plugin-openai 1.0.31 → 1.0.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/realtime/api_proto.cjs.map +1 -1
- package/dist/realtime/api_proto.d.cts +50 -12
- package/dist/realtime/api_proto.d.ts +50 -12
- package/dist/realtime/api_proto.d.ts.map +1 -1
- package/dist/realtime/api_proto.js.map +1 -1
- package/dist/realtime/index.cjs +19 -0
- package/dist/realtime/index.cjs.map +1 -1
- package/dist/realtime/index.d.cts +1 -0
- package/dist/realtime/index.d.ts +1 -0
- package/dist/realtime/index.d.ts.map +1 -1
- package/dist/realtime/index.js +4 -0
- package/dist/realtime/index.js.map +1 -1
- package/dist/realtime/realtime_model.cjs +69 -33
- package/dist/realtime/realtime_model.cjs.map +1 -1
- package/dist/realtime/realtime_model.d.cts +14 -6
- package/dist/realtime/realtime_model.d.ts +14 -6
- package/dist/realtime/realtime_model.d.ts.map +1 -1
- package/dist/realtime/realtime_model.js +69 -33
- package/dist/realtime/realtime_model.js.map +1 -1
- package/dist/realtime/realtime_model_beta.cjs +1300 -0
- package/dist/realtime/realtime_model_beta.cjs.map +1 -0
- package/dist/realtime/realtime_model_beta.d.cts +165 -0
- package/dist/realtime/realtime_model_beta.d.ts +165 -0
- package/dist/realtime/realtime_model_beta.d.ts.map +1 -0
- package/dist/realtime/realtime_model_beta.js +1280 -0
- package/dist/realtime/realtime_model_beta.js.map +1 -0
- package/package.json +5 -5
- package/src/realtime/api_proto.ts +76 -17
- package/src/realtime/index.ts +1 -0
- package/src/realtime/realtime_model.ts +86 -49
- package/src/realtime/realtime_model_beta.ts +1665 -0
|
@@ -33,7 +33,6 @@ class CreateResponseHandle {
|
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
35
|
const DEFAULT_FIRST_RETRY_INTERVAL_MS = 100;
|
|
36
|
-
const DEFAULT_TEMPERATURE = 0.8;
|
|
37
36
|
const DEFAULT_TURN_DETECTION = {
|
|
38
37
|
type: "semantic_vad",
|
|
39
38
|
eagerness: "medium",
|
|
@@ -59,14 +58,15 @@ const DEFAULT_MAX_SESSION_DURATION = 20 * 60 * 1e3;
|
|
|
59
58
|
const DEFAULT_REALTIME_MODEL_OPTIONS = {
|
|
60
59
|
model: "gpt-realtime",
|
|
61
60
|
voice: "marin",
|
|
62
|
-
temperature: DEFAULT_TEMPERATURE,
|
|
63
61
|
inputAudioTranscription: DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
62
|
+
inputAudioNoiseReduction: void 0,
|
|
64
63
|
turnDetection: DEFAULT_TURN_DETECTION,
|
|
65
64
|
toolChoice: DEFAULT_TOOL_CHOICE,
|
|
66
65
|
maxResponseOutputTokens: DEFAULT_MAX_RESPONSE_OUTPUT_TOKENS,
|
|
67
66
|
maxSessionDuration: DEFAULT_MAX_SESSION_DURATION,
|
|
68
67
|
connOptions: DEFAULT_API_CONNECT_OPTIONS,
|
|
69
|
-
modalities: ["text", "audio"]
|
|
68
|
+
modalities: ["text", "audio"],
|
|
69
|
+
tracing: void 0
|
|
70
70
|
};
|
|
71
71
|
class RealtimeModel extends llm.RealtimeModel {
|
|
72
72
|
sampleRate = api_proto.SAMPLE_RATE;
|
|
@@ -75,6 +75,9 @@ class RealtimeModel extends llm.RealtimeModel {
|
|
|
75
75
|
outFrameSize = api_proto.OUT_FRAME_SIZE;
|
|
76
76
|
/* @internal */
|
|
77
77
|
_options;
|
|
78
|
+
get model() {
|
|
79
|
+
return this._options.model;
|
|
80
|
+
}
|
|
78
81
|
constructor(options = {}) {
|
|
79
82
|
const modalities = options.modalities || DEFAULT_REALTIME_MODEL_OPTIONS.modalities;
|
|
80
83
|
super({
|
|
@@ -127,11 +130,10 @@ class RealtimeModel extends llm.RealtimeModel {
|
|
|
127
130
|
* @param baseURL - Base URL for the API endpoint. If undefined, constructed from the azure_endpoint.
|
|
128
131
|
* @param voice - Voice setting for audio outputs. Defaults to "alloy".
|
|
129
132
|
* @param inputAudioTranscription - Options for transcribing input audio. Defaults to @see DEFAULT_INPUT_AUDIO_TRANSCRIPTION.
|
|
133
|
+
* @param inputAudioNoiseReduction - Options for noise reduction. Defaults to undefined.
|
|
130
134
|
* @param turnDetection - Options for server-based voice activity detection (VAD). Defaults to @see DEFAULT_SERVER_VAD_OPTIONS.
|
|
131
|
-
* @param temperature - Sampling temperature for response generation. Defaults to @see DEFAULT_TEMPERATURE.
|
|
132
135
|
* @param speed - Speed of the audio output. Defaults to 1.0.
|
|
133
|
-
* @param
|
|
134
|
-
* @param maxSessionDuration - Maximum duration of the session in milliseconds. Defaults to @see DEFAULT_MAX_SESSION_DURATION.
|
|
136
|
+
* @param tracing - Tracing configuration. Defaults to undefined.
|
|
135
137
|
*
|
|
136
138
|
* @returns A RealtimeModel instance configured for Azure OpenAI Service.
|
|
137
139
|
*
|
|
@@ -145,10 +147,13 @@ class RealtimeModel extends llm.RealtimeModel {
|
|
|
145
147
|
entraToken,
|
|
146
148
|
baseURL,
|
|
147
149
|
voice = "alloy",
|
|
150
|
+
temperature,
|
|
151
|
+
// eslint-disable-line @typescript-eslint/no-unused-vars
|
|
148
152
|
inputAudioTranscription = AZURE_DEFAULT_INPUT_AUDIO_TRANSCRIPTION,
|
|
153
|
+
inputAudioNoiseReduction,
|
|
149
154
|
turnDetection = AZURE_DEFAULT_TURN_DETECTION,
|
|
150
|
-
|
|
151
|
-
|
|
155
|
+
speed,
|
|
156
|
+
tracing
|
|
152
157
|
}) {
|
|
153
158
|
apiKey = apiKey || process.env.AZURE_OPENAI_API_KEY;
|
|
154
159
|
if (!apiKey && !entraToken) {
|
|
@@ -174,9 +179,10 @@ class RealtimeModel extends llm.RealtimeModel {
|
|
|
174
179
|
return new RealtimeModel({
|
|
175
180
|
voice,
|
|
176
181
|
inputAudioTranscription,
|
|
182
|
+
inputAudioNoiseReduction,
|
|
177
183
|
turnDetection,
|
|
178
|
-
temperature,
|
|
179
184
|
speed,
|
|
185
|
+
tracing,
|
|
180
186
|
apiKey,
|
|
181
187
|
azureDeployment,
|
|
182
188
|
apiVersion,
|
|
@@ -253,24 +259,31 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
253
259
|
this.messageChannel.put(command);
|
|
254
260
|
}
|
|
255
261
|
createSessionUpdateEvent() {
|
|
256
|
-
const
|
|
262
|
+
const audioFormat = { type: "audio/pcm", rate: SAMPLE_RATE };
|
|
263
|
+
const modality = this.oaiRealtimeModel._options.modalities.includes("audio") ? "audio" : "text";
|
|
257
264
|
return {
|
|
258
265
|
type: "session.update",
|
|
259
266
|
session: {
|
|
267
|
+
type: "realtime",
|
|
260
268
|
model: this.oaiRealtimeModel._options.model,
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
+
output_modalities: [modality],
|
|
270
|
+
audio: {
|
|
271
|
+
input: {
|
|
272
|
+
format: audioFormat,
|
|
273
|
+
noise_reduction: this.oaiRealtimeModel._options.inputAudioNoiseReduction,
|
|
274
|
+
transcription: this.oaiRealtimeModel._options.inputAudioTranscription,
|
|
275
|
+
turn_detection: this.oaiRealtimeModel._options.turnDetection
|
|
276
|
+
},
|
|
277
|
+
output: {
|
|
278
|
+
format: audioFormat,
|
|
279
|
+
speed: this.oaiRealtimeModel._options.speed,
|
|
280
|
+
voice: this.oaiRealtimeModel._options.voice
|
|
281
|
+
}
|
|
282
|
+
},
|
|
283
|
+
max_output_tokens: this.oaiRealtimeModel._options.maxResponseOutputTokens === Infinity ? "inf" : this.oaiRealtimeModel._options.maxResponseOutputTokens,
|
|
269
284
|
tool_choice: toOaiToolChoice(this.oaiRealtimeModel._options.toolChoice),
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
instructions: this.instructions,
|
|
273
|
-
speed: this.oaiRealtimeModel._options.speed
|
|
285
|
+
tracing: this.oaiRealtimeModel._options.tracing,
|
|
286
|
+
instructions: this.instructions
|
|
274
287
|
}
|
|
275
288
|
};
|
|
276
289
|
}
|
|
@@ -386,6 +399,7 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
386
399
|
return {
|
|
387
400
|
type: "session.update",
|
|
388
401
|
session: {
|
|
402
|
+
type: "realtime",
|
|
389
403
|
model: this.oaiRealtimeModel._options.model,
|
|
390
404
|
tools: oaiTools
|
|
391
405
|
},
|
|
@@ -397,6 +411,7 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
397
411
|
this.sendEvent({
|
|
398
412
|
type: "session.update",
|
|
399
413
|
session: {
|
|
414
|
+
type: "realtime",
|
|
400
415
|
instructions: _instructions
|
|
401
416
|
},
|
|
402
417
|
event_id: eventId
|
|
@@ -404,7 +419,9 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
404
419
|
this.instructions = _instructions;
|
|
405
420
|
}
|
|
406
421
|
updateOptions({ toolChoice }) {
|
|
407
|
-
const options = {
|
|
422
|
+
const options = {
|
|
423
|
+
type: "realtime"
|
|
424
|
+
};
|
|
408
425
|
this.oaiRealtimeModel._options.toolChoice = toolChoice;
|
|
409
426
|
options.tool_choice = toOaiToolChoice(toolChoice);
|
|
410
427
|
this.sendEvent({
|
|
@@ -503,8 +520,12 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
503
520
|
throw new Error("Microsoft API key or entraToken is required");
|
|
504
521
|
}
|
|
505
522
|
} else {
|
|
523
|
+
if (!this.oaiRealtimeModel._options.apiKey) {
|
|
524
|
+
throw new Error(
|
|
525
|
+
"OpenAI API key is required but not set. Check OPENAI_API_KEY environment variable."
|
|
526
|
+
);
|
|
527
|
+
}
|
|
506
528
|
headers.Authorization = `Bearer ${this.oaiRealtimeModel._options.apiKey}`;
|
|
507
|
-
headers["OpenAI-Beta"] = "realtime=v1";
|
|
508
529
|
}
|
|
509
530
|
const url = processBaseURL({
|
|
510
531
|
baseURL: this.oaiRealtimeModel._options.baseURL,
|
|
@@ -671,6 +692,7 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
671
692
|
case "response.output_item.added":
|
|
672
693
|
this.handleResponseOutputItemAdded(event);
|
|
673
694
|
break;
|
|
695
|
+
case "conversation.item.added":
|
|
674
696
|
case "conversation.item.created":
|
|
675
697
|
this.handleConversationItemCreated(event);
|
|
676
698
|
break;
|
|
@@ -689,21 +711,27 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
689
711
|
case "response.content_part.done":
|
|
690
712
|
this.handleResponseContentPartDone(event);
|
|
691
713
|
break;
|
|
714
|
+
case "response.output_text.delta":
|
|
692
715
|
case "response.text.delta":
|
|
693
716
|
this.handleResponseTextDelta(event);
|
|
694
717
|
break;
|
|
718
|
+
case "response.output_text.done":
|
|
695
719
|
case "response.text.done":
|
|
696
720
|
this.handleResponseTextDone(event);
|
|
697
721
|
break;
|
|
722
|
+
case "response.output_audio_transcript.delta":
|
|
698
723
|
case "response.audio_transcript.delta":
|
|
699
724
|
this.handleResponseAudioTranscriptDelta(event);
|
|
700
725
|
break;
|
|
726
|
+
case "response.output_audio.delta":
|
|
701
727
|
case "response.audio.delta":
|
|
702
728
|
this.handleResponseAudioDelta(event);
|
|
703
729
|
break;
|
|
730
|
+
case "response.output_audio_transcript.done":
|
|
704
731
|
case "response.audio_transcript.done":
|
|
705
732
|
this.handleResponseAudioTranscriptDone(event);
|
|
706
733
|
break;
|
|
734
|
+
case "response.output_audio.done":
|
|
707
735
|
case "response.audio.done":
|
|
708
736
|
this.handleResponseAudioDone(event);
|
|
709
737
|
break;
|
|
@@ -779,7 +807,8 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
779
807
|
const generationEv = {
|
|
780
808
|
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
781
809
|
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
782
|
-
userInitiated: false
|
|
810
|
+
userInitiated: false,
|
|
811
|
+
responseId: event.response.id
|
|
783
812
|
};
|
|
784
813
|
const clientEventId = (_a = event.response.metadata) == null ? void 0 : _a.client_event_id;
|
|
785
814
|
if (clientEventId) {
|
|
@@ -899,11 +928,12 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
899
928
|
this.#logger.warn(`itemGeneration not found for itemId=${itemId}`);
|
|
900
929
|
return;
|
|
901
930
|
}
|
|
902
|
-
|
|
931
|
+
const isTextType = itemType === "text" || itemType === "output_text";
|
|
932
|
+
if (isTextType && this.oaiRealtimeModel.capabilities.audioOutput) {
|
|
903
933
|
this.#logger.warn("Text response received from OpenAI Realtime API in audio modality.");
|
|
904
934
|
}
|
|
905
935
|
if (!itemGeneration.modalities.done) {
|
|
906
|
-
const modalityResult =
|
|
936
|
+
const modalityResult = isTextType ? ["text"] : ["audio", "text"];
|
|
907
937
|
itemGeneration.modalities.resolve(modalityResult);
|
|
908
938
|
}
|
|
909
939
|
if (this.currentGeneration._firstTokenTimestamp === void 0) {
|
|
@@ -911,6 +941,9 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
911
941
|
}
|
|
912
942
|
}
|
|
913
943
|
handleResponseContentPartDone(event) {
|
|
944
|
+
if (!event.part) {
|
|
945
|
+
return;
|
|
946
|
+
}
|
|
914
947
|
if (event.part.type !== "text") {
|
|
915
948
|
return;
|
|
916
949
|
}
|
|
@@ -1001,11 +1034,13 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
1001
1034
|
if (!item.call_id || !item.name || !item.arguments) {
|
|
1002
1035
|
throw new Error("item is not a function call");
|
|
1003
1036
|
}
|
|
1004
|
-
this.currentGeneration.functionChannel.write(
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1037
|
+
this.currentGeneration.functionChannel.write(
|
|
1038
|
+
llm.FunctionCall.create({
|
|
1039
|
+
callId: item.call_id,
|
|
1040
|
+
name: item.name,
|
|
1041
|
+
args: item.arguments
|
|
1042
|
+
})
|
|
1043
|
+
);
|
|
1009
1044
|
} else if (itemType === "message") {
|
|
1010
1045
|
const itemGeneration = this.currentGeneration.messages.get(itemId);
|
|
1011
1046
|
if (!itemGeneration) {
|
|
@@ -1138,7 +1173,8 @@ class RealtimeSession extends llm.RealtimeSession {
|
|
|
1138
1173
|
const generation_ev = {
|
|
1139
1174
|
messageStream: this.currentGeneration.messageChannel.stream(),
|
|
1140
1175
|
functionStream: this.currentGeneration.functionChannel.stream(),
|
|
1141
|
-
userInitiated: false
|
|
1176
|
+
userInitiated: false,
|
|
1177
|
+
responseId
|
|
1142
1178
|
};
|
|
1143
1179
|
const handle = this.responseCreatedFutures[responseId];
|
|
1144
1180
|
if (handle) {
|