openai 4.87.4 → 4.89.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/helpers/audio.d.ts +12 -0
- package/helpers/audio.d.ts.map +1 -0
- package/helpers/audio.js +121 -0
- package/helpers/audio.js.map +1 -0
- package/helpers/audio.mjs +116 -0
- package/helpers/audio.mjs.map +1 -0
- package/index.d.mts +2 -0
- package/index.d.ts +2 -0
- package/index.d.ts.map +1 -1
- package/index.js.map +1 -1
- package/index.mjs.map +1 -1
- package/package.json +8 -29
- package/resources/audio/audio.d.ts +5 -4
- package/resources/audio/audio.d.ts.map +1 -1
- package/resources/audio/audio.js.map +1 -1
- package/resources/audio/audio.mjs.map +1 -1
- package/resources/audio/index.d.ts +1 -1
- package/resources/audio/index.d.ts.map +1 -1
- package/resources/audio/index.js.map +1 -1
- package/resources/audio/index.mjs.map +1 -1
- package/resources/audio/speech.d.ts +7 -2
- package/resources/audio/speech.d.ts.map +1 -1
- package/resources/audio/transcriptions.d.ts +172 -9
- package/resources/audio/transcriptions.d.ts.map +1 -1
- package/resources/audio/transcriptions.js.map +1 -1
- package/resources/audio/transcriptions.mjs.map +1 -1
- package/resources/audio/translations.d.ts +1 -1
- package/resources/audio/translations.d.ts.map +1 -1
- package/resources/beta/realtime/index.d.ts +1 -0
- package/resources/beta/realtime/index.d.ts.map +1 -1
- package/resources/beta/realtime/index.js +3 -1
- package/resources/beta/realtime/index.js.map +1 -1
- package/resources/beta/realtime/index.mjs +1 -0
- package/resources/beta/realtime/index.mjs.map +1 -1
- package/resources/beta/realtime/realtime.d.ts +383 -36
- package/resources/beta/realtime/realtime.d.ts.map +1 -1
- package/resources/beta/realtime/realtime.js +4 -0
- package/resources/beta/realtime/realtime.js.map +1 -1
- package/resources/beta/realtime/realtime.mjs +4 -0
- package/resources/beta/realtime/realtime.mjs.map +1 -1
- package/resources/beta/realtime/sessions.d.ts +169 -60
- package/resources/beta/realtime/sessions.d.ts.map +1 -1
- package/resources/beta/realtime/transcription-sessions.d.ts +262 -0
- package/resources/beta/realtime/transcription-sessions.d.ts.map +1 -0
- package/resources/beta/realtime/transcription-sessions.js +25 -0
- package/resources/beta/realtime/transcription-sessions.js.map +1 -0
- package/resources/beta/realtime/transcription-sessions.mjs +21 -0
- package/resources/beta/realtime/transcription-sessions.mjs.map +1 -0
- package/resources/chat/completions/completions.d.ts +1 -1
- package/resources/chat/completions/completions.d.ts.map +1 -1
- package/resources/responses/responses.d.ts +3 -3
- package/resources/responses/responses.d.ts.map +1 -1
- package/resources/shared.d.ts +3 -1
- package/resources/shared.d.ts.map +1 -1
- package/resources.d.ts +2 -0
- package/resources.d.ts.map +1 -0
- package/resources.js +18 -0
- package/resources.js.map +1 -0
- package/resources.mjs +2 -0
- package/resources.mjs.map +1 -0
- package/src/helpers/audio.ts +145 -0
- package/src/index.ts +2 -0
- package/src/resources/audio/audio.ts +15 -2
- package/src/resources/audio/index.ts +6 -0
- package/src/resources/audio/speech.ts +8 -2
- package/src/resources/audio/transcriptions.ts +215 -9
- package/src/resources/audio/translations.ts +1 -1
- package/src/resources/beta/realtime/index.ts +5 -0
- package/src/resources/beta/realtime/realtime.ts +465 -57
- package/src/resources/beta/realtime/sessions.ts +176 -60
- package/src/resources/beta/realtime/transcription-sessions.ts +308 -0
- package/src/resources/chat/completions/completions.ts +1 -1
- package/src/resources/responses/responses.ts +3 -3
- package/src/resources/shared.ts +22 -5
- package/src/resources.ts +1 -0
- package/src/version.ts +1 -1
- package/version.d.ts +1 -1
- package/version.js +1 -1
- package/version.mjs +1 -1
|
@@ -2,11 +2,15 @@
|
|
|
2
2
|
import { APIResource } from "../../../resource.mjs";
|
|
3
3
|
import * as SessionsAPI from "./sessions.mjs";
|
|
4
4
|
import { Sessions, } from "./sessions.mjs";
|
|
5
|
+
import * as TranscriptionSessionsAPI from "./transcription-sessions.mjs";
|
|
6
|
+
import { TranscriptionSessions, } from "./transcription-sessions.mjs";
|
|
5
7
|
export class Realtime extends APIResource {
|
|
6
8
|
constructor() {
|
|
7
9
|
super(...arguments);
|
|
8
10
|
this.sessions = new SessionsAPI.Sessions(this._client);
|
|
11
|
+
this.transcriptionSessions = new TranscriptionSessionsAPI.TranscriptionSessions(this._client);
|
|
9
12
|
}
|
|
10
13
|
}
|
|
11
14
|
Realtime.Sessions = Sessions;
|
|
15
|
+
Realtime.TranscriptionSessions = TranscriptionSessions;
|
|
12
16
|
//# sourceMappingURL=realtime.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"realtime.mjs","sourceRoot":"","sources":["../../../src/resources/beta/realtime/realtime.ts"],"names":[],"mappings":"AAAA,sFAAsF;OAE/E,EAAE,WAAW,EAAE;OAGf,KAAK,WAAW;OAChB,EAIL,QAAQ,GACT;AAED,MAAM,OAAO,QAAS,SAAQ,WAAW;IAAzC;;QACE,aAAQ,GAAyB,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"realtime.mjs","sourceRoot":"","sources":["../../../src/resources/beta/realtime/realtime.ts"],"names":[],"mappings":"AAAA,sFAAsF;OAE/E,EAAE,WAAW,EAAE;OAGf,KAAK,WAAW;OAChB,EAIL,QAAQ,GACT;OACM,KAAK,wBAAwB;OAC7B,EAGL,qBAAqB,GACtB;AAED,MAAM,OAAO,QAAS,SAAQ,WAAW;IAAzC;;QACE,aAAQ,GAAyB,IAAI,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACxE,0BAAqB,GACnB,IAAI,wBAAwB,CAAC,qBAAqB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACrE,CAAC;CAAA;AA03ED,QAAQ,CAAC,QAAQ,GAAG,QAAQ,CAAC;AAC7B,QAAQ,CAAC,qBAAqB,GAAG,qBAAqB,CAAC"}
|
|
@@ -17,7 +17,7 @@ export declare class Sessions extends APIResource {
|
|
|
17
17
|
*/
|
|
18
18
|
export interface Session {
|
|
19
19
|
/**
|
|
20
|
-
* Unique identifier for the session
|
|
20
|
+
* Unique identifier for the session that looks like `sess_1234567890abcdef`.
|
|
21
21
|
*/
|
|
22
22
|
id?: string;
|
|
23
23
|
/**
|
|
@@ -26,12 +26,23 @@ export interface Session {
|
|
|
26
26
|
* (mono), and little-endian byte order.
|
|
27
27
|
*/
|
|
28
28
|
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
|
29
|
+
/**
|
|
30
|
+
* Configuration for input audio noise reduction. This can be set to `null` to turn
|
|
31
|
+
* off. Noise reduction filters audio added to the input audio buffer before it is
|
|
32
|
+
* sent to VAD and the model. Filtering the audio can improve VAD and turn
|
|
33
|
+
* detection accuracy (reducing false positives) and model performance by improving
|
|
34
|
+
* perception of the input audio.
|
|
35
|
+
*/
|
|
36
|
+
input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
|
|
29
37
|
/**
|
|
30
38
|
* Configuration for input audio transcription, defaults to off and can be set to
|
|
31
39
|
* `null` to turn off once on. Input audio transcription is not native to the
|
|
32
40
|
* model, since the model consumes audio directly. Transcription runs
|
|
33
|
-
* asynchronously through
|
|
34
|
-
*
|
|
41
|
+
* asynchronously through
|
|
42
|
+
* [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
|
43
|
+
* and should be treated as guidance of input audio content rather than precisely
|
|
44
|
+
* what the model heard. The client can optionally set the language and prompt for
|
|
45
|
+
* transcription, these offer additional guidance to the transcription service.
|
|
35
46
|
*/
|
|
36
47
|
input_audio_transcription?: Session.InputAudioTranscription;
|
|
37
48
|
/**
|
|
@@ -62,14 +73,15 @@ export interface Session {
|
|
|
62
73
|
/**
|
|
63
74
|
* The Realtime model used for this session.
|
|
64
75
|
*/
|
|
65
|
-
model?:
|
|
76
|
+
model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
|
|
66
77
|
/**
|
|
67
78
|
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
|
|
68
79
|
* For `pcm16`, output audio is sampled at a rate of 24kHz.
|
|
69
80
|
*/
|
|
70
81
|
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
|
71
82
|
/**
|
|
72
|
-
* Sampling temperature for the model, limited to [0.6, 1.2].
|
|
83
|
+
* Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
|
|
84
|
+
* temperature of 0.8 is highly recommended for best performance.
|
|
73
85
|
*/
|
|
74
86
|
temperature?: number;
|
|
75
87
|
/**
|
|
@@ -82,11 +94,18 @@ export interface Session {
|
|
|
82
94
|
*/
|
|
83
95
|
tools?: Array<Session.Tool>;
|
|
84
96
|
/**
|
|
85
|
-
* Configuration for turn detection
|
|
86
|
-
*
|
|
87
|
-
*
|
|
97
|
+
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
|
98
|
+
* set to `null` to turn off, in which case the client must manually trigger model
|
|
99
|
+
* response. Server VAD means that the model will detect the start and end of
|
|
100
|
+
* speech based on audio volume and respond at the end of user speech. Semantic VAD
|
|
101
|
+
* is more advanced and uses a turn detection model (in conjuction with VAD) to
|
|
102
|
+
* semantically estimate whether the user has finished speaking, then dynamically
|
|
103
|
+
* sets a timeout based on this probability. For example, if user audio trails off
|
|
104
|
+
* with "uhhm", the model will score a low probability of turn end and wait longer
|
|
105
|
+
* for the user to continue speaking. This can be useful for more natural
|
|
106
|
+
* conversations, but may have a higher latency.
|
|
88
107
|
*/
|
|
89
|
-
turn_detection?: Session.TurnDetection
|
|
108
|
+
turn_detection?: Session.TurnDetection;
|
|
90
109
|
/**
|
|
91
110
|
* The voice the model uses to respond. Voice cannot be changed during the session
|
|
92
111
|
* once the model has responded with audio at least once. Current voice options are
|
|
@@ -95,19 +114,51 @@ export interface Session {
|
|
|
95
114
|
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
|
|
96
115
|
}
|
|
97
116
|
export declare namespace Session {
|
|
117
|
+
/**
|
|
118
|
+
* Configuration for input audio noise reduction. This can be set to `null` to turn
|
|
119
|
+
* off. Noise reduction filters audio added to the input audio buffer before it is
|
|
120
|
+
* sent to VAD and the model. Filtering the audio can improve VAD and turn
|
|
121
|
+
* detection accuracy (reducing false positives) and model performance by improving
|
|
122
|
+
* perception of the input audio.
|
|
123
|
+
*/
|
|
124
|
+
interface InputAudioNoiseReduction {
|
|
125
|
+
/**
|
|
126
|
+
* Type of noise reduction. `near_field` is for close-talking microphones such as
|
|
127
|
+
* headphones, `far_field` is for far-field microphones such as laptop or
|
|
128
|
+
* conference room microphones.
|
|
129
|
+
*/
|
|
130
|
+
type?: 'near_field' | 'far_field';
|
|
131
|
+
}
|
|
98
132
|
/**
|
|
99
133
|
* Configuration for input audio transcription, defaults to off and can be set to
|
|
100
134
|
* `null` to turn off once on. Input audio transcription is not native to the
|
|
101
135
|
* model, since the model consumes audio directly. Transcription runs
|
|
102
|
-
* asynchronously through
|
|
103
|
-
*
|
|
136
|
+
* asynchronously through
|
|
137
|
+
* [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
|
138
|
+
* and should be treated as guidance of input audio content rather than precisely
|
|
139
|
+
* what the model heard. The client can optionally set the language and prompt for
|
|
140
|
+
* transcription, these offer additional guidance to the transcription service.
|
|
104
141
|
*/
|
|
105
142
|
interface InputAudioTranscription {
|
|
106
143
|
/**
|
|
107
|
-
* The
|
|
108
|
-
*
|
|
144
|
+
* The language of the input audio. Supplying the input language in
|
|
145
|
+
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
|
146
|
+
* format will improve accuracy and latency.
|
|
147
|
+
*/
|
|
148
|
+
language?: string;
|
|
149
|
+
/**
|
|
150
|
+
* The model to use for transcription, current options are `gpt-4o-transcribe`,
|
|
151
|
+
* `gpt-4o-mini-transcribe`, and `whisper-1`.
|
|
109
152
|
*/
|
|
110
153
|
model?: string;
|
|
154
|
+
/**
|
|
155
|
+
* An optional text to guide the model's style or continue a previous audio
|
|
156
|
+
* segment. For `whisper-1`, the
|
|
157
|
+
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
|
|
158
|
+
* For `gpt-4o-transcribe` models, the prompt is a free text string, for example
|
|
159
|
+
* "expect words related to technology".
|
|
160
|
+
*/
|
|
161
|
+
prompt?: string;
|
|
111
162
|
}
|
|
112
163
|
interface Tool {
|
|
113
164
|
/**
|
|
@@ -129,43 +180,56 @@ export declare namespace Session {
|
|
|
129
180
|
type?: 'function';
|
|
130
181
|
}
|
|
131
182
|
/**
|
|
132
|
-
* Configuration for turn detection
|
|
133
|
-
*
|
|
134
|
-
*
|
|
183
|
+
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
|
184
|
+
* set to `null` to turn off, in which case the client must manually trigger model
|
|
185
|
+
* response. Server VAD means that the model will detect the start and end of
|
|
186
|
+
* speech based on audio volume and respond at the end of user speech. Semantic VAD
|
|
187
|
+
* is more advanced and uses a turn detection model (in conjuction with VAD) to
|
|
188
|
+
* semantically estimate whether the user has finished speaking, then dynamically
|
|
189
|
+
* sets a timeout based on this probability. For example, if user audio trails off
|
|
190
|
+
* with "uhhm", the model will score a low probability of turn end and wait longer
|
|
191
|
+
* for the user to continue speaking. This can be useful for more natural
|
|
192
|
+
* conversations, but may have a higher latency.
|
|
135
193
|
*/
|
|
136
194
|
interface TurnDetection {
|
|
137
195
|
/**
|
|
138
196
|
* Whether or not to automatically generate a response when a VAD stop event
|
|
139
|
-
* occurs.
|
|
197
|
+
* occurs.
|
|
140
198
|
*/
|
|
141
199
|
create_response?: boolean;
|
|
200
|
+
/**
|
|
201
|
+
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
|
202
|
+
* will wait longer for the user to continue speaking, `high` will respond more
|
|
203
|
+
* quickly. `auto` is the default and is equivalent to `medium`.
|
|
204
|
+
*/
|
|
205
|
+
eagerness?: 'low' | 'medium' | 'high' | 'auto';
|
|
142
206
|
/**
|
|
143
207
|
* Whether or not to automatically interrupt any ongoing response with output to
|
|
144
208
|
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
|
145
|
-
* occurs.
|
|
209
|
+
* occurs.
|
|
146
210
|
*/
|
|
147
211
|
interrupt_response?: boolean;
|
|
148
212
|
/**
|
|
149
|
-
* Amount of audio to include before the VAD
|
|
150
|
-
* Defaults to 300ms.
|
|
213
|
+
* Used only for `server_vad` mode. Amount of audio to include before the VAD
|
|
214
|
+
* detected speech (in milliseconds). Defaults to 300ms.
|
|
151
215
|
*/
|
|
152
216
|
prefix_padding_ms?: number;
|
|
153
217
|
/**
|
|
154
|
-
* Duration of silence to detect speech stop (in
|
|
155
|
-
* With shorter values the model will respond
|
|
156
|
-
* short pauses from the user.
|
|
218
|
+
* Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
|
219
|
+
* milliseconds). Defaults to 500ms. With shorter values the model will respond
|
|
220
|
+
* more quickly, but may jump in on short pauses from the user.
|
|
157
221
|
*/
|
|
158
222
|
silence_duration_ms?: number;
|
|
159
223
|
/**
|
|
160
|
-
* Activation threshold for VAD (0.0 to 1.0), this
|
|
161
|
-
* threshold will require louder audio to activate the
|
|
162
|
-
* perform better in noisy environments.
|
|
224
|
+
* Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
|
225
|
+
* defaults to 0.5. A higher threshold will require louder audio to activate the
|
|
226
|
+
* model, and thus might perform better in noisy environments.
|
|
163
227
|
*/
|
|
164
228
|
threshold?: number;
|
|
165
229
|
/**
|
|
166
|
-
* Type of turn detection
|
|
230
|
+
* Type of turn detection.
|
|
167
231
|
*/
|
|
168
|
-
type?: 'server_vad';
|
|
232
|
+
type?: 'server_vad' | 'semantic_vad';
|
|
169
233
|
}
|
|
170
234
|
}
|
|
171
235
|
/**
|
|
@@ -330,15 +394,23 @@ export interface SessionCreateParams {
|
|
|
330
394
|
* (mono), and little-endian byte order.
|
|
331
395
|
*/
|
|
332
396
|
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
|
397
|
+
/**
|
|
398
|
+
* Configuration for input audio noise reduction. This can be set to `null` to turn
|
|
399
|
+
* off. Noise reduction filters audio added to the input audio buffer before it is
|
|
400
|
+
* sent to VAD and the model. Filtering the audio can improve VAD and turn
|
|
401
|
+
* detection accuracy (reducing false positives) and model performance by improving
|
|
402
|
+
* perception of the input audio.
|
|
403
|
+
*/
|
|
404
|
+
input_audio_noise_reduction?: SessionCreateParams.InputAudioNoiseReduction;
|
|
333
405
|
/**
|
|
334
406
|
* Configuration for input audio transcription, defaults to off and can be set to
|
|
335
407
|
* `null` to turn off once on. Input audio transcription is not native to the
|
|
336
408
|
* model, since the model consumes audio directly. Transcription runs
|
|
337
409
|
* asynchronously through
|
|
338
|
-
* [
|
|
339
|
-
* and should be treated as
|
|
340
|
-
*
|
|
341
|
-
*
|
|
410
|
+
* [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
|
411
|
+
* and should be treated as guidance of input audio content rather than precisely
|
|
412
|
+
* what the model heard. The client can optionally set the language and prompt for
|
|
413
|
+
* transcription, these offer additional guidance to the transcription service.
|
|
342
414
|
*/
|
|
343
415
|
input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
|
|
344
416
|
/**
|
|
@@ -376,7 +448,8 @@ export interface SessionCreateParams {
|
|
|
376
448
|
*/
|
|
377
449
|
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
|
|
378
450
|
/**
|
|
379
|
-
* Sampling temperature for the model, limited to [0.6, 1.2].
|
|
451
|
+
* Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
|
|
452
|
+
* temperature of 0.8 is highly recommended for best performance.
|
|
380
453
|
*/
|
|
381
454
|
temperature?: number;
|
|
382
455
|
/**
|
|
@@ -389,9 +462,16 @@ export interface SessionCreateParams {
|
|
|
389
462
|
*/
|
|
390
463
|
tools?: Array<SessionCreateParams.Tool>;
|
|
391
464
|
/**
|
|
392
|
-
* Configuration for turn detection
|
|
393
|
-
*
|
|
394
|
-
*
|
|
465
|
+
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
|
466
|
+
* set to `null` to turn off, in which case the client must manually trigger model
|
|
467
|
+
* response. Server VAD means that the model will detect the start and end of
|
|
468
|
+
* speech based on audio volume and respond at the end of user speech. Semantic VAD
|
|
469
|
+
* is more advanced and uses a turn detection model (in conjuction with VAD) to
|
|
470
|
+
* semantically estimate whether the user has finished speaking, then dynamically
|
|
471
|
+
* sets a timeout based on this probability. For example, if user audio trails off
|
|
472
|
+
* with "uhhm", the model will score a low probability of turn end and wait longer
|
|
473
|
+
* for the user to continue speaking. This can be useful for more natural
|
|
474
|
+
* conversations, but may have a higher latency.
|
|
395
475
|
*/
|
|
396
476
|
turn_detection?: SessionCreateParams.TurnDetection;
|
|
397
477
|
/**
|
|
@@ -402,15 +482,30 @@ export interface SessionCreateParams {
|
|
|
402
482
|
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
|
|
403
483
|
}
|
|
404
484
|
export declare namespace SessionCreateParams {
|
|
485
|
+
/**
|
|
486
|
+
* Configuration for input audio noise reduction. This can be set to `null` to turn
|
|
487
|
+
* off. Noise reduction filters audio added to the input audio buffer before it is
|
|
488
|
+
* sent to VAD and the model. Filtering the audio can improve VAD and turn
|
|
489
|
+
* detection accuracy (reducing false positives) and model performance by improving
|
|
490
|
+
* perception of the input audio.
|
|
491
|
+
*/
|
|
492
|
+
interface InputAudioNoiseReduction {
|
|
493
|
+
/**
|
|
494
|
+
* Type of noise reduction. `near_field` is for close-talking microphones such as
|
|
495
|
+
* headphones, `far_field` is for far-field microphones such as laptop or
|
|
496
|
+
* conference room microphones.
|
|
497
|
+
*/
|
|
498
|
+
type?: 'near_field' | 'far_field';
|
|
499
|
+
}
|
|
405
500
|
/**
|
|
406
501
|
* Configuration for input audio transcription, defaults to off and can be set to
|
|
407
502
|
* `null` to turn off once on. Input audio transcription is not native to the
|
|
408
503
|
* model, since the model consumes audio directly. Transcription runs
|
|
409
504
|
* asynchronously through
|
|
410
|
-
* [
|
|
411
|
-
* and should be treated as
|
|
412
|
-
*
|
|
413
|
-
*
|
|
505
|
+
* [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
|
|
506
|
+
* and should be treated as guidance of input audio content rather than precisely
|
|
507
|
+
* what the model heard. The client can optionally set the language and prompt for
|
|
508
|
+
* transcription, these offer additional guidance to the transcription service.
|
|
414
509
|
*/
|
|
415
510
|
interface InputAudioTranscription {
|
|
416
511
|
/**
|
|
@@ -420,15 +515,16 @@ export declare namespace SessionCreateParams {
|
|
|
420
515
|
*/
|
|
421
516
|
language?: string;
|
|
422
517
|
/**
|
|
423
|
-
* The model to use for transcription,
|
|
424
|
-
*
|
|
518
|
+
* The model to use for transcription, current options are `gpt-4o-transcribe`,
|
|
519
|
+
* `gpt-4o-mini-transcribe`, and `whisper-1`.
|
|
425
520
|
*/
|
|
426
521
|
model?: string;
|
|
427
522
|
/**
|
|
428
523
|
* An optional text to guide the model's style or continue a previous audio
|
|
429
|
-
* segment.
|
|
430
|
-
* [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
|
431
|
-
*
|
|
524
|
+
* segment. For `whisper-1`, the
|
|
525
|
+
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
|
|
526
|
+
* For `gpt-4o-transcribe` models, the prompt is a free text string, for example
|
|
527
|
+
* "expect words related to technology".
|
|
432
528
|
*/
|
|
433
529
|
prompt?: string;
|
|
434
530
|
}
|
|
@@ -452,43 +548,56 @@ export declare namespace SessionCreateParams {
|
|
|
452
548
|
type?: 'function';
|
|
453
549
|
}
|
|
454
550
|
/**
|
|
455
|
-
* Configuration for turn detection
|
|
456
|
-
*
|
|
457
|
-
*
|
|
551
|
+
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
|
552
|
+
* set to `null` to turn off, in which case the client must manually trigger model
|
|
553
|
+
* response. Server VAD means that the model will detect the start and end of
|
|
554
|
+
* speech based on audio volume and respond at the end of user speech. Semantic VAD
|
|
555
|
+
* is more advanced and uses a turn detection model (in conjuction with VAD) to
|
|
556
|
+
* semantically estimate whether the user has finished speaking, then dynamically
|
|
557
|
+
* sets a timeout based on this probability. For example, if user audio trails off
|
|
558
|
+
* with "uhhm", the model will score a low probability of turn end and wait longer
|
|
559
|
+
* for the user to continue speaking. This can be useful for more natural
|
|
560
|
+
* conversations, but may have a higher latency.
|
|
458
561
|
*/
|
|
459
562
|
interface TurnDetection {
|
|
460
563
|
/**
|
|
461
564
|
* Whether or not to automatically generate a response when a VAD stop event
|
|
462
|
-
* occurs.
|
|
565
|
+
* occurs.
|
|
463
566
|
*/
|
|
464
567
|
create_response?: boolean;
|
|
568
|
+
/**
|
|
569
|
+
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
|
570
|
+
* will wait longer for the user to continue speaking, `high` will respond more
|
|
571
|
+
* quickly. `auto` is the default and is equivalent to `medium`.
|
|
572
|
+
*/
|
|
573
|
+
eagerness?: 'low' | 'medium' | 'high' | 'auto';
|
|
465
574
|
/**
|
|
466
575
|
* Whether or not to automatically interrupt any ongoing response with output to
|
|
467
576
|
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
|
468
|
-
* occurs.
|
|
577
|
+
* occurs.
|
|
469
578
|
*/
|
|
470
579
|
interrupt_response?: boolean;
|
|
471
580
|
/**
|
|
472
|
-
* Amount of audio to include before the VAD
|
|
473
|
-
* Defaults to 300ms.
|
|
581
|
+
* Used only for `server_vad` mode. Amount of audio to include before the VAD
|
|
582
|
+
* detected speech (in milliseconds). Defaults to 300ms.
|
|
474
583
|
*/
|
|
475
584
|
prefix_padding_ms?: number;
|
|
476
585
|
/**
|
|
477
|
-
* Duration of silence to detect speech stop (in
|
|
478
|
-
* With shorter values the model will respond
|
|
479
|
-
* short pauses from the user.
|
|
586
|
+
* Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
|
587
|
+
* milliseconds). Defaults to 500ms. With shorter values the model will respond
|
|
588
|
+
* more quickly, but may jump in on short pauses from the user.
|
|
480
589
|
*/
|
|
481
590
|
silence_duration_ms?: number;
|
|
482
591
|
/**
|
|
483
|
-
* Activation threshold for VAD (0.0 to 1.0), this
|
|
484
|
-
* threshold will require louder audio to activate the
|
|
485
|
-
* perform better in noisy environments.
|
|
592
|
+
* Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
|
593
|
+
* defaults to 0.5. A higher threshold will require louder audio to activate the
|
|
594
|
+
* model, and thus might perform better in noisy environments.
|
|
486
595
|
*/
|
|
487
596
|
threshold?: number;
|
|
488
597
|
/**
|
|
489
|
-
* Type of turn detection
|
|
598
|
+
* Type of turn detection.
|
|
490
599
|
*/
|
|
491
|
-
type?:
|
|
600
|
+
type?: 'server_vad' | 'semantic_vad';
|
|
492
601
|
}
|
|
493
602
|
}
|
|
494
603
|
export declare namespace Sessions {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sessions.d.ts","sourceRoot":"","sources":["../../../src/resources/beta/realtime/sessions.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,KAAK,IAAI,MAAM,eAAe,CAAC;AAEtC,qBAAa,QAAS,SAAQ,WAAW;IACvC;;;;;;;;OAQG;IACH,MAAM,CAAC,IAAI,EAAE,mBAAmB,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,qBAAqB,CAAC;CAOzG;AAED;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;OAEG;IACH,EAAE,CAAC,EAAE,MAAM,CAAC;IAEZ;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,WAAW,CAAC;IAEzD;;;;;;OAMG;IACH,yBAAyB,CAAC,EAAE,OAAO,CAAC,uBAAuB,CAAC;IAE5D;;;;;;;;;;;;OAYG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,0BAA0B,CAAC,EAAE,MAAM,GAAG,KAAK,CAAC;IAE5C;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;IAErC;;OAEG;IACH,KAAK,CAAC,EACF,
|
|
1
|
+
{"version":3,"file":"sessions.d.ts","sourceRoot":"","sources":["../../../src/resources/beta/realtime/sessions.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,KAAK,IAAI,MAAM,eAAe,CAAC;AAEtC,qBAAa,QAAS,SAAQ,WAAW;IACvC;;;;;;;;OAQG;IACH,MAAM,CAAC,IAAI,EAAE,mBAAmB,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC,UAAU,CAAC,qBAAqB,CAAC;CAOzG;AAED;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB;;OAEG;IACH,EAAE,CAAC,EAAE,MAAM,CAAC;IAEZ;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,WAAW,CAAC;IAEzD;;;;;;OAMG;IACH,2BAA2B,CAAC,EAAE,OAAO,CAAC,wBAAwB,CAAC;IAE/D;;;;;;;;;OASG;IACH,yBAAyB,CAAC,EAAE,OAAO,CAAC,uBAAuB,CAAC;IAE5D;;;;;;;;;;;;OAYG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,0BAA0B,CAAC,EAAE,MAAM,GAAG,KAAK,CAAC;IAE5C;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;IAErC;;OAEG;IACH,KAAK,CAAC,EACF,yBAAyB,GACzB,oCAAoC,GACpC,oCAAoC,GACpC,8BAA8B,GAC9B,yCAAyC,CAAC;IAE9C;;;OAGG;IACH,mBAAmB,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,WAAW,CAAC;IAE1D;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,KAAK,CAAC,EAAE,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IAE5B;;;;;;;;;;;OAWG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC;IAEvC;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,QAAQ,GAAG,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;CACtF;AAED,yBAAiB,OAAO,CAAC;IACvB;;;;;;OAMG;IACH,UAAiB,wBAAwB;QACvC;;;;WAIG;QACH,IAAI,CAAC,EAAE,YAAY,GAAG,WAAW,CAAC;KACnC;IAED;;;;;;;;;OASG;IACH,UAAiB,uBAAuB;QACtC;;;;WAIG;QACH,QAAQ,CAAC,EAAE,MAAM,CAAC;QAElB;;;WAGG;QACH,KAAK,CAAC,EAAE,MAAM,CAAC;QAEf;;;;;;WAMG;QACH,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB;IAED,UAAiB,IAAI;QACnB;;;WAGG;QACH,WAAW,CAAC,EAAE,MAAM,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,MAAM,CAAC;QAEd;;WAEG;QACH,UAAU,CAAC,EAAE,OAAO,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,UAAU,CAAC;KACnB;IAED;;;;;;;;;;;OAWG;IACH,UAAiB,aAAa;QAC5B;;;WAGG;QACH,eAAe,CAAC,EAAE,OAAO,CAAC;QAE1B;;;;WAIG;QACH,SAAS,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,CAAC;QAE/C;;;;WAIG;QACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;QAE7B;;;WAGG;QACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAE3B;;;;WAIG;QACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAE7B;;;;WAIG;QACH,SAAS,CAAC,EAAE,MAAM,CAAC;QAEnB;;WAEG;QACH,IAAI,CAAC,EAAE,YAAY,GAAG,cAAc,CAAC;KACtC;CACF;AAED;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC;;OAEG;IACH,aAAa,EAAE,qBAAqB,CAAC,YAAY,CAAC;IAElD;;OAEG;IACH,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAE5B;;;;;;OAMG;IACH,yBAAyB,CAAC,EAAE,qBAAqB,CAAC,uBAAuB,CAAC;IAE1E;;;;;;;;;;;;OAYG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,0BAA0B,CAAC,EAAE,MAAM,GAAG,KAAK,CAAC;IAE5C;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;IAErC;;OAEG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;OAEG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,KAAK,CAAC,EAAE,KAAK,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC;IAE1C;;;;OAIG;IACH,cAAc,CAAC,EAAE,qBAAqB,CAAC,aAAa,CAAC;IAErD;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,QAAQ,GAAG,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;CACtF;AAED,yBAAiB,qBAAqB,CAAC;IACrC;;OAEG;IACH,UAAiB,YAAY;QAC3B;;;WAGG;QACH,UAAU,EAAE,MAAM,CAAC;QAEnB;;;;WAIG;QACH,KAAK,EAAE,MAAM,CAAC;KACf;IAED;;;;;;OAMG;IACH,UAAiB,uBAAuB;QACtC;;;WAGG;QACH,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB;IAED,UAAiB,IAAI;QACnB;;;WAGG;QACH,WAAW,CAAC,EAAE,MAAM,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,MAAM,CAAC;QAEd;;WAEG;QACH,UAAU,CAAC,EAAE,OAAO,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,UAAU,CAAC;KACnB;IAED;;;;OAIG;IACH,UAAiB,aAAa;QAC5B;;;WAGG;QACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAE3B;;;;WAIG;QACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAE7B;;;;WAIG;QACH,SAAS,CAAC,EAAE,MAAM,CAAC;QAEnB;;WAEG;QACH,IAAI,CAAC,EAAE,MAAM,CAAC;KACf;CACF;AAED,MAAM,WAAW,mBAAmB;IAClC;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,WAAW,CAAC;IAEzD;;;;;;OAMG;IACH,2BAA2B,CAAC,EAAE,mBAAmB,CAAC,wBAAwB,CAAC;IAE3E;;;;;;;;;OASG;IACH,yBAAyB,CAAC,EAAE,mBAAmB,CAAC,uBAAuB,CAAC;IAExE;;;;;;;;;;;;OAYG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;;OAIG;IACH,0BAA0B,CAAC,EAAE,MAAM,GAAG,KAAK,CAAC;IAE5C;;;OAGG;IACH,UAAU,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;IAErC;;OAEG;IACH,KAAK,CAAC,EACF,yBAAyB,GACzB,oCAAoC,GACpC,oCAAoC,GACpC,8BAA8B,GAC9B,yCAAyC,CAAC;IAE9C;;;OAGG;IACH,mBAAmB,CAAC,EAAE,OAAO,GAAG,WAAW,GAAG,WAAW,CAAC;IAE1D;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;OAEG;IACH,KAAK,CAAC,EAAE,KAAK,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC;IAExC;;;;;;;;;;;OAWG;IACH,cAAc,CAAC,EAAE,mBAAmB,CAAC,aAAa,CAAC;IAEnD;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,GAAG,KAAK,GAAG,QAAQ,GAAG,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;CACtF;AAED,yBAAiB,mBAAmB,CAAC;IACnC;;;;;;OAMG;IACH,UAAiB,wBAAwB;QACvC;;;;WAIG;QACH,IAAI,CAAC,EAAE,YAAY,GAAG,WAAW,CAAC;KACnC;IAED;;;;;;;;;OASG;IACH,UAAiB,uBAAuB;QACtC;;;;WAIG;QACH,QAAQ,CAAC,EAAE,MAAM,CAAC;QAElB;;;WAGG;QACH,KAAK,CAAC,EAAE,MAAM,CAAC;QAEf;;;;;;WAMG;QACH,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB;IAED,UAAiB,IAAI;QACnB;;;WAGG;QACH,WAAW,CAAC,EAAE,MAAM,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,MAAM,CAAC;QAEd;;WAEG;QACH,UAAU,CAAC,EAAE,OAAO,CAAC;QAErB;;WAEG;QACH,IAAI,CAAC,EAAE,UAAU,CAAC;KACnB;IAED;;;;;;;;;;;OAWG;IACH,UAAiB,aAAa;QAC5B;;;WAGG;QACH,eAAe,CAAC,EAAE,OAAO,CAAC;QAE1B;;;;WAIG;QACH,SAAS,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,CAAC;QAE/C;;;;WAIG;QACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;QAE7B;;;WAGG;QACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAE3B;;;;WAIG;QACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAE7B;;;;WAIG;QACH,SAAS,CAAC,EAAE,MAAM,CAAC;QAEnB;;WAEG;QACH,IAAI,CAAC,EAAE,YAAY,GAAG,cAAc,CAAC;KACtC;CACF;AAED,MAAM,CAAC,OAAO,WAAW,QAAQ,CAAC;IAChC,OAAO,EACL,KAAK,OAAO,IAAI,OAAO,EACvB,KAAK,qBAAqB,IAAI,qBAAqB,EACnD,KAAK,mBAAmB,IAAI,mBAAmB,GAChD,CAAC;CACH"}
|