openai 4.87.4 → 4.89.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/helpers/audio.d.ts +12 -0
  3. package/helpers/audio.d.ts.map +1 -0
  4. package/helpers/audio.js +121 -0
  5. package/helpers/audio.js.map +1 -0
  6. package/helpers/audio.mjs +116 -0
  7. package/helpers/audio.mjs.map +1 -0
  8. package/index.d.mts +2 -0
  9. package/index.d.ts +2 -0
  10. package/index.d.ts.map +1 -1
  11. package/index.js.map +1 -1
  12. package/index.mjs.map +1 -1
  13. package/package.json +8 -29
  14. package/resources/audio/audio.d.ts +5 -4
  15. package/resources/audio/audio.d.ts.map +1 -1
  16. package/resources/audio/audio.js.map +1 -1
  17. package/resources/audio/audio.mjs.map +1 -1
  18. package/resources/audio/index.d.ts +1 -1
  19. package/resources/audio/index.d.ts.map +1 -1
  20. package/resources/audio/index.js.map +1 -1
  21. package/resources/audio/index.mjs.map +1 -1
  22. package/resources/audio/speech.d.ts +7 -2
  23. package/resources/audio/speech.d.ts.map +1 -1
  24. package/resources/audio/transcriptions.d.ts +172 -9
  25. package/resources/audio/transcriptions.d.ts.map +1 -1
  26. package/resources/audio/transcriptions.js.map +1 -1
  27. package/resources/audio/transcriptions.mjs.map +1 -1
  28. package/resources/audio/translations.d.ts +1 -1
  29. package/resources/audio/translations.d.ts.map +1 -1
  30. package/resources/beta/realtime/index.d.ts +1 -0
  31. package/resources/beta/realtime/index.d.ts.map +1 -1
  32. package/resources/beta/realtime/index.js +3 -1
  33. package/resources/beta/realtime/index.js.map +1 -1
  34. package/resources/beta/realtime/index.mjs +1 -0
  35. package/resources/beta/realtime/index.mjs.map +1 -1
  36. package/resources/beta/realtime/realtime.d.ts +383 -36
  37. package/resources/beta/realtime/realtime.d.ts.map +1 -1
  38. package/resources/beta/realtime/realtime.js +4 -0
  39. package/resources/beta/realtime/realtime.js.map +1 -1
  40. package/resources/beta/realtime/realtime.mjs +4 -0
  41. package/resources/beta/realtime/realtime.mjs.map +1 -1
  42. package/resources/beta/realtime/sessions.d.ts +169 -60
  43. package/resources/beta/realtime/sessions.d.ts.map +1 -1
  44. package/resources/beta/realtime/transcription-sessions.d.ts +262 -0
  45. package/resources/beta/realtime/transcription-sessions.d.ts.map +1 -0
  46. package/resources/beta/realtime/transcription-sessions.js +25 -0
  47. package/resources/beta/realtime/transcription-sessions.js.map +1 -0
  48. package/resources/beta/realtime/transcription-sessions.mjs +21 -0
  49. package/resources/beta/realtime/transcription-sessions.mjs.map +1 -0
  50. package/resources/chat/completions/completions.d.ts +1 -1
  51. package/resources/chat/completions/completions.d.ts.map +1 -1
  52. package/resources/responses/responses.d.ts +3 -3
  53. package/resources/responses/responses.d.ts.map +1 -1
  54. package/resources/shared.d.ts +3 -1
  55. package/resources/shared.d.ts.map +1 -1
  56. package/resources.d.ts +2 -0
  57. package/resources.d.ts.map +1 -0
  58. package/resources.js +18 -0
  59. package/resources.js.map +1 -0
  60. package/resources.mjs +2 -0
  61. package/resources.mjs.map +1 -0
  62. package/src/helpers/audio.ts +145 -0
  63. package/src/index.ts +2 -0
  64. package/src/resources/audio/audio.ts +15 -2
  65. package/src/resources/audio/index.ts +6 -0
  66. package/src/resources/audio/speech.ts +8 -2
  67. package/src/resources/audio/transcriptions.ts +215 -9
  68. package/src/resources/audio/translations.ts +1 -1
  69. package/src/resources/beta/realtime/index.ts +5 -0
  70. package/src/resources/beta/realtime/realtime.ts +465 -57
  71. package/src/resources/beta/realtime/sessions.ts +176 -60
  72. package/src/resources/beta/realtime/transcription-sessions.ts +308 -0
  73. package/src/resources/chat/completions/completions.ts +1 -1
  74. package/src/resources/responses/responses.ts +3 -3
  75. package/src/resources/shared.ts +22 -5
  76. package/src/resources.ts +1 -0
  77. package/src/version.ts +1 -1
  78. package/version.d.ts +1 -1
  79. package/version.js +1 -1
  80. package/version.mjs +1 -1
@@ -0,0 +1,308 @@
1
+ // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
+
3
+ import { APIResource } from '../../../resource';
4
+ import * as Core from '../../../core';
5
+
6
+ export class TranscriptionSessions extends APIResource {
7
+ /**
8
+ * Create an ephemeral API token for use in client-side applications with the
9
+ * Realtime API specifically for realtime transcriptions. Can be configured with
10
+ * the same session parameters as the `transcription_session.update` client event.
11
+ *
12
+ * It responds with a session object, plus a `client_secret` key which contains a
13
+ * usable ephemeral API token that can be used to authenticate browser clients for
14
+ * the Realtime API.
15
+ */
16
+ create(
17
+ body: TranscriptionSessionCreateParams,
18
+ options?: Core.RequestOptions,
19
+ ): Core.APIPromise<TranscriptionSession> {
20
+ return this._client.post('/realtime/transcription_sessions', {
21
+ body,
22
+ ...options,
23
+ headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
24
+ });
25
+ }
26
+ }
27
+
28
+ /**
29
+ * A new Realtime transcription session configuration.
30
+ *
31
+ * When a session is created on the server via REST API, the session object also
32
+ * contains an ephemeral key. Default TTL for keys is one minute. This property is
33
+ * not present when a session is updated via the WebSocket API.
34
+ */
35
+ export interface TranscriptionSession {
36
+ /**
37
+ * Ephemeral key returned by the API. Only present when the session is created on
38
+ * the server via REST API.
39
+ */
40
+ client_secret: TranscriptionSession.ClientSecret;
41
+
42
+ /**
43
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
44
+ */
45
+ input_audio_format?: string;
46
+
47
+ /**
48
+ * Configuration of the transcription model.
49
+ */
50
+ input_audio_transcription?: TranscriptionSession.InputAudioTranscription;
51
+
52
+ /**
53
+ * The set of modalities the model can respond with. To disable audio, set this to
54
+ * ["text"].
55
+ */
56
+ modalities?: Array<'text' | 'audio'>;
57
+
58
+ /**
59
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
60
+ * means that the model will detect the start and end of speech based on audio
61
+ * volume and respond at the end of user speech.
62
+ */
63
+ turn_detection?: TranscriptionSession.TurnDetection;
64
+ }
65
+
66
+ export namespace TranscriptionSession {
67
+ /**
68
+ * Ephemeral key returned by the API. Only present when the session is created on
69
+ * the server via REST API.
70
+ */
71
+ export interface ClientSecret {
72
+ /**
73
+ * Timestamp for when the token expires. Currently, all tokens expire after one
74
+ * minute.
75
+ */
76
+ expires_at: number;
77
+
78
+ /**
79
+ * Ephemeral key usable in client environments to authenticate connections to the
80
+ * Realtime API. Use this in client-side environments rather than a standard API
81
+ * token, which should only be used server-side.
82
+ */
83
+ value: string;
84
+ }
85
+
86
+ /**
87
+ * Configuration of the transcription model.
88
+ */
89
+ export interface InputAudioTranscription {
90
+ /**
91
+ * The language of the input audio. Supplying the input language in
92
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
93
+ * format will improve accuracy and latency.
94
+ */
95
+ language?: string;
96
+
97
+ /**
98
+ * The model to use for transcription. Can be `gpt-4o-transcribe`,
99
+ * `gpt-4o-mini-transcribe`, or `whisper-1`.
100
+ */
101
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
102
+
103
+ /**
104
+ * An optional text to guide the model's style or continue a previous audio
105
+ * segment. The
106
+ * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
107
+ * should match the audio language.
108
+ */
109
+ prompt?: string;
110
+ }
111
+
112
+ /**
113
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
114
+ * means that the model will detect the start and end of speech based on audio
115
+ * volume and respond at the end of user speech.
116
+ */
117
+ export interface TurnDetection {
118
+ /**
119
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
120
+ * Defaults to 300ms.
121
+ */
122
+ prefix_padding_ms?: number;
123
+
124
+ /**
125
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
126
+ * With shorter values the model will respond more quickly, but may jump in on
127
+ * short pauses from the user.
128
+ */
129
+ silence_duration_ms?: number;
130
+
131
+ /**
132
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
133
+ * threshold will require louder audio to activate the model, and thus might
134
+ * perform better in noisy environments.
135
+ */
136
+ threshold?: number;
137
+
138
+ /**
139
+ * Type of turn detection, only `server_vad` is currently supported.
140
+ */
141
+ type?: string;
142
+ }
143
+ }
144
+
145
+ export interface TranscriptionSessionCreateParams {
146
+ /**
147
+ * The set of items to include in the transcription. Current available items are:
148
+ *
149
+ * - `item.input_audio_transcription.logprobs`
150
+ */
151
+ include?: Array<string>;
152
+
153
+ /**
154
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
155
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
156
+ * (mono), and little-endian byte order.
157
+ */
158
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
159
+
160
+ /**
161
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
162
+ * off. Noise reduction filters audio added to the input audio buffer before it is
163
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
164
+ * detection accuracy (reducing false positives) and model performance by improving
165
+ * perception of the input audio.
166
+ */
167
+ input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
168
+
169
+ /**
170
+ * Configuration for input audio transcription. The client can optionally set the
171
+ * language and prompt for transcription, these offer additional guidance to the
172
+ * transcription service.
173
+ */
174
+ input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
175
+
176
+ /**
177
+ * The set of modalities the model can respond with. To disable audio, set this to
178
+ * ["text"].
179
+ */
180
+ modalities?: Array<'text' | 'audio'>;
181
+
182
+ /**
183
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
184
+ * set to `null` to turn off, in which case the client must manually trigger model
185
+ * response. Server VAD means that the model will detect the start and end of
186
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
187
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
188
+ * semantically estimate whether the user has finished speaking, then dynamically
189
+ * sets a timeout based on this probability. For example, if user audio trails off
190
+ * with "uhhm", the model will score a low probability of turn end and wait longer
191
+ * for the user to continue speaking. This can be useful for more natural
192
+ * conversations, but may have a higher latency.
193
+ */
194
+ turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
195
+ }
196
+
197
+ export namespace TranscriptionSessionCreateParams {
198
+ /**
199
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
200
+ * off. Noise reduction filters audio added to the input audio buffer before it is
201
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
202
+ * detection accuracy (reducing false positives) and model performance by improving
203
+ * perception of the input audio.
204
+ */
205
+ export interface InputAudioNoiseReduction {
206
+ /**
207
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
208
+ * headphones, `far_field` is for far-field microphones such as laptop or
209
+ * conference room microphones.
210
+ */
211
+ type?: 'near_field' | 'far_field';
212
+ }
213
+
214
+ /**
215
+ * Configuration for input audio transcription. The client can optionally set the
216
+ * language and prompt for transcription, these offer additional guidance to the
217
+ * transcription service.
218
+ */
219
+ export interface InputAudioTranscription {
220
+ /**
221
+ * The language of the input audio. Supplying the input language in
222
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
223
+ * format will improve accuracy and latency.
224
+ */
225
+ language?: string;
226
+
227
+ /**
228
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
229
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
230
+ */
231
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
232
+
233
+ /**
234
+ * An optional text to guide the model's style or continue a previous audio
235
+ * segment. For `whisper-1`, the
236
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
237
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
238
+ * "expect words related to technology".
239
+ */
240
+ prompt?: string;
241
+ }
242
+
243
+ /**
244
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
245
+ * set to `null` to turn off, in which case the client must manually trigger model
246
+ * response. Server VAD means that the model will detect the start and end of
247
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
248
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
249
+ * semantically estimate whether the user has finished speaking, then dynamically
250
+ * sets a timeout based on this probability. For example, if user audio trails off
251
+ * with "uhhm", the model will score a low probability of turn end and wait longer
252
+ * for the user to continue speaking. This can be useful for more natural
253
+ * conversations, but may have a higher latency.
254
+ */
255
+ export interface TurnDetection {
256
+ /**
257
+ * Whether or not to automatically generate a response when a VAD stop event
258
+ * occurs.
259
+ */
260
+ create_response?: boolean;
261
+
262
+ /**
263
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
264
+ * will wait longer for the user to continue speaking, `high` will respond more
265
+ * quickly. `auto` is the default and is equivalent to `medium`.
266
+ */
267
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
268
+
269
+ /**
270
+ * Whether or not to automatically interrupt any ongoing response with output to
271
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
272
+ * occurs.
273
+ */
274
+ interrupt_response?: boolean;
275
+
276
+ /**
277
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
278
+ * detected speech (in milliseconds). Defaults to 300ms.
279
+ */
280
+ prefix_padding_ms?: number;
281
+
282
+ /**
283
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
284
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
285
+ * more quickly, but may jump in on short pauses from the user.
286
+ */
287
+ silence_duration_ms?: number;
288
+
289
+ /**
290
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
291
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
292
+ * model, and thus might perform better in noisy environments.
293
+ */
294
+ threshold?: number;
295
+
296
+ /**
297
+ * Type of turn detection.
298
+ */
299
+ type?: 'server_vad' | 'semantic_vad';
300
+ }
301
+ }
302
+
303
+ export declare namespace TranscriptionSessions {
304
+ export {
305
+ type TranscriptionSession as TranscriptionSession,
306
+ type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams,
307
+ };
308
+ }
@@ -383,7 +383,7 @@ export interface ChatCompletionChunk {
383
383
  * **NOTE:** If the stream is interrupted or cancelled, you may not receive the
384
384
  * final usage chunk which contains the total token usage for the request.
385
385
  */
386
- usage?: CompletionsAPI.CompletionUsage;
386
+ usage?: CompletionsAPI.CompletionUsage | null;
387
387
  }
388
388
 
389
389
  export namespace ChatCompletionChunk {
@@ -327,7 +327,7 @@ export interface Response {
327
327
  * [model guide](https://platform.openai.com/docs/models) to browse and compare
328
328
  * available models.
329
329
  */
330
- model: (string & {}) | Shared.ChatModel;
330
+ model: Shared.ResponsesModel;
331
331
 
332
332
  /**
333
333
  * The object type of this resource - always set to `response`.
@@ -1481,7 +1481,7 @@ export interface ResponseFunctionToolCall {
1481
1481
  */
1482
1482
  export interface ResponseFunctionToolCallItem extends ResponseFunctionToolCall {
1483
1483
  /**
1484
- * The unique ID of the function call tool output.
1484
+ * The unique ID of the function tool call.
1485
1485
  */
1486
1486
  id: string;
1487
1487
  }
@@ -2679,7 +2679,7 @@ export interface ResponseCreateParamsBase {
2679
2679
  * [model guide](https://platform.openai.com/docs/models) to browse and compare
2680
2680
  * available models.
2681
2681
  */
2682
- model: (string & {}) | Shared.ChatModel;
2682
+ model: Shared.ResponsesModel;
2683
2683
 
2684
2684
  /**
2685
2685
  * Specify additional output data to include in the model response. Currently
@@ -1,5 +1,15 @@
1
1
  // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
2
 
3
+ export type AllModels =
4
+ | string
5
+ | ChatModel
6
+ | string
7
+ | ChatModel
8
+ | 'o1-pro'
9
+ | 'o1-pro-2025-03-19'
10
+ | 'computer-use-preview'
11
+ | 'computer-use-preview-2025-03-11';
12
+
3
13
  export type ChatModel =
4
14
  | 'o3-mini'
5
15
  | 'o3-mini-2025-01-31'
@@ -9,11 +19,6 @@ export type ChatModel =
9
19
  | 'o1-preview-2024-09-12'
10
20
  | 'o1-mini'
11
21
  | 'o1-mini-2024-09-12'
12
- | 'computer-use-preview'
13
- | 'computer-use-preview-2025-02-04'
14
- | 'computer-use-preview-2025-03-11'
15
- | 'gpt-4.5-preview'
16
- | 'gpt-4.5-preview-2025-02-27'
17
22
  | 'gpt-4o'
18
23
  | 'gpt-4o-2024-11-20'
19
24
  | 'gpt-4o-2024-08-06'
@@ -23,6 +28,10 @@ export type ChatModel =
23
28
  | 'gpt-4o-audio-preview-2024-12-17'
24
29
  | 'gpt-4o-mini-audio-preview'
25
30
  | 'gpt-4o-mini-audio-preview-2024-12-17'
31
+ | 'gpt-4o-search-preview'
32
+ | 'gpt-4o-mini-search-preview'
33
+ | 'gpt-4o-search-preview-2025-03-11'
34
+ | 'gpt-4o-mini-search-preview-2025-03-11'
26
35
  | 'chatgpt-4o-latest'
27
36
  | 'gpt-4o-mini'
28
37
  | 'gpt-4o-mini-2024-07-18'
@@ -265,3 +274,11 @@ export interface ResponseFormatText {
265
274
  */
266
275
  type: 'text';
267
276
  }
277
+
278
+ export type ResponsesModel =
279
+ | (string & {})
280
+ | ChatModel
281
+ | 'o1-pro'
282
+ | 'o1-pro-2025-03-19'
283
+ | 'computer-use-preview'
284
+ | 'computer-use-preview-2025-03-11';
@@ -0,0 +1 @@
1
+ export * from './resources/index';
package/src/version.ts CHANGED
@@ -1 +1 @@
1
- export const VERSION = '4.87.4'; // x-release-please-version
1
+ export const VERSION = '4.89.0'; // x-release-please-version
package/version.d.ts CHANGED
@@ -1,2 +1,2 @@
1
- export declare const VERSION = "4.87.4";
1
+ export declare const VERSION = "4.89.0";
2
2
  //# sourceMappingURL=version.d.ts.map
package/version.js CHANGED
@@ -1,5 +1,5 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.VERSION = void 0;
4
- exports.VERSION = '4.87.4'; // x-release-please-version
4
+ exports.VERSION = '4.89.0'; // x-release-please-version
5
5
  //# sourceMappingURL=version.js.map
package/version.mjs CHANGED
@@ -1,2 +1,2 @@
1
- export const VERSION = '4.87.4'; // x-release-please-version
1
+ export const VERSION = '4.89.0'; // x-release-please-version
2
2
  //# sourceMappingURL=version.mjs.map