openai 4.87.4 → 4.89.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/helpers/audio.d.ts +12 -0
  3. package/helpers/audio.d.ts.map +1 -0
  4. package/helpers/audio.js +121 -0
  5. package/helpers/audio.js.map +1 -0
  6. package/helpers/audio.mjs +116 -0
  7. package/helpers/audio.mjs.map +1 -0
  8. package/index.d.mts +2 -0
  9. package/index.d.ts +2 -0
  10. package/index.d.ts.map +1 -1
  11. package/index.js.map +1 -1
  12. package/index.mjs.map +1 -1
  13. package/package.json +8 -29
  14. package/resources/audio/audio.d.ts +5 -4
  15. package/resources/audio/audio.d.ts.map +1 -1
  16. package/resources/audio/audio.js.map +1 -1
  17. package/resources/audio/audio.mjs.map +1 -1
  18. package/resources/audio/index.d.ts +1 -1
  19. package/resources/audio/index.d.ts.map +1 -1
  20. package/resources/audio/index.js.map +1 -1
  21. package/resources/audio/index.mjs.map +1 -1
  22. package/resources/audio/speech.d.ts +7 -2
  23. package/resources/audio/speech.d.ts.map +1 -1
  24. package/resources/audio/transcriptions.d.ts +172 -9
  25. package/resources/audio/transcriptions.d.ts.map +1 -1
  26. package/resources/audio/transcriptions.js.map +1 -1
  27. package/resources/audio/transcriptions.mjs.map +1 -1
  28. package/resources/audio/translations.d.ts +1 -1
  29. package/resources/audio/translations.d.ts.map +1 -1
  30. package/resources/beta/realtime/index.d.ts +1 -0
  31. package/resources/beta/realtime/index.d.ts.map +1 -1
  32. package/resources/beta/realtime/index.js +3 -1
  33. package/resources/beta/realtime/index.js.map +1 -1
  34. package/resources/beta/realtime/index.mjs +1 -0
  35. package/resources/beta/realtime/index.mjs.map +1 -1
  36. package/resources/beta/realtime/realtime.d.ts +383 -36
  37. package/resources/beta/realtime/realtime.d.ts.map +1 -1
  38. package/resources/beta/realtime/realtime.js +4 -0
  39. package/resources/beta/realtime/realtime.js.map +1 -1
  40. package/resources/beta/realtime/realtime.mjs +4 -0
  41. package/resources/beta/realtime/realtime.mjs.map +1 -1
  42. package/resources/beta/realtime/sessions.d.ts +169 -60
  43. package/resources/beta/realtime/sessions.d.ts.map +1 -1
  44. package/resources/beta/realtime/transcription-sessions.d.ts +262 -0
  45. package/resources/beta/realtime/transcription-sessions.d.ts.map +1 -0
  46. package/resources/beta/realtime/transcription-sessions.js +25 -0
  47. package/resources/beta/realtime/transcription-sessions.js.map +1 -0
  48. package/resources/beta/realtime/transcription-sessions.mjs +21 -0
  49. package/resources/beta/realtime/transcription-sessions.mjs.map +1 -0
  50. package/resources/chat/completions/completions.d.ts +1 -1
  51. package/resources/chat/completions/completions.d.ts.map +1 -1
  52. package/resources/responses/responses.d.ts +3 -3
  53. package/resources/responses/responses.d.ts.map +1 -1
  54. package/resources/shared.d.ts +3 -1
  55. package/resources/shared.d.ts.map +1 -1
  56. package/resources.d.ts +2 -0
  57. package/resources.d.ts.map +1 -0
  58. package/resources.js +18 -0
  59. package/resources.js.map +1 -0
  60. package/resources.mjs +2 -0
  61. package/resources.mjs.map +1 -0
  62. package/src/helpers/audio.ts +145 -0
  63. package/src/index.ts +2 -0
  64. package/src/resources/audio/audio.ts +15 -2
  65. package/src/resources/audio/index.ts +6 -0
  66. package/src/resources/audio/speech.ts +8 -2
  67. package/src/resources/audio/transcriptions.ts +215 -9
  68. package/src/resources/audio/translations.ts +1 -1
  69. package/src/resources/beta/realtime/index.ts +5 -0
  70. package/src/resources/beta/realtime/realtime.ts +465 -57
  71. package/src/resources/beta/realtime/sessions.ts +176 -60
  72. package/src/resources/beta/realtime/transcription-sessions.ts +308 -0
  73. package/src/resources/chat/completions/completions.ts +1 -1
  74. package/src/resources/responses/responses.ts +3 -3
  75. package/src/resources/shared.ts +22 -5
  76. package/src/resources.ts +1 -0
  77. package/src/version.ts +1 -1
  78. package/version.d.ts +1 -1
  79. package/version.js +1 -1
  80. package/version.mjs +1 -1
@@ -27,7 +27,7 @@ export class Sessions extends APIResource {
27
27
  */
28
28
  export interface Session {
29
29
  /**
30
- * Unique identifier for the session object.
30
+ * Unique identifier for the session that looks like `sess_1234567890abcdef`.
31
31
  */
32
32
  id?: string;
33
33
 
@@ -38,12 +38,24 @@ export interface Session {
38
38
  */
39
39
  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
40
40
 
41
+ /**
42
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
43
+ * off. Noise reduction filters audio added to the input audio buffer before it is
44
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
45
+ * detection accuracy (reducing false positives) and model performance by improving
46
+ * perception of the input audio.
47
+ */
48
+ input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
49
+
41
50
  /**
42
51
  * Configuration for input audio transcription, defaults to off and can be set to
43
52
  * `null` to turn off once on. Input audio transcription is not native to the
44
53
  * model, since the model consumes audio directly. Transcription runs
45
- * asynchronously through Whisper and should be treated as rough guidance rather
46
- * than the representation understood by the model.
54
+ * asynchronously through
55
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
56
+ * and should be treated as guidance of input audio content rather than precisely
57
+ * what the model heard. The client can optionally set the language and prompt for
58
+ * transcription, these offer additional guidance to the transcription service.
47
59
  */
48
60
  input_audio_transcription?: Session.InputAudioTranscription;
49
61
 
@@ -79,7 +91,6 @@ export interface Session {
79
91
  * The Realtime model used for this session.
80
92
  */
81
93
  model?:
82
- | (string & {})
83
94
  | 'gpt-4o-realtime-preview'
84
95
  | 'gpt-4o-realtime-preview-2024-10-01'
85
96
  | 'gpt-4o-realtime-preview-2024-12-17'
@@ -93,7 +104,8 @@ export interface Session {
93
104
  output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
94
105
 
95
106
  /**
96
- * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
107
+ * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
108
+ * temperature of 0.8 is highly recommended for best performance.
97
109
  */
98
110
  temperature?: number;
99
111
 
@@ -109,11 +121,18 @@ export interface Session {
109
121
  tools?: Array<Session.Tool>;
110
122
 
111
123
  /**
112
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
113
- * means that the model will detect the start and end of speech based on audio
114
- * volume and respond at the end of user speech.
124
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
125
+ * set to `null` to turn off, in which case the client must manually trigger model
126
+ * response. Server VAD means that the model will detect the start and end of
127
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
128
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
129
+ * semantically estimate whether the user has finished speaking, then dynamically
130
+ * sets a timeout based on this probability. For example, if user audio trails off
131
+ * with "uhhm", the model will score a low probability of turn end and wait longer
132
+ * for the user to continue speaking. This can be useful for more natural
133
+ * conversations, but may have a higher latency.
115
134
  */
116
- turn_detection?: Session.TurnDetection | null;
135
+ turn_detection?: Session.TurnDetection;
117
136
 
118
137
  /**
119
138
  * The voice the model uses to respond. Voice cannot be changed during the session
@@ -124,19 +143,54 @@ export interface Session {
124
143
  }
125
144
 
126
145
  export namespace Session {
146
+ /**
147
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
148
+ * off. Noise reduction filters audio added to the input audio buffer before it is
149
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
150
+ * detection accuracy (reducing false positives) and model performance by improving
151
+ * perception of the input audio.
152
+ */
153
+ export interface InputAudioNoiseReduction {
154
+ /**
155
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
156
+ * headphones, `far_field` is for far-field microphones such as laptop or
157
+ * conference room microphones.
158
+ */
159
+ type?: 'near_field' | 'far_field';
160
+ }
161
+
127
162
  /**
128
163
  * Configuration for input audio transcription, defaults to off and can be set to
129
164
  * `null` to turn off once on. Input audio transcription is not native to the
130
165
  * model, since the model consumes audio directly. Transcription runs
131
- * asynchronously through Whisper and should be treated as rough guidance rather
132
- * than the representation understood by the model.
166
+ * asynchronously through
167
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
168
+ * and should be treated as guidance of input audio content rather than precisely
169
+ * what the model heard. The client can optionally set the language and prompt for
170
+ * transcription, these offer additional guidance to the transcription service.
133
171
  */
134
172
  export interface InputAudioTranscription {
135
173
  /**
136
- * The model to use for transcription, `whisper-1` is the only currently supported
137
- * model.
174
+ * The language of the input audio. Supplying the input language in
175
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
176
+ * format will improve accuracy and latency.
177
+ */
178
+ language?: string;
179
+
180
+ /**
181
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
182
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
138
183
  */
139
184
  model?: string;
185
+
186
+ /**
187
+ * An optional text to guide the model's style or continue a previous audio
188
+ * segment. For `whisper-1`, the
189
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
190
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
191
+ * "expect words related to technology".
192
+ */
193
+ prompt?: string;
140
194
  }
141
195
 
142
196
  export interface Tool {
@@ -163,48 +217,62 @@ export namespace Session {
163
217
  }
164
218
 
165
219
  /**
166
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
167
- * means that the model will detect the start and end of speech based on audio
168
- * volume and respond at the end of user speech.
220
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
221
+ * set to `null` to turn off, in which case the client must manually trigger model
222
+ * response. Server VAD means that the model will detect the start and end of
223
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
224
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
225
+ * semantically estimate whether the user has finished speaking, then dynamically
226
+ * sets a timeout based on this probability. For example, if user audio trails off
227
+ * with "uhhm", the model will score a low probability of turn end and wait longer
228
+ * for the user to continue speaking. This can be useful for more natural
229
+ * conversations, but may have a higher latency.
169
230
  */
170
231
  export interface TurnDetection {
171
232
  /**
172
233
  * Whether or not to automatically generate a response when a VAD stop event
173
- * occurs. `true` by default.
234
+ * occurs.
174
235
  */
175
236
  create_response?: boolean;
176
237
 
238
+ /**
239
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
240
+ * will wait longer for the user to continue speaking, `high` will respond more
241
+ * quickly. `auto` is the default and is equivalent to `medium`.
242
+ */
243
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
244
+
177
245
  /**
178
246
  * Whether or not to automatically interrupt any ongoing response with output to
179
247
  * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
180
- * occurs. `true` by default.
248
+ * occurs.
181
249
  */
182
250
  interrupt_response?: boolean;
183
251
 
184
252
  /**
185
- * Amount of audio to include before the VAD detected speech (in milliseconds).
186
- * Defaults to 300ms.
253
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
254
+ * detected speech (in milliseconds). Defaults to 300ms.
187
255
  */
188
256
  prefix_padding_ms?: number;
189
257
 
190
258
  /**
191
- * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
192
- * With shorter values the model will respond more quickly, but may jump in on
193
- * short pauses from the user.
259
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
260
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
261
+ * more quickly, but may jump in on short pauses from the user.
194
262
  */
195
263
  silence_duration_ms?: number;
196
264
 
197
265
  /**
198
- * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
199
- * threshold will require louder audio to activate the model, and thus might
200
- * perform better in noisy environments.
266
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
267
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
268
+ * model, and thus might perform better in noisy environments.
201
269
  */
202
270
  threshold?: number;
203
271
 
204
272
  /**
205
- * Type of turn detection, only `server_vad` is currently supported.
273
+ * Type of turn detection.
206
274
  */
207
- type?: 'server_vad';
275
+ type?: 'server_vad' | 'semantic_vad';
208
276
  }
209
277
  }
210
278
 
@@ -394,15 +462,24 @@ export interface SessionCreateParams {
394
462
  */
395
463
  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
396
464
 
465
+ /**
466
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
467
+ * off. Noise reduction filters audio added to the input audio buffer before it is
468
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
469
+ * detection accuracy (reducing false positives) and model performance by improving
470
+ * perception of the input audio.
471
+ */
472
+ input_audio_noise_reduction?: SessionCreateParams.InputAudioNoiseReduction;
473
+
397
474
  /**
398
475
  * Configuration for input audio transcription, defaults to off and can be set to
399
476
  * `null` to turn off once on. Input audio transcription is not native to the
400
477
  * model, since the model consumes audio directly. Transcription runs
401
478
  * asynchronously through
402
- * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
403
- * and should be treated as rough guidance rather than the representation
404
- * understood by the model. The client can optionally set the language and prompt
405
- * for transcription, these fields will be passed to the Whisper API.
479
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
480
+ * and should be treated as guidance of input audio content rather than precisely
481
+ * what the model heard. The client can optionally set the language and prompt for
482
+ * transcription, these offer additional guidance to the transcription service.
406
483
  */
407
484
  input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
408
485
 
@@ -451,7 +528,8 @@ export interface SessionCreateParams {
451
528
  output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
452
529
 
453
530
  /**
454
- * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
531
+ * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
532
+ * temperature of 0.8 is highly recommended for best performance.
455
533
  */
456
534
  temperature?: number;
457
535
 
@@ -467,9 +545,16 @@ export interface SessionCreateParams {
467
545
  tools?: Array<SessionCreateParams.Tool>;
468
546
 
469
547
  /**
470
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
471
- * means that the model will detect the start and end of speech based on audio
472
- * volume and respond at the end of user speech.
548
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
549
+ * set to `null` to turn off, in which case the client must manually trigger model
550
+ * response. Server VAD means that the model will detect the start and end of
551
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
552
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
553
+ * semantically estimate whether the user has finished speaking, then dynamically
554
+ * sets a timeout based on this probability. For example, if user audio trails off
555
+ * with "uhhm", the model will score a low probability of turn end and wait longer
556
+ * for the user to continue speaking. This can be useful for more natural
557
+ * conversations, but may have a higher latency.
473
558
  */
474
559
  turn_detection?: SessionCreateParams.TurnDetection;
475
560
 
@@ -482,15 +567,31 @@ export interface SessionCreateParams {
482
567
  }
483
568
 
484
569
  export namespace SessionCreateParams {
570
+ /**
571
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
572
+ * off. Noise reduction filters audio added to the input audio buffer before it is
573
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
574
+ * detection accuracy (reducing false positives) and model performance by improving
575
+ * perception of the input audio.
576
+ */
577
+ export interface InputAudioNoiseReduction {
578
+ /**
579
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
580
+ * headphones, `far_field` is for far-field microphones such as laptop or
581
+ * conference room microphones.
582
+ */
583
+ type?: 'near_field' | 'far_field';
584
+ }
585
+
485
586
  /**
486
587
  * Configuration for input audio transcription, defaults to off and can be set to
487
588
  * `null` to turn off once on. Input audio transcription is not native to the
488
589
  * model, since the model consumes audio directly. Transcription runs
489
590
  * asynchronously through
490
- * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
491
- * and should be treated as rough guidance rather than the representation
492
- * understood by the model. The client can optionally set the language and prompt
493
- * for transcription, these fields will be passed to the Whisper API.
591
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
592
+ * and should be treated as guidance of input audio content rather than precisely
593
+ * what the model heard. The client can optionally set the language and prompt for
594
+ * transcription, these offer additional guidance to the transcription service.
494
595
  */
495
596
  export interface InputAudioTranscription {
496
597
  /**
@@ -501,16 +602,17 @@ export namespace SessionCreateParams {
501
602
  language?: string;
502
603
 
503
604
  /**
504
- * The model to use for transcription, `whisper-1` is the only currently supported
505
- * model.
605
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
606
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
506
607
  */
507
608
  model?: string;
508
609
 
509
610
  /**
510
611
  * An optional text to guide the model's style or continue a previous audio
511
- * segment. The
512
- * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
513
- * should match the audio language.
612
+ * segment. For `whisper-1`, the
613
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
614
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
615
+ * "expect words related to technology".
514
616
  */
515
617
  prompt?: string;
516
618
  }
@@ -539,48 +641,62 @@ export namespace SessionCreateParams {
539
641
  }
540
642
 
541
643
  /**
542
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
543
- * means that the model will detect the start and end of speech based on audio
544
- * volume and respond at the end of user speech.
644
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
645
+ * set to `null` to turn off, in which case the client must manually trigger model
646
+ * response. Server VAD means that the model will detect the start and end of
647
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
648
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
649
+ * semantically estimate whether the user has finished speaking, then dynamically
650
+ * sets a timeout based on this probability. For example, if user audio trails off
651
+ * with "uhhm", the model will score a low probability of turn end and wait longer
652
+ * for the user to continue speaking. This can be useful for more natural
653
+ * conversations, but may have a higher latency.
545
654
  */
546
655
  export interface TurnDetection {
547
656
  /**
548
657
  * Whether or not to automatically generate a response when a VAD stop event
549
- * occurs. `true` by default.
658
+ * occurs.
550
659
  */
551
660
  create_response?: boolean;
552
661
 
662
+ /**
663
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
664
+ * will wait longer for the user to continue speaking, `high` will respond more
665
+ * quickly. `auto` is the default and is equivalent to `medium`.
666
+ */
667
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
668
+
553
669
  /**
554
670
  * Whether or not to automatically interrupt any ongoing response with output to
555
671
  * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
556
- * occurs. `true` by default.
672
+ * occurs.
557
673
  */
558
674
  interrupt_response?: boolean;
559
675
 
560
676
  /**
561
- * Amount of audio to include before the VAD detected speech (in milliseconds).
562
- * Defaults to 300ms.
677
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
678
+ * detected speech (in milliseconds). Defaults to 300ms.
563
679
  */
564
680
  prefix_padding_ms?: number;
565
681
 
566
682
  /**
567
- * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
568
- * With shorter values the model will respond more quickly, but may jump in on
569
- * short pauses from the user.
683
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
684
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
685
+ * more quickly, but may jump in on short pauses from the user.
570
686
  */
571
687
  silence_duration_ms?: number;
572
688
 
573
689
  /**
574
- * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
575
- * threshold will require louder audio to activate the model, and thus might
576
- * perform better in noisy environments.
690
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
691
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
692
+ * model, and thus might perform better in noisy environments.
577
693
  */
578
694
  threshold?: number;
579
695
 
580
696
  /**
581
- * Type of turn detection, only `server_vad` is currently supported.
697
+ * Type of turn detection.
582
698
  */
583
- type?: string;
699
+ type?: 'server_vad' | 'semantic_vad';
584
700
  }
585
701
  }
586
702