openai 4.87.4 → 4.89.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/helpers/audio.d.ts +12 -0
  3. package/helpers/audio.d.ts.map +1 -0
  4. package/helpers/audio.js +121 -0
  5. package/helpers/audio.js.map +1 -0
  6. package/helpers/audio.mjs +116 -0
  7. package/helpers/audio.mjs.map +1 -0
  8. package/index.d.mts +2 -0
  9. package/index.d.ts +2 -0
  10. package/index.d.ts.map +1 -1
  11. package/index.js.map +1 -1
  12. package/index.mjs.map +1 -1
  13. package/package.json +8 -29
  14. package/resources/audio/audio.d.ts +5 -4
  15. package/resources/audio/audio.d.ts.map +1 -1
  16. package/resources/audio/audio.js.map +1 -1
  17. package/resources/audio/audio.mjs.map +1 -1
  18. package/resources/audio/index.d.ts +1 -1
  19. package/resources/audio/index.d.ts.map +1 -1
  20. package/resources/audio/index.js.map +1 -1
  21. package/resources/audio/index.mjs.map +1 -1
  22. package/resources/audio/speech.d.ts +7 -2
  23. package/resources/audio/speech.d.ts.map +1 -1
  24. package/resources/audio/transcriptions.d.ts +172 -9
  25. package/resources/audio/transcriptions.d.ts.map +1 -1
  26. package/resources/audio/transcriptions.js.map +1 -1
  27. package/resources/audio/transcriptions.mjs.map +1 -1
  28. package/resources/audio/translations.d.ts +1 -1
  29. package/resources/audio/translations.d.ts.map +1 -1
  30. package/resources/beta/realtime/index.d.ts +1 -0
  31. package/resources/beta/realtime/index.d.ts.map +1 -1
  32. package/resources/beta/realtime/index.js +3 -1
  33. package/resources/beta/realtime/index.js.map +1 -1
  34. package/resources/beta/realtime/index.mjs +1 -0
  35. package/resources/beta/realtime/index.mjs.map +1 -1
  36. package/resources/beta/realtime/realtime.d.ts +383 -36
  37. package/resources/beta/realtime/realtime.d.ts.map +1 -1
  38. package/resources/beta/realtime/realtime.js +4 -0
  39. package/resources/beta/realtime/realtime.js.map +1 -1
  40. package/resources/beta/realtime/realtime.mjs +4 -0
  41. package/resources/beta/realtime/realtime.mjs.map +1 -1
  42. package/resources/beta/realtime/sessions.d.ts +169 -60
  43. package/resources/beta/realtime/sessions.d.ts.map +1 -1
  44. package/resources/beta/realtime/transcription-sessions.d.ts +262 -0
  45. package/resources/beta/realtime/transcription-sessions.d.ts.map +1 -0
  46. package/resources/beta/realtime/transcription-sessions.js +25 -0
  47. package/resources/beta/realtime/transcription-sessions.js.map +1 -0
  48. package/resources/beta/realtime/transcription-sessions.mjs +21 -0
  49. package/resources/beta/realtime/transcription-sessions.mjs.map +1 -0
  50. package/resources/chat/completions/completions.d.ts +1 -1
  51. package/resources/chat/completions/completions.d.ts.map +1 -1
  52. package/resources/responses/responses.d.ts +3 -3
  53. package/resources/responses/responses.d.ts.map +1 -1
  54. package/resources/shared.d.ts +3 -1
  55. package/resources/shared.d.ts.map +1 -1
  56. package/resources.d.ts +2 -0
  57. package/resources.d.ts.map +1 -0
  58. package/resources.js +18 -0
  59. package/resources.js.map +1 -0
  60. package/resources.mjs +2 -0
  61. package/resources.mjs.map +1 -0
  62. package/src/helpers/audio.ts +145 -0
  63. package/src/index.ts +2 -0
  64. package/src/resources/audio/audio.ts +15 -2
  65. package/src/resources/audio/index.ts +6 -0
  66. package/src/resources/audio/speech.ts +8 -2
  67. package/src/resources/audio/transcriptions.ts +215 -9
  68. package/src/resources/audio/translations.ts +1 -1
  69. package/src/resources/beta/realtime/index.ts +5 -0
  70. package/src/resources/beta/realtime/realtime.ts +465 -57
  71. package/src/resources/beta/realtime/sessions.ts +176 -60
  72. package/src/resources/beta/realtime/transcription-sessions.ts +308 -0
  73. package/src/resources/chat/completions/completions.ts +1 -1
  74. package/src/resources/responses/responses.ts +3 -3
  75. package/src/resources/shared.ts +22 -5
  76. package/src/resources.ts +1 -0
  77. package/src/version.ts +1 -1
  78. package/version.d.ts +1 -1
  79. package/version.js +1 -1
  80. package/version.mjs +1 -1
@@ -10,9 +10,17 @@ import {
10
10
  SessionCreateResponse,
11
11
  Sessions,
12
12
  } from './sessions';
13
+ import * as TranscriptionSessionsAPI from './transcription-sessions';
14
+ import {
15
+ TranscriptionSession,
16
+ TranscriptionSessionCreateParams,
17
+ TranscriptionSessions,
18
+ } from './transcription-sessions';
13
19
 
14
20
  export class Realtime extends APIResource {
15
21
  sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
22
+ transcriptionSessions: TranscriptionSessionsAPI.TranscriptionSessions =
23
+ new TranscriptionSessionsAPI.TranscriptionSessions(this._client);
16
24
  }
17
25
 
18
26
  /**
@@ -300,6 +308,91 @@ export interface ConversationItemInputAudioTranscriptionCompletedEvent {
300
308
  * The event type, must be `conversation.item.input_audio_transcription.completed`.
301
309
  */
302
310
  type: 'conversation.item.input_audio_transcription.completed';
311
+
312
+ /**
313
+ * The log probabilities of the transcription.
314
+ */
315
+ logprobs?: Array<ConversationItemInputAudioTranscriptionCompletedEvent.Logprob> | null;
316
+ }
317
+
318
+ export namespace ConversationItemInputAudioTranscriptionCompletedEvent {
319
+ /**
320
+ * A log probability object.
321
+ */
322
+ export interface Logprob {
323
+ /**
324
+ * The token that was used to generate the log probability.
325
+ */
326
+ token: string;
327
+
328
+ /**
329
+ * The bytes that were used to generate the log probability.
330
+ */
331
+ bytes: Array<number>;
332
+
333
+ /**
334
+ * The log probability of the token.
335
+ */
336
+ logprob: number;
337
+ }
338
+ }
339
+
340
+ /**
341
+ * Returned when the text value of an input audio transcription content part is
342
+ * updated.
343
+ */
344
+ export interface ConversationItemInputAudioTranscriptionDeltaEvent {
345
+ /**
346
+ * The unique ID of the server event.
347
+ */
348
+ event_id: string;
349
+
350
+ /**
351
+ * The ID of the item.
352
+ */
353
+ item_id: string;
354
+
355
+ /**
356
+ * The event type, must be `conversation.item.input_audio_transcription.delta`.
357
+ */
358
+ type: 'conversation.item.input_audio_transcription.delta';
359
+
360
+ /**
361
+ * The index of the content part in the item's content array.
362
+ */
363
+ content_index?: number;
364
+
365
+ /**
366
+ * The text delta.
367
+ */
368
+ delta?: string;
369
+
370
+ /**
371
+ * The log probabilities of the transcription.
372
+ */
373
+ logprobs?: Array<ConversationItemInputAudioTranscriptionDeltaEvent.Logprob> | null;
374
+ }
375
+
376
+ export namespace ConversationItemInputAudioTranscriptionDeltaEvent {
377
+ /**
378
+ * A log probability object.
379
+ */
380
+ export interface Logprob {
381
+ /**
382
+ * The token that was used to generate the log probability.
383
+ */
384
+ token: string;
385
+
386
+ /**
387
+ * The bytes that were used to generate the log probability.
388
+ */
389
+ bytes: Array<number>;
390
+
391
+ /**
392
+ * The log probability of the token.
393
+ */
394
+ logprob: number;
395
+ }
303
396
  }
304
397
 
305
398
  /**
@@ -361,6 +454,30 @@ export namespace ConversationItemInputAudioTranscriptionFailedEvent {
361
454
  }
362
455
  }
363
456
 
457
+ /**
458
+ * Send this event when you want to retrieve the server's representation of a
459
+ * specific item in the conversation history. This is useful, for example, to
460
+ * inspect user audio after noise cancellation and VAD. The server will respond
461
+ * with a `conversation.item.retrieved` event, unless the item does not exist in
462
+ * the conversation history, in which case the server will respond with an error.
463
+ */
464
+ export interface ConversationItemRetrieveEvent {
465
+ /**
466
+ * The ID of the item to retrieve.
467
+ */
468
+ item_id: string;
469
+
470
+ /**
471
+ * The event type, must be `conversation.item.retrieve`.
472
+ */
473
+ type: 'conversation.item.retrieve';
474
+
475
+ /**
476
+ * Optional client-generated ID used to identify this event.
477
+ */
478
+ event_id?: string;
479
+ }
480
+
364
481
  /**
365
482
  * Send this event to truncate a previous assistant message’s audio. The server
366
483
  * will produce audio faster than realtime, so this event is useful when the user
@@ -789,18 +906,20 @@ export namespace RateLimitsUpdatedEvent {
789
906
  }
790
907
 
791
908
  /**
792
- * All events that the client can send to the Realtime API
909
+ * A realtime client event.
793
910
  */
794
911
  export type RealtimeClientEvent =
795
- | SessionUpdateEvent
796
- | InputAudioBufferAppendEvent
797
- | InputAudioBufferCommitEvent
798
- | InputAudioBufferClearEvent
799
912
  | ConversationItemCreateEvent
800
- | ConversationItemTruncateEvent
801
913
  | ConversationItemDeleteEvent
914
+ | ConversationItemRetrieveEvent
915
+ | ConversationItemTruncateEvent
916
+ | InputAudioBufferAppendEvent
917
+ | InputAudioBufferClearEvent
918
+ | InputAudioBufferCommitEvent
919
+ | ResponseCancelEvent
802
920
  | ResponseCreateEvent
803
- | ResponseCancelEvent;
921
+ | SessionUpdateEvent
922
+ | TranscriptionSessionUpdate;
804
923
 
805
924
  /**
806
925
  * The response resource.
@@ -1009,37 +1128,63 @@ export namespace RealtimeResponseUsage {
1009
1128
  }
1010
1129
 
1011
1130
  /**
1012
- * All events that the Realtime API can send back
1131
+ * A realtime server event.
1013
1132
  */
1014
1133
  export type RealtimeServerEvent =
1015
- | ErrorEvent
1016
- | SessionCreatedEvent
1017
- | SessionUpdatedEvent
1018
1134
  | ConversationCreatedEvent
1019
- | InputAudioBufferCommittedEvent
1020
- | InputAudioBufferClearedEvent
1021
- | InputAudioBufferSpeechStartedEvent
1022
- | InputAudioBufferSpeechStoppedEvent
1023
1135
  | ConversationItemCreatedEvent
1136
+ | ConversationItemDeletedEvent
1024
1137
  | ConversationItemInputAudioTranscriptionCompletedEvent
1138
+ | ConversationItemInputAudioTranscriptionDeltaEvent
1025
1139
  | ConversationItemInputAudioTranscriptionFailedEvent
1140
+ | RealtimeServerEvent.ConversationItemRetrieved
1026
1141
  | ConversationItemTruncatedEvent
1027
- | ConversationItemDeletedEvent
1142
+ | ErrorEvent
1143
+ | InputAudioBufferClearedEvent
1144
+ | InputAudioBufferCommittedEvent
1145
+ | InputAudioBufferSpeechStartedEvent
1146
+ | InputAudioBufferSpeechStoppedEvent
1147
+ | RateLimitsUpdatedEvent
1148
+ | ResponseAudioDeltaEvent
1149
+ | ResponseAudioDoneEvent
1150
+ | ResponseAudioTranscriptDeltaEvent
1151
+ | ResponseAudioTranscriptDoneEvent
1152
+ | ResponseContentPartAddedEvent
1153
+ | ResponseContentPartDoneEvent
1028
1154
  | ResponseCreatedEvent
1029
1155
  | ResponseDoneEvent
1156
+ | ResponseFunctionCallArgumentsDeltaEvent
1157
+ | ResponseFunctionCallArgumentsDoneEvent
1030
1158
  | ResponseOutputItemAddedEvent
1031
1159
  | ResponseOutputItemDoneEvent
1032
- | ResponseContentPartAddedEvent
1033
- | ResponseContentPartDoneEvent
1034
1160
  | ResponseTextDeltaEvent
1035
1161
  | ResponseTextDoneEvent
1036
- | ResponseAudioTranscriptDeltaEvent
1037
- | ResponseAudioTranscriptDoneEvent
1038
- | ResponseAudioDeltaEvent
1039
- | ResponseAudioDoneEvent
1040
- | ResponseFunctionCallArgumentsDeltaEvent
1041
- | ResponseFunctionCallArgumentsDoneEvent
1042
- | RateLimitsUpdatedEvent;
1162
+ | SessionCreatedEvent
1163
+ | SessionUpdatedEvent
1164
+ | TranscriptionSessionUpdatedEvent;
1165
+
1166
+ export namespace RealtimeServerEvent {
1167
+ /**
1168
+ * Returned when a conversation item is retrieved with
1169
+ * `conversation.item.retrieve`.
1170
+ */
1171
+ export interface ConversationItemRetrieved {
1172
+ /**
1173
+ * The unique ID of the server event.
1174
+ */
1175
+ event_id: string;
1176
+
1177
+ /**
1178
+ * The item to add to the conversation.
1179
+ */
1180
+ item: RealtimeAPI.ConversationItem;
1181
+
1182
+ /**
1183
+ * The event type, must be `conversation.item.retrieved`.
1184
+ */
1185
+ type: 'conversation.item.retrieved';
1186
+ }
1187
+ }
1043
1188
 
1044
1189
  /**
1045
1190
  * Returned when the model-generated audio is updated.
@@ -1834,15 +1979,24 @@ export namespace SessionUpdateEvent {
1834
1979
  */
1835
1980
  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
1836
1981
 
1982
+ /**
1983
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
1984
+ * off. Noise reduction filters audio added to the input audio buffer before it is
1985
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
1986
+ * detection accuracy (reducing false positives) and model performance by improving
1987
+ * perception of the input audio.
1988
+ */
1989
+ input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
1990
+
1837
1991
  /**
1838
1992
  * Configuration for input audio transcription, defaults to off and can be set to
1839
1993
  * `null` to turn off once on. Input audio transcription is not native to the
1840
1994
  * model, since the model consumes audio directly. Transcription runs
1841
1995
  * asynchronously through
1842
- * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1843
- * and should be treated as rough guidance rather than the representation
1844
- * understood by the model. The client can optionally set the language and prompt
1845
- * for transcription, these fields will be passed to the Whisper API.
1996
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1997
+ * and should be treated as guidance of input audio content rather than precisely
1998
+ * what the model heard. The client can optionally set the language and prompt for
1999
+ * transcription, these offer additional guidance to the transcription service.
1846
2000
  */
1847
2001
  input_audio_transcription?: Session.InputAudioTranscription;
1848
2002
 
@@ -1891,7 +2045,8 @@ export namespace SessionUpdateEvent {
1891
2045
  output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
1892
2046
 
1893
2047
  /**
1894
- * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
2048
+ * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
2049
+ * temperature of 0.8 is highly recommended for best performance.
1895
2050
  */
1896
2051
  temperature?: number;
1897
2052
 
@@ -1907,9 +2062,16 @@ export namespace SessionUpdateEvent {
1907
2062
  tools?: Array<Session.Tool>;
1908
2063
 
1909
2064
  /**
1910
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
1911
- * means that the model will detect the start and end of speech based on audio
1912
- * volume and respond at the end of user speech.
2065
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
2066
+ * set to `null` to turn off, in which case the client must manually trigger model
2067
+ * response. Server VAD means that the model will detect the start and end of
2068
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
2069
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
2070
+ * semantically estimate whether the user has finished speaking, then dynamically
2071
+ * sets a timeout based on this probability. For example, if user audio trails off
2072
+ * with "uhhm", the model will score a low probability of turn end and wait longer
2073
+ * for the user to continue speaking. This can be useful for more natural
2074
+ * conversations, but may have a higher latency.
1913
2075
  */
1914
2076
  turn_detection?: Session.TurnDetection;
1915
2077
 
@@ -1922,15 +2084,31 @@ export namespace SessionUpdateEvent {
1922
2084
  }
1923
2085
 
1924
2086
  export namespace Session {
2087
+ /**
2088
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
2089
+ * off. Noise reduction filters audio added to the input audio buffer before it is
2090
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
2091
+ * detection accuracy (reducing false positives) and model performance by improving
2092
+ * perception of the input audio.
2093
+ */
2094
+ export interface InputAudioNoiseReduction {
2095
+ /**
2096
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
2097
+ * headphones, `far_field` is for far-field microphones such as laptop or
2098
+ * conference room microphones.
2099
+ */
2100
+ type?: 'near_field' | 'far_field';
2101
+ }
2102
+
1925
2103
  /**
1926
2104
  * Configuration for input audio transcription, defaults to off and can be set to
1927
2105
  * `null` to turn off once on. Input audio transcription is not native to the
1928
2106
  * model, since the model consumes audio directly. Transcription runs
1929
2107
  * asynchronously through
1930
- * [OpenAI Whisper transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
1931
- * and should be treated as rough guidance rather than the representation
1932
- * understood by the model. The client can optionally set the language and prompt
1933
- * for transcription, these fields will be passed to the Whisper API.
2108
+ * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
2109
+ * and should be treated as guidance of input audio content rather than precisely
2110
+ * what the model heard. The client can optionally set the language and prompt for
2111
+ * transcription, these offer additional guidance to the transcription service.
1934
2112
  */
1935
2113
  export interface InputAudioTranscription {
1936
2114
  /**
@@ -1941,16 +2119,17 @@ export namespace SessionUpdateEvent {
1941
2119
  language?: string;
1942
2120
 
1943
2121
  /**
1944
- * The model to use for transcription, `whisper-1` is the only currently supported
1945
- * model.
2122
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
2123
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
1946
2124
  */
1947
2125
  model?: string;
1948
2126
 
1949
2127
  /**
1950
2128
  * An optional text to guide the model's style or continue a previous audio
1951
- * segment. The
1952
- * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1953
- * should match the audio language.
2129
+ * segment. For `whisper-1`, the
2130
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
2131
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
2132
+ * "expect words related to technology".
1954
2133
  */
1955
2134
  prompt?: string;
1956
2135
  }
@@ -1979,48 +2158,62 @@ export namespace SessionUpdateEvent {
1979
2158
  }
1980
2159
 
1981
2160
  /**
1982
- * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
1983
- * means that the model will detect the start and end of speech based on audio
1984
- * volume and respond at the end of user speech.
2161
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
2162
+ * set to `null` to turn off, in which case the client must manually trigger model
2163
+ * response. Server VAD means that the model will detect the start and end of
2164
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
2165
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
2166
+ * semantically estimate whether the user has finished speaking, then dynamically
2167
+ * sets a timeout based on this probability. For example, if user audio trails off
2168
+ * with "uhhm", the model will score a low probability of turn end and wait longer
2169
+ * for the user to continue speaking. This can be useful for more natural
2170
+ * conversations, but may have a higher latency.
1985
2171
  */
1986
2172
  export interface TurnDetection {
1987
2173
  /**
1988
2174
  * Whether or not to automatically generate a response when a VAD stop event
1989
- * occurs. `true` by default.
2175
+ * occurs.
1990
2176
  */
1991
2177
  create_response?: boolean;
1992
2178
 
2179
+ /**
2180
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
2181
+ * will wait longer for the user to continue speaking, `high` will respond more
2182
+ * quickly. `auto` is the default and is equivalent to `medium`.
2183
+ */
2184
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
2185
+
1993
2186
  /**
1994
2187
  * Whether or not to automatically interrupt any ongoing response with output to
1995
2188
  * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
1996
- * occurs. `true` by default.
2189
+ * occurs.
1997
2190
  */
1998
2191
  interrupt_response?: boolean;
1999
2192
 
2000
2193
  /**
2001
- * Amount of audio to include before the VAD detected speech (in milliseconds).
2002
- * Defaults to 300ms.
2194
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
2195
+ * detected speech (in milliseconds). Defaults to 300ms.
2003
2196
  */
2004
2197
  prefix_padding_ms?: number;
2005
2198
 
2006
2199
  /**
2007
- * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
2008
- * With shorter values the model will respond more quickly, but may jump in on
2009
- * short pauses from the user.
2200
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
2201
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
2202
+ * more quickly, but may jump in on short pauses from the user.
2010
2203
  */
2011
2204
  silence_duration_ms?: number;
2012
2205
 
2013
2206
  /**
2014
- * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
2015
- * threshold will require louder audio to activate the model, and thus might
2016
- * perform better in noisy environments.
2207
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
2208
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
2209
+ * model, and thus might perform better in noisy environments.
2017
2210
  */
2018
2211
  threshold?: number;
2019
2212
 
2020
2213
  /**
2021
- * Type of turn detection, only `server_vad` is currently supported.
2214
+ * Type of turn detection.
2022
2215
  */
2023
- type?: string;
2216
+ type?: 'server_vad' | 'semantic_vad';
2024
2217
  }
2025
2218
  }
2026
2219
  }
@@ -2046,7 +2239,216 @@ export interface SessionUpdatedEvent {
2046
2239
  type: 'session.updated';
2047
2240
  }
2048
2241
 
2242
+ /**
2243
+ * Send this event to update a transcription session.
2244
+ */
2245
+ export interface TranscriptionSessionUpdate {
2246
+ /**
2247
+ * Realtime transcription session object configuration.
2248
+ */
2249
+ session: TranscriptionSessionUpdate.Session;
2250
+
2251
+ /**
2252
+ * The event type, must be `transcription_session.update`.
2253
+ */
2254
+ type: 'transcription_session.update';
2255
+
2256
+ /**
2257
+ * Optional client-generated ID used to identify this event.
2258
+ */
2259
+ event_id?: string;
2260
+ }
2261
+
2262
+ export namespace TranscriptionSessionUpdate {
2263
+ /**
2264
+ * Realtime transcription session object configuration.
2265
+ */
2266
+ export interface Session {
2267
+ /**
2268
+ * The set of items to include in the transcription. Current available items are:
2269
+ *
2270
+ * - `item.input_audio_transcription.logprobs`
2271
+ */
2272
+ include?: Array<string>;
2273
+
2274
+ /**
2275
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
2276
+ * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
2277
+ * (mono), and little-endian byte order.
2278
+ */
2279
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
2280
+
2281
+ /**
2282
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
2283
+ * off. Noise reduction filters audio added to the input audio buffer before it is
2284
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
2285
+ * detection accuracy (reducing false positives) and model performance by improving
2286
+ * perception of the input audio.
2287
+ */
2288
+ input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
2289
+
2290
+ /**
2291
+ * Configuration for input audio transcription. The client can optionally set the
2292
+ * language and prompt for transcription, these offer additional guidance to the
2293
+ * transcription service.
2294
+ */
2295
+ input_audio_transcription?: Session.InputAudioTranscription;
2296
+
2297
+ /**
2298
+ * The set of modalities the model can respond with. To disable audio, set this to
2299
+ * ["text"].
2300
+ */
2301
+ modalities?: Array<'text' | 'audio'>;
2302
+
2303
+ /**
2304
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
2305
+ * set to `null` to turn off, in which case the client must manually trigger model
2306
+ * response. Server VAD means that the model will detect the start and end of
2307
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
2308
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
2309
+ * semantically estimate whether the user has finished speaking, then dynamically
2310
+ * sets a timeout based on this probability. For example, if user audio trails off
2311
+ * with "uhhm", the model will score a low probability of turn end and wait longer
2312
+ * for the user to continue speaking. This can be useful for more natural
2313
+ * conversations, but may have a higher latency.
2314
+ */
2315
+ turn_detection?: Session.TurnDetection;
2316
+ }
2317
+
2318
+ export namespace Session {
2319
+ /**
2320
+ * Configuration for input audio noise reduction. This can be set to `null` to turn
2321
+ * off. Noise reduction filters audio added to the input audio buffer before it is
2322
+ * sent to VAD and the model. Filtering the audio can improve VAD and turn
2323
+ * detection accuracy (reducing false positives) and model performance by improving
2324
+ * perception of the input audio.
2325
+ */
2326
+ export interface InputAudioNoiseReduction {
2327
+ /**
2328
+ * Type of noise reduction. `near_field` is for close-talking microphones such as
2329
+ * headphones, `far_field` is for far-field microphones such as laptop or
2330
+ * conference room microphones.
2331
+ */
2332
+ type?: 'near_field' | 'far_field';
2333
+ }
2334
+
2335
+ /**
2336
+ * Configuration for input audio transcription. The client can optionally set the
2337
+ * language and prompt for transcription, these offer additional guidance to the
2338
+ * transcription service.
2339
+ */
2340
+ export interface InputAudioTranscription {
2341
+ /**
2342
+ * The language of the input audio. Supplying the input language in
2343
+ * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
2344
+ * format will improve accuracy and latency.
2345
+ */
2346
+ language?: string;
2347
+
2348
+ /**
2349
+ * The model to use for transcription, current options are `gpt-4o-transcribe`,
2350
+ * `gpt-4o-mini-transcribe`, and `whisper-1`.
2351
+ */
2352
+ model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
2353
+
2354
+ /**
2355
+ * An optional text to guide the model's style or continue a previous audio
2356
+ * segment. For `whisper-1`, the
2357
+ * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
2358
+ * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
2359
+ * "expect words related to technology".
2360
+ */
2361
+ prompt?: string;
2362
+ }
2363
+
2364
+ /**
2365
+ * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
2366
+ * set to `null` to turn off, in which case the client must manually trigger model
2367
+ * response. Server VAD means that the model will detect the start and end of
2368
+ * speech based on audio volume and respond at the end of user speech. Semantic VAD
2369
+ * is more advanced and uses a turn detection model (in conjuction with VAD) to
2370
+ * semantically estimate whether the user has finished speaking, then dynamically
2371
+ * sets a timeout based on this probability. For example, if user audio trails off
2372
+ * with "uhhm", the model will score a low probability of turn end and wait longer
2373
+ * for the user to continue speaking. This can be useful for more natural
2374
+ * conversations, but may have a higher latency.
2375
+ */
2376
+ export interface TurnDetection {
2377
+ /**
2378
+ * Whether or not to automatically generate a response when a VAD stop event
2379
+ * occurs.
2380
+ */
2381
+ create_response?: boolean;
2382
+
2383
+ /**
2384
+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
2385
+ * will wait longer for the user to continue speaking, `high` will respond more
2386
+ * quickly. `auto` is the default and is equivalent to `medium`.
2387
+ */
2388
+ eagerness?: 'low' | 'medium' | 'high' | 'auto';
2389
+
2390
+ /**
2391
+ * Whether or not to automatically interrupt any ongoing response with output to
2392
+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
2393
+ * occurs.
2394
+ */
2395
+ interrupt_response?: boolean;
2396
+
2397
+ /**
2398
+ * Used only for `server_vad` mode. Amount of audio to include before the VAD
2399
+ * detected speech (in milliseconds). Defaults to 300ms.
2400
+ */
2401
+ prefix_padding_ms?: number;
2402
+
2403
+ /**
2404
+ * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
2405
+ * milliseconds). Defaults to 500ms. With shorter values the model will respond
2406
+ * more quickly, but may jump in on short pauses from the user.
2407
+ */
2408
+ silence_duration_ms?: number;
2409
+
2410
+ /**
2411
+ * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
2412
+ * defaults to 0.5. A higher threshold will require louder audio to activate the
2413
+ * model, and thus might perform better in noisy environments.
2414
+ */
2415
+ threshold?: number;
2416
+
2417
+ /**
2418
+ * Type of turn detection.
2419
+ */
2420
+ type?: 'server_vad' | 'semantic_vad';
2421
+ }
2422
+ }
2423
+ }
2424
+
2425
+ /**
2426
+ * Returned when a transcription session is updated with a
2427
+ * `transcription_session.update` event, unless there is an error.
2428
+ */
2429
+ export interface TranscriptionSessionUpdatedEvent {
2430
+ /**
2431
+ * The unique ID of the server event.
2432
+ */
2433
+ event_id: string;
2434
+
2435
+ /**
2436
+ * A new Realtime transcription session configuration.
2437
+ *
2438
+ * When a session is created on the server via REST API, the session object also
2439
+ * contains an ephemeral key. Default TTL for keys is one minute. This property is
2440
+ * not present when a session is updated via the WebSocket API.
2441
+ */
2442
+ session: TranscriptionSessionsAPI.TranscriptionSession;
2443
+
2444
+ /**
2445
+ * The event type, must be `transcription_session.updated`.
2446
+ */
2447
+ type: 'transcription_session.updated';
2448
+ }
2449
+
2049
2450
  Realtime.Sessions = Sessions;
2451
+ Realtime.TranscriptionSessions = TranscriptionSessions;
2050
2452
 
2051
2453
  export declare namespace Realtime {
2052
2454
  export {
@@ -2055,4 +2457,10 @@ export declare namespace Realtime {
2055
2457
  type SessionCreateResponse as SessionCreateResponse,
2056
2458
  type SessionCreateParams as SessionCreateParams,
2057
2459
  };
2460
+
2461
+ export {
2462
+ TranscriptionSessions as TranscriptionSessions,
2463
+ type TranscriptionSession as TranscriptionSession,
2464
+ type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams,
2465
+ };
2058
2466
  }