openai 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/audio_transcription.rb +60 -0
  5. data/lib/openai/models/realtime/client_secret_create_params.rb +18 -9
  6. data/lib/openai/models/realtime/client_secret_create_response.rb +11 -250
  7. data/lib/openai/models/realtime/conversation_item.rb +1 -1
  8. data/lib/openai/models/realtime/conversation_item_added.rb +14 -1
  9. data/lib/openai/models/realtime/conversation_item_done.rb +3 -0
  10. data/lib/openai/models/realtime/conversation_item_input_audio_transcription_completed_event.rb +10 -8
  11. data/lib/openai/models/realtime/conversation_item_input_audio_transcription_delta_event.rb +14 -5
  12. data/lib/openai/models/realtime/conversation_item_truncate_event.rb +2 -2
  13. data/lib/openai/models/realtime/input_audio_buffer_append_event.rb +10 -5
  14. data/lib/openai/models/realtime/models.rb +58 -0
  15. data/lib/openai/models/realtime/noise_reduction_type.rb +20 -0
  16. data/lib/openai/models/realtime/realtime_audio_config.rb +6 -427
  17. data/lib/openai/models/realtime/realtime_audio_config_input.rb +89 -0
  18. data/lib/openai/models/realtime/realtime_audio_config_output.rb +100 -0
  19. data/lib/openai/models/realtime/realtime_audio_formats.rb +121 -0
  20. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +131 -0
  21. data/lib/openai/models/realtime/realtime_client_event.rb +31 -23
  22. data/lib/openai/models/realtime/realtime_conversation_item_assistant_message.rb +43 -10
  23. data/lib/openai/models/realtime/realtime_conversation_item_function_call.rb +16 -7
  24. data/lib/openai/models/realtime/realtime_conversation_item_function_call_output.rb +15 -7
  25. data/lib/openai/models/realtime/realtime_conversation_item_system_message.rb +18 -6
  26. data/lib/openai/models/realtime/realtime_conversation_item_user_message.rb +62 -13
  27. data/lib/openai/models/realtime/realtime_response.rb +117 -107
  28. data/lib/openai/models/realtime/realtime_response_create_audio_output.rb +100 -0
  29. data/lib/openai/models/realtime/realtime_response_create_mcp_tool.rb +310 -0
  30. data/lib/openai/models/realtime/realtime_response_create_params.rb +225 -0
  31. data/lib/openai/models/realtime/realtime_response_status.rb +1 -1
  32. data/lib/openai/models/realtime/realtime_response_usage.rb +5 -2
  33. data/lib/openai/models/realtime/realtime_response_usage_input_token_details.rb +58 -8
  34. data/lib/openai/models/realtime/realtime_server_event.rb +21 -5
  35. data/lib/openai/models/realtime/realtime_session.rb +9 -125
  36. data/lib/openai/models/realtime/realtime_session_client_secret.rb +36 -0
  37. data/lib/openai/models/realtime/realtime_session_create_request.rb +50 -71
  38. data/lib/openai/models/realtime/realtime_session_create_response.rb +621 -219
  39. data/lib/openai/models/realtime/realtime_tools_config_union.rb +2 -53
  40. data/lib/openai/models/realtime/realtime_tracing_config.rb +7 -6
  41. data/lib/openai/models/realtime/realtime_transcription_session_audio.rb +19 -0
  42. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +90 -0
  43. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +131 -0
  44. data/lib/openai/models/realtime/realtime_transcription_session_client_secret.rb +38 -0
  45. data/lib/openai/models/realtime/realtime_transcription_session_create_request.rb +12 -270
  46. data/lib/openai/models/realtime/realtime_transcription_session_create_response.rb +78 -0
  47. data/lib/openai/models/realtime/realtime_transcription_session_input_audio_transcription.rb +66 -0
  48. data/lib/openai/models/realtime/realtime_transcription_session_turn_detection.rb +57 -0
  49. data/lib/openai/models/realtime/realtime_truncation.rb +8 -40
  50. data/lib/openai/models/realtime/realtime_truncation_retention_ratio.rb +34 -0
  51. data/lib/openai/models/realtime/response_cancel_event.rb +3 -1
  52. data/lib/openai/models/realtime/response_create_event.rb +18 -348
  53. data/lib/openai/models/realtime/response_done_event.rb +7 -0
  54. data/lib/openai/models/realtime/session_created_event.rb +20 -4
  55. data/lib/openai/models/realtime/session_update_event.rb +36 -12
  56. data/lib/openai/models/realtime/session_updated_event.rb +20 -4
  57. data/lib/openai/models/realtime/transcription_session_created.rb +8 -243
  58. data/lib/openai/models/realtime/transcription_session_update.rb +179 -3
  59. data/lib/openai/models/realtime/transcription_session_updated_event.rb +8 -243
  60. data/lib/openai/resources/realtime/client_secrets.rb +2 -3
  61. data/lib/openai/version.rb +1 -1
  62. data/lib/openai.rb +19 -1
  63. data/rbi/openai/models/realtime/audio_transcription.rbi +132 -0
  64. data/rbi/openai/models/realtime/client_secret_create_params.rbi +25 -11
  65. data/rbi/openai/models/realtime/client_secret_create_response.rbi +2 -587
  66. data/rbi/openai/models/realtime/conversation_item_added.rbi +14 -1
  67. data/rbi/openai/models/realtime/conversation_item_done.rbi +3 -0
  68. data/rbi/openai/models/realtime/conversation_item_input_audio_transcription_completed_event.rbi +11 -8
  69. data/rbi/openai/models/realtime/conversation_item_input_audio_transcription_delta_event.rbi +15 -5
  70. data/rbi/openai/models/realtime/conversation_item_truncate_event.rbi +2 -2
  71. data/rbi/openai/models/realtime/input_audio_buffer_append_event.rbi +10 -5
  72. data/rbi/openai/models/realtime/models.rbi +97 -0
  73. data/rbi/openai/models/realtime/noise_reduction_type.rbi +31 -0
  74. data/rbi/openai/models/realtime/realtime_audio_config.rbi +8 -956
  75. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +221 -0
  76. data/rbi/openai/models/realtime/realtime_audio_config_output.rbi +222 -0
  77. data/rbi/openai/models/realtime/realtime_audio_formats.rbi +329 -0
  78. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +262 -0
  79. data/rbi/openai/models/realtime/realtime_conversation_item_assistant_message.rbi +51 -10
  80. data/rbi/openai/models/realtime/realtime_conversation_item_function_call.rbi +16 -7
  81. data/rbi/openai/models/realtime/realtime_conversation_item_function_call_output.rbi +14 -7
  82. data/rbi/openai/models/realtime/realtime_conversation_item_system_message.rbi +16 -6
  83. data/rbi/openai/models/realtime/realtime_conversation_item_user_message.rbi +110 -12
  84. data/rbi/openai/models/realtime/realtime_response.rbi +287 -212
  85. data/rbi/openai/models/realtime/realtime_response_create_audio_output.rbi +250 -0
  86. data/rbi/openai/models/realtime/realtime_response_create_mcp_tool.rbi +616 -0
  87. data/rbi/openai/models/realtime/realtime_response_create_params.rbi +529 -0
  88. data/rbi/openai/models/realtime/realtime_response_usage.rbi +8 -2
  89. data/rbi/openai/models/realtime/realtime_response_usage_input_token_details.rbi +106 -7
  90. data/rbi/openai/models/realtime/realtime_server_event.rbi +4 -1
  91. data/rbi/openai/models/realtime/realtime_session.rbi +12 -262
  92. data/rbi/openai/models/realtime/realtime_session_client_secret.rbi +49 -0
  93. data/rbi/openai/models/realtime/realtime_session_create_request.rbi +112 -133
  94. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +1229 -405
  95. data/rbi/openai/models/realtime/realtime_tools_config_union.rbi +1 -117
  96. data/rbi/openai/models/realtime/realtime_tracing_config.rbi +11 -10
  97. data/rbi/openai/models/realtime/realtime_transcription_session_audio.rbi +50 -0
  98. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +226 -0
  99. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +259 -0
  100. data/rbi/openai/models/realtime/realtime_transcription_session_client_secret.rbi +51 -0
  101. data/rbi/openai/models/realtime/realtime_transcription_session_create_request.rbi +25 -597
  102. data/rbi/openai/models/realtime/realtime_transcription_session_create_response.rbi +195 -0
  103. data/rbi/openai/models/realtime/realtime_transcription_session_input_audio_transcription.rbi +144 -0
  104. data/rbi/openai/models/realtime/realtime_transcription_session_turn_detection.rbi +94 -0
  105. data/rbi/openai/models/realtime/realtime_truncation.rbi +5 -56
  106. data/rbi/openai/models/realtime/realtime_truncation_retention_ratio.rbi +45 -0
  107. data/rbi/openai/models/realtime/response_cancel_event.rbi +3 -1
  108. data/rbi/openai/models/realtime/response_create_event.rbi +19 -786
  109. data/rbi/openai/models/realtime/response_done_event.rbi +7 -0
  110. data/rbi/openai/models/realtime/session_created_event.rbi +42 -9
  111. data/rbi/openai/models/realtime/session_update_event.rbi +57 -19
  112. data/rbi/openai/models/realtime/session_updated_event.rbi +42 -9
  113. data/rbi/openai/models/realtime/transcription_session_created.rbi +17 -591
  114. data/rbi/openai/models/realtime/transcription_session_update.rbi +425 -7
  115. data/rbi/openai/models/realtime/transcription_session_updated_event.rbi +14 -591
  116. data/rbi/openai/resources/realtime/client_secrets.rbi +5 -3
  117. data/sig/openai/models/realtime/audio_transcription.rbs +57 -0
  118. data/sig/openai/models/realtime/client_secret_create_response.rbs +1 -251
  119. data/sig/openai/models/realtime/models.rbs +57 -0
  120. data/sig/openai/models/realtime/noise_reduction_type.rbs +16 -0
  121. data/sig/openai/models/realtime/realtime_audio_config.rbs +12 -331
  122. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +72 -0
  123. data/sig/openai/models/realtime/realtime_audio_config_output.rbs +72 -0
  124. data/sig/openai/models/realtime/realtime_audio_formats.rbs +128 -0
  125. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +99 -0
  126. data/sig/openai/models/realtime/realtime_conversation_item_assistant_message.rbs +17 -2
  127. data/sig/openai/models/realtime/realtime_conversation_item_user_message.rbs +30 -1
  128. data/sig/openai/models/realtime/realtime_response.rbs +103 -82
  129. data/sig/openai/models/realtime/realtime_response_create_audio_output.rbs +84 -0
  130. data/sig/openai/models/realtime/realtime_response_create_mcp_tool.rbs +218 -0
  131. data/sig/openai/models/realtime/realtime_response_create_params.rbs +148 -0
  132. data/sig/openai/models/realtime/realtime_response_usage_input_token_details.rbs +50 -1
  133. data/sig/openai/models/realtime/realtime_session.rbs +16 -106
  134. data/sig/openai/models/realtime/realtime_session_client_secret.rbs +20 -0
  135. data/sig/openai/models/realtime/realtime_session_create_request.rbs +27 -43
  136. data/sig/openai/models/realtime/realtime_session_create_response.rbs +389 -187
  137. data/sig/openai/models/realtime/realtime_tools_config_union.rbs +1 -53
  138. data/sig/openai/models/realtime/realtime_transcription_session_audio.rbs +24 -0
  139. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +72 -0
  140. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +99 -0
  141. data/sig/openai/models/realtime/realtime_transcription_session_client_secret.rbs +20 -0
  142. data/sig/openai/models/realtime/realtime_transcription_session_create_request.rbs +11 -203
  143. data/sig/openai/models/realtime/realtime_transcription_session_create_response.rbs +69 -0
  144. data/sig/openai/models/realtime/realtime_transcription_session_input_audio_transcription.rbs +59 -0
  145. data/sig/openai/models/realtime/realtime_transcription_session_turn_detection.rbs +47 -0
  146. data/sig/openai/models/realtime/realtime_truncation.rbs +1 -28
  147. data/sig/openai/models/realtime/realtime_truncation_retention_ratio.rbs +21 -0
  148. data/sig/openai/models/realtime/response_create_event.rbs +6 -249
  149. data/sig/openai/models/realtime/session_created_event.rbs +14 -4
  150. data/sig/openai/models/realtime/session_update_event.rbs +14 -4
  151. data/sig/openai/models/realtime/session_updated_event.rbs +14 -4
  152. data/sig/openai/models/realtime/transcription_session_created.rbs +4 -254
  153. data/sig/openai/models/realtime/transcription_session_update.rbs +154 -4
  154. data/sig/openai/models/realtime/transcription_session_updated_event.rbs +4 -254
  155. metadata +59 -5
  156. data/lib/openai/models/realtime/realtime_client_secret_config.rb +0 -64
  157. data/rbi/openai/models/realtime/realtime_client_secret_config.rbi +0 -147
  158. data/sig/openai/models/realtime/realtime_client_secret_config.rbs +0 -60
@@ -4,14 +4,6 @@ module OpenAI
4
4
  module Models
5
5
  module Realtime
6
6
  class RealtimeTranscriptionSessionCreateRequest < OpenAI::Internal::Type::BaseModel
7
- # @!attribute model
8
- # ID of the model to use. The options are `gpt-4o-transcribe`,
9
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
10
- # Whisper V2 model).
11
- #
12
- # @return [String, Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Model]
13
- required :model, union: -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::Model }
14
-
15
7
  # @!attribute type
16
8
  # The type of session to create. Always `transcription` for transcription
17
9
  # sessions.
@@ -19,106 +11,35 @@ module OpenAI
19
11
  # @return [Symbol, :transcription]
20
12
  required :type, const: :transcription
21
13
 
14
+ # @!attribute audio
15
+ # Configuration for input and output audio.
16
+ #
17
+ # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudio, nil]
18
+ optional :audio, -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudio }
19
+
22
20
  # @!attribute include
23
- # The set of items to include in the transcription. Current available items are:
21
+ # Additional fields to include in server outputs.
24
22
  #
25
- # - `item.input_audio_transcription.logprobs`
23
+ # `item.input_audio_transcription.logprobs`: Include logprobs for input audio
24
+ # transcription.
26
25
  #
27
26
  # @return [Array<Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Include>, nil]
28
27
  optional :include,
29
28
  -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::Include] }
30
29
 
31
- # @!attribute input_audio_format
32
- # The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
33
- # `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
34
- # (mono), and little-endian byte order.
35
- #
36
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioFormat, nil]
37
- optional :input_audio_format,
38
- enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioFormat }
39
-
40
- # @!attribute input_audio_noise_reduction
41
- # Configuration for input audio noise reduction. This can be set to `null` to turn
42
- # off. Noise reduction filters audio added to the input audio buffer before it is
43
- # sent to VAD and the model. Filtering the audio can improve VAD and turn
44
- # detection accuracy (reducing false positives) and model performance by improving
45
- # perception of the input audio.
46
- #
47
- # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction, nil]
48
- optional :input_audio_noise_reduction,
49
- -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction }
50
-
51
- # @!attribute input_audio_transcription
52
- # Configuration for input audio transcription. The client can optionally set the
53
- # language and prompt for transcription, these offer additional guidance to the
54
- # transcription service.
55
- #
56
- # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription, nil]
57
- optional :input_audio_transcription,
58
- -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription }
59
-
60
- # @!attribute turn_detection
61
- # Configuration for turn detection. Can be set to `null` to turn off. Server VAD
62
- # means that the model will detect the start and end of speech based on audio
63
- # volume and respond at the end of user speech.
64
- #
65
- # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection, nil]
66
- optional :turn_detection,
67
- -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection }
68
-
69
- # @!method initialize(model:, include: nil, input_audio_format: nil, input_audio_noise_reduction: nil, input_audio_transcription: nil, turn_detection: nil, type: :transcription)
30
+ # @!method initialize(audio: nil, include: nil, type: :transcription)
70
31
  # Some parameter documentations has been truncated, see
71
32
  # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest} for more
72
33
  # details.
73
34
  #
74
35
  # Realtime transcription session object configuration.
75
36
  #
76
- # @param model [String, Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Model] ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transc
77
- #
78
- # @param include [Array<Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Include>] The set of items to include in the transcription. Current available items are:
79
- #
80
- # @param input_audio_format [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioFormat] The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
81
- #
82
- # @param input_audio_noise_reduction [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction] Configuration for input audio noise reduction. This can be set to `null` to turn
37
+ # @param audio [OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudio] Configuration for input and output audio.
83
38
  #
84
- # @param input_audio_transcription [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription] Configuration for input audio transcription. The client can optionally set the l
85
- #
86
- # @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection] Configuration for turn detection. Can be set to `null` to turn off. Server VAD m
39
+ # @param include [Array<Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Include>] Additional fields to include in server outputs.
87
40
  #
88
41
  # @param type [Symbol, :transcription] The type of session to create. Always `transcription` for transcription sessions
89
42
 
90
- # ID of the model to use. The options are `gpt-4o-transcribe`,
91
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
92
- # Whisper V2 model).
93
- #
94
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest#model
95
- module Model
96
- extend OpenAI::Internal::Type::Union
97
-
98
- variant String
99
-
100
- variant const: -> { OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Model::WHISPER_1 }
101
-
102
- variant const: -> { OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Model::GPT_4O_TRANSCRIBE }
103
-
104
- variant const: -> { OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::Model::GPT_4O_MINI_TRANSCRIBE }
105
-
106
- # @!method self.variants
107
- # @return [Array(String, Symbol)]
108
-
109
- define_sorbet_constant!(:Variants) do
110
- T.type_alias { T.any(String, OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::Model::TaggedSymbol) }
111
- end
112
-
113
- # @!group
114
-
115
- WHISPER_1 = :"whisper-1"
116
- GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
117
- GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
118
-
119
- # @!endgroup
120
- end
121
-
122
43
  module Include
123
44
  extend OpenAI::Internal::Type::Enum
124
45
 
@@ -127,185 +48,6 @@ module OpenAI
127
48
  # @!method self.values
128
49
  # @return [Array<Symbol>]
129
50
  end
130
-
131
- # The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
132
- # `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
133
- # (mono), and little-endian byte order.
134
- #
135
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest#input_audio_format
136
- module InputAudioFormat
137
- extend OpenAI::Internal::Type::Enum
138
-
139
- PCM16 = :pcm16
140
- G711_ULAW = :g711_ulaw
141
- G711_ALAW = :g711_alaw
142
-
143
- # @!method self.values
144
- # @return [Array<Symbol>]
145
- end
146
-
147
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest#input_audio_noise_reduction
148
- class InputAudioNoiseReduction < OpenAI::Internal::Type::BaseModel
149
- # @!attribute type
150
- # Type of noise reduction. `near_field` is for close-talking microphones such as
151
- # headphones, `far_field` is for far-field microphones such as laptop or
152
- # conference room microphones.
153
- #
154
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction::Type, nil]
155
- optional :type,
156
- enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction::Type }
157
-
158
- # @!method initialize(type: nil)
159
- # Some parameter documentations has been truncated, see
160
- # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction}
161
- # for more details.
162
- #
163
- # Configuration for input audio noise reduction. This can be set to `null` to turn
164
- # off. Noise reduction filters audio added to the input audio buffer before it is
165
- # sent to VAD and the model. Filtering the audio can improve VAD and turn
166
- # detection accuracy (reducing false positives) and model performance by improving
167
- # perception of the input audio.
168
- #
169
- # @param type [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction::Type] Type of noise reduction. `near_field` is for close-talking microphones such as h
170
-
171
- # Type of noise reduction. `near_field` is for close-talking microphones such as
172
- # headphones, `far_field` is for far-field microphones such as laptop or
173
- # conference room microphones.
174
- #
175
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioNoiseReduction#type
176
- module Type
177
- extend OpenAI::Internal::Type::Enum
178
-
179
- NEAR_FIELD = :near_field
180
- FAR_FIELD = :far_field
181
-
182
- # @!method self.values
183
- # @return [Array<Symbol>]
184
- end
185
- end
186
-
187
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest#input_audio_transcription
188
- class InputAudioTranscription < OpenAI::Internal::Type::BaseModel
189
- # @!attribute language
190
- # The language of the input audio. Supplying the input language in
191
- # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
192
- # format will improve accuracy and latency.
193
- #
194
- # @return [String, nil]
195
- optional :language, String
196
-
197
- # @!attribute model
198
- # The model to use for transcription, current options are `gpt-4o-transcribe`,
199
- # `gpt-4o-mini-transcribe`, and `whisper-1`.
200
- #
201
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription::Model, nil]
202
- optional :model,
203
- enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription::Model }
204
-
205
- # @!attribute prompt
206
- # An optional text to guide the model's style or continue a previous audio
207
- # segment. For `whisper-1`, the
208
- # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
209
- # For `gpt-4o-transcribe` models, the prompt is a free text string, for example
210
- # "expect words related to technology".
211
- #
212
- # @return [String, nil]
213
- optional :prompt, String
214
-
215
- # @!method initialize(language: nil, model: nil, prompt: nil)
216
- # Some parameter documentations has been truncated, see
217
- # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription}
218
- # for more details.
219
- #
220
- # Configuration for input audio transcription. The client can optionally set the
221
- # language and prompt for transcription, these offer additional guidance to the
222
- # transcription service.
223
- #
224
- # @param language [String] The language of the input audio. Supplying the input language in
225
- #
226
- # @param model [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription::Model] The model to use for transcription, current options are `gpt-4o-transcribe`, `gp
227
- #
228
- # @param prompt [String] An optional text to guide the model's style or continue a previous audio
229
-
230
- # The model to use for transcription, current options are `gpt-4o-transcribe`,
231
- # `gpt-4o-mini-transcribe`, and `whisper-1`.
232
- #
233
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::InputAudioTranscription#model
234
- module Model
235
- extend OpenAI::Internal::Type::Enum
236
-
237
- GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
238
- GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
239
- WHISPER_1 = :"whisper-1"
240
-
241
- # @!method self.values
242
- # @return [Array<Symbol>]
243
- end
244
- end
245
-
246
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest#turn_detection
247
- class TurnDetection < OpenAI::Internal::Type::BaseModel
248
- # @!attribute prefix_padding_ms
249
- # Amount of audio to include before the VAD detected speech (in milliseconds).
250
- # Defaults to 300ms.
251
- #
252
- # @return [Integer, nil]
253
- optional :prefix_padding_ms, Integer
254
-
255
- # @!attribute silence_duration_ms
256
- # Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
257
- # With shorter values the model will respond more quickly, but may jump in on
258
- # short pauses from the user.
259
- #
260
- # @return [Integer, nil]
261
- optional :silence_duration_ms, Integer
262
-
263
- # @!attribute threshold
264
- # Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
265
- # threshold will require louder audio to activate the model, and thus might
266
- # perform better in noisy environments.
267
- #
268
- # @return [Float, nil]
269
- optional :threshold, Float
270
-
271
- # @!attribute type
272
- # Type of turn detection. Only `server_vad` is currently supported for
273
- # transcription sessions.
274
- #
275
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection::Type, nil]
276
- optional :type,
277
- enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection::Type }
278
-
279
- # @!method initialize(prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
280
- # Some parameter documentations has been truncated, see
281
- # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection}
282
- # for more details.
283
- #
284
- # Configuration for turn detection. Can be set to `null` to turn off. Server VAD
285
- # means that the model will detect the start and end of speech based on audio
286
- # volume and respond at the end of user speech.
287
- #
288
- # @param prefix_padding_ms [Integer] Amount of audio to include before the VAD detected speech (in
289
- #
290
- # @param silence_duration_ms [Integer] Duration of silence to detect speech stop (in milliseconds). Defaults
291
- #
292
- # @param threshold [Float] Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A
293
- #
294
- # @param type [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection::Type] Type of turn detection. Only `server_vad` is currently supported for transcripti
295
-
296
- # Type of turn detection. Only `server_vad` is currently supported for
297
- # transcription sessions.
298
- #
299
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateRequest::TurnDetection#type
300
- module Type
301
- extend OpenAI::Internal::Type::Enum
302
-
303
- SERVER_VAD = :server_vad
304
-
305
- # @!method self.values
306
- # @return [Array<Symbol>]
307
- end
308
- end
309
51
  end
310
52
  end
311
53
  end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Realtime
6
+ class RealtimeTranscriptionSessionCreateResponse < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute client_secret
8
+ # Ephemeral key returned by the API. Only present when the session is created on
9
+ # the server via REST API.
10
+ #
11
+ # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionClientSecret]
12
+ required :client_secret, -> { OpenAI::Realtime::RealtimeTranscriptionSessionClientSecret }
13
+
14
+ # @!attribute input_audio_format
15
+ # The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
16
+ #
17
+ # @return [String, nil]
18
+ optional :input_audio_format, String
19
+
20
+ # @!attribute input_audio_transcription
21
+ # Configuration of the transcription model.
22
+ #
23
+ # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription, nil]
24
+ optional :input_audio_transcription,
25
+ -> { OpenAI::Realtime::RealtimeTranscriptionSessionInputAudioTranscription }
26
+
27
+ # @!attribute modalities
28
+ # The set of modalities the model can respond with. To disable audio, set this to
29
+ # ["text"].
30
+ #
31
+ # @return [Array<Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateResponse::Modality>, nil]
32
+ optional :modalities,
33
+ -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Realtime::RealtimeTranscriptionSessionCreateResponse::Modality] }
34
+
35
+ # @!attribute turn_detection
36
+ # Configuration for turn detection. Can be set to `null` to turn off. Server VAD
37
+ # means that the model will detect the start and end of speech based on audio
38
+ # volume and respond at the end of user speech.
39
+ #
40
+ # @return [OpenAI::Models::Realtime::RealtimeTranscriptionSessionTurnDetection, nil]
41
+ optional :turn_detection, -> { OpenAI::Realtime::RealtimeTranscriptionSessionTurnDetection }
42
+
43
+ # @!method initialize(client_secret:, input_audio_format: nil, input_audio_transcription: nil, modalities: nil, turn_detection: nil)
44
+ # Some parameter documentations has been truncated, see
45
+ # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateResponse} for more
46
+ # details.
47
+ #
48
+ # A new Realtime transcription session configuration.
49
+ #
50
+ # When a session is created on the server via REST API, the session object also
51
+ # contains an ephemeral key. Default TTL for keys is 10 minutes. This property is
52
+ # not present when a session is updated via the WebSocket API.
53
+ #
54
+ # @param client_secret [OpenAI::Models::Realtime::RealtimeTranscriptionSessionClientSecret] Ephemeral key returned by the API. Only present when the session is
55
+ #
56
+ # @param input_audio_format [String] The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
57
+ #
58
+ # @param input_audio_transcription [OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription] Configuration of the transcription model.
59
+ #
60
+ # @param modalities [Array<Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionCreateResponse::Modality>] The set of modalities the model can respond with. To disable audio,
61
+ #
62
+ # @param turn_detection [OpenAI::Models::Realtime::RealtimeTranscriptionSessionTurnDetection] Configuration for turn detection. Can be set to `null` to turn off. Server
63
+
64
+ module Modality
65
+ extend OpenAI::Internal::Type::Enum
66
+
67
+ TEXT = :text
68
+ AUDIO = :audio
69
+
70
+ # @!method self.values
71
+ # @return [Array<Symbol>]
72
+ end
73
+ end
74
+ end
75
+
76
+ RealtimeTranscriptionSessionCreateResponse = Realtime::RealtimeTranscriptionSessionCreateResponse
77
+ end
78
+ end
@@ -0,0 +1,66 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Realtime
6
+ class RealtimeTranscriptionSessionInputAudioTranscription < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute language
8
+ # The language of the input audio. Supplying the input language in
9
+ # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
10
+ # format will improve accuracy and latency.
11
+ #
12
+ # @return [String, nil]
13
+ optional :language, String
14
+
15
+ # @!attribute model
16
+ # The model to use for transcription. Current options are `whisper-1`,
17
+ # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
18
+ #
19
+ # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription::Model, nil]
20
+ optional :model, enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionInputAudioTranscription::Model }
21
+
22
+ # @!attribute prompt
23
+ # An optional text to guide the model's style or continue a previous audio
24
+ # segment. For `whisper-1`, the
25
+ # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
26
+ # For `gpt-4o-transcribe` models, the prompt is a free text string, for example
27
+ # "expect words related to technology".
28
+ #
29
+ # @return [String, nil]
30
+ optional :prompt, String
31
+
32
+ # @!method initialize(language: nil, model: nil, prompt: nil)
33
+ # Some parameter documentations has been truncated, see
34
+ # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription}
35
+ # for more details.
36
+ #
37
+ # Configuration of the transcription model.
38
+ #
39
+ # @param language [String] The language of the input audio. Supplying the input language in
40
+ #
41
+ # @param model [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-tra
42
+ #
43
+ # @param prompt [String] An optional text to guide the model's style or continue a previous audio
44
+
45
+ # The model to use for transcription. Current options are `whisper-1`,
46
+ # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
47
+ #
48
+ # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionInputAudioTranscription#model
49
+ module Model
50
+ extend OpenAI::Internal::Type::Enum
51
+
52
+ WHISPER_1 = :"whisper-1"
53
+ GPT_4O_TRANSCRIBE_LATEST = :"gpt-4o-transcribe-latest"
54
+ GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
55
+ GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
56
+
57
+ # @!method self.values
58
+ # @return [Array<Symbol>]
59
+ end
60
+ end
61
+ end
62
+
63
+ RealtimeTranscriptionSessionInputAudioTranscription =
64
+ Realtime::RealtimeTranscriptionSessionInputAudioTranscription
65
+ end
66
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Realtime
6
+ class RealtimeTranscriptionSessionTurnDetection < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute prefix_padding_ms
8
+ # Amount of audio to include before the VAD detected speech (in milliseconds).
9
+ # Defaults to 300ms.
10
+ #
11
+ # @return [Integer, nil]
12
+ optional :prefix_padding_ms, Integer
13
+
14
+ # @!attribute silence_duration_ms
15
+ # Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
16
+ # With shorter values the model will respond more quickly, but may jump in on
17
+ # short pauses from the user.
18
+ #
19
+ # @return [Integer, nil]
20
+ optional :silence_duration_ms, Integer
21
+
22
+ # @!attribute threshold
23
+ # Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
24
+ # threshold will require louder audio to activate the model, and thus might
25
+ # perform better in noisy environments.
26
+ #
27
+ # @return [Float, nil]
28
+ optional :threshold, Float
29
+
30
+ # @!attribute type
31
+ # Type of turn detection, only `server_vad` is currently supported.
32
+ #
33
+ # @return [String, nil]
34
+ optional :type, String
35
+
36
+ # @!method initialize(prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
37
+ # Some parameter documentations has been truncated, see
38
+ # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionTurnDetection} for more
39
+ # details.
40
+ #
41
+ # Configuration for turn detection. Can be set to `null` to turn off. Server VAD
42
+ # means that the model will detect the start and end of speech based on audio
43
+ # volume and respond at the end of user speech.
44
+ #
45
+ # @param prefix_padding_ms [Integer] Amount of audio to include before the VAD detected speech (in
46
+ #
47
+ # @param silence_duration_ms [Integer] Duration of silence to detect speech stop (in milliseconds). Defaults
48
+ #
49
+ # @param threshold [Float] Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A
50
+ #
51
+ # @param type [String] Type of turn detection, only `server_vad` is currently supported.
52
+ end
53
+ end
54
+
55
+ RealtimeTranscriptionSessionTurnDetection = Realtime::RealtimeTranscriptionSessionTurnDetection
56
+ end
57
+ end
@@ -4,18 +4,19 @@ module OpenAI
4
4
  module Models
5
5
  module Realtime
6
6
  # Controls how the realtime conversation is truncated prior to model inference.
7
- # The default is `auto`. When set to `retention_ratio`, the server retains a
8
- # fraction of the conversation tokens prior to the instructions.
7
+ # The default is `auto`.
9
8
  module RealtimeTruncation
10
9
  extend OpenAI::Internal::Type::Union
11
10
 
12
- # The truncation strategy to use for the session.
11
+ # The truncation strategy to use for the session. `auto` is the default truncation strategy. `disabled` will disable truncation and emit errors when the conversation exceeds the input token limit.
13
12
  variant enum: -> { OpenAI::Realtime::RealtimeTruncation::RealtimeTruncationStrategy }
14
13
 
15
- # Retain a fraction of the conversation tokens.
16
- variant -> { OpenAI::Realtime::RealtimeTruncation::RetentionRatioTruncation }
14
+ # Retain a fraction of the conversation tokens when the conversation exceeds the input token limit. This allows you to amortize truncations across multiple turns, which can help improve cached token usage.
15
+ variant -> { OpenAI::Realtime::RealtimeTruncationRetentionRatio }
17
16
 
18
- # The truncation strategy to use for the session.
17
+ # The truncation strategy to use for the session. `auto` is the default truncation
18
+ # strategy. `disabled` will disable truncation and emit errors when the
19
+ # conversation exceeds the input token limit.
19
20
  module RealtimeTruncationStrategy
20
21
  extend OpenAI::Internal::Type::Enum
21
22
 
@@ -26,41 +27,8 @@ module OpenAI
26
27
  # @return [Array<Symbol>]
27
28
  end
28
29
 
29
- class RetentionRatioTruncation < OpenAI::Internal::Type::BaseModel
30
- # @!attribute retention_ratio
31
- # Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).
32
- #
33
- # @return [Float]
34
- required :retention_ratio, Float
35
-
36
- # @!attribute type
37
- # Use retention ratio truncation.
38
- #
39
- # @return [Symbol, :retention_ratio]
40
- required :type, const: :retention_ratio
41
-
42
- # @!attribute post_instructions_token_limit
43
- # Optional cap on tokens allowed after the instructions.
44
- #
45
- # @return [Integer, nil]
46
- optional :post_instructions_token_limit, Integer, nil?: true
47
-
48
- # @!method initialize(retention_ratio:, post_instructions_token_limit: nil, type: :retention_ratio)
49
- # Some parameter documentations has been truncated, see
50
- # {OpenAI::Models::Realtime::RealtimeTruncation::RetentionRatioTruncation} for
51
- # more details.
52
- #
53
- # Retain a fraction of the conversation tokens.
54
- #
55
- # @param retention_ratio [Float] Fraction of pre-instruction conversation tokens to retain (0.0 - 1.0).
56
- #
57
- # @param post_instructions_token_limit [Integer, nil] Optional cap on tokens allowed after the instructions.
58
- #
59
- # @param type [Symbol, :retention_ratio] Use retention ratio truncation.
60
- end
61
-
62
30
  # @!method self.variants
63
- # @return [Array(Symbol, OpenAI::Models::Realtime::RealtimeTruncation::RealtimeTruncationStrategy, OpenAI::Models::Realtime::RealtimeTruncation::RetentionRatioTruncation)]
31
+ # @return [Array(Symbol, OpenAI::Models::Realtime::RealtimeTruncation::RealtimeTruncationStrategy, OpenAI::Models::Realtime::RealtimeTruncationRetentionRatio)]
64
32
  end
65
33
  end
66
34
  end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Realtime
6
+ class RealtimeTruncationRetentionRatio < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute retention_ratio
8
+ # Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
9
+ # conversation exceeds the input token limit.
10
+ #
11
+ # @return [Float]
12
+ required :retention_ratio, Float
13
+
14
+ # @!attribute type
15
+ # Use retention ratio truncation.
16
+ #
17
+ # @return [Symbol, :retention_ratio]
18
+ required :type, const: :retention_ratio
19
+
20
+ # @!method initialize(retention_ratio:, type: :retention_ratio)
21
+ # Some parameter documentations has been truncated, see
22
+ # {OpenAI::Models::Realtime::RealtimeTruncationRetentionRatio} for more details.
23
+ #
24
+ # Retain a fraction of the conversation tokens when the conversation exceeds the
25
+ # input token limit. This allows you to amortize truncations across multiple
26
+ # turns, which can help improve cached token usage.
27
+ #
28
+ # @param retention_ratio [Float] Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
29
+ #
30
+ # @param type [Symbol, :retention_ratio] Use retention ratio truncation.
31
+ end
32
+ end
33
+ end
34
+ end
@@ -29,7 +29,9 @@ module OpenAI
29
29
  #
30
30
  # Send this event to cancel an in-progress response. The server will respond with
31
31
  # a `response.done` event with a status of `response.status=cancelled`. If there
32
- # is no response to cancel, the server will respond with an error.
32
+ # is no response to cancel, the server will respond with an error. It's safe to
33
+ # call `response.cancel` even if no response is in progress, an error will be
34
+ # returned the session will remain unaffected.
33
35
  #
34
36
  # @param event_id [String] Optional client-generated ID used to identify this event.
35
37
  #