openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -3,259 +3,320 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Realtime
6
- class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
7
- OrHash =
6
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
7
+ # set to `null` to turn off, in which case the client must manually trigger model
8
+ # response.
9
+ #
10
+ # Server VAD means that the model will detect the start and end of speech based on
11
+ # audio volume and respond at the end of user speech.
12
+ #
13
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
14
+ # with VAD) to semantically estimate whether the user has finished speaking, then
15
+ # dynamically sets a timeout based on this probability. For example, if user audio
16
+ # trails off with "uhhm", the model will score a low probability of turn end and
17
+ # wait longer for the user to continue speaking. This can be useful for more
18
+ # natural conversations, but may have a higher latency.
19
+ module RealtimeAudioInputTurnDetection
20
+ extend OpenAI::Internal::Type::Union
21
+
22
+ Variants =
8
23
  T.type_alias do
9
24
  T.any(
10
- OpenAI::Realtime::RealtimeAudioInputTurnDetection,
11
- OpenAI::Internal::AnyHash
25
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
26
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
12
27
  )
13
28
  end
14
29
 
15
- # Whether or not to automatically generate a response when a VAD stop event
16
- # occurs.
17
- sig { returns(T.nilable(T::Boolean)) }
18
- attr_reader :create_response
19
-
20
- sig { params(create_response: T::Boolean).void }
21
- attr_writer :create_response
22
-
23
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
24
- # will wait longer for the user to continue speaking, `high` will respond more
25
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
26
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
27
- sig do
28
- returns(
29
- T.nilable(
30
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol
31
- )
32
- )
33
- end
34
- attr_reader :eagerness
35
-
36
- sig do
37
- params(
38
- eagerness:
39
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol
40
- ).void
41
- end
42
- attr_writer :eagerness
43
-
44
- # Optional idle timeout after which turn detection will auto-timeout when no
45
- # additional audio is received and emits a `timeout_triggered` event.
46
- sig { returns(T.nilable(Integer)) }
47
- attr_accessor :idle_timeout_ms
48
-
49
- # Whether or not to automatically interrupt any ongoing response with output to
50
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
51
- # occurs.
52
- sig { returns(T.nilable(T::Boolean)) }
53
- attr_reader :interrupt_response
54
-
55
- sig { params(interrupt_response: T::Boolean).void }
56
- attr_writer :interrupt_response
57
-
58
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
59
- # detected speech (in milliseconds). Defaults to 300ms.
60
- sig { returns(T.nilable(Integer)) }
61
- attr_reader :prefix_padding_ms
62
-
63
- sig { params(prefix_padding_ms: Integer).void }
64
- attr_writer :prefix_padding_ms
65
-
66
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
67
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
68
- # more quickly, but may jump in on short pauses from the user.
69
- sig { returns(T.nilable(Integer)) }
70
- attr_reader :silence_duration_ms
30
+ class ServerVad < OpenAI::Internal::Type::BaseModel
31
+ OrHash =
32
+ T.type_alias do
33
+ T.any(
34
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
35
+ OpenAI::Internal::AnyHash
36
+ )
37
+ end
71
38
 
72
- sig { params(silence_duration_ms: Integer).void }
73
- attr_writer :silence_duration_ms
39
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
40
+ sig { returns(Symbol) }
41
+ attr_accessor :type
74
42
 
75
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
76
- # defaults to 0.5. A higher threshold will require louder audio to activate the
77
- # model, and thus might perform better in noisy environments.
78
- sig { returns(T.nilable(Float)) }
79
- attr_reader :threshold
43
+ # Whether or not to automatically generate a response when a VAD stop event
44
+ # occurs.
45
+ sig { returns(T.nilable(T::Boolean)) }
46
+ attr_reader :create_response
80
47
 
81
- sig { params(threshold: Float).void }
82
- attr_writer :threshold
48
+ sig { params(create_response: T::Boolean).void }
49
+ attr_writer :create_response
83
50
 
84
- # Type of turn detection.
85
- sig do
86
- returns(
87
- T.nilable(
88
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
89
- )
90
- )
91
- end
92
- attr_reader :type
51
+ # Optional timeout after which a model response will be triggered automatically.
52
+ # This is useful for situations in which a long pause from the user is unexpected,
53
+ # such as a phone call. The model will effectively prompt the user to continue the
54
+ # conversation based on the current context.
55
+ #
56
+ # The timeout value will be applied after the last model response's audio has
57
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
58
+ # duration.
59
+ #
60
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
61
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
62
+ # only supported for `server_vad` mode.
63
+ sig { returns(T.nilable(Integer)) }
64
+ attr_accessor :idle_timeout_ms
93
65
 
94
- sig do
95
- params(
96
- type:
97
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
98
- ).void
99
- end
100
- attr_writer :type
101
-
102
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
103
- # set to `null` to turn off, in which case the client must manually trigger model
104
- # response. Server VAD means that the model will detect the start and end of
105
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
106
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
107
- # semantically estimate whether the user has finished speaking, then dynamically
108
- # sets a timeout based on this probability. For example, if user audio trails off
109
- # with "uhhm", the model will score a low probability of turn end and wait longer
110
- # for the user to continue speaking. This can be useful for more natural
111
- # conversations, but may have a higher latency.
112
- sig do
113
- params(
114
- create_response: T::Boolean,
115
- eagerness:
116
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol,
117
- idle_timeout_ms: T.nilable(Integer),
118
- interrupt_response: T::Boolean,
119
- prefix_padding_ms: Integer,
120
- silence_duration_ms: Integer,
121
- threshold: Float,
122
- type:
123
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
124
- ).returns(T.attached_class)
125
- end
126
- def self.new(
127
- # Whether or not to automatically generate a response when a VAD stop event
128
- # occurs.
129
- create_response: nil,
130
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
131
- # will wait longer for the user to continue speaking, `high` will respond more
132
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
133
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
134
- eagerness: nil,
135
- # Optional idle timeout after which turn detection will auto-timeout when no
136
- # additional audio is received and emits a `timeout_triggered` event.
137
- idle_timeout_ms: nil,
138
66
  # Whether or not to automatically interrupt any ongoing response with output to
139
67
  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
140
68
  # occurs.
141
- interrupt_response: nil,
69
+ sig { returns(T.nilable(T::Boolean)) }
70
+ attr_reader :interrupt_response
71
+
72
+ sig { params(interrupt_response: T::Boolean).void }
73
+ attr_writer :interrupt_response
74
+
142
75
  # Used only for `server_vad` mode. Amount of audio to include before the VAD
143
76
  # detected speech (in milliseconds). Defaults to 300ms.
144
- prefix_padding_ms: nil,
77
+ sig { returns(T.nilable(Integer)) }
78
+ attr_reader :prefix_padding_ms
79
+
80
+ sig { params(prefix_padding_ms: Integer).void }
81
+ attr_writer :prefix_padding_ms
82
+
145
83
  # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
146
84
  # milliseconds). Defaults to 500ms. With shorter values the model will respond
147
85
  # more quickly, but may jump in on short pauses from the user.
148
- silence_duration_ms: nil,
86
+ sig { returns(T.nilable(Integer)) }
87
+ attr_reader :silence_duration_ms
88
+
89
+ sig { params(silence_duration_ms: Integer).void }
90
+ attr_writer :silence_duration_ms
91
+
149
92
  # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
150
93
  # defaults to 0.5. A higher threshold will require louder audio to activate the
151
94
  # model, and thus might perform better in noisy environments.
152
- threshold: nil,
153
- # Type of turn detection.
154
- type: nil
155
- )
156
- end
95
+ sig { returns(T.nilable(Float)) }
96
+ attr_reader :threshold
157
97
 
158
- sig do
159
- override.returns(
160
- {
98
+ sig { params(threshold: Float).void }
99
+ attr_writer :threshold
100
+
101
+ # Server-side voice activity detection (VAD) which flips on when user speech is
102
+ # detected and off after a period of silence.
103
+ sig do
104
+ params(
161
105
  create_response: T::Boolean,
162
- eagerness:
163
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::OrSymbol,
164
106
  idle_timeout_ms: T.nilable(Integer),
165
107
  interrupt_response: T::Boolean,
166
108
  prefix_padding_ms: Integer,
167
109
  silence_duration_ms: Integer,
168
110
  threshold: Float,
169
- type:
170
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::OrSymbol
171
- }
111
+ type: Symbol
112
+ ).returns(T.attached_class)
113
+ end
114
+ def self.new(
115
+ # Whether or not to automatically generate a response when a VAD stop event
116
+ # occurs.
117
+ create_response: nil,
118
+ # Optional timeout after which a model response will be triggered automatically.
119
+ # This is useful for situations in which a long pause from the user is unexpected,
120
+ # such as a phone call. The model will effectively prompt the user to continue the
121
+ # conversation based on the current context.
122
+ #
123
+ # The timeout value will be applied after the last model response's audio has
124
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
125
+ # duration.
126
+ #
127
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
128
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
129
+ # only supported for `server_vad` mode.
130
+ idle_timeout_ms: nil,
131
+ # Whether or not to automatically interrupt any ongoing response with output to
132
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
133
+ # occurs.
134
+ interrupt_response: nil,
135
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
136
+ # detected speech (in milliseconds). Defaults to 300ms.
137
+ prefix_padding_ms: nil,
138
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
139
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
140
+ # more quickly, but may jump in on short pauses from the user.
141
+ silence_duration_ms: nil,
142
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
143
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
144
+ # model, and thus might perform better in noisy environments.
145
+ threshold: nil,
146
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
147
+ type: :server_vad
172
148
  )
173
- end
174
- def to_hash
175
- end
149
+ end
176
150
 
177
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
178
- # will wait longer for the user to continue speaking, `high` will respond more
179
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
180
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
181
- module Eagerness
182
- extend OpenAI::Internal::Type::Enum
151
+ sig do
152
+ override.returns(
153
+ {
154
+ type: Symbol,
155
+ create_response: T::Boolean,
156
+ idle_timeout_ms: T.nilable(Integer),
157
+ interrupt_response: T::Boolean,
158
+ prefix_padding_ms: Integer,
159
+ silence_duration_ms: Integer,
160
+ threshold: Float
161
+ }
162
+ )
163
+ end
164
+ def to_hash
165
+ end
166
+ end
183
167
 
184
- TaggedSymbol =
168
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
169
+ OrHash =
185
170
  T.type_alias do
186
- T.all(
187
- Symbol,
188
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness
171
+ T.any(
172
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad,
173
+ OpenAI::Internal::AnyHash
189
174
  )
190
175
  end
191
- OrSymbol = T.type_alias { T.any(Symbol, String) }
192
176
 
193
- LOW =
194
- T.let(
195
- :low,
196
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
197
- )
198
- MEDIUM =
199
- T.let(
200
- :medium,
201
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
202
- )
203
- HIGH =
204
- T.let(
205
- :high,
206
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
207
- )
208
- AUTO =
209
- T.let(
210
- :auto,
211
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
212
- )
177
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
178
+ sig { returns(Symbol) }
179
+ attr_accessor :type
180
+
181
+ # Whether or not to automatically generate a response when a VAD stop event
182
+ # occurs.
183
+ sig { returns(T.nilable(T::Boolean)) }
184
+ attr_reader :create_response
185
+
186
+ sig { params(create_response: T::Boolean).void }
187
+ attr_writer :create_response
213
188
 
189
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
190
+ # will wait longer for the user to continue speaking, `high` will respond more
191
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
192
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
214
193
  sig do
215
- override.returns(
216
- T::Array[
217
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness::TaggedSymbol
218
- ]
194
+ returns(
195
+ T.nilable(
196
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
197
+ )
219
198
  )
220
199
  end
221
- def self.values
200
+ attr_reader :eagerness
201
+
202
+ sig do
203
+ params(
204
+ eagerness:
205
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
206
+ ).void
222
207
  end
223
- end
208
+ attr_writer :eagerness
224
209
 
225
- # Type of turn detection.
226
- module Type
227
- extend OpenAI::Internal::Type::Enum
210
+ # Whether or not to automatically interrupt any ongoing response with output to
211
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
212
+ # occurs.
213
+ sig { returns(T.nilable(T::Boolean)) }
214
+ attr_reader :interrupt_response
228
215
 
229
- TaggedSymbol =
230
- T.type_alias do
231
- T.all(
232
- Symbol,
233
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type
234
- )
235
- end
236
- OrSymbol = T.type_alias { T.any(Symbol, String) }
216
+ sig { params(interrupt_response: T::Boolean).void }
217
+ attr_writer :interrupt_response
237
218
 
238
- SERVER_VAD =
239
- T.let(
240
- :server_vad,
241
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
242
- )
243
- SEMANTIC_VAD =
244
- T.let(
245
- :semantic_vad,
246
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
247
- )
219
+ # Server-side semantic turn detection which uses a model to determine when the
220
+ # user has finished speaking.
221
+ sig do
222
+ params(
223
+ create_response: T::Boolean,
224
+ eagerness:
225
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
226
+ interrupt_response: T::Boolean,
227
+ type: Symbol
228
+ ).returns(T.attached_class)
229
+ end
230
+ def self.new(
231
+ # Whether or not to automatically generate a response when a VAD stop event
232
+ # occurs.
233
+ create_response: nil,
234
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
235
+ # will wait longer for the user to continue speaking, `high` will respond more
236
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
237
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
238
+ eagerness: nil,
239
+ # Whether or not to automatically interrupt any ongoing response with output to
240
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
241
+ # occurs.
242
+ interrupt_response: nil,
243
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
244
+ type: :semantic_vad
245
+ )
246
+ end
248
247
 
249
248
  sig do
250
249
  override.returns(
251
- T::Array[
252
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type::TaggedSymbol
253
- ]
250
+ {
251
+ type: Symbol,
252
+ create_response: T::Boolean,
253
+ eagerness:
254
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
255
+ interrupt_response: T::Boolean
256
+ }
254
257
  )
255
258
  end
256
- def self.values
259
+ def to_hash
260
+ end
261
+
262
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
263
+ # will wait longer for the user to continue speaking, `high` will respond more
264
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
265
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
266
+ module Eagerness
267
+ extend OpenAI::Internal::Type::Enum
268
+
269
+ TaggedSymbol =
270
+ T.type_alias do
271
+ T.all(
272
+ Symbol,
273
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness
274
+ )
275
+ end
276
+ OrSymbol = T.type_alias { T.any(Symbol, String) }
277
+
278
+ LOW =
279
+ T.let(
280
+ :low,
281
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
282
+ )
283
+ MEDIUM =
284
+ T.let(
285
+ :medium,
286
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
287
+ )
288
+ HIGH =
289
+ T.let(
290
+ :high,
291
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
292
+ )
293
+ AUTO =
294
+ T.let(
295
+ :auto,
296
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
297
+ )
298
+
299
+ sig do
300
+ override.returns(
301
+ T::Array[
302
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
303
+ ]
304
+ )
305
+ end
306
+ def self.values
307
+ end
257
308
  end
258
309
  end
310
+
311
+ sig do
312
+ override.returns(
313
+ T::Array[
314
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::Variants
315
+ ]
316
+ )
317
+ end
318
+ def self.variants
319
+ end
259
320
  end
260
321
  end
261
322
  end