openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -80,30 +80,28 @@ module OpenAI
80
80
 
81
81
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
82
82
  # set to `null` to turn off, in which case the client must manually trigger model
83
- # response. Server VAD means that the model will detect the start and end of
84
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
85
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
86
- # semantically estimate whether the user has finished speaking, then dynamically
87
- # sets a timeout based on this probability. For example, if user audio trails off
88
- # with "uhhm", the model will score a low probability of turn end and wait longer
89
- # for the user to continue speaking. This can be useful for more natural
90
- # conversations, but may have a higher latency.
83
+ # response.
84
+ #
85
+ # Server VAD means that the model will detect the start and end of speech based on
86
+ # audio volume and respond at the end of user speech.
87
+ #
88
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
89
+ # with VAD) to semantically estimate whether the user has finished speaking, then
90
+ # dynamically sets a timeout based on this probability. For example, if user audio
91
+ # trails off with "uhhm", the model will score a low probability of turn end and
92
+ # wait longer for the user to continue speaking. This can be useful for more
93
+ # natural conversations, but may have a higher latency.
91
94
  sig do
92
95
  returns(
93
96
  T.nilable(
94
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
97
+ T.any(
98
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
99
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
100
+ )
95
101
  )
96
102
  )
97
103
  end
98
- attr_reader :turn_detection
99
-
100
- sig do
101
- params(
102
- turn_detection:
103
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash
104
- ).void
105
- end
106
- attr_writer :turn_detection
104
+ attr_accessor :turn_detection
107
105
 
108
106
  sig do
109
107
  params(
@@ -117,7 +115,12 @@ module OpenAI
117
115
  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction::OrHash,
118
116
  transcription: OpenAI::Realtime::AudioTranscription::OrHash,
119
117
  turn_detection:
120
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash
118
+ T.nilable(
119
+ T.any(
120
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad::OrHash,
121
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::OrHash
122
+ )
123
+ )
121
124
  ).returns(T.attached_class)
122
125
  end
123
126
  def self.new(
@@ -140,14 +143,17 @@ module OpenAI
140
143
  transcription: nil,
141
144
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
142
145
  # set to `null` to turn off, in which case the client must manually trigger model
143
- # response. Server VAD means that the model will detect the start and end of
144
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
145
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
146
- # semantically estimate whether the user has finished speaking, then dynamically
147
- # sets a timeout based on this probability. For example, if user audio trails off
148
- # with "uhhm", the model will score a low probability of turn end and wait longer
149
- # for the user to continue speaking. This can be useful for more natural
150
- # conversations, but may have a higher latency.
146
+ # response.
147
+ #
148
+ # Server VAD means that the model will detect the start and end of speech based on
149
+ # audio volume and respond at the end of user speech.
150
+ #
151
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
152
+ # with VAD) to semantically estimate whether the user has finished speaking, then
153
+ # dynamically sets a timeout based on this probability. For example, if user audio
154
+ # trails off with "uhhm", the model will score a low probability of turn end and
155
+ # wait longer for the user to continue speaking. This can be useful for more
156
+ # natural conversations, but may have a higher latency.
151
157
  turn_detection: nil
152
158
  )
153
159
  end
@@ -165,7 +171,12 @@ module OpenAI
165
171
  OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
166
172
  transcription: OpenAI::Realtime::AudioTranscription,
167
173
  turn_detection:
168
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection
174
+ T.nilable(
175
+ T.any(
176
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
177
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
178
+ )
179
+ )
169
180
  }
170
181
  )
171
182
  end
@@ -3,256 +3,320 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Realtime
6
- class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
7
- OrHash =
6
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
7
+ # set to `null` to turn off, in which case the client must manually trigger model
8
+ # response.
9
+ #
10
+ # Server VAD means that the model will detect the start and end of speech based on
11
+ # audio volume and respond at the end of user speech.
12
+ #
13
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
14
+ # with VAD) to semantically estimate whether the user has finished speaking, then
15
+ # dynamically sets a timeout based on this probability. For example, if user audio
16
+ # trails off with "uhhm", the model will score a low probability of turn end and
17
+ # wait longer for the user to continue speaking. This can be useful for more
18
+ # natural conversations, but may have a higher latency.
19
+ module RealtimeTranscriptionSessionAudioInputTurnDetection
20
+ extend OpenAI::Internal::Type::Union
21
+
22
+ Variants =
8
23
  T.type_alias do
9
24
  T.any(
10
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection,
11
- OpenAI::Internal::AnyHash
25
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
26
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
12
27
  )
13
28
  end
14
29
 
15
- # Whether or not to automatically generate a response when a VAD stop event
16
- # occurs.
17
- sig { returns(T.nilable(T::Boolean)) }
18
- attr_reader :create_response
19
-
20
- sig { params(create_response: T::Boolean).void }
21
- attr_writer :create_response
22
-
23
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
24
- # will wait longer for the user to continue speaking, `high` will respond more
25
- # quickly. `auto` is the default and is equivalent to `medium`.
26
- sig do
27
- returns(
28
- T.nilable(
29
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
30
- )
31
- )
32
- end
33
- attr_reader :eagerness
34
-
35
- sig do
36
- params(
37
- eagerness:
38
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
39
- ).void
40
- end
41
- attr_writer :eagerness
42
-
43
- # Optional idle timeout after which turn detection will auto-timeout when no
44
- # additional audio is received.
45
- sig { returns(T.nilable(Integer)) }
46
- attr_accessor :idle_timeout_ms
47
-
48
- # Whether or not to automatically interrupt any ongoing response with output to
49
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
50
- # occurs.
51
- sig { returns(T.nilable(T::Boolean)) }
52
- attr_reader :interrupt_response
53
-
54
- sig { params(interrupt_response: T::Boolean).void }
55
- attr_writer :interrupt_response
56
-
57
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
58
- # detected speech (in milliseconds). Defaults to 300ms.
59
- sig { returns(T.nilable(Integer)) }
60
- attr_reader :prefix_padding_ms
61
-
62
- sig { params(prefix_padding_ms: Integer).void }
63
- attr_writer :prefix_padding_ms
30
+ class ServerVad < OpenAI::Internal::Type::BaseModel
31
+ OrHash =
32
+ T.type_alias do
33
+ T.any(
34
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
35
+ OpenAI::Internal::AnyHash
36
+ )
37
+ end
64
38
 
65
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
66
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
67
- # more quickly, but may jump in on short pauses from the user.
68
- sig { returns(T.nilable(Integer)) }
69
- attr_reader :silence_duration_ms
39
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
40
+ sig { returns(Symbol) }
41
+ attr_accessor :type
70
42
 
71
- sig { params(silence_duration_ms: Integer).void }
72
- attr_writer :silence_duration_ms
43
+ # Whether or not to automatically generate a response when a VAD stop event
44
+ # occurs.
45
+ sig { returns(T.nilable(T::Boolean)) }
46
+ attr_reader :create_response
73
47
 
74
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
75
- # defaults to 0.5. A higher threshold will require louder audio to activate the
76
- # model, and thus might perform better in noisy environments.
77
- sig { returns(T.nilable(Float)) }
78
- attr_reader :threshold
48
+ sig { params(create_response: T::Boolean).void }
49
+ attr_writer :create_response
79
50
 
80
- sig { params(threshold: Float).void }
81
- attr_writer :threshold
51
+ # Optional timeout after which a model response will be triggered automatically.
52
+ # This is useful for situations in which a long pause from the user is unexpected,
53
+ # such as a phone call. The model will effectively prompt the user to continue the
54
+ # conversation based on the current context.
55
+ #
56
+ # The timeout value will be applied after the last model response's audio has
57
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
58
+ # duration.
59
+ #
60
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
61
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
62
+ # only supported for `server_vad` mode.
63
+ sig { returns(T.nilable(Integer)) }
64
+ attr_accessor :idle_timeout_ms
82
65
 
83
- # Type of turn detection.
84
- sig do
85
- returns(
86
- T.nilable(
87
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
88
- )
89
- )
90
- end
91
- attr_reader :type
92
-
93
- sig do
94
- params(
95
- type:
96
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
97
- ).void
98
- end
99
- attr_writer :type
100
-
101
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
102
- # set to `null` to turn off, in which case the client must manually trigger model
103
- # response. Server VAD means that the model will detect the start and end of
104
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
105
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
106
- # semantically estimate whether the user has finished speaking, then dynamically
107
- # sets a timeout based on this probability. For example, if user audio trails off
108
- # with "uhhm", the model will score a low probability of turn end and wait longer
109
- # for the user to continue speaking. This can be useful for more natural
110
- # conversations, but may have a higher latency.
111
- sig do
112
- params(
113
- create_response: T::Boolean,
114
- eagerness:
115
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
116
- idle_timeout_ms: T.nilable(Integer),
117
- interrupt_response: T::Boolean,
118
- prefix_padding_ms: Integer,
119
- silence_duration_ms: Integer,
120
- threshold: Float,
121
- type:
122
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
123
- ).returns(T.attached_class)
124
- end
125
- def self.new(
126
- # Whether or not to automatically generate a response when a VAD stop event
127
- # occurs.
128
- create_response: nil,
129
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
130
- # will wait longer for the user to continue speaking, `high` will respond more
131
- # quickly. `auto` is the default and is equivalent to `medium`.
132
- eagerness: nil,
133
- # Optional idle timeout after which turn detection will auto-timeout when no
134
- # additional audio is received.
135
- idle_timeout_ms: nil,
136
66
  # Whether or not to automatically interrupt any ongoing response with output to
137
67
  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
138
68
  # occurs.
139
- interrupt_response: nil,
69
+ sig { returns(T.nilable(T::Boolean)) }
70
+ attr_reader :interrupt_response
71
+
72
+ sig { params(interrupt_response: T::Boolean).void }
73
+ attr_writer :interrupt_response
74
+
140
75
  # Used only for `server_vad` mode. Amount of audio to include before the VAD
141
76
  # detected speech (in milliseconds). Defaults to 300ms.
142
- prefix_padding_ms: nil,
77
+ sig { returns(T.nilable(Integer)) }
78
+ attr_reader :prefix_padding_ms
79
+
80
+ sig { params(prefix_padding_ms: Integer).void }
81
+ attr_writer :prefix_padding_ms
82
+
143
83
  # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
144
84
  # milliseconds). Defaults to 500ms. With shorter values the model will respond
145
85
  # more quickly, but may jump in on short pauses from the user.
146
- silence_duration_ms: nil,
86
+ sig { returns(T.nilable(Integer)) }
87
+ attr_reader :silence_duration_ms
88
+
89
+ sig { params(silence_duration_ms: Integer).void }
90
+ attr_writer :silence_duration_ms
91
+
147
92
  # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
148
93
  # defaults to 0.5. A higher threshold will require louder audio to activate the
149
94
  # model, and thus might perform better in noisy environments.
150
- threshold: nil,
151
- # Type of turn detection.
152
- type: nil
153
- )
154
- end
95
+ sig { returns(T.nilable(Float)) }
96
+ attr_reader :threshold
155
97
 
156
- sig do
157
- override.returns(
158
- {
98
+ sig { params(threshold: Float).void }
99
+ attr_writer :threshold
100
+
101
+ # Server-side voice activity detection (VAD) which flips on when user speech is
102
+ # detected and off after a period of silence.
103
+ sig do
104
+ params(
159
105
  create_response: T::Boolean,
160
- eagerness:
161
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
162
106
  idle_timeout_ms: T.nilable(Integer),
163
107
  interrupt_response: T::Boolean,
164
108
  prefix_padding_ms: Integer,
165
109
  silence_duration_ms: Integer,
166
110
  threshold: Float,
167
- type:
168
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
169
- }
111
+ type: Symbol
112
+ ).returns(T.attached_class)
113
+ end
114
+ def self.new(
115
+ # Whether or not to automatically generate a response when a VAD stop event
116
+ # occurs.
117
+ create_response: nil,
118
+ # Optional timeout after which a model response will be triggered automatically.
119
+ # This is useful for situations in which a long pause from the user is unexpected,
120
+ # such as a phone call. The model will effectively prompt the user to continue the
121
+ # conversation based on the current context.
122
+ #
123
+ # The timeout value will be applied after the last model response's audio has
124
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
125
+ # duration.
126
+ #
127
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
128
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
129
+ # only supported for `server_vad` mode.
130
+ idle_timeout_ms: nil,
131
+ # Whether or not to automatically interrupt any ongoing response with output to
132
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
133
+ # occurs.
134
+ interrupt_response: nil,
135
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
136
+ # detected speech (in milliseconds). Defaults to 300ms.
137
+ prefix_padding_ms: nil,
138
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
139
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
140
+ # more quickly, but may jump in on short pauses from the user.
141
+ silence_duration_ms: nil,
142
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
143
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
144
+ # model, and thus might perform better in noisy environments.
145
+ threshold: nil,
146
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
147
+ type: :server_vad
170
148
  )
171
- end
172
- def to_hash
173
- end
149
+ end
174
150
 
175
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
176
- # will wait longer for the user to continue speaking, `high` will respond more
177
- # quickly. `auto` is the default and is equivalent to `medium`.
178
- module Eagerness
179
- extend OpenAI::Internal::Type::Enum
151
+ sig do
152
+ override.returns(
153
+ {
154
+ type: Symbol,
155
+ create_response: T::Boolean,
156
+ idle_timeout_ms: T.nilable(Integer),
157
+ interrupt_response: T::Boolean,
158
+ prefix_padding_ms: Integer,
159
+ silence_duration_ms: Integer,
160
+ threshold: Float
161
+ }
162
+ )
163
+ end
164
+ def to_hash
165
+ end
166
+ end
180
167
 
181
- TaggedSymbol =
168
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
169
+ OrHash =
182
170
  T.type_alias do
183
- T.all(
184
- Symbol,
185
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness
171
+ T.any(
172
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad,
173
+ OpenAI::Internal::AnyHash
186
174
  )
187
175
  end
188
- OrSymbol = T.type_alias { T.any(Symbol, String) }
189
176
 
190
- LOW =
191
- T.let(
192
- :low,
193
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
194
- )
195
- MEDIUM =
196
- T.let(
197
- :medium,
198
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
199
- )
200
- HIGH =
201
- T.let(
202
- :high,
203
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
204
- )
205
- AUTO =
206
- T.let(
207
- :auto,
208
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
209
- )
177
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
178
+ sig { returns(Symbol) }
179
+ attr_accessor :type
180
+
181
+ # Whether or not to automatically generate a response when a VAD stop event
182
+ # occurs.
183
+ sig { returns(T.nilable(T::Boolean)) }
184
+ attr_reader :create_response
210
185
 
186
+ sig { params(create_response: T::Boolean).void }
187
+ attr_writer :create_response
188
+
189
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
190
+ # will wait longer for the user to continue speaking, `high` will respond more
191
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
192
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
211
193
  sig do
212
- override.returns(
213
- T::Array[
214
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
215
- ]
194
+ returns(
195
+ T.nilable(
196
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
197
+ )
216
198
  )
217
199
  end
218
- def self.values
200
+ attr_reader :eagerness
201
+
202
+ sig do
203
+ params(
204
+ eagerness:
205
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
206
+ ).void
219
207
  end
220
- end
208
+ attr_writer :eagerness
221
209
 
222
- # Type of turn detection.
223
- module Type
224
- extend OpenAI::Internal::Type::Enum
210
+ # Whether or not to automatically interrupt any ongoing response with output to
211
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
212
+ # occurs.
213
+ sig { returns(T.nilable(T::Boolean)) }
214
+ attr_reader :interrupt_response
225
215
 
226
- TaggedSymbol =
227
- T.type_alias do
228
- T.all(
229
- Symbol,
230
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type
231
- )
232
- end
233
- OrSymbol = T.type_alias { T.any(Symbol, String) }
216
+ sig { params(interrupt_response: T::Boolean).void }
217
+ attr_writer :interrupt_response
234
218
 
235
- SERVER_VAD =
236
- T.let(
237
- :server_vad,
238
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
239
- )
240
- SEMANTIC_VAD =
241
- T.let(
242
- :semantic_vad,
243
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
244
- )
219
+ # Server-side semantic turn detection which uses a model to determine when the
220
+ # user has finished speaking.
221
+ sig do
222
+ params(
223
+ create_response: T::Boolean,
224
+ eagerness:
225
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
226
+ interrupt_response: T::Boolean,
227
+ type: Symbol
228
+ ).returns(T.attached_class)
229
+ end
230
+ def self.new(
231
+ # Whether or not to automatically generate a response when a VAD stop event
232
+ # occurs.
233
+ create_response: nil,
234
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
235
+ # will wait longer for the user to continue speaking, `high` will respond more
236
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
237
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
238
+ eagerness: nil,
239
+ # Whether or not to automatically interrupt any ongoing response with output to
240
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
241
+ # occurs.
242
+ interrupt_response: nil,
243
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
244
+ type: :semantic_vad
245
+ )
246
+ end
245
247
 
246
248
  sig do
247
249
  override.returns(
248
- T::Array[
249
- OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::TaggedSymbol
250
- ]
250
+ {
251
+ type: Symbol,
252
+ create_response: T::Boolean,
253
+ eagerness:
254
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
255
+ interrupt_response: T::Boolean
256
+ }
251
257
  )
252
258
  end
253
- def self.values
259
+ def to_hash
260
+ end
261
+
262
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
263
+ # will wait longer for the user to continue speaking, `high` will respond more
264
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
265
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
266
+ module Eagerness
267
+ extend OpenAI::Internal::Type::Enum
268
+
269
+ TaggedSymbol =
270
+ T.type_alias do
271
+ T.all(
272
+ Symbol,
273
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness
274
+ )
275
+ end
276
+ OrSymbol = T.type_alias { T.any(Symbol, String) }
277
+
278
+ LOW =
279
+ T.let(
280
+ :low,
281
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
282
+ )
283
+ MEDIUM =
284
+ T.let(
285
+ :medium,
286
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
287
+ )
288
+ HIGH =
289
+ T.let(
290
+ :high,
291
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
292
+ )
293
+ AUTO =
294
+ T.let(
295
+ :auto,
296
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
297
+ )
298
+
299
+ sig do
300
+ override.returns(
301
+ T::Array[
302
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
303
+ ]
304
+ )
305
+ end
306
+ def self.values
307
+ end
254
308
  end
255
309
  end
310
+
311
+ sig do
312
+ override.returns(
313
+ T::Array[
314
+ OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Variants
315
+ ]
316
+ )
317
+ end
318
+ def self.variants
319
+ end
256
320
  end
257
321
  end
258
322
  end