openai 0.23.1 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
- data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
- data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
- data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
- data/lib/openai/models/realtime/realtime_session.rb +179 -118
- data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
- data/lib/openai/models/responses/response.rb +8 -8
- data/lib/openai/models/responses/response_create_params.rb +8 -8
- data/lib/openai/version.rb +1 -1
- data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
- data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
- data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
- data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
- data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
- data/rbi/openai/models/responses/response.rbi +12 -12
- data/rbi/openai/models/responses/response_create_params.rbi +12 -12
- data/rbi/openai/resources/responses.rbi +8 -8
- data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
- data/sig/openai/models/realtime/realtime_session.rbs +95 -69
- data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
- metadata +2 -2
@@ -80,30 +80,28 @@ module OpenAI
|
|
80
80
|
|
81
81
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
82
82
|
# set to `null` to turn off, in which case the client must manually trigger model
|
83
|
-
# response.
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
83
|
+
# response.
|
84
|
+
#
|
85
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
86
|
+
# audio volume and respond at the end of user speech.
|
87
|
+
#
|
88
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
89
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
90
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
91
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
92
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
93
|
+
# natural conversations, but may have a higher latency.
|
91
94
|
sig do
|
92
95
|
returns(
|
93
96
|
T.nilable(
|
94
|
-
|
97
|
+
T.any(
|
98
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
|
99
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
|
100
|
+
)
|
95
101
|
)
|
96
102
|
)
|
97
103
|
end
|
98
|
-
|
99
|
-
|
100
|
-
sig do
|
101
|
-
params(
|
102
|
-
turn_detection:
|
103
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::OrHash
|
104
|
-
).void
|
105
|
-
end
|
106
|
-
attr_writer :turn_detection
|
104
|
+
attr_accessor :turn_detection
|
107
105
|
|
108
106
|
sig do
|
109
107
|
params(
|
@@ -117,7 +115,12 @@ module OpenAI
|
|
117
115
|
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction::OrHash,
|
118
116
|
transcription: OpenAI::Realtime::AudioTranscription::OrHash,
|
119
117
|
turn_detection:
|
120
|
-
|
118
|
+
T.nilable(
|
119
|
+
T.any(
|
120
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad::OrHash,
|
121
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::OrHash
|
122
|
+
)
|
123
|
+
)
|
121
124
|
).returns(T.attached_class)
|
122
125
|
end
|
123
126
|
def self.new(
|
@@ -140,14 +143,17 @@ module OpenAI
|
|
140
143
|
transcription: nil,
|
141
144
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
142
145
|
# set to `null` to turn off, in which case the client must manually trigger model
|
143
|
-
# response.
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
146
|
+
# response.
|
147
|
+
#
|
148
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
149
|
+
# audio volume and respond at the end of user speech.
|
150
|
+
#
|
151
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
152
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
153
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
154
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
155
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
156
|
+
# natural conversations, but may have a higher latency.
|
151
157
|
turn_detection: nil
|
152
158
|
)
|
153
159
|
end
|
@@ -165,7 +171,12 @@ module OpenAI
|
|
165
171
|
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInput::NoiseReduction,
|
166
172
|
transcription: OpenAI::Realtime::AudioTranscription,
|
167
173
|
turn_detection:
|
168
|
-
|
174
|
+
T.nilable(
|
175
|
+
T.any(
|
176
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
|
177
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
|
178
|
+
)
|
179
|
+
)
|
169
180
|
}
|
170
181
|
)
|
171
182
|
end
|
data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi
CHANGED
@@ -3,256 +3,320 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
module Realtime
|
6
|
-
|
7
|
-
|
6
|
+
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
7
|
+
# set to `null` to turn off, in which case the client must manually trigger model
|
8
|
+
# response.
|
9
|
+
#
|
10
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
11
|
+
# audio volume and respond at the end of user speech.
|
12
|
+
#
|
13
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
14
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
15
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
16
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
17
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
18
|
+
# natural conversations, but may have a higher latency.
|
19
|
+
module RealtimeTranscriptionSessionAudioInputTurnDetection
|
20
|
+
extend OpenAI::Internal::Type::Union
|
21
|
+
|
22
|
+
Variants =
|
8
23
|
T.type_alias do
|
9
24
|
T.any(
|
10
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection,
|
11
|
-
OpenAI::
|
25
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
|
26
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad
|
12
27
|
)
|
13
28
|
end
|
14
29
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
24
|
-
# will wait longer for the user to continue speaking, `high` will respond more
|
25
|
-
# quickly. `auto` is the default and is equivalent to `medium`.
|
26
|
-
sig do
|
27
|
-
returns(
|
28
|
-
T.nilable(
|
29
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
|
30
|
-
)
|
31
|
-
)
|
32
|
-
end
|
33
|
-
attr_reader :eagerness
|
34
|
-
|
35
|
-
sig do
|
36
|
-
params(
|
37
|
-
eagerness:
|
38
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol
|
39
|
-
).void
|
40
|
-
end
|
41
|
-
attr_writer :eagerness
|
42
|
-
|
43
|
-
# Optional idle timeout after which turn detection will auto-timeout when no
|
44
|
-
# additional audio is received.
|
45
|
-
sig { returns(T.nilable(Integer)) }
|
46
|
-
attr_accessor :idle_timeout_ms
|
47
|
-
|
48
|
-
# Whether or not to automatically interrupt any ongoing response with output to
|
49
|
-
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
50
|
-
# occurs.
|
51
|
-
sig { returns(T.nilable(T::Boolean)) }
|
52
|
-
attr_reader :interrupt_response
|
53
|
-
|
54
|
-
sig { params(interrupt_response: T::Boolean).void }
|
55
|
-
attr_writer :interrupt_response
|
56
|
-
|
57
|
-
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
58
|
-
# detected speech (in milliseconds). Defaults to 300ms.
|
59
|
-
sig { returns(T.nilable(Integer)) }
|
60
|
-
attr_reader :prefix_padding_ms
|
61
|
-
|
62
|
-
sig { params(prefix_padding_ms: Integer).void }
|
63
|
-
attr_writer :prefix_padding_ms
|
30
|
+
class ServerVad < OpenAI::Internal::Type::BaseModel
|
31
|
+
OrHash =
|
32
|
+
T.type_alias do
|
33
|
+
T.any(
|
34
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad,
|
35
|
+
OpenAI::Internal::AnyHash
|
36
|
+
)
|
37
|
+
end
|
64
38
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
sig { returns(T.nilable(Integer)) }
|
69
|
-
attr_reader :silence_duration_ms
|
39
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
40
|
+
sig { returns(Symbol) }
|
41
|
+
attr_accessor :type
|
70
42
|
|
71
|
-
|
72
|
-
|
43
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
44
|
+
# occurs.
|
45
|
+
sig { returns(T.nilable(T::Boolean)) }
|
46
|
+
attr_reader :create_response
|
73
47
|
|
74
|
-
|
75
|
-
|
76
|
-
# model, and thus might perform better in noisy environments.
|
77
|
-
sig { returns(T.nilable(Float)) }
|
78
|
-
attr_reader :threshold
|
48
|
+
sig { params(create_response: T::Boolean).void }
|
49
|
+
attr_writer :create_response
|
79
50
|
|
80
|
-
|
81
|
-
|
51
|
+
# Optional timeout after which a model response will be triggered automatically.
|
52
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
53
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
54
|
+
# conversation based on the current context.
|
55
|
+
#
|
56
|
+
# The timeout value will be applied after the last model response's audio has
|
57
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
58
|
+
# duration.
|
59
|
+
#
|
60
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
61
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
62
|
+
# only supported for `server_vad` mode.
|
63
|
+
sig { returns(T.nilable(Integer)) }
|
64
|
+
attr_accessor :idle_timeout_ms
|
82
65
|
|
83
|
-
# Type of turn detection.
|
84
|
-
sig do
|
85
|
-
returns(
|
86
|
-
T.nilable(
|
87
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
|
88
|
-
)
|
89
|
-
)
|
90
|
-
end
|
91
|
-
attr_reader :type
|
92
|
-
|
93
|
-
sig do
|
94
|
-
params(
|
95
|
-
type:
|
96
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
|
97
|
-
).void
|
98
|
-
end
|
99
|
-
attr_writer :type
|
100
|
-
|
101
|
-
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
102
|
-
# set to `null` to turn off, in which case the client must manually trigger model
|
103
|
-
# response. Server VAD means that the model will detect the start and end of
|
104
|
-
# speech based on audio volume and respond at the end of user speech. Semantic VAD
|
105
|
-
# is more advanced and uses a turn detection model (in conjunction with VAD) to
|
106
|
-
# semantically estimate whether the user has finished speaking, then dynamically
|
107
|
-
# sets a timeout based on this probability. For example, if user audio trails off
|
108
|
-
# with "uhhm", the model will score a low probability of turn end and wait longer
|
109
|
-
# for the user to continue speaking. This can be useful for more natural
|
110
|
-
# conversations, but may have a higher latency.
|
111
|
-
sig do
|
112
|
-
params(
|
113
|
-
create_response: T::Boolean,
|
114
|
-
eagerness:
|
115
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
|
116
|
-
idle_timeout_ms: T.nilable(Integer),
|
117
|
-
interrupt_response: T::Boolean,
|
118
|
-
prefix_padding_ms: Integer,
|
119
|
-
silence_duration_ms: Integer,
|
120
|
-
threshold: Float,
|
121
|
-
type:
|
122
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type::OrSymbol
|
123
|
-
).returns(T.attached_class)
|
124
|
-
end
|
125
|
-
def self.new(
|
126
|
-
# Whether or not to automatically generate a response when a VAD stop event
|
127
|
-
# occurs.
|
128
|
-
create_response: nil,
|
129
|
-
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
130
|
-
# will wait longer for the user to continue speaking, `high` will respond more
|
131
|
-
# quickly. `auto` is the default and is equivalent to `medium`.
|
132
|
-
eagerness: nil,
|
133
|
-
# Optional idle timeout after which turn detection will auto-timeout when no
|
134
|
-
# additional audio is received.
|
135
|
-
idle_timeout_ms: nil,
|
136
66
|
# Whether or not to automatically interrupt any ongoing response with output to
|
137
67
|
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
138
68
|
# occurs.
|
139
|
-
|
69
|
+
sig { returns(T.nilable(T::Boolean)) }
|
70
|
+
attr_reader :interrupt_response
|
71
|
+
|
72
|
+
sig { params(interrupt_response: T::Boolean).void }
|
73
|
+
attr_writer :interrupt_response
|
74
|
+
|
140
75
|
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
141
76
|
# detected speech (in milliseconds). Defaults to 300ms.
|
142
|
-
|
77
|
+
sig { returns(T.nilable(Integer)) }
|
78
|
+
attr_reader :prefix_padding_ms
|
79
|
+
|
80
|
+
sig { params(prefix_padding_ms: Integer).void }
|
81
|
+
attr_writer :prefix_padding_ms
|
82
|
+
|
143
83
|
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
144
84
|
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
145
85
|
# more quickly, but may jump in on short pauses from the user.
|
146
|
-
|
86
|
+
sig { returns(T.nilable(Integer)) }
|
87
|
+
attr_reader :silence_duration_ms
|
88
|
+
|
89
|
+
sig { params(silence_duration_ms: Integer).void }
|
90
|
+
attr_writer :silence_duration_ms
|
91
|
+
|
147
92
|
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
148
93
|
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
149
94
|
# model, and thus might perform better in noisy environments.
|
150
|
-
|
151
|
-
|
152
|
-
type: nil
|
153
|
-
)
|
154
|
-
end
|
95
|
+
sig { returns(T.nilable(Float)) }
|
96
|
+
attr_reader :threshold
|
155
97
|
|
156
|
-
|
157
|
-
|
158
|
-
|
98
|
+
sig { params(threshold: Float).void }
|
99
|
+
attr_writer :threshold
|
100
|
+
|
101
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is
|
102
|
+
# detected and off after a period of silence.
|
103
|
+
sig do
|
104
|
+
params(
|
159
105
|
create_response: T::Boolean,
|
160
|
-
eagerness:
|
161
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::OrSymbol,
|
162
106
|
idle_timeout_ms: T.nilable(Integer),
|
163
107
|
interrupt_response: T::Boolean,
|
164
108
|
prefix_padding_ms: Integer,
|
165
109
|
silence_duration_ms: Integer,
|
166
110
|
threshold: Float,
|
167
|
-
type:
|
168
|
-
|
169
|
-
|
111
|
+
type: Symbol
|
112
|
+
).returns(T.attached_class)
|
113
|
+
end
|
114
|
+
def self.new(
|
115
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
116
|
+
# occurs.
|
117
|
+
create_response: nil,
|
118
|
+
# Optional timeout after which a model response will be triggered automatically.
|
119
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
120
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
121
|
+
# conversation based on the current context.
|
122
|
+
#
|
123
|
+
# The timeout value will be applied after the last model response's audio has
|
124
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
125
|
+
# duration.
|
126
|
+
#
|
127
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
128
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
129
|
+
# only supported for `server_vad` mode.
|
130
|
+
idle_timeout_ms: nil,
|
131
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
132
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
133
|
+
# occurs.
|
134
|
+
interrupt_response: nil,
|
135
|
+
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
136
|
+
# detected speech (in milliseconds). Defaults to 300ms.
|
137
|
+
prefix_padding_ms: nil,
|
138
|
+
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
139
|
+
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
140
|
+
# more quickly, but may jump in on short pauses from the user.
|
141
|
+
silence_duration_ms: nil,
|
142
|
+
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
143
|
+
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
144
|
+
# model, and thus might perform better in noisy environments.
|
145
|
+
threshold: nil,
|
146
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
147
|
+
type: :server_vad
|
170
148
|
)
|
171
|
-
|
172
|
-
def to_hash
|
173
|
-
end
|
149
|
+
end
|
174
150
|
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
151
|
+
sig do
|
152
|
+
override.returns(
|
153
|
+
{
|
154
|
+
type: Symbol,
|
155
|
+
create_response: T::Boolean,
|
156
|
+
idle_timeout_ms: T.nilable(Integer),
|
157
|
+
interrupt_response: T::Boolean,
|
158
|
+
prefix_padding_ms: Integer,
|
159
|
+
silence_duration_ms: Integer,
|
160
|
+
threshold: Float
|
161
|
+
}
|
162
|
+
)
|
163
|
+
end
|
164
|
+
def to_hash
|
165
|
+
end
|
166
|
+
end
|
180
167
|
|
181
|
-
|
168
|
+
class SemanticVad < OpenAI::Internal::Type::BaseModel
|
169
|
+
OrHash =
|
182
170
|
T.type_alias do
|
183
|
-
T.
|
184
|
-
|
185
|
-
OpenAI::
|
171
|
+
T.any(
|
172
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad,
|
173
|
+
OpenAI::Internal::AnyHash
|
186
174
|
)
|
187
175
|
end
|
188
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
189
176
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
|
199
|
-
)
|
200
|
-
HIGH =
|
201
|
-
T.let(
|
202
|
-
:high,
|
203
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
|
204
|
-
)
|
205
|
-
AUTO =
|
206
|
-
T.let(
|
207
|
-
:auto,
|
208
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::TaggedSymbol
|
209
|
-
)
|
177
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
178
|
+
sig { returns(Symbol) }
|
179
|
+
attr_accessor :type
|
180
|
+
|
181
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
182
|
+
# occurs.
|
183
|
+
sig { returns(T.nilable(T::Boolean)) }
|
184
|
+
attr_reader :create_response
|
210
185
|
|
186
|
+
sig { params(create_response: T::Boolean).void }
|
187
|
+
attr_writer :create_response
|
188
|
+
|
189
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
190
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
191
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
192
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
211
193
|
sig do
|
212
|
-
|
213
|
-
T
|
214
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness::
|
215
|
-
|
194
|
+
returns(
|
195
|
+
T.nilable(
|
196
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
|
197
|
+
)
|
216
198
|
)
|
217
199
|
end
|
218
|
-
|
200
|
+
attr_reader :eagerness
|
201
|
+
|
202
|
+
sig do
|
203
|
+
params(
|
204
|
+
eagerness:
|
205
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol
|
206
|
+
).void
|
219
207
|
end
|
220
|
-
|
208
|
+
attr_writer :eagerness
|
221
209
|
|
222
|
-
|
223
|
-
|
224
|
-
|
210
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
211
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
212
|
+
# occurs.
|
213
|
+
sig { returns(T.nilable(T::Boolean)) }
|
214
|
+
attr_reader :interrupt_response
|
225
215
|
|
226
|
-
|
227
|
-
|
228
|
-
T.all(
|
229
|
-
Symbol,
|
230
|
-
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type
|
231
|
-
)
|
232
|
-
end
|
233
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
216
|
+
sig { params(interrupt_response: T::Boolean).void }
|
217
|
+
attr_writer :interrupt_response
|
234
218
|
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
:
|
243
|
-
|
244
|
-
)
|
219
|
+
# Server-side semantic turn detection which uses a model to determine when the
|
220
|
+
# user has finished speaking.
|
221
|
+
sig do
|
222
|
+
params(
|
223
|
+
create_response: T::Boolean,
|
224
|
+
eagerness:
|
225
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
|
226
|
+
interrupt_response: T::Boolean,
|
227
|
+
type: Symbol
|
228
|
+
).returns(T.attached_class)
|
229
|
+
end
|
230
|
+
def self.new(
|
231
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
232
|
+
# occurs.
|
233
|
+
create_response: nil,
|
234
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
235
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
236
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
237
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
238
|
+
eagerness: nil,
|
239
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
240
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
241
|
+
# occurs.
|
242
|
+
interrupt_response: nil,
|
243
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
244
|
+
type: :semantic_vad
|
245
|
+
)
|
246
|
+
end
|
245
247
|
|
246
248
|
sig do
|
247
249
|
override.returns(
|
248
|
-
|
249
|
-
|
250
|
-
|
250
|
+
{
|
251
|
+
type: Symbol,
|
252
|
+
create_response: T::Boolean,
|
253
|
+
eagerness:
|
254
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::OrSymbol,
|
255
|
+
interrupt_response: T::Boolean
|
256
|
+
}
|
251
257
|
)
|
252
258
|
end
|
253
|
-
def
|
259
|
+
def to_hash
|
260
|
+
end
|
261
|
+
|
262
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
263
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
264
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
265
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
266
|
+
module Eagerness
|
267
|
+
extend OpenAI::Internal::Type::Enum
|
268
|
+
|
269
|
+
TaggedSymbol =
|
270
|
+
T.type_alias do
|
271
|
+
T.all(
|
272
|
+
Symbol,
|
273
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness
|
274
|
+
)
|
275
|
+
end
|
276
|
+
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
277
|
+
|
278
|
+
LOW =
|
279
|
+
T.let(
|
280
|
+
:low,
|
281
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
282
|
+
)
|
283
|
+
MEDIUM =
|
284
|
+
T.let(
|
285
|
+
:medium,
|
286
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
287
|
+
)
|
288
|
+
HIGH =
|
289
|
+
T.let(
|
290
|
+
:high,
|
291
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
292
|
+
)
|
293
|
+
AUTO =
|
294
|
+
T.let(
|
295
|
+
:auto,
|
296
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
297
|
+
)
|
298
|
+
|
299
|
+
sig do
|
300
|
+
override.returns(
|
301
|
+
T::Array[
|
302
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
303
|
+
]
|
304
|
+
)
|
305
|
+
end
|
306
|
+
def self.values
|
307
|
+
end
|
254
308
|
end
|
255
309
|
end
|
310
|
+
|
311
|
+
sig do
|
312
|
+
override.returns(
|
313
|
+
T::Array[
|
314
|
+
OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Variants
|
315
|
+
]
|
316
|
+
)
|
317
|
+
end
|
318
|
+
def self.variants
|
319
|
+
end
|
256
320
|
end
|
257
321
|
end
|
258
322
|
end
|