openai 0.23.1 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
- data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
- data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
- data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
- data/lib/openai/models/realtime/realtime_session.rb +179 -118
- data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
- data/lib/openai/models/responses/response.rb +8 -8
- data/lib/openai/models/responses/response_create_params.rb +8 -8
- data/lib/openai/version.rb +1 -1
- data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
- data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
- data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
- data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
- data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
- data/rbi/openai/models/responses/response.rbi +12 -12
- data/rbi/openai/models/responses/response_create_params.rbi +12 -12
- data/rbi/openai/resources/responses.rbi +8 -8
- data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
- data/sig/openai/models/realtime/realtime_session.rbs +95 -69
- data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
- metadata +2 -2
data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb
CHANGED
@@ -3,128 +3,186 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
module Realtime
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
#
|
113
|
-
#
|
6
|
+
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
7
|
+
# set to `null` to turn off, in which case the client must manually trigger model
|
8
|
+
# response.
|
9
|
+
#
|
10
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
11
|
+
# audio volume and respond at the end of user speech.
|
12
|
+
#
|
13
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
14
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
15
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
16
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
17
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
18
|
+
# natural conversations, but may have a higher latency.
|
19
|
+
module RealtimeTranscriptionSessionAudioInputTurnDetection
|
20
|
+
extend OpenAI::Internal::Type::Union
|
21
|
+
|
22
|
+
discriminator :type
|
23
|
+
|
24
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
|
25
|
+
variant :server_vad,
|
26
|
+
-> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad }
|
27
|
+
|
28
|
+
# Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
|
29
|
+
variant :semantic_vad,
|
30
|
+
-> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad }
|
31
|
+
|
32
|
+
class ServerVad < OpenAI::Internal::Type::BaseModel
|
33
|
+
# @!attribute type
|
34
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
35
|
+
#
|
36
|
+
# @return [Symbol, :server_vad]
|
37
|
+
required :type, const: :server_vad
|
38
|
+
|
39
|
+
# @!attribute create_response
|
40
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
41
|
+
# occurs.
|
42
|
+
#
|
43
|
+
# @return [Boolean, nil]
|
44
|
+
optional :create_response, OpenAI::Internal::Type::Boolean
|
45
|
+
|
46
|
+
# @!attribute idle_timeout_ms
|
47
|
+
# Optional timeout after which a model response will be triggered automatically.
|
48
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
49
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
50
|
+
# conversation based on the current context.
|
51
|
+
#
|
52
|
+
# The timeout value will be applied after the last model response's audio has
|
53
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
54
|
+
# duration.
|
55
|
+
#
|
56
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
57
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
58
|
+
# only supported for `server_vad` mode.
|
59
|
+
#
|
60
|
+
# @return [Integer, nil]
|
61
|
+
optional :idle_timeout_ms, Integer, nil?: true
|
62
|
+
|
63
|
+
# @!attribute interrupt_response
|
64
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
65
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
66
|
+
# occurs.
|
67
|
+
#
|
68
|
+
# @return [Boolean, nil]
|
69
|
+
optional :interrupt_response, OpenAI::Internal::Type::Boolean
|
70
|
+
|
71
|
+
# @!attribute prefix_padding_ms
|
72
|
+
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
73
|
+
# detected speech (in milliseconds). Defaults to 300ms.
|
74
|
+
#
|
75
|
+
# @return [Integer, nil]
|
76
|
+
optional :prefix_padding_ms, Integer
|
77
|
+
|
78
|
+
# @!attribute silence_duration_ms
|
79
|
+
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
80
|
+
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
81
|
+
# more quickly, but may jump in on short pauses from the user.
|
82
|
+
#
|
83
|
+
# @return [Integer, nil]
|
84
|
+
optional :silence_duration_ms, Integer
|
85
|
+
|
86
|
+
# @!attribute threshold
|
87
|
+
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
88
|
+
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
89
|
+
# model, and thus might perform better in noisy environments.
|
90
|
+
#
|
91
|
+
# @return [Float, nil]
|
92
|
+
optional :threshold, Float
|
93
|
+
|
94
|
+
# @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
|
95
|
+
# Some parameter documentations has been truncated, see
|
96
|
+
# {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad}
|
97
|
+
# for more details.
|
98
|
+
#
|
99
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is
|
100
|
+
# detected and off after a period of silence.
|
101
|
+
#
|
102
|
+
# @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
|
103
|
+
#
|
104
|
+
# @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
|
105
|
+
#
|
106
|
+
# @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
|
107
|
+
#
|
108
|
+
# @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
|
109
|
+
#
|
110
|
+
# @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
|
111
|
+
#
|
112
|
+
# @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
113
|
+
#
|
114
|
+
# @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
|
114
115
|
end
|
115
116
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
117
|
+
class SemanticVad < OpenAI::Internal::Type::BaseModel
|
118
|
+
# @!attribute type
|
119
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
120
|
+
#
|
121
|
+
# @return [Symbol, :semantic_vad]
|
122
|
+
required :type, const: :semantic_vad
|
123
|
+
|
124
|
+
# @!attribute create_response
|
125
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
126
|
+
# occurs.
|
127
|
+
#
|
128
|
+
# @return [Boolean, nil]
|
129
|
+
optional :create_response, OpenAI::Internal::Type::Boolean
|
130
|
+
|
131
|
+
# @!attribute eagerness
|
132
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
133
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
134
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
135
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
136
|
+
#
|
137
|
+
# @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness, nil]
|
138
|
+
optional :eagerness,
|
139
|
+
enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness }
|
121
140
|
|
122
|
-
|
123
|
-
|
141
|
+
# @!attribute interrupt_response
|
142
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
143
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
144
|
+
# occurs.
|
145
|
+
#
|
146
|
+
# @return [Boolean, nil]
|
147
|
+
optional :interrupt_response, OpenAI::Internal::Type::Boolean
|
124
148
|
|
125
|
-
# @!method
|
126
|
-
#
|
149
|
+
# @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
|
150
|
+
# Some parameter documentations has been truncated, see
|
151
|
+
# {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad}
|
152
|
+
# for more details.
|
153
|
+
#
|
154
|
+
# Server-side semantic turn detection which uses a model to determine when the
|
155
|
+
# user has finished speaking.
|
156
|
+
#
|
157
|
+
# @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
|
158
|
+
#
|
159
|
+
# @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
160
|
+
#
|
161
|
+
# @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
|
162
|
+
#
|
163
|
+
# @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
164
|
+
|
165
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
166
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
167
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
168
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
169
|
+
#
|
170
|
+
# @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad#eagerness
|
171
|
+
module Eagerness
|
172
|
+
extend OpenAI::Internal::Type::Enum
|
173
|
+
|
174
|
+
LOW = :low
|
175
|
+
MEDIUM = :medium
|
176
|
+
HIGH = :high
|
177
|
+
AUTO = :auto
|
178
|
+
|
179
|
+
# @!method self.values
|
180
|
+
# @return [Array<Symbol>]
|
181
|
+
end
|
127
182
|
end
|
183
|
+
|
184
|
+
# @!method self.variants
|
185
|
+
# @return [Array(OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad)]
|
128
186
|
end
|
129
187
|
end
|
130
188
|
end
|
@@ -259,10 +259,10 @@ module OpenAI
|
|
259
259
|
# @!attribute truncation
|
260
260
|
# The truncation strategy to use for the model response.
|
261
261
|
#
|
262
|
-
# - `auto`: If the
|
263
|
-
#
|
264
|
-
#
|
265
|
-
# - `disabled` (default): If
|
262
|
+
# - `auto`: If the input to this Response exceeds the model's context window size,
|
263
|
+
# the model will truncate the response to fit the context window by dropping
|
264
|
+
# items from the beginning of the conversation.
|
265
|
+
# - `disabled` (default): If the input size will exceed the context window size
|
266
266
|
# for a model, the request will fail with a 400 error.
|
267
267
|
#
|
268
268
|
# @return [Symbol, OpenAI::Models::Responses::Response::Truncation, nil]
|
@@ -510,10 +510,10 @@ module OpenAI
|
|
510
510
|
|
511
511
|
# The truncation strategy to use for the model response.
|
512
512
|
#
|
513
|
-
# - `auto`: If the
|
514
|
-
#
|
515
|
-
#
|
516
|
-
# - `disabled` (default): If
|
513
|
+
# - `auto`: If the input to this Response exceeds the model's context window size,
|
514
|
+
# the model will truncate the response to fit the context window by dropping
|
515
|
+
# items from the beginning of the conversation.
|
516
|
+
# - `disabled` (default): If the input size will exceed the context window size
|
517
517
|
# for a model, the request will fail with a 400 error.
|
518
518
|
#
|
519
519
|
# @see OpenAI::Models::Responses::Response#truncation
|
@@ -276,10 +276,10 @@ module OpenAI
|
|
276
276
|
# @!attribute truncation
|
277
277
|
# The truncation strategy to use for the model response.
|
278
278
|
#
|
279
|
-
# - `auto`: If the
|
280
|
-
#
|
281
|
-
#
|
282
|
-
# - `disabled` (default): If
|
279
|
+
# - `auto`: If the input to this Response exceeds the model's context window size,
|
280
|
+
# the model will truncate the response to fit the context window by dropping
|
281
|
+
# items from the beginning of the conversation.
|
282
|
+
# - `disabled` (default): If the input size will exceed the context window size
|
283
283
|
# for a model, the request will fail with a 400 error.
|
284
284
|
#
|
285
285
|
# @return [Symbol, OpenAI::Models::Responses::ResponseCreateParams::Truncation, nil]
|
@@ -485,10 +485,10 @@ module OpenAI
|
|
485
485
|
|
486
486
|
# The truncation strategy to use for the model response.
|
487
487
|
#
|
488
|
-
# - `auto`: If the
|
489
|
-
#
|
490
|
-
#
|
491
|
-
# - `disabled` (default): If
|
488
|
+
# - `auto`: If the input to this Response exceeds the model's context window size,
|
489
|
+
# the model will truncate the response to fit the context window by dropping
|
490
|
+
# items from the beginning of the conversation.
|
491
|
+
# - `disabled` (default): If the input size will exceed the context window size
|
492
492
|
# for a model, the request will fail with a 400 error.
|
493
493
|
module Truncation
|
494
494
|
extend OpenAI::Internal::Type::Enum
|
data/lib/openai/version.rb
CHANGED
@@ -12,11 +12,13 @@ module OpenAI
|
|
12
12
|
)
|
13
13
|
end
|
14
14
|
|
15
|
-
# Millisecond offset
|
15
|
+
# Millisecond offset of audio written to the input audio buffer at the time the
|
16
|
+
# timeout was triggered.
|
16
17
|
sig { returns(Integer) }
|
17
18
|
attr_accessor :audio_end_ms
|
18
19
|
|
19
|
-
# Millisecond offset
|
20
|
+
# Millisecond offset of audio written to the input audio buffer that was after the
|
21
|
+
# playback time of the last model response.
|
20
22
|
sig { returns(Integer) }
|
21
23
|
attr_accessor :audio_start_ms
|
22
24
|
|
@@ -32,7 +34,22 @@ module OpenAI
|
|
32
34
|
sig { returns(Symbol) }
|
33
35
|
attr_accessor :type
|
34
36
|
|
35
|
-
# Returned when the
|
37
|
+
# Returned when the Server VAD timeout is triggered for the input audio buffer.
|
38
|
+
# This is configured with `idle_timeout_ms` in the `turn_detection` settings of
|
39
|
+
# the session, and it indicates that there hasn't been any speech detected for the
|
40
|
+
# configured duration.
|
41
|
+
#
|
42
|
+
# The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
|
43
|
+
# after the last model response up to the triggering time, as an offset from the
|
44
|
+
# beginning of audio written to the input audio buffer. This means it demarcates
|
45
|
+
# the segment of audio that was silent and the difference between the start and
|
46
|
+
# end values will roughly match the configured timeout.
|
47
|
+
#
|
48
|
+
# The empty audio will be committed to the conversation as an `input_audio` item
|
49
|
+
# (there will be a `input_audio_buffer.committed` event) and a model response will
|
50
|
+
# be generated. There may be speech that didn't trigger VAD but is still detected
|
51
|
+
# by the model, so the model may respond with something relevant to the
|
52
|
+
# conversation or a prompt to continue speaking.
|
36
53
|
sig do
|
37
54
|
params(
|
38
55
|
audio_end_ms: Integer,
|
@@ -43,9 +60,11 @@ module OpenAI
|
|
43
60
|
).returns(T.attached_class)
|
44
61
|
end
|
45
62
|
def self.new(
|
46
|
-
# Millisecond offset
|
63
|
+
# Millisecond offset of audio written to the input audio buffer at the time the
|
64
|
+
# timeout was triggered.
|
47
65
|
audio_end_ms:,
|
48
|
-
# Millisecond offset
|
66
|
+
# Millisecond offset of audio written to the input audio buffer that was after the
|
67
|
+
# playback time of the last model response.
|
49
68
|
audio_start_ms:,
|
50
69
|
# The unique ID of the server event.
|
51
70
|
event_id:,
|
@@ -80,26 +80,28 @@ module OpenAI
|
|
80
80
|
|
81
81
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
82
82
|
# set to `null` to turn off, in which case the client must manually trigger model
|
83
|
-
# response.
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
83
|
+
# response.
|
84
|
+
#
|
85
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
86
|
+
# audio volume and respond at the end of user speech.
|
87
|
+
#
|
88
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
89
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
90
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
91
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
92
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
93
|
+
# natural conversations, but may have a higher latency.
|
91
94
|
sig do
|
92
|
-
returns(
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
).void
|
95
|
+
returns(
|
96
|
+
T.nilable(
|
97
|
+
T.any(
|
98
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
|
99
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
|
100
|
+
)
|
101
|
+
)
|
102
|
+
)
|
101
103
|
end
|
102
|
-
|
104
|
+
attr_accessor :turn_detection
|
103
105
|
|
104
106
|
sig do
|
105
107
|
params(
|
@@ -113,7 +115,12 @@ module OpenAI
|
|
113
115
|
OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction::OrHash,
|
114
116
|
transcription: OpenAI::Realtime::AudioTranscription::OrHash,
|
115
117
|
turn_detection:
|
116
|
-
|
118
|
+
T.nilable(
|
119
|
+
T.any(
|
120
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad::OrHash,
|
121
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::OrHash
|
122
|
+
)
|
123
|
+
)
|
117
124
|
).returns(T.attached_class)
|
118
125
|
end
|
119
126
|
def self.new(
|
@@ -136,14 +143,17 @@ module OpenAI
|
|
136
143
|
transcription: nil,
|
137
144
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
138
145
|
# set to `null` to turn off, in which case the client must manually trigger model
|
139
|
-
# response.
|
140
|
-
#
|
141
|
-
#
|
142
|
-
#
|
143
|
-
#
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
146
|
+
# response.
|
147
|
+
#
|
148
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
149
|
+
# audio volume and respond at the end of user speech.
|
150
|
+
#
|
151
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
152
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
153
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
154
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
155
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
156
|
+
# natural conversations, but may have a higher latency.
|
147
157
|
turn_detection: nil
|
148
158
|
)
|
149
159
|
end
|
@@ -160,7 +170,13 @@ module OpenAI
|
|
160
170
|
noise_reduction:
|
161
171
|
OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
|
162
172
|
transcription: OpenAI::Realtime::AudioTranscription,
|
163
|
-
turn_detection:
|
173
|
+
turn_detection:
|
174
|
+
T.nilable(
|
175
|
+
T.any(
|
176
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
|
177
|
+
OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
|
178
|
+
)
|
179
|
+
)
|
164
180
|
}
|
165
181
|
)
|
166
182
|
end
|