openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -3,128 +3,186 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Realtime
6
- class RealtimeTranscriptionSessionAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
7
- # @!attribute create_response
8
- # Whether or not to automatically generate a response when a VAD stop event
9
- # occurs.
10
- #
11
- # @return [Boolean, nil]
12
- optional :create_response, OpenAI::Internal::Type::Boolean
13
-
14
- # @!attribute eagerness
15
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
16
- # will wait longer for the user to continue speaking, `high` will respond more
17
- # quickly. `auto` is the default and is equivalent to `medium`.
18
- #
19
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness, nil]
20
- optional :eagerness,
21
- enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness }
22
-
23
- # @!attribute idle_timeout_ms
24
- # Optional idle timeout after which turn detection will auto-timeout when no
25
- # additional audio is received.
26
- #
27
- # @return [Integer, nil]
28
- optional :idle_timeout_ms, Integer, nil?: true
29
-
30
- # @!attribute interrupt_response
31
- # Whether or not to automatically interrupt any ongoing response with output to
32
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
33
- # occurs.
34
- #
35
- # @return [Boolean, nil]
36
- optional :interrupt_response, OpenAI::Internal::Type::Boolean
37
-
38
- # @!attribute prefix_padding_ms
39
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
40
- # detected speech (in milliseconds). Defaults to 300ms.
41
- #
42
- # @return [Integer, nil]
43
- optional :prefix_padding_ms, Integer
44
-
45
- # @!attribute silence_duration_ms
46
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
47
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
48
- # more quickly, but may jump in on short pauses from the user.
49
- #
50
- # @return [Integer, nil]
51
- optional :silence_duration_ms, Integer
52
-
53
- # @!attribute threshold
54
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
55
- # defaults to 0.5. A higher threshold will require louder audio to activate the
56
- # model, and thus might perform better in noisy environments.
57
- #
58
- # @return [Float, nil]
59
- optional :threshold, Float
60
-
61
- # @!attribute type
62
- # Type of turn detection.
63
- #
64
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type, nil]
65
- optional :type, enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type }
66
-
67
- # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
68
- # Some parameter documentations has been truncated, see
69
- # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection}
70
- # for more details.
71
- #
72
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
73
- # set to `null` to turn off, in which case the client must manually trigger model
74
- # response. Server VAD means that the model will detect the start and end of
75
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
76
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
77
- # semantically estimate whether the user has finished speaking, then dynamically
78
- # sets a timeout based on this probability. For example, if user audio trails off
79
- # with "uhhm", the model will score a low probability of turn end and wait longer
80
- # for the user to continue speaking. This can be useful for more natural
81
- # conversations, but may have a higher latency.
82
- #
83
- # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
84
- #
85
- # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
86
- #
87
- # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
88
- #
89
- # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
90
- #
91
- # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
92
- #
93
- # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
94
- #
95
- # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
96
- #
97
- # @param type [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::Type] Type of turn detection.
98
-
99
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
100
- # will wait longer for the user to continue speaking, `high` will respond more
101
- # quickly. `auto` is the default and is equivalent to `medium`.
102
- #
103
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#eagerness
104
- module Eagerness
105
- extend OpenAI::Internal::Type::Enum
106
-
107
- LOW = :low
108
- MEDIUM = :medium
109
- HIGH = :high
110
- AUTO = :auto
111
-
112
- # @!method self.values
113
- # @return [Array<Symbol>]
6
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
7
+ # set to `null` to turn off, in which case the client must manually trigger model
8
+ # response.
9
+ #
10
+ # Server VAD means that the model will detect the start and end of speech based on
11
+ # audio volume and respond at the end of user speech.
12
+ #
13
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
14
+ # with VAD) to semantically estimate whether the user has finished speaking, then
15
+ # dynamically sets a timeout based on this probability. For example, if user audio
16
+ # trails off with "uhhm", the model will score a low probability of turn end and
17
+ # wait longer for the user to continue speaking. This can be useful for more
18
+ # natural conversations, but may have a higher latency.
19
+ module RealtimeTranscriptionSessionAudioInputTurnDetection
20
+ extend OpenAI::Internal::Type::Union
21
+
22
+ discriminator :type
23
+
24
+ # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
25
+ variant :server_vad,
26
+ -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad }
27
+
28
+ # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
29
+ variant :semantic_vad,
30
+ -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad }
31
+
32
+ class ServerVad < OpenAI::Internal::Type::BaseModel
33
+ # @!attribute type
34
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
35
+ #
36
+ # @return [Symbol, :server_vad]
37
+ required :type, const: :server_vad
38
+
39
+ # @!attribute create_response
40
+ # Whether or not to automatically generate a response when a VAD stop event
41
+ # occurs.
42
+ #
43
+ # @return [Boolean, nil]
44
+ optional :create_response, OpenAI::Internal::Type::Boolean
45
+
46
+ # @!attribute idle_timeout_ms
47
+ # Optional timeout after which a model response will be triggered automatically.
48
+ # This is useful for situations in which a long pause from the user is unexpected,
49
+ # such as a phone call. The model will effectively prompt the user to continue the
50
+ # conversation based on the current context.
51
+ #
52
+ # The timeout value will be applied after the last model response's audio has
53
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
54
+ # duration.
55
+ #
56
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
57
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
58
+ # only supported for `server_vad` mode.
59
+ #
60
+ # @return [Integer, nil]
61
+ optional :idle_timeout_ms, Integer, nil?: true
62
+
63
+ # @!attribute interrupt_response
64
+ # Whether or not to automatically interrupt any ongoing response with output to
65
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
66
+ # occurs.
67
+ #
68
+ # @return [Boolean, nil]
69
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
70
+
71
+ # @!attribute prefix_padding_ms
72
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
73
+ # detected speech (in milliseconds). Defaults to 300ms.
74
+ #
75
+ # @return [Integer, nil]
76
+ optional :prefix_padding_ms, Integer
77
+
78
+ # @!attribute silence_duration_ms
79
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
80
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
81
+ # more quickly, but may jump in on short pauses from the user.
82
+ #
83
+ # @return [Integer, nil]
84
+ optional :silence_duration_ms, Integer
85
+
86
+ # @!attribute threshold
87
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
88
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
89
+ # model, and thus might perform better in noisy environments.
90
+ #
91
+ # @return [Float, nil]
92
+ optional :threshold, Float
93
+
94
+ # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
95
+ # Some parameter documentations has been truncated, see
96
+ # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad}
97
+ # for more details.
98
+ #
99
+ # Server-side voice activity detection (VAD) which flips on when user speech is
100
+ # detected and off after a period of silence.
101
+ #
102
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
103
+ #
104
+ # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
105
+ #
106
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
107
+ #
108
+ # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
109
+ #
110
+ # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
111
+ #
112
+ # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
113
+ #
114
+ # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
114
115
  end
115
116
 
116
- # Type of turn detection.
117
- #
118
- # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection#type
119
- module Type
120
- extend OpenAI::Internal::Type::Enum
117
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
118
+ # @!attribute type
119
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
120
+ #
121
+ # @return [Symbol, :semantic_vad]
122
+ required :type, const: :semantic_vad
123
+
124
+ # @!attribute create_response
125
+ # Whether or not to automatically generate a response when a VAD stop event
126
+ # occurs.
127
+ #
128
+ # @return [Boolean, nil]
129
+ optional :create_response, OpenAI::Internal::Type::Boolean
130
+
131
+ # @!attribute eagerness
132
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
133
+ # will wait longer for the user to continue speaking, `high` will respond more
134
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
135
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
136
+ #
137
+ # @return [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness, nil]
138
+ optional :eagerness,
139
+ enum: -> { OpenAI::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness }
121
140
 
122
- SERVER_VAD = :server_vad
123
- SEMANTIC_VAD = :semantic_vad
141
+ # @!attribute interrupt_response
142
+ # Whether or not to automatically interrupt any ongoing response with output to
143
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
144
+ # occurs.
145
+ #
146
+ # @return [Boolean, nil]
147
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
124
148
 
125
- # @!method self.values
126
- # @return [Array<Symbol>]
149
+ # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
150
+ # Some parameter documentations has been truncated, see
151
+ # {OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad}
152
+ # for more details.
153
+ #
154
+ # Server-side semantic turn detection which uses a model to determine when the
155
+ # user has finished speaking.
156
+ #
157
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
158
+ #
159
+ # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
160
+ #
161
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
162
+ #
163
+ # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
164
+
165
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
166
+ # will wait longer for the user to continue speaking, `high` will respond more
167
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
168
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
169
+ #
170
+ # @see OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad#eagerness
171
+ module Eagerness
172
+ extend OpenAI::Internal::Type::Enum
173
+
174
+ LOW = :low
175
+ MEDIUM = :medium
176
+ HIGH = :high
177
+ AUTO = :auto
178
+
179
+ # @!method self.values
180
+ # @return [Array<Symbol>]
181
+ end
127
182
  end
183
+
184
+ # @!method self.variants
185
+ # @return [Array(OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeTranscriptionSessionAudioInputTurnDetection::SemanticVad)]
128
186
  end
129
187
  end
130
188
  end
@@ -259,10 +259,10 @@ module OpenAI
259
259
  # @!attribute truncation
260
260
  # The truncation strategy to use for the model response.
261
261
  #
262
- # - `auto`: If the context of this response and previous ones exceeds the model's
263
- # context window size, the model will truncate the response to fit the context
264
- # window by dropping input items in the middle of the conversation.
265
- # - `disabled` (default): If a model response will exceed the context window size
262
+ # - `auto`: If the input to this Response exceeds the model's context window size,
263
+ # the model will truncate the response to fit the context window by dropping
264
+ # items from the beginning of the conversation.
265
+ # - `disabled` (default): If the input size will exceed the context window size
266
266
  # for a model, the request will fail with a 400 error.
267
267
  #
268
268
  # @return [Symbol, OpenAI::Models::Responses::Response::Truncation, nil]
@@ -510,10 +510,10 @@ module OpenAI
510
510
 
511
511
  # The truncation strategy to use for the model response.
512
512
  #
513
- # - `auto`: If the context of this response and previous ones exceeds the model's
514
- # context window size, the model will truncate the response to fit the context
515
- # window by dropping input items in the middle of the conversation.
516
- # - `disabled` (default): If a model response will exceed the context window size
513
+ # - `auto`: If the input to this Response exceeds the model's context window size,
514
+ # the model will truncate the response to fit the context window by dropping
515
+ # items from the beginning of the conversation.
516
+ # - `disabled` (default): If the input size will exceed the context window size
517
517
  # for a model, the request will fail with a 400 error.
518
518
  #
519
519
  # @see OpenAI::Models::Responses::Response#truncation
@@ -276,10 +276,10 @@ module OpenAI
276
276
  # @!attribute truncation
277
277
  # The truncation strategy to use for the model response.
278
278
  #
279
- # - `auto`: If the context of this response and previous ones exceeds the model's
280
- # context window size, the model will truncate the response to fit the context
281
- # window by dropping input items in the middle of the conversation.
282
- # - `disabled` (default): If a model response will exceed the context window size
279
+ # - `auto`: If the input to this Response exceeds the model's context window size,
280
+ # the model will truncate the response to fit the context window by dropping
281
+ # items from the beginning of the conversation.
282
+ # - `disabled` (default): If the input size will exceed the context window size
283
283
  # for a model, the request will fail with a 400 error.
284
284
  #
285
285
  # @return [Symbol, OpenAI::Models::Responses::ResponseCreateParams::Truncation, nil]
@@ -485,10 +485,10 @@ module OpenAI
485
485
 
486
486
  # The truncation strategy to use for the model response.
487
487
  #
488
- # - `auto`: If the context of this response and previous ones exceeds the model's
489
- # context window size, the model will truncate the response to fit the context
490
- # window by dropping input items in the middle of the conversation.
491
- # - `disabled` (default): If a model response will exceed the context window size
488
+ # - `auto`: If the input to this Response exceeds the model's context window size,
489
+ # the model will truncate the response to fit the context window by dropping
490
+ # items from the beginning of the conversation.
491
+ # - `disabled` (default): If the input size will exceed the context window size
492
492
  # for a model, the request will fail with a 400 error.
493
493
  module Truncation
494
494
  extend OpenAI::Internal::Type::Enum
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OpenAI
4
- VERSION = "0.23.1"
4
+ VERSION = "0.23.2"
5
5
  end
@@ -12,11 +12,13 @@ module OpenAI
12
12
  )
13
13
  end
14
14
 
15
- # Millisecond offset where speech ended within the buffered audio.
15
+ # Millisecond offset of audio written to the input audio buffer at the time the
16
+ # timeout was triggered.
16
17
  sig { returns(Integer) }
17
18
  attr_accessor :audio_end_ms
18
19
 
19
- # Millisecond offset where speech started within the buffered audio.
20
+ # Millisecond offset of audio written to the input audio buffer that was after the
21
+ # playback time of the last model response.
20
22
  sig { returns(Integer) }
21
23
  attr_accessor :audio_start_ms
22
24
 
@@ -32,7 +34,22 @@ module OpenAI
32
34
  sig { returns(Symbol) }
33
35
  attr_accessor :type
34
36
 
35
- # Returned when the server VAD timeout is triggered for the input audio buffer.
37
+ # Returned when the Server VAD timeout is triggered for the input audio buffer.
38
+ # This is configured with `idle_timeout_ms` in the `turn_detection` settings of
39
+ # the session, and it indicates that there hasn't been any speech detected for the
40
+ # configured duration.
41
+ #
42
+ # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
43
+ # after the last model response up to the triggering time, as an offset from the
44
+ # beginning of audio written to the input audio buffer. This means it demarcates
45
+ # the segment of audio that was silent and the difference between the start and
46
+ # end values will roughly match the configured timeout.
47
+ #
48
+ # The empty audio will be committed to the conversation as an `input_audio` item
49
+ # (there will be a `input_audio_buffer.committed` event) and a model response will
50
+ # be generated. There may be speech that didn't trigger VAD but is still detected
51
+ # by the model, so the model may respond with something relevant to the
52
+ # conversation or a prompt to continue speaking.
36
53
  sig do
37
54
  params(
38
55
  audio_end_ms: Integer,
@@ -43,9 +60,11 @@ module OpenAI
43
60
  ).returns(T.attached_class)
44
61
  end
45
62
  def self.new(
46
- # Millisecond offset where speech ended within the buffered audio.
63
+ # Millisecond offset of audio written to the input audio buffer at the time the
64
+ # timeout was triggered.
47
65
  audio_end_ms:,
48
- # Millisecond offset where speech started within the buffered audio.
66
+ # Millisecond offset of audio written to the input audio buffer that was after the
67
+ # playback time of the last model response.
49
68
  audio_start_ms:,
50
69
  # The unique ID of the server event.
51
70
  event_id:,
@@ -80,26 +80,28 @@ module OpenAI
80
80
 
81
81
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
82
82
  # set to `null` to turn off, in which case the client must manually trigger model
83
- # response. Server VAD means that the model will detect the start and end of
84
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
85
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
86
- # semantically estimate whether the user has finished speaking, then dynamically
87
- # sets a timeout based on this probability. For example, if user audio trails off
88
- # with "uhhm", the model will score a low probability of turn end and wait longer
89
- # for the user to continue speaking. This can be useful for more natural
90
- # conversations, but may have a higher latency.
83
+ # response.
84
+ #
85
+ # Server VAD means that the model will detect the start and end of speech based on
86
+ # audio volume and respond at the end of user speech.
87
+ #
88
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
89
+ # with VAD) to semantically estimate whether the user has finished speaking, then
90
+ # dynamically sets a timeout based on this probability. For example, if user audio
91
+ # trails off with "uhhm", the model will score a low probability of turn end and
92
+ # wait longer for the user to continue speaking. This can be useful for more
93
+ # natural conversations, but may have a higher latency.
91
94
  sig do
92
- returns(T.nilable(OpenAI::Realtime::RealtimeAudioInputTurnDetection))
93
- end
94
- attr_reader :turn_detection
95
-
96
- sig do
97
- params(
98
- turn_detection:
99
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash
100
- ).void
95
+ returns(
96
+ T.nilable(
97
+ T.any(
98
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
99
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
100
+ )
101
+ )
102
+ )
101
103
  end
102
- attr_writer :turn_detection
104
+ attr_accessor :turn_detection
103
105
 
104
106
  sig do
105
107
  params(
@@ -113,7 +115,12 @@ module OpenAI
113
115
  OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction::OrHash,
114
116
  transcription: OpenAI::Realtime::AudioTranscription::OrHash,
115
117
  turn_detection:
116
- OpenAI::Realtime::RealtimeAudioInputTurnDetection::OrHash
118
+ T.nilable(
119
+ T.any(
120
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad::OrHash,
121
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::OrHash
122
+ )
123
+ )
117
124
  ).returns(T.attached_class)
118
125
  end
119
126
  def self.new(
@@ -136,14 +143,17 @@ module OpenAI
136
143
  transcription: nil,
137
144
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
138
145
  # set to `null` to turn off, in which case the client must manually trigger model
139
- # response. Server VAD means that the model will detect the start and end of
140
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
141
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
142
- # semantically estimate whether the user has finished speaking, then dynamically
143
- # sets a timeout based on this probability. For example, if user audio trails off
144
- # with "uhhm", the model will score a low probability of turn end and wait longer
145
- # for the user to continue speaking. This can be useful for more natural
146
- # conversations, but may have a higher latency.
146
+ # response.
147
+ #
148
+ # Server VAD means that the model will detect the start and end of speech based on
149
+ # audio volume and respond at the end of user speech.
150
+ #
151
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
152
+ # with VAD) to semantically estimate whether the user has finished speaking, then
153
+ # dynamically sets a timeout based on this probability. For example, if user audio
154
+ # trails off with "uhhm", the model will score a low probability of turn end and
155
+ # wait longer for the user to continue speaking. This can be useful for more
156
+ # natural conversations, but may have a higher latency.
147
157
  turn_detection: nil
148
158
  )
149
159
  end
@@ -160,7 +170,13 @@ module OpenAI
160
170
  noise_reduction:
161
171
  OpenAI::Realtime::RealtimeAudioConfigInput::NoiseReduction,
162
172
  transcription: OpenAI::Realtime::AudioTranscription,
163
- turn_detection: OpenAI::Realtime::RealtimeAudioInputTurnDetection
173
+ turn_detection:
174
+ T.nilable(
175
+ T.any(
176
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad,
177
+ OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad
178
+ )
179
+ )
164
180
  }
165
181
  )
166
182
  end