openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c61159341c0fe919d9f1042f0c705f57d05fc656407e03498bdf1367f704b6ca
4
- data.tar.gz: 26c9718404d8d1519acb3d9364f9918c623952679a0b29d11c159ebb070a6227
3
+ metadata.gz: 7603886aae923da50eb83881be4d69cf1b5be3616b9903a784f16bcd124c023d
4
+ data.tar.gz: 03feb2933c7a27590301dc9c805d66ce2e7f45e743515e7e46bf9bbf9d61dfe3
5
5
  SHA512:
6
- metadata.gz: e5bebf2d7459cf64493d9ce74b839ad8c9326df50953ff064ecd9b5a912c077ebef0c0c38b6a95b40f01b7f37e161ed42be30d985e7b608ca76db18908da2d8d
7
- data.tar.gz: 0a1fb39beb43b71336e7d0a398bbb56705c850de30f6eb110821b974967e21848d6cc5b626e57434df8bc247b4cd23f71f3c0b41ff425fe39b606ea44454fe2d
6
+ metadata.gz: 6de05c188535d0ed867bf936a6e97a7f452bd6924678b672d23f7868feeda7144df86dac454bfa29300d81844799c1ab77adc42767a314975d566acacb3aaefd
7
+ data.tar.gz: bf693ec699be028060d13db1fa9bd5ef3bb980118d5c4b292877bc6a16a88bd743ee10cd6c6a9d3828aa15d31e8e55058d660d831af3179050c68b110d8c705f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.23.2 (2025-09-11)
4
+
5
+ Full Changelog: [v0.23.1...v0.23.2](https://github.com/openai/openai-ruby/compare/v0.23.1...v0.23.2)
6
+
7
+ ### Chores
8
+
9
+ * **api:** Minor docs and type updates for realtime ([ccef982](https://github.com/openai/openai-ruby/commit/ccef9827b31206fc9ba40d2b6165eeefda7621f5))
10
+
3
11
  ## 0.23.1 (2025-09-10)
4
12
 
5
13
  Full Changelog: [v0.23.0...v0.23.1](https://github.com/openai/openai-ruby/compare/v0.23.0...v0.23.1)
data/README.md CHANGED
@@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application
15
15
  <!-- x-release-please-start-version -->
16
16
 
17
17
  ```ruby
18
- gem "openai", "~> 0.23.1"
18
+ gem "openai", "~> 0.23.2"
19
19
  ```
20
20
 
21
21
  <!-- x-release-please-end -->
@@ -5,13 +5,15 @@ module OpenAI
5
5
  module Realtime
6
6
  class InputAudioBufferTimeoutTriggered < OpenAI::Internal::Type::BaseModel
7
7
  # @!attribute audio_end_ms
8
- # Millisecond offset where speech ended within the buffered audio.
8
+ # Millisecond offset of audio written to the input audio buffer at the time the
9
+ # timeout was triggered.
9
10
  #
10
11
  # @return [Integer]
11
12
  required :audio_end_ms, Integer
12
13
 
13
14
  # @!attribute audio_start_ms
14
- # Millisecond offset where speech started within the buffered audio.
15
+ # Millisecond offset of audio written to the input audio buffer that was after the
16
+ # playback time of the last model response.
15
17
  #
16
18
  # @return [Integer]
17
19
  required :audio_start_ms, Integer
@@ -35,11 +37,29 @@ module OpenAI
35
37
  required :type, const: :"input_audio_buffer.timeout_triggered"
36
38
 
37
39
  # @!method initialize(audio_end_ms:, audio_start_ms:, event_id:, item_id:, type: :"input_audio_buffer.timeout_triggered")
38
- # Returned when the server VAD timeout is triggered for the input audio buffer.
40
+ # Some parameter documentations has been truncated, see
41
+ # {OpenAI::Models::Realtime::InputAudioBufferTimeoutTriggered} for more details.
39
42
  #
40
- # @param audio_end_ms [Integer] Millisecond offset where speech ended within the buffered audio.
43
+ # Returned when the Server VAD timeout is triggered for the input audio buffer.
44
+ # This is configured with `idle_timeout_ms` in the `turn_detection` settings of
45
+ # the session, and it indicates that there hasn't been any speech detected for the
46
+ # configured duration.
41
47
  #
42
- # @param audio_start_ms [Integer] Millisecond offset where speech started within the buffered audio.
48
+ # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio
49
+ # after the last model response up to the triggering time, as an offset from the
50
+ # beginning of audio written to the input audio buffer. This means it demarcates
51
+ # the segment of audio that was silent and the difference between the start and
52
+ # end values will roughly match the configured timeout.
53
+ #
54
+ # The empty audio will be committed to the conversation as an `input_audio` item
55
+ # (there will be a `input_audio_buffer.committed` event) and a model response will
56
+ # be generated. There may be speech that didn't trigger VAD but is still detected
57
+ # by the model, so the model may respond with something relevant to the
58
+ # conversation or a prompt to continue speaking.
59
+ #
60
+ # @param audio_end_ms [Integer] Millisecond offset of audio written to the input audio buffer at the time the ti
61
+ #
62
+ # @param audio_start_ms [Integer] Millisecond offset of audio written to the input audio buffer that was after the
43
63
  #
44
64
  # @param event_id [String] The unique ID of the server event.
45
65
  #
@@ -36,17 +36,20 @@ module OpenAI
36
36
  # @!attribute turn_detection
37
37
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
38
38
  # set to `null` to turn off, in which case the client must manually trigger model
39
- # response. Server VAD means that the model will detect the start and end of
40
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
41
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
42
- # semantically estimate whether the user has finished speaking, then dynamically
43
- # sets a timeout based on this probability. For example, if user audio trails off
44
- # with "uhhm", the model will score a low probability of turn end and wait longer
45
- # for the user to continue speaking. This can be useful for more natural
46
- # conversations, but may have a higher latency.
39
+ # response.
47
40
  #
48
- # @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection, nil]
49
- optional :turn_detection, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection }
41
+ # Server VAD means that the model will detect the start and end of speech based on
42
+ # audio volume and respond at the end of user speech.
43
+ #
44
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
45
+ # with VAD) to semantically estimate whether the user has finished speaking, then
46
+ # dynamically sets a timeout based on this probability. For example, if user audio
47
+ # trails off with "uhhm", the model will score a low probability of turn end and
48
+ # wait longer for the user to continue speaking. This can be useful for more
49
+ # natural conversations, but may have a higher latency.
50
+ #
51
+ # @return [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil]
52
+ optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection }, nil?: true
50
53
 
51
54
  # @!method initialize(format_: nil, noise_reduction: nil, transcription: nil, turn_detection: nil)
52
55
  # Some parameter documentations has been truncated, see
@@ -58,7 +61,7 @@ module OpenAI
58
61
  #
59
62
  # @param transcription [OpenAI::Models::Realtime::AudioTranscription] Configuration for input audio transcription, defaults to off and can be set to `
60
63
  #
61
- # @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
64
+ # @param turn_detection [OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
62
65
 
63
66
  # @see OpenAI::Models::Realtime::RealtimeAudioConfigInput#noise_reduction
64
67
  class NoiseReduction < OpenAI::Internal::Type::BaseModel
@@ -3,128 +3,184 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Realtime
6
- class RealtimeAudioInputTurnDetection < OpenAI::Internal::Type::BaseModel
7
- # @!attribute create_response
8
- # Whether or not to automatically generate a response when a VAD stop event
9
- # occurs.
10
- #
11
- # @return [Boolean, nil]
12
- optional :create_response, OpenAI::Internal::Type::Boolean
13
-
14
- # @!attribute eagerness
15
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
16
- # will wait longer for the user to continue speaking, `high` will respond more
17
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
18
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
19
- #
20
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness, nil]
21
- optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Eagerness }
22
-
23
- # @!attribute idle_timeout_ms
24
- # Optional idle timeout after which turn detection will auto-timeout when no
25
- # additional audio is received and emits a `timeout_triggered` event.
26
- #
27
- # @return [Integer, nil]
28
- optional :idle_timeout_ms, Integer, nil?: true
29
-
30
- # @!attribute interrupt_response
31
- # Whether or not to automatically interrupt any ongoing response with output to
32
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
33
- # occurs.
34
- #
35
- # @return [Boolean, nil]
36
- optional :interrupt_response, OpenAI::Internal::Type::Boolean
37
-
38
- # @!attribute prefix_padding_ms
39
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
40
- # detected speech (in milliseconds). Defaults to 300ms.
41
- #
42
- # @return [Integer, nil]
43
- optional :prefix_padding_ms, Integer
44
-
45
- # @!attribute silence_duration_ms
46
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
47
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
48
- # more quickly, but may jump in on short pauses from the user.
49
- #
50
- # @return [Integer, nil]
51
- optional :silence_duration_ms, Integer
52
-
53
- # @!attribute threshold
54
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
55
- # defaults to 0.5. A higher threshold will require louder audio to activate the
56
- # model, and thus might perform better in noisy environments.
57
- #
58
- # @return [Float, nil]
59
- optional :threshold, Float
60
-
61
- # @!attribute type
62
- # Type of turn detection.
63
- #
64
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type, nil]
65
- optional :type, enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::Type }
66
-
67
- # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
68
- # Some parameter documentations has been truncated, see
69
- # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection} for more details.
70
- #
71
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
72
- # set to `null` to turn off, in which case the client must manually trigger model
73
- # response. Server VAD means that the model will detect the start and end of
74
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
75
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
76
- # semantically estimate whether the user has finished speaking, then dynamically
77
- # sets a timeout based on this probability. For example, if user audio trails off
78
- # with "uhhm", the model will score a low probability of turn end and wait longer
79
- # for the user to continue speaking. This can be useful for more natural
80
- # conversations, but may have a higher latency.
81
- #
82
- # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
83
- #
84
- # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
85
- #
86
- # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
87
- #
88
- # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
89
- #
90
- # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
91
- #
92
- # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
93
- #
94
- # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
95
- #
96
- # @param type [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::Type] Type of turn detection.
97
-
98
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
99
- # will wait longer for the user to continue speaking, `high` will respond more
100
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
101
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
102
- #
103
- # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#eagerness
104
- module Eagerness
105
- extend OpenAI::Internal::Type::Enum
106
-
107
- LOW = :low
108
- MEDIUM = :medium
109
- HIGH = :high
110
- AUTO = :auto
111
-
112
- # @!method self.values
113
- # @return [Array<Symbol>]
6
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
7
+ # set to `null` to turn off, in which case the client must manually trigger model
8
+ # response.
9
+ #
10
+ # Server VAD means that the model will detect the start and end of speech based on
11
+ # audio volume and respond at the end of user speech.
12
+ #
13
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
14
+ # with VAD) to semantically estimate whether the user has finished speaking, then
15
+ # dynamically sets a timeout based on this probability. For example, if user audio
16
+ # trails off with "uhhm", the model will score a low probability of turn end and
17
+ # wait longer for the user to continue speaking. This can be useful for more
18
+ # natural conversations, but may have a higher latency.
19
+ module RealtimeAudioInputTurnDetection
20
+ extend OpenAI::Internal::Type::Union
21
+
22
+ discriminator :type
23
+
24
+ # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
25
+ variant :server_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::ServerVad }
26
+
27
+ # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
28
+ variant :semantic_vad, -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad }
29
+
30
+ class ServerVad < OpenAI::Internal::Type::BaseModel
31
+ # @!attribute type
32
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
33
+ #
34
+ # @return [Symbol, :server_vad]
35
+ required :type, const: :server_vad
36
+
37
+ # @!attribute create_response
38
+ # Whether or not to automatically generate a response when a VAD stop event
39
+ # occurs.
40
+ #
41
+ # @return [Boolean, nil]
42
+ optional :create_response, OpenAI::Internal::Type::Boolean
43
+
44
+ # @!attribute idle_timeout_ms
45
+ # Optional timeout after which a model response will be triggered automatically.
46
+ # This is useful for situations in which a long pause from the user is unexpected,
47
+ # such as a phone call. The model will effectively prompt the user to continue the
48
+ # conversation based on the current context.
49
+ #
50
+ # The timeout value will be applied after the last model response's audio has
51
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
52
+ # duration.
53
+ #
54
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
55
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
56
+ # only supported for `server_vad` mode.
57
+ #
58
+ # @return [Integer, nil]
59
+ optional :idle_timeout_ms, Integer, nil?: true
60
+
61
+ # @!attribute interrupt_response
62
+ # Whether or not to automatically interrupt any ongoing response with output to
63
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
64
+ # occurs.
65
+ #
66
+ # @return [Boolean, nil]
67
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
68
+
69
+ # @!attribute prefix_padding_ms
70
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
71
+ # detected speech (in milliseconds). Defaults to 300ms.
72
+ #
73
+ # @return [Integer, nil]
74
+ optional :prefix_padding_ms, Integer
75
+
76
+ # @!attribute silence_duration_ms
77
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
78
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
79
+ # more quickly, but may jump in on short pauses from the user.
80
+ #
81
+ # @return [Integer, nil]
82
+ optional :silence_duration_ms, Integer
83
+
84
+ # @!attribute threshold
85
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
86
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
87
+ # model, and thus might perform better in noisy environments.
88
+ #
89
+ # @return [Float, nil]
90
+ optional :threshold, Float
91
+
92
+ # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
93
+ # Some parameter documentations has been truncated, see
94
+ # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad} for more
95
+ # details.
96
+ #
97
+ # Server-side voice activity detection (VAD) which flips on when user speech is
98
+ # detected and off after a period of silence.
99
+ #
100
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
101
+ #
102
+ # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
103
+ #
104
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
105
+ #
106
+ # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
107
+ #
108
+ # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
109
+ #
110
+ # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
111
+ #
112
+ # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
114
113
  end
115
114
 
116
- # Type of turn detection.
117
- #
118
- # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection#type
119
- module Type
120
- extend OpenAI::Internal::Type::Enum
115
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
116
+ # @!attribute type
117
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
118
+ #
119
+ # @return [Symbol, :semantic_vad]
120
+ required :type, const: :semantic_vad
121
+
122
+ # @!attribute create_response
123
+ # Whether or not to automatically generate a response when a VAD stop event
124
+ # occurs.
125
+ #
126
+ # @return [Boolean, nil]
127
+ optional :create_response, OpenAI::Internal::Type::Boolean
128
+
129
+ # @!attribute eagerness
130
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
131
+ # will wait longer for the user to continue speaking, `high` will respond more
132
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
133
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
134
+ #
135
+ # @return [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness, nil]
136
+ optional :eagerness,
137
+ enum: -> { OpenAI::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness }
121
138
 
122
- SERVER_VAD = :server_vad
123
- SEMANTIC_VAD = :semantic_vad
139
+ # @!attribute interrupt_response
140
+ # Whether or not to automatically interrupt any ongoing response with output to
141
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
142
+ # occurs.
143
+ #
144
+ # @return [Boolean, nil]
145
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
124
146
 
125
- # @!method self.values
126
- # @return [Array<Symbol>]
147
+ # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
148
+ # Some parameter documentations has been truncated, see
149
+ # {OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad} for
150
+ # more details.
151
+ #
152
+ # Server-side semantic turn detection which uses a model to determine when the
153
+ # user has finished speaking.
154
+ #
155
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
156
+ #
157
+ # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
158
+ #
159
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
160
+ #
161
+ # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
162
+
163
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
164
+ # will wait longer for the user to continue speaking, `high` will respond more
165
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
166
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
167
+ #
168
+ # @see OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad#eagerness
169
+ module Eagerness
170
+ extend OpenAI::Internal::Type::Enum
171
+
172
+ LOW = :low
173
+ MEDIUM = :medium
174
+ HIGH = :high
175
+ AUTO = :auto
176
+
177
+ # @!method self.values
178
+ # @return [Array<Symbol>]
179
+ end
127
180
  end
181
+
182
+ # @!method self.variants
183
+ # @return [Array(OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeAudioInputTurnDetection::SemanticVad)]
128
184
  end
129
185
  end
130
186
  end
@@ -208,7 +208,19 @@ module OpenAI
208
208
  # The event will include the full content of the Item except for audio data, which can be retrieved separately with a `conversation.item.retrieve` event if needed.
209
209
  variant :"conversation.item.done", -> { OpenAI::Realtime::ConversationItemDone }
210
210
 
211
- # Returned when the server VAD timeout is triggered for the input audio buffer.
211
+ # Returned when the Server VAD timeout is triggered for the input audio buffer. This is configured
212
+ # with `idle_timeout_ms` in the `turn_detection` settings of the session, and it indicates that
213
+ # there hasn't been any speech detected for the configured duration.
214
+ #
215
+ # The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio after the last
216
+ # model response up to the triggering time, as an offset from the beginning of audio written
217
+ # to the input audio buffer. This means it demarcates the segment of audio that was silent and
218
+ # the difference between the start and end values will roughly match the configured timeout.
219
+ #
220
+ # The empty audio will be committed to the conversation as an `input_audio` item (there will be a
221
+ # `input_audio_buffer.committed` event) and a model response will be generated. There may be speech
222
+ # that didn't trigger VAD but is still detected by the model, so the model may respond with
223
+ # something relevant to the conversation or a prompt to continue speaking.
212
224
  variant :"input_audio_buffer.timeout_triggered", -> { OpenAI::Realtime::InputAudioBufferTimeoutTriggered }
213
225
 
214
226
  # Returned when an input audio transcription segment is identified for an item.