openai 0.23.1 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
- data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
- data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
- data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
- data/lib/openai/models/realtime/realtime_session.rb +179 -118
- data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
- data/lib/openai/models/responses/response.rb +8 -8
- data/lib/openai/models/responses/response_create_params.rb +8 -8
- data/lib/openai/version.rb +1 -1
- data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
- data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
- data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
- data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
- data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
- data/rbi/openai/models/responses/response.rbi +12 -12
- data/rbi/openai/models/responses/response_create_params.rbi +12 -12
- data/rbi/openai/resources/responses.rbi +8 -8
- data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
- data/sig/openai/models/realtime/realtime_session.rbs +95 -69
- data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
- metadata +2 -2
@@ -256,28 +256,28 @@ module OpenAI
|
|
256
256
|
|
257
257
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
258
258
|
# set to `null` to turn off, in which case the client must manually trigger model
|
259
|
-
# response.
|
260
|
-
#
|
261
|
-
#
|
262
|
-
#
|
263
|
-
#
|
264
|
-
#
|
265
|
-
#
|
266
|
-
#
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
attr_reader :turn_detection
|
271
|
-
|
259
|
+
# response.
|
260
|
+
#
|
261
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
262
|
+
# audio volume and respond at the end of user speech.
|
263
|
+
#
|
264
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
265
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
266
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
267
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
268
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
269
|
+
# natural conversations, but may have a higher latency.
|
272
270
|
sig do
|
273
|
-
|
274
|
-
|
275
|
-
T.
|
276
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::
|
271
|
+
returns(
|
272
|
+
T.nilable(
|
273
|
+
T.any(
|
274
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
|
275
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
|
277
276
|
)
|
278
|
-
|
277
|
+
)
|
278
|
+
)
|
279
279
|
end
|
280
|
-
|
280
|
+
attr_accessor :turn_detection
|
281
281
|
|
282
282
|
# The voice the model uses to respond. Voice cannot be changed during the session
|
283
283
|
# once the model has responded with audio at least once. Current voice options are
|
@@ -299,7 +299,7 @@ module OpenAI
|
|
299
299
|
end
|
300
300
|
attr_writer :voice
|
301
301
|
|
302
|
-
# Realtime session object.
|
302
|
+
# Realtime session object for the beta interface.
|
303
303
|
sig do
|
304
304
|
params(
|
305
305
|
id: String,
|
@@ -336,7 +336,10 @@ module OpenAI
|
|
336
336
|
),
|
337
337
|
turn_detection:
|
338
338
|
T.nilable(
|
339
|
-
|
339
|
+
T.any(
|
340
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad::OrHash,
|
341
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::OrHash
|
342
|
+
)
|
340
343
|
),
|
341
344
|
voice:
|
342
345
|
T.any(String, OpenAI::Realtime::RealtimeSession::Voice::OrSymbol)
|
@@ -420,14 +423,17 @@ module OpenAI
|
|
420
423
|
tracing: nil,
|
421
424
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
422
425
|
# set to `null` to turn off, in which case the client must manually trigger model
|
423
|
-
# response.
|
424
|
-
#
|
425
|
-
#
|
426
|
-
#
|
427
|
-
#
|
428
|
-
#
|
429
|
-
#
|
430
|
-
#
|
426
|
+
# response.
|
427
|
+
#
|
428
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
429
|
+
# audio volume and respond at the end of user speech.
|
430
|
+
#
|
431
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
432
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
433
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
434
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
435
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
436
|
+
# natural conversations, but may have a higher latency.
|
431
437
|
turn_detection: nil,
|
432
438
|
# The voice the model uses to respond. Voice cannot be changed during the session
|
433
439
|
# once the model has responded with audio at least once. Current voice options are
|
@@ -472,7 +478,12 @@ module OpenAI
|
|
472
478
|
)
|
473
479
|
),
|
474
480
|
turn_detection:
|
475
|
-
T.nilable(
|
481
|
+
T.nilable(
|
482
|
+
T.any(
|
483
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
|
484
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
|
485
|
+
)
|
486
|
+
),
|
476
487
|
voice:
|
477
488
|
T.any(
|
478
489
|
String,
|
@@ -864,256 +875,320 @@ module OpenAI
|
|
864
875
|
end
|
865
876
|
end
|
866
877
|
|
867
|
-
|
868
|
-
|
878
|
+
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
879
|
+
# set to `null` to turn off, in which case the client must manually trigger model
|
880
|
+
# response.
|
881
|
+
#
|
882
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
883
|
+
# audio volume and respond at the end of user speech.
|
884
|
+
#
|
885
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
886
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
887
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
888
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
889
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
890
|
+
# natural conversations, but may have a higher latency.
|
891
|
+
module TurnDetection
|
892
|
+
extend OpenAI::Internal::Type::Union
|
893
|
+
|
894
|
+
Variants =
|
869
895
|
T.type_alias do
|
870
896
|
T.any(
|
871
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection,
|
872
|
-
OpenAI::
|
897
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
|
898
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
|
873
899
|
)
|
874
900
|
end
|
875
901
|
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
885
|
-
# will wait longer for the user to continue speaking, `high` will respond more
|
886
|
-
# quickly. `auto` is the default and is equivalent to `medium`.
|
887
|
-
sig do
|
888
|
-
returns(
|
889
|
-
T.nilable(
|
890
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
|
891
|
-
)
|
892
|
-
)
|
893
|
-
end
|
894
|
-
attr_reader :eagerness
|
895
|
-
|
896
|
-
sig do
|
897
|
-
params(
|
898
|
-
eagerness:
|
899
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
|
900
|
-
).void
|
901
|
-
end
|
902
|
-
attr_writer :eagerness
|
903
|
-
|
904
|
-
# Optional idle timeout after which turn detection will auto-timeout when no
|
905
|
-
# additional audio is received.
|
906
|
-
sig { returns(T.nilable(Integer)) }
|
907
|
-
attr_accessor :idle_timeout_ms
|
908
|
-
|
909
|
-
# Whether or not to automatically interrupt any ongoing response with output to
|
910
|
-
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
911
|
-
# occurs.
|
912
|
-
sig { returns(T.nilable(T::Boolean)) }
|
913
|
-
attr_reader :interrupt_response
|
914
|
-
|
915
|
-
sig { params(interrupt_response: T::Boolean).void }
|
916
|
-
attr_writer :interrupt_response
|
917
|
-
|
918
|
-
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
919
|
-
# detected speech (in milliseconds). Defaults to 300ms.
|
920
|
-
sig { returns(T.nilable(Integer)) }
|
921
|
-
attr_reader :prefix_padding_ms
|
922
|
-
|
923
|
-
sig { params(prefix_padding_ms: Integer).void }
|
924
|
-
attr_writer :prefix_padding_ms
|
925
|
-
|
926
|
-
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
927
|
-
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
928
|
-
# more quickly, but may jump in on short pauses from the user.
|
929
|
-
sig { returns(T.nilable(Integer)) }
|
930
|
-
attr_reader :silence_duration_ms
|
931
|
-
|
932
|
-
sig { params(silence_duration_ms: Integer).void }
|
933
|
-
attr_writer :silence_duration_ms
|
934
|
-
|
935
|
-
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
936
|
-
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
937
|
-
# model, and thus might perform better in noisy environments.
|
938
|
-
sig { returns(T.nilable(Float)) }
|
939
|
-
attr_reader :threshold
|
940
|
-
|
941
|
-
sig { params(threshold: Float).void }
|
942
|
-
attr_writer :threshold
|
943
|
-
|
944
|
-
# Type of turn detection.
|
945
|
-
sig do
|
946
|
-
returns(
|
947
|
-
T.nilable(
|
948
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
|
949
|
-
)
|
950
|
-
)
|
951
|
-
end
|
952
|
-
attr_reader :type
|
902
|
+
class ServerVad < OpenAI::Internal::Type::BaseModel
|
903
|
+
OrHash =
|
904
|
+
T.type_alias do
|
905
|
+
T.any(
|
906
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
|
907
|
+
OpenAI::Internal::AnyHash
|
908
|
+
)
|
909
|
+
end
|
953
910
|
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
|
958
|
-
).void
|
959
|
-
end
|
960
|
-
attr_writer :type
|
911
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
912
|
+
sig { returns(Symbol) }
|
913
|
+
attr_accessor :type
|
961
914
|
|
962
|
-
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
963
|
-
# set to `null` to turn off, in which case the client must manually trigger model
|
964
|
-
# response. Server VAD means that the model will detect the start and end of
|
965
|
-
# speech based on audio volume and respond at the end of user speech. Semantic VAD
|
966
|
-
# is more advanced and uses a turn detection model (in conjunction with VAD) to
|
967
|
-
# semantically estimate whether the user has finished speaking, then dynamically
|
968
|
-
# sets a timeout based on this probability. For example, if user audio trails off
|
969
|
-
# with "uhhm", the model will score a low probability of turn end and wait longer
|
970
|
-
# for the user to continue speaking. This can be useful for more natural
|
971
|
-
# conversations, but may have a higher latency.
|
972
|
-
sig do
|
973
|
-
params(
|
974
|
-
create_response: T::Boolean,
|
975
|
-
eagerness:
|
976
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
|
977
|
-
idle_timeout_ms: T.nilable(Integer),
|
978
|
-
interrupt_response: T::Boolean,
|
979
|
-
prefix_padding_ms: Integer,
|
980
|
-
silence_duration_ms: Integer,
|
981
|
-
threshold: Float,
|
982
|
-
type:
|
983
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
|
984
|
-
).returns(T.attached_class)
|
985
|
-
end
|
986
|
-
def self.new(
|
987
915
|
# Whether or not to automatically generate a response when a VAD stop event
|
988
916
|
# occurs.
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
#
|
996
|
-
|
917
|
+
sig { returns(T.nilable(T::Boolean)) }
|
918
|
+
attr_reader :create_response
|
919
|
+
|
920
|
+
sig { params(create_response: T::Boolean).void }
|
921
|
+
attr_writer :create_response
|
922
|
+
|
923
|
+
# Optional timeout after which a model response will be triggered automatically.
|
924
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
925
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
926
|
+
# conversation based on the current context.
|
927
|
+
#
|
928
|
+
# The timeout value will be applied after the last model response's audio has
|
929
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
930
|
+
# duration.
|
931
|
+
#
|
932
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
933
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
934
|
+
# only supported for `server_vad` mode.
|
935
|
+
sig { returns(T.nilable(Integer)) }
|
936
|
+
attr_accessor :idle_timeout_ms
|
937
|
+
|
997
938
|
# Whether or not to automatically interrupt any ongoing response with output to
|
998
939
|
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
999
940
|
# occurs.
|
1000
|
-
|
941
|
+
sig { returns(T.nilable(T::Boolean)) }
|
942
|
+
attr_reader :interrupt_response
|
943
|
+
|
944
|
+
sig { params(interrupt_response: T::Boolean).void }
|
945
|
+
attr_writer :interrupt_response
|
946
|
+
|
1001
947
|
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
1002
948
|
# detected speech (in milliseconds). Defaults to 300ms.
|
1003
|
-
|
949
|
+
sig { returns(T.nilable(Integer)) }
|
950
|
+
attr_reader :prefix_padding_ms
|
951
|
+
|
952
|
+
sig { params(prefix_padding_ms: Integer).void }
|
953
|
+
attr_writer :prefix_padding_ms
|
954
|
+
|
1004
955
|
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
1005
956
|
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
1006
957
|
# more quickly, but may jump in on short pauses from the user.
|
1007
|
-
|
958
|
+
sig { returns(T.nilable(Integer)) }
|
959
|
+
attr_reader :silence_duration_ms
|
960
|
+
|
961
|
+
sig { params(silence_duration_ms: Integer).void }
|
962
|
+
attr_writer :silence_duration_ms
|
963
|
+
|
1008
964
|
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
1009
965
|
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
1010
966
|
# model, and thus might perform better in noisy environments.
|
1011
|
-
|
1012
|
-
|
1013
|
-
type: nil
|
1014
|
-
)
|
1015
|
-
end
|
967
|
+
sig { returns(T.nilable(Float)) }
|
968
|
+
attr_reader :threshold
|
1016
969
|
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
970
|
+
sig { params(threshold: Float).void }
|
971
|
+
attr_writer :threshold
|
972
|
+
|
973
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is
|
974
|
+
# detected and off after a period of silence.
|
975
|
+
sig do
|
976
|
+
params(
|
1020
977
|
create_response: T::Boolean,
|
1021
|
-
eagerness:
|
1022
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
|
1023
978
|
idle_timeout_ms: T.nilable(Integer),
|
1024
979
|
interrupt_response: T::Boolean,
|
1025
980
|
prefix_padding_ms: Integer,
|
1026
981
|
silence_duration_ms: Integer,
|
1027
982
|
threshold: Float,
|
1028
|
-
type:
|
1029
|
-
|
1030
|
-
|
983
|
+
type: Symbol
|
984
|
+
).returns(T.attached_class)
|
985
|
+
end
|
986
|
+
def self.new(
|
987
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
988
|
+
# occurs.
|
989
|
+
create_response: nil,
|
990
|
+
# Optional timeout after which a model response will be triggered automatically.
|
991
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
992
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
993
|
+
# conversation based on the current context.
|
994
|
+
#
|
995
|
+
# The timeout value will be applied after the last model response's audio has
|
996
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
997
|
+
# duration.
|
998
|
+
#
|
999
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
1000
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
1001
|
+
# only supported for `server_vad` mode.
|
1002
|
+
idle_timeout_ms: nil,
|
1003
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
1004
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
1005
|
+
# occurs.
|
1006
|
+
interrupt_response: nil,
|
1007
|
+
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
1008
|
+
# detected speech (in milliseconds). Defaults to 300ms.
|
1009
|
+
prefix_padding_ms: nil,
|
1010
|
+
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
1011
|
+
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
1012
|
+
# more quickly, but may jump in on short pauses from the user.
|
1013
|
+
silence_duration_ms: nil,
|
1014
|
+
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
1015
|
+
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
1016
|
+
# model, and thus might perform better in noisy environments.
|
1017
|
+
threshold: nil,
|
1018
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
1019
|
+
type: :server_vad
|
1031
1020
|
)
|
1032
|
-
|
1033
|
-
def to_hash
|
1034
|
-
end
|
1021
|
+
end
|
1035
1022
|
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1023
|
+
sig do
|
1024
|
+
override.returns(
|
1025
|
+
{
|
1026
|
+
type: Symbol,
|
1027
|
+
create_response: T::Boolean,
|
1028
|
+
idle_timeout_ms: T.nilable(Integer),
|
1029
|
+
interrupt_response: T::Boolean,
|
1030
|
+
prefix_padding_ms: Integer,
|
1031
|
+
silence_duration_ms: Integer,
|
1032
|
+
threshold: Float
|
1033
|
+
}
|
1034
|
+
)
|
1035
|
+
end
|
1036
|
+
def to_hash
|
1037
|
+
end
|
1038
|
+
end
|
1041
1039
|
|
1042
|
-
|
1040
|
+
class SemanticVad < OpenAI::Internal::Type::BaseModel
|
1041
|
+
OrHash =
|
1043
1042
|
T.type_alias do
|
1044
|
-
T.
|
1045
|
-
|
1046
|
-
OpenAI::
|
1043
|
+
T.any(
|
1044
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad,
|
1045
|
+
OpenAI::Internal::AnyHash
|
1047
1046
|
)
|
1048
1047
|
end
|
1049
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
1050
1048
|
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
|
1055
|
-
)
|
1056
|
-
MEDIUM =
|
1057
|
-
T.let(
|
1058
|
-
:medium,
|
1059
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
|
1060
|
-
)
|
1061
|
-
HIGH =
|
1062
|
-
T.let(
|
1063
|
-
:high,
|
1064
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
|
1065
|
-
)
|
1066
|
-
AUTO =
|
1067
|
-
T.let(
|
1068
|
-
:auto,
|
1069
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
|
1070
|
-
)
|
1049
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
1050
|
+
sig { returns(Symbol) }
|
1051
|
+
attr_accessor :type
|
1071
1052
|
|
1053
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
1054
|
+
# occurs.
|
1055
|
+
sig { returns(T.nilable(T::Boolean)) }
|
1056
|
+
attr_reader :create_response
|
1057
|
+
|
1058
|
+
sig { params(create_response: T::Boolean).void }
|
1059
|
+
attr_writer :create_response
|
1060
|
+
|
1061
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
1062
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
1063
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
1064
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
1072
1065
|
sig do
|
1073
|
-
|
1074
|
-
T
|
1075
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::
|
1076
|
-
|
1066
|
+
returns(
|
1067
|
+
T.nilable(
|
1068
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
|
1069
|
+
)
|
1077
1070
|
)
|
1078
1071
|
end
|
1079
|
-
|
1072
|
+
attr_reader :eagerness
|
1073
|
+
|
1074
|
+
sig do
|
1075
|
+
params(
|
1076
|
+
eagerness:
|
1077
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
|
1078
|
+
).void
|
1080
1079
|
end
|
1081
|
-
|
1080
|
+
attr_writer :eagerness
|
1082
1081
|
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1082
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
1083
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
1084
|
+
# occurs.
|
1085
|
+
sig { returns(T.nilable(T::Boolean)) }
|
1086
|
+
attr_reader :interrupt_response
|
1086
1087
|
|
1087
|
-
|
1088
|
-
|
1089
|
-
T.all(
|
1090
|
-
Symbol,
|
1091
|
-
OpenAI::Realtime::RealtimeSession::TurnDetection::Type
|
1092
|
-
)
|
1093
|
-
end
|
1094
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
1088
|
+
sig { params(interrupt_response: T::Boolean).void }
|
1089
|
+
attr_writer :interrupt_response
|
1095
1090
|
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
:
|
1104
|
-
|
1105
|
-
)
|
1091
|
+
# Server-side semantic turn detection which uses a model to determine when the
|
1092
|
+
# user has finished speaking.
|
1093
|
+
sig do
|
1094
|
+
params(
|
1095
|
+
create_response: T::Boolean,
|
1096
|
+
eagerness:
|
1097
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
|
1098
|
+
interrupt_response: T::Boolean,
|
1099
|
+
type: Symbol
|
1100
|
+
).returns(T.attached_class)
|
1101
|
+
end
|
1102
|
+
def self.new(
|
1103
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
1104
|
+
# occurs.
|
1105
|
+
create_response: nil,
|
1106
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
1107
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
1108
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
1109
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
1110
|
+
eagerness: nil,
|
1111
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
1112
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
1113
|
+
# occurs.
|
1114
|
+
interrupt_response: nil,
|
1115
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
1116
|
+
type: :semantic_vad
|
1117
|
+
)
|
1118
|
+
end
|
1106
1119
|
|
1107
1120
|
sig do
|
1108
1121
|
override.returns(
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1122
|
+
{
|
1123
|
+
type: Symbol,
|
1124
|
+
create_response: T::Boolean,
|
1125
|
+
eagerness:
|
1126
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
|
1127
|
+
interrupt_response: T::Boolean
|
1128
|
+
}
|
1112
1129
|
)
|
1113
1130
|
end
|
1114
|
-
def
|
1131
|
+
def to_hash
|
1132
|
+
end
|
1133
|
+
|
1134
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
1135
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
1136
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
1137
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
1138
|
+
module Eagerness
|
1139
|
+
extend OpenAI::Internal::Type::Enum
|
1140
|
+
|
1141
|
+
TaggedSymbol =
|
1142
|
+
T.type_alias do
|
1143
|
+
T.all(
|
1144
|
+
Symbol,
|
1145
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness
|
1146
|
+
)
|
1147
|
+
end
|
1148
|
+
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
1149
|
+
|
1150
|
+
LOW =
|
1151
|
+
T.let(
|
1152
|
+
:low,
|
1153
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
1154
|
+
)
|
1155
|
+
MEDIUM =
|
1156
|
+
T.let(
|
1157
|
+
:medium,
|
1158
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
1159
|
+
)
|
1160
|
+
HIGH =
|
1161
|
+
T.let(
|
1162
|
+
:high,
|
1163
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
1164
|
+
)
|
1165
|
+
AUTO =
|
1166
|
+
T.let(
|
1167
|
+
:auto,
|
1168
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
1169
|
+
)
|
1170
|
+
|
1171
|
+
sig do
|
1172
|
+
override.returns(
|
1173
|
+
T::Array[
|
1174
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
1175
|
+
]
|
1176
|
+
)
|
1177
|
+
end
|
1178
|
+
def self.values
|
1179
|
+
end
|
1115
1180
|
end
|
1116
1181
|
end
|
1182
|
+
|
1183
|
+
sig do
|
1184
|
+
override.returns(
|
1185
|
+
T::Array[
|
1186
|
+
OpenAI::Realtime::RealtimeSession::TurnDetection::Variants
|
1187
|
+
]
|
1188
|
+
)
|
1189
|
+
end
|
1190
|
+
def self.variants
|
1191
|
+
end
|
1117
1192
|
end
|
1118
1193
|
|
1119
1194
|
# The voice the model uses to respond. Voice cannot be changed during the session
|