openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -256,28 +256,28 @@ module OpenAI
256
256
 
257
257
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
258
258
  # set to `null` to turn off, in which case the client must manually trigger model
259
- # response. Server VAD means that the model will detect the start and end of
260
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
261
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
262
- # semantically estimate whether the user has finished speaking, then dynamically
263
- # sets a timeout based on this probability. For example, if user audio trails off
264
- # with "uhhm", the model will score a low probability of turn end and wait longer
265
- # for the user to continue speaking. This can be useful for more natural
266
- # conversations, but may have a higher latency.
267
- sig do
268
- returns(T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection))
269
- end
270
- attr_reader :turn_detection
271
-
259
+ # response.
260
+ #
261
+ # Server VAD means that the model will detect the start and end of speech based on
262
+ # audio volume and respond at the end of user speech.
263
+ #
264
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
265
+ # with VAD) to semantically estimate whether the user has finished speaking, then
266
+ # dynamically sets a timeout based on this probability. For example, if user audio
267
+ # trails off with "uhhm", the model will score a low probability of turn end and
268
+ # wait longer for the user to continue speaking. This can be useful for more
269
+ # natural conversations, but may have a higher latency.
272
270
  sig do
273
- params(
274
- turn_detection:
275
- T.nilable(
276
- OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash
271
+ returns(
272
+ T.nilable(
273
+ T.any(
274
+ OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
275
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
277
276
  )
278
- ).void
277
+ )
278
+ )
279
279
  end
280
- attr_writer :turn_detection
280
+ attr_accessor :turn_detection
281
281
 
282
282
  # The voice the model uses to respond. Voice cannot be changed during the session
283
283
  # once the model has responded with audio at least once. Current voice options are
@@ -299,7 +299,7 @@ module OpenAI
299
299
  end
300
300
  attr_writer :voice
301
301
 
302
- # Realtime session object.
302
+ # Realtime session object for the beta interface.
303
303
  sig do
304
304
  params(
305
305
  id: String,
@@ -336,7 +336,10 @@ module OpenAI
336
336
  ),
337
337
  turn_detection:
338
338
  T.nilable(
339
- OpenAI::Realtime::RealtimeSession::TurnDetection::OrHash
339
+ T.any(
340
+ OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad::OrHash,
341
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::OrHash
342
+ )
340
343
  ),
341
344
  voice:
342
345
  T.any(String, OpenAI::Realtime::RealtimeSession::Voice::OrSymbol)
@@ -420,14 +423,17 @@ module OpenAI
420
423
  tracing: nil,
421
424
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
422
425
  # set to `null` to turn off, in which case the client must manually trigger model
423
- # response. Server VAD means that the model will detect the start and end of
424
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
425
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
426
- # semantically estimate whether the user has finished speaking, then dynamically
427
- # sets a timeout based on this probability. For example, if user audio trails off
428
- # with "uhhm", the model will score a low probability of turn end and wait longer
429
- # for the user to continue speaking. This can be useful for more natural
430
- # conversations, but may have a higher latency.
426
+ # response.
427
+ #
428
+ # Server VAD means that the model will detect the start and end of speech based on
429
+ # audio volume and respond at the end of user speech.
430
+ #
431
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
432
+ # with VAD) to semantically estimate whether the user has finished speaking, then
433
+ # dynamically sets a timeout based on this probability. For example, if user audio
434
+ # trails off with "uhhm", the model will score a low probability of turn end and
435
+ # wait longer for the user to continue speaking. This can be useful for more
436
+ # natural conversations, but may have a higher latency.
431
437
  turn_detection: nil,
432
438
  # The voice the model uses to respond. Voice cannot be changed during the session
433
439
  # once the model has responded with audio at least once. Current voice options are
@@ -472,7 +478,12 @@ module OpenAI
472
478
  )
473
479
  ),
474
480
  turn_detection:
475
- T.nilable(OpenAI::Realtime::RealtimeSession::TurnDetection),
481
+ T.nilable(
482
+ T.any(
483
+ OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
484
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
485
+ )
486
+ ),
476
487
  voice:
477
488
  T.any(
478
489
  String,
@@ -864,256 +875,320 @@ module OpenAI
864
875
  end
865
876
  end
866
877
 
867
- class TurnDetection < OpenAI::Internal::Type::BaseModel
868
- OrHash =
878
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
879
+ # set to `null` to turn off, in which case the client must manually trigger model
880
+ # response.
881
+ #
882
+ # Server VAD means that the model will detect the start and end of speech based on
883
+ # audio volume and respond at the end of user speech.
884
+ #
885
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
886
+ # with VAD) to semantically estimate whether the user has finished speaking, then
887
+ # dynamically sets a timeout based on this probability. For example, if user audio
888
+ # trails off with "uhhm", the model will score a low probability of turn end and
889
+ # wait longer for the user to continue speaking. This can be useful for more
890
+ # natural conversations, but may have a higher latency.
891
+ module TurnDetection
892
+ extend OpenAI::Internal::Type::Union
893
+
894
+ Variants =
869
895
  T.type_alias do
870
896
  T.any(
871
- OpenAI::Realtime::RealtimeSession::TurnDetection,
872
- OpenAI::Internal::AnyHash
897
+ OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
898
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad
873
899
  )
874
900
  end
875
901
 
876
- # Whether or not to automatically generate a response when a VAD stop event
877
- # occurs.
878
- sig { returns(T.nilable(T::Boolean)) }
879
- attr_reader :create_response
880
-
881
- sig { params(create_response: T::Boolean).void }
882
- attr_writer :create_response
883
-
884
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
885
- # will wait longer for the user to continue speaking, `high` will respond more
886
- # quickly. `auto` is the default and is equivalent to `medium`.
887
- sig do
888
- returns(
889
- T.nilable(
890
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
891
- )
892
- )
893
- end
894
- attr_reader :eagerness
895
-
896
- sig do
897
- params(
898
- eagerness:
899
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol
900
- ).void
901
- end
902
- attr_writer :eagerness
903
-
904
- # Optional idle timeout after which turn detection will auto-timeout when no
905
- # additional audio is received.
906
- sig { returns(T.nilable(Integer)) }
907
- attr_accessor :idle_timeout_ms
908
-
909
- # Whether or not to automatically interrupt any ongoing response with output to
910
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
911
- # occurs.
912
- sig { returns(T.nilable(T::Boolean)) }
913
- attr_reader :interrupt_response
914
-
915
- sig { params(interrupt_response: T::Boolean).void }
916
- attr_writer :interrupt_response
917
-
918
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
919
- # detected speech (in milliseconds). Defaults to 300ms.
920
- sig { returns(T.nilable(Integer)) }
921
- attr_reader :prefix_padding_ms
922
-
923
- sig { params(prefix_padding_ms: Integer).void }
924
- attr_writer :prefix_padding_ms
925
-
926
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
927
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
928
- # more quickly, but may jump in on short pauses from the user.
929
- sig { returns(T.nilable(Integer)) }
930
- attr_reader :silence_duration_ms
931
-
932
- sig { params(silence_duration_ms: Integer).void }
933
- attr_writer :silence_duration_ms
934
-
935
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
936
- # defaults to 0.5. A higher threshold will require louder audio to activate the
937
- # model, and thus might perform better in noisy environments.
938
- sig { returns(T.nilable(Float)) }
939
- attr_reader :threshold
940
-
941
- sig { params(threshold: Float).void }
942
- attr_writer :threshold
943
-
944
- # Type of turn detection.
945
- sig do
946
- returns(
947
- T.nilable(
948
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
949
- )
950
- )
951
- end
952
- attr_reader :type
902
+ class ServerVad < OpenAI::Internal::Type::BaseModel
903
+ OrHash =
904
+ T.type_alias do
905
+ T.any(
906
+ OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad,
907
+ OpenAI::Internal::AnyHash
908
+ )
909
+ end
953
910
 
954
- sig do
955
- params(
956
- type:
957
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
958
- ).void
959
- end
960
- attr_writer :type
911
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
912
+ sig { returns(Symbol) }
913
+ attr_accessor :type
961
914
 
962
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
963
- # set to `null` to turn off, in which case the client must manually trigger model
964
- # response. Server VAD means that the model will detect the start and end of
965
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
966
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
967
- # semantically estimate whether the user has finished speaking, then dynamically
968
- # sets a timeout based on this probability. For example, if user audio trails off
969
- # with "uhhm", the model will score a low probability of turn end and wait longer
970
- # for the user to continue speaking. This can be useful for more natural
971
- # conversations, but may have a higher latency.
972
- sig do
973
- params(
974
- create_response: T::Boolean,
975
- eagerness:
976
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
977
- idle_timeout_ms: T.nilable(Integer),
978
- interrupt_response: T::Boolean,
979
- prefix_padding_ms: Integer,
980
- silence_duration_ms: Integer,
981
- threshold: Float,
982
- type:
983
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
984
- ).returns(T.attached_class)
985
- end
986
- def self.new(
987
915
  # Whether or not to automatically generate a response when a VAD stop event
988
916
  # occurs.
989
- create_response: nil,
990
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
991
- # will wait longer for the user to continue speaking, `high` will respond more
992
- # quickly. `auto` is the default and is equivalent to `medium`.
993
- eagerness: nil,
994
- # Optional idle timeout after which turn detection will auto-timeout when no
995
- # additional audio is received.
996
- idle_timeout_ms: nil,
917
+ sig { returns(T.nilable(T::Boolean)) }
918
+ attr_reader :create_response
919
+
920
+ sig { params(create_response: T::Boolean).void }
921
+ attr_writer :create_response
922
+
923
+ # Optional timeout after which a model response will be triggered automatically.
924
+ # This is useful for situations in which a long pause from the user is unexpected,
925
+ # such as a phone call. The model will effectively prompt the user to continue the
926
+ # conversation based on the current context.
927
+ #
928
+ # The timeout value will be applied after the last model response's audio has
929
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
930
+ # duration.
931
+ #
932
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
933
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
934
+ # only supported for `server_vad` mode.
935
+ sig { returns(T.nilable(Integer)) }
936
+ attr_accessor :idle_timeout_ms
937
+
997
938
  # Whether or not to automatically interrupt any ongoing response with output to
998
939
  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
999
940
  # occurs.
1000
- interrupt_response: nil,
941
+ sig { returns(T.nilable(T::Boolean)) }
942
+ attr_reader :interrupt_response
943
+
944
+ sig { params(interrupt_response: T::Boolean).void }
945
+ attr_writer :interrupt_response
946
+
1001
947
  # Used only for `server_vad` mode. Amount of audio to include before the VAD
1002
948
  # detected speech (in milliseconds). Defaults to 300ms.
1003
- prefix_padding_ms: nil,
949
+ sig { returns(T.nilable(Integer)) }
950
+ attr_reader :prefix_padding_ms
951
+
952
+ sig { params(prefix_padding_ms: Integer).void }
953
+ attr_writer :prefix_padding_ms
954
+
1004
955
  # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
1005
956
  # milliseconds). Defaults to 500ms. With shorter values the model will respond
1006
957
  # more quickly, but may jump in on short pauses from the user.
1007
- silence_duration_ms: nil,
958
+ sig { returns(T.nilable(Integer)) }
959
+ attr_reader :silence_duration_ms
960
+
961
+ sig { params(silence_duration_ms: Integer).void }
962
+ attr_writer :silence_duration_ms
963
+
1008
964
  # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
1009
965
  # defaults to 0.5. A higher threshold will require louder audio to activate the
1010
966
  # model, and thus might perform better in noisy environments.
1011
- threshold: nil,
1012
- # Type of turn detection.
1013
- type: nil
1014
- )
1015
- end
967
+ sig { returns(T.nilable(Float)) }
968
+ attr_reader :threshold
1016
969
 
1017
- sig do
1018
- override.returns(
1019
- {
970
+ sig { params(threshold: Float).void }
971
+ attr_writer :threshold
972
+
973
+ # Server-side voice activity detection (VAD) which flips on when user speech is
974
+ # detected and off after a period of silence.
975
+ sig do
976
+ params(
1020
977
  create_response: T::Boolean,
1021
- eagerness:
1022
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::OrSymbol,
1023
978
  idle_timeout_ms: T.nilable(Integer),
1024
979
  interrupt_response: T::Boolean,
1025
980
  prefix_padding_ms: Integer,
1026
981
  silence_duration_ms: Integer,
1027
982
  threshold: Float,
1028
- type:
1029
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::OrSymbol
1030
- }
983
+ type: Symbol
984
+ ).returns(T.attached_class)
985
+ end
986
+ def self.new(
987
+ # Whether or not to automatically generate a response when a VAD stop event
988
+ # occurs.
989
+ create_response: nil,
990
+ # Optional timeout after which a model response will be triggered automatically.
991
+ # This is useful for situations in which a long pause from the user is unexpected,
992
+ # such as a phone call. The model will effectively prompt the user to continue the
993
+ # conversation based on the current context.
994
+ #
995
+ # The timeout value will be applied after the last model response's audio has
996
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
997
+ # duration.
998
+ #
999
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
1000
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
1001
+ # only supported for `server_vad` mode.
1002
+ idle_timeout_ms: nil,
1003
+ # Whether or not to automatically interrupt any ongoing response with output to
1004
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
1005
+ # occurs.
1006
+ interrupt_response: nil,
1007
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
1008
+ # detected speech (in milliseconds). Defaults to 300ms.
1009
+ prefix_padding_ms: nil,
1010
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
1011
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
1012
+ # more quickly, but may jump in on short pauses from the user.
1013
+ silence_duration_ms: nil,
1014
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
1015
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
1016
+ # model, and thus might perform better in noisy environments.
1017
+ threshold: nil,
1018
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
1019
+ type: :server_vad
1031
1020
  )
1032
- end
1033
- def to_hash
1034
- end
1021
+ end
1035
1022
 
1036
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
1037
- # will wait longer for the user to continue speaking, `high` will respond more
1038
- # quickly. `auto` is the default and is equivalent to `medium`.
1039
- module Eagerness
1040
- extend OpenAI::Internal::Type::Enum
1023
+ sig do
1024
+ override.returns(
1025
+ {
1026
+ type: Symbol,
1027
+ create_response: T::Boolean,
1028
+ idle_timeout_ms: T.nilable(Integer),
1029
+ interrupt_response: T::Boolean,
1030
+ prefix_padding_ms: Integer,
1031
+ silence_duration_ms: Integer,
1032
+ threshold: Float
1033
+ }
1034
+ )
1035
+ end
1036
+ def to_hash
1037
+ end
1038
+ end
1041
1039
 
1042
- TaggedSymbol =
1040
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
1041
+ OrHash =
1043
1042
  T.type_alias do
1044
- T.all(
1045
- Symbol,
1046
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness
1043
+ T.any(
1044
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad,
1045
+ OpenAI::Internal::AnyHash
1047
1046
  )
1048
1047
  end
1049
- OrSymbol = T.type_alias { T.any(Symbol, String) }
1050
1048
 
1051
- LOW =
1052
- T.let(
1053
- :low,
1054
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
1055
- )
1056
- MEDIUM =
1057
- T.let(
1058
- :medium,
1059
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
1060
- )
1061
- HIGH =
1062
- T.let(
1063
- :high,
1064
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
1065
- )
1066
- AUTO =
1067
- T.let(
1068
- :auto,
1069
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
1070
- )
1049
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
1050
+ sig { returns(Symbol) }
1051
+ attr_accessor :type
1071
1052
 
1053
+ # Whether or not to automatically generate a response when a VAD stop event
1054
+ # occurs.
1055
+ sig { returns(T.nilable(T::Boolean)) }
1056
+ attr_reader :create_response
1057
+
1058
+ sig { params(create_response: T::Boolean).void }
1059
+ attr_writer :create_response
1060
+
1061
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
1062
+ # will wait longer for the user to continue speaking, `high` will respond more
1063
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
1064
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
1072
1065
  sig do
1073
- override.returns(
1074
- T::Array[
1075
- OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness::TaggedSymbol
1076
- ]
1066
+ returns(
1067
+ T.nilable(
1068
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
1069
+ )
1077
1070
  )
1078
1071
  end
1079
- def self.values
1072
+ attr_reader :eagerness
1073
+
1074
+ sig do
1075
+ params(
1076
+ eagerness:
1077
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol
1078
+ ).void
1080
1079
  end
1081
- end
1080
+ attr_writer :eagerness
1082
1081
 
1083
- # Type of turn detection.
1084
- module Type
1085
- extend OpenAI::Internal::Type::Enum
1082
+ # Whether or not to automatically interrupt any ongoing response with output to
1083
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
1084
+ # occurs.
1085
+ sig { returns(T.nilable(T::Boolean)) }
1086
+ attr_reader :interrupt_response
1086
1087
 
1087
- TaggedSymbol =
1088
- T.type_alias do
1089
- T.all(
1090
- Symbol,
1091
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type
1092
- )
1093
- end
1094
- OrSymbol = T.type_alias { T.any(Symbol, String) }
1088
+ sig { params(interrupt_response: T::Boolean).void }
1089
+ attr_writer :interrupt_response
1095
1090
 
1096
- SERVER_VAD =
1097
- T.let(
1098
- :server_vad,
1099
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
1100
- )
1101
- SEMANTIC_VAD =
1102
- T.let(
1103
- :semantic_vad,
1104
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
1105
- )
1091
+ # Server-side semantic turn detection which uses a model to determine when the
1092
+ # user has finished speaking.
1093
+ sig do
1094
+ params(
1095
+ create_response: T::Boolean,
1096
+ eagerness:
1097
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
1098
+ interrupt_response: T::Boolean,
1099
+ type: Symbol
1100
+ ).returns(T.attached_class)
1101
+ end
1102
+ def self.new(
1103
+ # Whether or not to automatically generate a response when a VAD stop event
1104
+ # occurs.
1105
+ create_response: nil,
1106
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
1107
+ # will wait longer for the user to continue speaking, `high` will respond more
1108
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
1109
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
1110
+ eagerness: nil,
1111
+ # Whether or not to automatically interrupt any ongoing response with output to
1112
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
1113
+ # occurs.
1114
+ interrupt_response: nil,
1115
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
1116
+ type: :semantic_vad
1117
+ )
1118
+ end
1106
1119
 
1107
1120
  sig do
1108
1121
  override.returns(
1109
- T::Array[
1110
- OpenAI::Realtime::RealtimeSession::TurnDetection::Type::TaggedSymbol
1111
- ]
1122
+ {
1123
+ type: Symbol,
1124
+ create_response: T::Boolean,
1125
+ eagerness:
1126
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::OrSymbol,
1127
+ interrupt_response: T::Boolean
1128
+ }
1112
1129
  )
1113
1130
  end
1114
- def self.values
1131
+ def to_hash
1132
+ end
1133
+
1134
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
1135
+ # will wait longer for the user to continue speaking, `high` will respond more
1136
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
1137
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
1138
+ module Eagerness
1139
+ extend OpenAI::Internal::Type::Enum
1140
+
1141
+ TaggedSymbol =
1142
+ T.type_alias do
1143
+ T.all(
1144
+ Symbol,
1145
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness
1146
+ )
1147
+ end
1148
+ OrSymbol = T.type_alias { T.any(Symbol, String) }
1149
+
1150
+ LOW =
1151
+ T.let(
1152
+ :low,
1153
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
1154
+ )
1155
+ MEDIUM =
1156
+ T.let(
1157
+ :medium,
1158
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
1159
+ )
1160
+ HIGH =
1161
+ T.let(
1162
+ :high,
1163
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
1164
+ )
1165
+ AUTO =
1166
+ T.let(
1167
+ :auto,
1168
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
1169
+ )
1170
+
1171
+ sig do
1172
+ override.returns(
1173
+ T::Array[
1174
+ OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
1175
+ ]
1176
+ )
1177
+ end
1178
+ def self.values
1179
+ end
1115
1180
  end
1116
1181
  end
1182
+
1183
+ sig do
1184
+ override.returns(
1185
+ T::Array[
1186
+ OpenAI::Realtime::RealtimeSession::TurnDetection::Variants
1187
+ ]
1188
+ )
1189
+ end
1190
+ def self.variants
1191
+ end
1117
1192
  end
1118
1193
 
1119
1194
  # The voice the model uses to respond. Voice cannot be changed during the session