openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -525,30 +525,25 @@ module OpenAI
525
525
 
526
526
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
527
527
  # set to `null` to turn off, in which case the client must manually trigger model
528
- # response. Server VAD means that the model will detect the start and end of
529
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
530
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
531
- # semantically estimate whether the user has finished speaking, then dynamically
532
- # sets a timeout based on this probability. For example, if user audio trails off
533
- # with "uhhm", the model will score a low probability of turn end and wait longer
534
- # for the user to continue speaking. This can be useful for more natural
535
- # conversations, but may have a higher latency.
528
+ # response.
529
+ #
530
+ # Server VAD means that the model will detect the start and end of speech based on
531
+ # audio volume and respond at the end of user speech.
532
+ #
533
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
534
+ # with VAD) to semantically estimate whether the user has finished speaking, then
535
+ # dynamically sets a timeout based on this probability. For example, if user audio
536
+ # trails off with "uhhm", the model will score a low probability of turn end and
537
+ # wait longer for the user to continue speaking. This can be useful for more
538
+ # natural conversations, but may have a higher latency.
536
539
  sig do
537
540
  returns(
538
541
  T.nilable(
539
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
542
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
540
543
  )
541
544
  )
542
545
  end
543
- attr_reader :turn_detection
544
-
545
- sig do
546
- params(
547
- turn_detection:
548
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash
549
- ).void
550
- end
551
- attr_writer :turn_detection
546
+ attr_accessor :turn_detection
552
547
 
553
548
  sig do
554
549
  params(
@@ -562,7 +557,12 @@ module OpenAI
562
557
  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction::OrHash,
563
558
  transcription: OpenAI::Realtime::AudioTranscription::OrHash,
564
559
  turn_detection:
565
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash
560
+ T.nilable(
561
+ T.any(
562
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad::OrHash,
563
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::OrHash
564
+ )
565
+ )
566
566
  ).returns(T.attached_class)
567
567
  end
568
568
  def self.new(
@@ -585,14 +585,17 @@ module OpenAI
585
585
  transcription: nil,
586
586
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
587
587
  # set to `null` to turn off, in which case the client must manually trigger model
588
- # response. Server VAD means that the model will detect the start and end of
589
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
590
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
591
- # semantically estimate whether the user has finished speaking, then dynamically
592
- # sets a timeout based on this probability. For example, if user audio trails off
593
- # with "uhhm", the model will score a low probability of turn end and wait longer
594
- # for the user to continue speaking. This can be useful for more natural
595
- # conversations, but may have a higher latency.
588
+ # response.
589
+ #
590
+ # Server VAD means that the model will detect the start and end of speech based on
591
+ # audio volume and respond at the end of user speech.
592
+ #
593
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
594
+ # with VAD) to semantically estimate whether the user has finished speaking, then
595
+ # dynamically sets a timeout based on this probability. For example, if user audio
596
+ # trails off with "uhhm", the model will score a low probability of turn end and
597
+ # wait longer for the user to continue speaking. This can be useful for more
598
+ # natural conversations, but may have a higher latency.
596
599
  turn_detection: nil
597
600
  )
598
601
  end
@@ -605,7 +608,9 @@ module OpenAI
605
608
  OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
606
609
  transcription: OpenAI::Realtime::AudioTranscription,
607
610
  turn_detection:
608
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
611
+ T.nilable(
612
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
613
+ )
609
614
  }
610
615
  )
611
616
  end
@@ -665,259 +670,320 @@ module OpenAI
665
670
  end
666
671
  end
667
672
 
668
- class TurnDetection < OpenAI::Internal::Type::BaseModel
669
- OrHash =
673
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
674
+ # set to `null` to turn off, in which case the client must manually trigger model
675
+ # response.
676
+ #
677
+ # Server VAD means that the model will detect the start and end of speech based on
678
+ # audio volume and respond at the end of user speech.
679
+ #
680
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
681
+ # with VAD) to semantically estimate whether the user has finished speaking, then
682
+ # dynamically sets a timeout based on this probability. For example, if user audio
683
+ # trails off with "uhhm", the model will score a low probability of turn end and
684
+ # wait longer for the user to continue speaking. This can be useful for more
685
+ # natural conversations, but may have a higher latency.
686
+ module TurnDetection
687
+ extend OpenAI::Internal::Type::Union
688
+
689
+ Variants =
670
690
  T.type_alias do
671
691
  T.any(
672
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection,
673
- OpenAI::Internal::AnyHash
692
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
693
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad
674
694
  )
675
695
  end
676
696
 
677
- # Whether or not to automatically generate a response when a VAD stop event
678
- # occurs.
679
- sig { returns(T.nilable(T::Boolean)) }
680
- attr_reader :create_response
681
-
682
- sig { params(create_response: T::Boolean).void }
683
- attr_writer :create_response
684
-
685
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
686
- # will wait longer for the user to continue speaking, `high` will respond more
687
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
688
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
689
- sig do
690
- returns(
691
- T.nilable(
692
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
693
- )
694
- )
695
- end
696
- attr_reader :eagerness
697
-
698
- sig do
699
- params(
700
- eagerness:
701
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol
702
- ).void
703
- end
704
- attr_writer :eagerness
705
-
706
- # Optional idle timeout after which turn detection will auto-timeout when no
707
- # additional audio is received and emits a `timeout_triggered` event.
708
- sig { returns(T.nilable(Integer)) }
709
- attr_accessor :idle_timeout_ms
710
-
711
- # Whether or not to automatically interrupt any ongoing response with output to
712
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
713
- # occurs.
714
- sig { returns(T.nilable(T::Boolean)) }
715
- attr_reader :interrupt_response
716
-
717
- sig { params(interrupt_response: T::Boolean).void }
718
- attr_writer :interrupt_response
719
-
720
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
721
- # detected speech (in milliseconds). Defaults to 300ms.
722
- sig { returns(T.nilable(Integer)) }
723
- attr_reader :prefix_padding_ms
724
-
725
- sig { params(prefix_padding_ms: Integer).void }
726
- attr_writer :prefix_padding_ms
727
-
728
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
729
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
730
- # more quickly, but may jump in on short pauses from the user.
731
- sig { returns(T.nilable(Integer)) }
732
- attr_reader :silence_duration_ms
733
-
734
- sig { params(silence_duration_ms: Integer).void }
735
- attr_writer :silence_duration_ms
736
-
737
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
738
- # defaults to 0.5. A higher threshold will require louder audio to activate the
739
- # model, and thus might perform better in noisy environments.
740
- sig { returns(T.nilable(Float)) }
741
- attr_reader :threshold
742
-
743
- sig { params(threshold: Float).void }
744
- attr_writer :threshold
745
-
746
- # Type of turn detection.
747
- sig do
748
- returns(
749
- T.nilable(
750
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
751
- )
752
- )
753
- end
754
- attr_reader :type
697
+ class ServerVad < OpenAI::Internal::Type::BaseModel
698
+ OrHash =
699
+ T.type_alias do
700
+ T.any(
701
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
702
+ OpenAI::Internal::AnyHash
703
+ )
704
+ end
755
705
 
756
- sig do
757
- params(
758
- type:
759
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
760
- ).void
761
- end
762
- attr_writer :type
706
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
707
+ sig { returns(Symbol) }
708
+ attr_accessor :type
763
709
 
764
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
765
- # set to `null` to turn off, in which case the client must manually trigger model
766
- # response. Server VAD means that the model will detect the start and end of
767
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
768
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
769
- # semantically estimate whether the user has finished speaking, then dynamically
770
- # sets a timeout based on this probability. For example, if user audio trails off
771
- # with "uhhm", the model will score a low probability of turn end and wait longer
772
- # for the user to continue speaking. This can be useful for more natural
773
- # conversations, but may have a higher latency.
774
- sig do
775
- params(
776
- create_response: T::Boolean,
777
- eagerness:
778
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol,
779
- idle_timeout_ms: T.nilable(Integer),
780
- interrupt_response: T::Boolean,
781
- prefix_padding_ms: Integer,
782
- silence_duration_ms: Integer,
783
- threshold: Float,
784
- type:
785
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
786
- ).returns(T.attached_class)
787
- end
788
- def self.new(
789
710
  # Whether or not to automatically generate a response when a VAD stop event
790
711
  # occurs.
791
- create_response: nil,
792
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
793
- # will wait longer for the user to continue speaking, `high` will respond more
794
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
795
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
796
- eagerness: nil,
797
- # Optional idle timeout after which turn detection will auto-timeout when no
798
- # additional audio is received and emits a `timeout_triggered` event.
799
- idle_timeout_ms: nil,
712
+ sig { returns(T.nilable(T::Boolean)) }
713
+ attr_reader :create_response
714
+
715
+ sig { params(create_response: T::Boolean).void }
716
+ attr_writer :create_response
717
+
718
+ # Optional timeout after which a model response will be triggered automatically.
719
+ # This is useful for situations in which a long pause from the user is unexpected,
720
+ # such as a phone call. The model will effectively prompt the user to continue the
721
+ # conversation based on the current context.
722
+ #
723
+ # The timeout value will be applied after the last model response's audio has
724
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
725
+ # duration.
726
+ #
727
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
728
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
729
+ # only supported for `server_vad` mode.
730
+ sig { returns(T.nilable(Integer)) }
731
+ attr_accessor :idle_timeout_ms
732
+
800
733
  # Whether or not to automatically interrupt any ongoing response with output to
801
734
  # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
802
735
  # occurs.
803
- interrupt_response: nil,
736
+ sig { returns(T.nilable(T::Boolean)) }
737
+ attr_reader :interrupt_response
738
+
739
+ sig { params(interrupt_response: T::Boolean).void }
740
+ attr_writer :interrupt_response
741
+
804
742
  # Used only for `server_vad` mode. Amount of audio to include before the VAD
805
743
  # detected speech (in milliseconds). Defaults to 300ms.
806
- prefix_padding_ms: nil,
744
+ sig { returns(T.nilable(Integer)) }
745
+ attr_reader :prefix_padding_ms
746
+
747
+ sig { params(prefix_padding_ms: Integer).void }
748
+ attr_writer :prefix_padding_ms
749
+
807
750
  # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
808
751
  # milliseconds). Defaults to 500ms. With shorter values the model will respond
809
752
  # more quickly, but may jump in on short pauses from the user.
810
- silence_duration_ms: nil,
753
+ sig { returns(T.nilable(Integer)) }
754
+ attr_reader :silence_duration_ms
755
+
756
+ sig { params(silence_duration_ms: Integer).void }
757
+ attr_writer :silence_duration_ms
758
+
811
759
  # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
812
760
  # defaults to 0.5. A higher threshold will require louder audio to activate the
813
761
  # model, and thus might perform better in noisy environments.
814
- threshold: nil,
815
- # Type of turn detection.
816
- type: nil
817
- )
818
- end
762
+ sig { returns(T.nilable(Float)) }
763
+ attr_reader :threshold
819
764
 
820
- sig do
821
- override.returns(
822
- {
765
+ sig { params(threshold: Float).void }
766
+ attr_writer :threshold
767
+
768
+ # Server-side voice activity detection (VAD) which flips on when user speech is
769
+ # detected and off after a period of silence.
770
+ sig do
771
+ params(
823
772
  create_response: T::Boolean,
824
- eagerness:
825
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol,
826
773
  idle_timeout_ms: T.nilable(Integer),
827
774
  interrupt_response: T::Boolean,
828
775
  prefix_padding_ms: Integer,
829
776
  silence_duration_ms: Integer,
830
777
  threshold: Float,
831
- type:
832
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
833
- }
778
+ type: Symbol
779
+ ).returns(T.attached_class)
780
+ end
781
+ def self.new(
782
+ # Whether or not to automatically generate a response when a VAD stop event
783
+ # occurs.
784
+ create_response: nil,
785
+ # Optional timeout after which a model response will be triggered automatically.
786
+ # This is useful for situations in which a long pause from the user is unexpected,
787
+ # such as a phone call. The model will effectively prompt the user to continue the
788
+ # conversation based on the current context.
789
+ #
790
+ # The timeout value will be applied after the last model response's audio has
791
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
792
+ # duration.
793
+ #
794
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
795
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
796
+ # only supported for `server_vad` mode.
797
+ idle_timeout_ms: nil,
798
+ # Whether or not to automatically interrupt any ongoing response with output to
799
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
800
+ # occurs.
801
+ interrupt_response: nil,
802
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
803
+ # detected speech (in milliseconds). Defaults to 300ms.
804
+ prefix_padding_ms: nil,
805
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
806
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
807
+ # more quickly, but may jump in on short pauses from the user.
808
+ silence_duration_ms: nil,
809
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
810
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
811
+ # model, and thus might perform better in noisy environments.
812
+ threshold: nil,
813
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
814
+ type: :server_vad
834
815
  )
835
- end
836
- def to_hash
837
- end
816
+ end
838
817
 
839
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
840
- # will wait longer for the user to continue speaking, `high` will respond more
841
- # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
842
- # and `high` have max timeouts of 8s, 4s, and 2s respectively.
843
- module Eagerness
844
- extend OpenAI::Internal::Type::Enum
818
+ sig do
819
+ override.returns(
820
+ {
821
+ type: Symbol,
822
+ create_response: T::Boolean,
823
+ idle_timeout_ms: T.nilable(Integer),
824
+ interrupt_response: T::Boolean,
825
+ prefix_padding_ms: Integer,
826
+ silence_duration_ms: Integer,
827
+ threshold: Float
828
+ }
829
+ )
830
+ end
831
+ def to_hash
832
+ end
833
+ end
845
834
 
846
- TaggedSymbol =
835
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
836
+ OrHash =
847
837
  T.type_alias do
848
- T.all(
849
- Symbol,
850
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness
838
+ T.any(
839
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad,
840
+ OpenAI::Internal::AnyHash
851
841
  )
852
842
  end
853
- OrSymbol = T.type_alias { T.any(Symbol, String) }
854
843
 
855
- LOW =
856
- T.let(
857
- :low,
858
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
859
- )
860
- MEDIUM =
861
- T.let(
862
- :medium,
863
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
864
- )
865
- HIGH =
866
- T.let(
867
- :high,
868
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
869
- )
870
- AUTO =
871
- T.let(
872
- :auto,
873
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
874
- )
844
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
845
+ sig { returns(Symbol) }
846
+ attr_accessor :type
875
847
 
848
+ # Whether or not to automatically generate a response when a VAD stop event
849
+ # occurs.
850
+ sig { returns(T.nilable(T::Boolean)) }
851
+ attr_reader :create_response
852
+
853
+ sig { params(create_response: T::Boolean).void }
854
+ attr_writer :create_response
855
+
856
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
857
+ # will wait longer for the user to continue speaking, `high` will respond more
858
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
859
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
876
860
  sig do
877
- override.returns(
878
- T::Array[
879
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
880
- ]
861
+ returns(
862
+ T.nilable(
863
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
864
+ )
881
865
  )
882
866
  end
883
- def self.values
867
+ attr_reader :eagerness
868
+
869
+ sig do
870
+ params(
871
+ eagerness:
872
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol
873
+ ).void
884
874
  end
885
- end
875
+ attr_writer :eagerness
886
876
 
887
- # Type of turn detection.
888
- module Type
889
- extend OpenAI::Internal::Type::Enum
877
+ # Whether or not to automatically interrupt any ongoing response with output to
878
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
879
+ # occurs.
880
+ sig { returns(T.nilable(T::Boolean)) }
881
+ attr_reader :interrupt_response
890
882
 
891
- TaggedSymbol =
892
- T.type_alias do
893
- T.all(
894
- Symbol,
895
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type
896
- )
897
- end
898
- OrSymbol = T.type_alias { T.any(Symbol, String) }
883
+ sig { params(interrupt_response: T::Boolean).void }
884
+ attr_writer :interrupt_response
899
885
 
900
- SERVER_VAD =
901
- T.let(
902
- :server_vad,
903
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
904
- )
905
- SEMANTIC_VAD =
906
- T.let(
907
- :semantic_vad,
908
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
909
- )
886
+ # Server-side semantic turn detection which uses a model to determine when the
887
+ # user has finished speaking.
888
+ sig do
889
+ params(
890
+ create_response: T::Boolean,
891
+ eagerness:
892
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol,
893
+ interrupt_response: T::Boolean,
894
+ type: Symbol
895
+ ).returns(T.attached_class)
896
+ end
897
+ def self.new(
898
+ # Whether or not to automatically generate a response when a VAD stop event
899
+ # occurs.
900
+ create_response: nil,
901
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
902
+ # will wait longer for the user to continue speaking, `high` will respond more
903
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
904
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
905
+ eagerness: nil,
906
+ # Whether or not to automatically interrupt any ongoing response with output to
907
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
908
+ # occurs.
909
+ interrupt_response: nil,
910
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
911
+ type: :semantic_vad
912
+ )
913
+ end
910
914
 
911
915
  sig do
912
916
  override.returns(
913
- T::Array[
914
- OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
915
- ]
917
+ {
918
+ type: Symbol,
919
+ create_response: T::Boolean,
920
+ eagerness:
921
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol,
922
+ interrupt_response: T::Boolean
923
+ }
916
924
  )
917
925
  end
918
- def self.values
926
+ def to_hash
927
+ end
928
+
929
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
930
+ # will wait longer for the user to continue speaking, `high` will respond more
931
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
932
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
933
+ module Eagerness
934
+ extend OpenAI::Internal::Type::Enum
935
+
936
+ TaggedSymbol =
937
+ T.type_alias do
938
+ T.all(
939
+ Symbol,
940
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness
941
+ )
942
+ end
943
+ OrSymbol = T.type_alias { T.any(Symbol, String) }
944
+
945
+ LOW =
946
+ T.let(
947
+ :low,
948
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
949
+ )
950
+ MEDIUM =
951
+ T.let(
952
+ :medium,
953
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
954
+ )
955
+ HIGH =
956
+ T.let(
957
+ :high,
958
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
959
+ )
960
+ AUTO =
961
+ T.let(
962
+ :auto,
963
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
964
+ )
965
+
966
+ sig do
967
+ override.returns(
968
+ T::Array[
969
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
970
+ ]
971
+ )
972
+ end
973
+ def self.values
974
+ end
919
975
  end
920
976
  end
977
+
978
+ sig do
979
+ override.returns(
980
+ T::Array[
981
+ OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
982
+ ]
983
+ )
984
+ end
985
+ def self.variants
986
+ end
921
987
  end
922
988
  end
923
989