openai 0.23.1 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
- data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
- data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
- data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
- data/lib/openai/models/realtime/realtime_session.rb +179 -118
- data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
- data/lib/openai/models/responses/response.rb +8 -8
- data/lib/openai/models/responses/response_create_params.rb +8 -8
- data/lib/openai/version.rb +1 -1
- data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
- data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
- data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
- data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
- data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
- data/rbi/openai/models/responses/response.rbi +12 -12
- data/rbi/openai/models/responses/response_create_params.rbi +12 -12
- data/rbi/openai/resources/responses.rbi +8 -8
- data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
- data/sig/openai/models/realtime/realtime_session.rbs +95 -69
- data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
- metadata +2 -2
@@ -525,30 +525,25 @@ module OpenAI
|
|
525
525
|
|
526
526
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
527
527
|
# set to `null` to turn off, in which case the client must manually trigger model
|
528
|
-
# response.
|
529
|
-
#
|
530
|
-
#
|
531
|
-
#
|
532
|
-
#
|
533
|
-
#
|
534
|
-
#
|
535
|
-
#
|
528
|
+
# response.
|
529
|
+
#
|
530
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
531
|
+
# audio volume and respond at the end of user speech.
|
532
|
+
#
|
533
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
534
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
535
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
536
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
537
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
538
|
+
# natural conversations, but may have a higher latency.
|
536
539
|
sig do
|
537
540
|
returns(
|
538
541
|
T.nilable(
|
539
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection
|
542
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
|
540
543
|
)
|
541
544
|
)
|
542
545
|
end
|
543
|
-
|
544
|
-
|
545
|
-
sig do
|
546
|
-
params(
|
547
|
-
turn_detection:
|
548
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::OrHash
|
549
|
-
).void
|
550
|
-
end
|
551
|
-
attr_writer :turn_detection
|
546
|
+
attr_accessor :turn_detection
|
552
547
|
|
553
548
|
sig do
|
554
549
|
params(
|
@@ -562,7 +557,12 @@ module OpenAI
|
|
562
557
|
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction::OrHash,
|
563
558
|
transcription: OpenAI::Realtime::AudioTranscription::OrHash,
|
564
559
|
turn_detection:
|
565
|
-
|
560
|
+
T.nilable(
|
561
|
+
T.any(
|
562
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad::OrHash,
|
563
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::OrHash
|
564
|
+
)
|
565
|
+
)
|
566
566
|
).returns(T.attached_class)
|
567
567
|
end
|
568
568
|
def self.new(
|
@@ -585,14 +585,17 @@ module OpenAI
|
|
585
585
|
transcription: nil,
|
586
586
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
587
587
|
# set to `null` to turn off, in which case the client must manually trigger model
|
588
|
-
# response.
|
589
|
-
#
|
590
|
-
#
|
591
|
-
#
|
592
|
-
#
|
593
|
-
#
|
594
|
-
#
|
595
|
-
#
|
588
|
+
# response.
|
589
|
+
#
|
590
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
591
|
+
# audio volume and respond at the end of user speech.
|
592
|
+
#
|
593
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
594
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
595
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
596
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
597
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
598
|
+
# natural conversations, but may have a higher latency.
|
596
599
|
turn_detection: nil
|
597
600
|
)
|
598
601
|
end
|
@@ -605,7 +608,9 @@ module OpenAI
|
|
605
608
|
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::NoiseReduction,
|
606
609
|
transcription: OpenAI::Realtime::AudioTranscription,
|
607
610
|
turn_detection:
|
608
|
-
|
611
|
+
T.nilable(
|
612
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
|
613
|
+
)
|
609
614
|
}
|
610
615
|
)
|
611
616
|
end
|
@@ -665,259 +670,320 @@ module OpenAI
|
|
665
670
|
end
|
666
671
|
end
|
667
672
|
|
668
|
-
|
669
|
-
|
673
|
+
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
674
|
+
# set to `null` to turn off, in which case the client must manually trigger model
|
675
|
+
# response.
|
676
|
+
#
|
677
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
678
|
+
# audio volume and respond at the end of user speech.
|
679
|
+
#
|
680
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
681
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
682
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
683
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
684
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
685
|
+
# natural conversations, but may have a higher latency.
|
686
|
+
module TurnDetection
|
687
|
+
extend OpenAI::Internal::Type::Union
|
688
|
+
|
689
|
+
Variants =
|
670
690
|
T.type_alias do
|
671
691
|
T.any(
|
672
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection,
|
673
|
-
OpenAI::
|
692
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
|
693
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad
|
674
694
|
)
|
675
695
|
end
|
676
696
|
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
686
|
-
# will wait longer for the user to continue speaking, `high` will respond more
|
687
|
-
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
688
|
-
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
689
|
-
sig do
|
690
|
-
returns(
|
691
|
-
T.nilable(
|
692
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
693
|
-
)
|
694
|
-
)
|
695
|
-
end
|
696
|
-
attr_reader :eagerness
|
697
|
-
|
698
|
-
sig do
|
699
|
-
params(
|
700
|
-
eagerness:
|
701
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol
|
702
|
-
).void
|
703
|
-
end
|
704
|
-
attr_writer :eagerness
|
705
|
-
|
706
|
-
# Optional idle timeout after which turn detection will auto-timeout when no
|
707
|
-
# additional audio is received and emits a `timeout_triggered` event.
|
708
|
-
sig { returns(T.nilable(Integer)) }
|
709
|
-
attr_accessor :idle_timeout_ms
|
710
|
-
|
711
|
-
# Whether or not to automatically interrupt any ongoing response with output to
|
712
|
-
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
713
|
-
# occurs.
|
714
|
-
sig { returns(T.nilable(T::Boolean)) }
|
715
|
-
attr_reader :interrupt_response
|
716
|
-
|
717
|
-
sig { params(interrupt_response: T::Boolean).void }
|
718
|
-
attr_writer :interrupt_response
|
719
|
-
|
720
|
-
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
721
|
-
# detected speech (in milliseconds). Defaults to 300ms.
|
722
|
-
sig { returns(T.nilable(Integer)) }
|
723
|
-
attr_reader :prefix_padding_ms
|
724
|
-
|
725
|
-
sig { params(prefix_padding_ms: Integer).void }
|
726
|
-
attr_writer :prefix_padding_ms
|
727
|
-
|
728
|
-
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
729
|
-
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
730
|
-
# more quickly, but may jump in on short pauses from the user.
|
731
|
-
sig { returns(T.nilable(Integer)) }
|
732
|
-
attr_reader :silence_duration_ms
|
733
|
-
|
734
|
-
sig { params(silence_duration_ms: Integer).void }
|
735
|
-
attr_writer :silence_duration_ms
|
736
|
-
|
737
|
-
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
738
|
-
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
739
|
-
# model, and thus might perform better in noisy environments.
|
740
|
-
sig { returns(T.nilable(Float)) }
|
741
|
-
attr_reader :threshold
|
742
|
-
|
743
|
-
sig { params(threshold: Float).void }
|
744
|
-
attr_writer :threshold
|
745
|
-
|
746
|
-
# Type of turn detection.
|
747
|
-
sig do
|
748
|
-
returns(
|
749
|
-
T.nilable(
|
750
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::TaggedSymbol
|
751
|
-
)
|
752
|
-
)
|
753
|
-
end
|
754
|
-
attr_reader :type
|
697
|
+
class ServerVad < OpenAI::Internal::Type::BaseModel
|
698
|
+
OrHash =
|
699
|
+
T.type_alias do
|
700
|
+
T.any(
|
701
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::ServerVad,
|
702
|
+
OpenAI::Internal::AnyHash
|
703
|
+
)
|
704
|
+
end
|
755
705
|
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
|
760
|
-
).void
|
761
|
-
end
|
762
|
-
attr_writer :type
|
706
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
707
|
+
sig { returns(Symbol) }
|
708
|
+
attr_accessor :type
|
763
709
|
|
764
|
-
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
765
|
-
# set to `null` to turn off, in which case the client must manually trigger model
|
766
|
-
# response. Server VAD means that the model will detect the start and end of
|
767
|
-
# speech based on audio volume and respond at the end of user speech. Semantic VAD
|
768
|
-
# is more advanced and uses a turn detection model (in conjunction with VAD) to
|
769
|
-
# semantically estimate whether the user has finished speaking, then dynamically
|
770
|
-
# sets a timeout based on this probability. For example, if user audio trails off
|
771
|
-
# with "uhhm", the model will score a low probability of turn end and wait longer
|
772
|
-
# for the user to continue speaking. This can be useful for more natural
|
773
|
-
# conversations, but may have a higher latency.
|
774
|
-
sig do
|
775
|
-
params(
|
776
|
-
create_response: T::Boolean,
|
777
|
-
eagerness:
|
778
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::OrSymbol,
|
779
|
-
idle_timeout_ms: T.nilable(Integer),
|
780
|
-
interrupt_response: T::Boolean,
|
781
|
-
prefix_padding_ms: Integer,
|
782
|
-
silence_duration_ms: Integer,
|
783
|
-
threshold: Float,
|
784
|
-
type:
|
785
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type::OrSymbol
|
786
|
-
).returns(T.attached_class)
|
787
|
-
end
|
788
|
-
def self.new(
|
789
710
|
# Whether or not to automatically generate a response when a VAD stop event
|
790
711
|
# occurs.
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
# Optional
|
798
|
-
#
|
799
|
-
|
712
|
+
sig { returns(T.nilable(T::Boolean)) }
|
713
|
+
attr_reader :create_response
|
714
|
+
|
715
|
+
sig { params(create_response: T::Boolean).void }
|
716
|
+
attr_writer :create_response
|
717
|
+
|
718
|
+
# Optional timeout after which a model response will be triggered automatically.
|
719
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
720
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
721
|
+
# conversation based on the current context.
|
722
|
+
#
|
723
|
+
# The timeout value will be applied after the last model response's audio has
|
724
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
725
|
+
# duration.
|
726
|
+
#
|
727
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
728
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
729
|
+
# only supported for `server_vad` mode.
|
730
|
+
sig { returns(T.nilable(Integer)) }
|
731
|
+
attr_accessor :idle_timeout_ms
|
732
|
+
|
800
733
|
# Whether or not to automatically interrupt any ongoing response with output to
|
801
734
|
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
802
735
|
# occurs.
|
803
|
-
|
736
|
+
sig { returns(T.nilable(T::Boolean)) }
|
737
|
+
attr_reader :interrupt_response
|
738
|
+
|
739
|
+
sig { params(interrupt_response: T::Boolean).void }
|
740
|
+
attr_writer :interrupt_response
|
741
|
+
|
804
742
|
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
805
743
|
# detected speech (in milliseconds). Defaults to 300ms.
|
806
|
-
|
744
|
+
sig { returns(T.nilable(Integer)) }
|
745
|
+
attr_reader :prefix_padding_ms
|
746
|
+
|
747
|
+
sig { params(prefix_padding_ms: Integer).void }
|
748
|
+
attr_writer :prefix_padding_ms
|
749
|
+
|
807
750
|
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
808
751
|
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
809
752
|
# more quickly, but may jump in on short pauses from the user.
|
810
|
-
|
753
|
+
sig { returns(T.nilable(Integer)) }
|
754
|
+
attr_reader :silence_duration_ms
|
755
|
+
|
756
|
+
sig { params(silence_duration_ms: Integer).void }
|
757
|
+
attr_writer :silence_duration_ms
|
758
|
+
|
811
759
|
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
812
760
|
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
813
761
|
# model, and thus might perform better in noisy environments.
|
814
|
-
|
815
|
-
|
816
|
-
type: nil
|
817
|
-
)
|
818
|
-
end
|
762
|
+
sig { returns(T.nilable(Float)) }
|
763
|
+
attr_reader :threshold
|
819
764
|
|
820
|
-
|
821
|
-
|
822
|
-
|
765
|
+
sig { params(threshold: Float).void }
|
766
|
+
attr_writer :threshold
|
767
|
+
|
768
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is
|
769
|
+
# detected and off after a period of silence.
|
770
|
+
sig do
|
771
|
+
params(
|
823
772
|
create_response: T::Boolean,
|
824
|
-
eagerness:
|
825
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol,
|
826
773
|
idle_timeout_ms: T.nilable(Integer),
|
827
774
|
interrupt_response: T::Boolean,
|
828
775
|
prefix_padding_ms: Integer,
|
829
776
|
silence_duration_ms: Integer,
|
830
777
|
threshold: Float,
|
831
|
-
type:
|
832
|
-
|
833
|
-
|
778
|
+
type: Symbol
|
779
|
+
).returns(T.attached_class)
|
780
|
+
end
|
781
|
+
def self.new(
|
782
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
783
|
+
# occurs.
|
784
|
+
create_response: nil,
|
785
|
+
# Optional timeout after which a model response will be triggered automatically.
|
786
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
787
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
788
|
+
# conversation based on the current context.
|
789
|
+
#
|
790
|
+
# The timeout value will be applied after the last model response's audio has
|
791
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
792
|
+
# duration.
|
793
|
+
#
|
794
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
795
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
796
|
+
# only supported for `server_vad` mode.
|
797
|
+
idle_timeout_ms: nil,
|
798
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
799
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
800
|
+
# occurs.
|
801
|
+
interrupt_response: nil,
|
802
|
+
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
803
|
+
# detected speech (in milliseconds). Defaults to 300ms.
|
804
|
+
prefix_padding_ms: nil,
|
805
|
+
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
806
|
+
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
807
|
+
# more quickly, but may jump in on short pauses from the user.
|
808
|
+
silence_duration_ms: nil,
|
809
|
+
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
810
|
+
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
811
|
+
# model, and thus might perform better in noisy environments.
|
812
|
+
threshold: nil,
|
813
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
814
|
+
type: :server_vad
|
834
815
|
)
|
835
|
-
|
836
|
-
def to_hash
|
837
|
-
end
|
816
|
+
end
|
838
817
|
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
818
|
+
sig do
|
819
|
+
override.returns(
|
820
|
+
{
|
821
|
+
type: Symbol,
|
822
|
+
create_response: T::Boolean,
|
823
|
+
idle_timeout_ms: T.nilable(Integer),
|
824
|
+
interrupt_response: T::Boolean,
|
825
|
+
prefix_padding_ms: Integer,
|
826
|
+
silence_duration_ms: Integer,
|
827
|
+
threshold: Float
|
828
|
+
}
|
829
|
+
)
|
830
|
+
end
|
831
|
+
def to_hash
|
832
|
+
end
|
833
|
+
end
|
845
834
|
|
846
|
-
|
835
|
+
class SemanticVad < OpenAI::Internal::Type::BaseModel
|
836
|
+
OrHash =
|
847
837
|
T.type_alias do
|
848
|
-
T.
|
849
|
-
|
850
|
-
OpenAI::
|
838
|
+
T.any(
|
839
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad,
|
840
|
+
OpenAI::Internal::AnyHash
|
851
841
|
)
|
852
842
|
end
|
853
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
854
843
|
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
859
|
-
)
|
860
|
-
MEDIUM =
|
861
|
-
T.let(
|
862
|
-
:medium,
|
863
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
864
|
-
)
|
865
|
-
HIGH =
|
866
|
-
T.let(
|
867
|
-
:high,
|
868
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
869
|
-
)
|
870
|
-
AUTO =
|
871
|
-
T.let(
|
872
|
-
:auto,
|
873
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
874
|
-
)
|
844
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
845
|
+
sig { returns(Symbol) }
|
846
|
+
attr_accessor :type
|
875
847
|
|
848
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
849
|
+
# occurs.
|
850
|
+
sig { returns(T.nilable(T::Boolean)) }
|
851
|
+
attr_reader :create_response
|
852
|
+
|
853
|
+
sig { params(create_response: T::Boolean).void }
|
854
|
+
attr_writer :create_response
|
855
|
+
|
856
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
857
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
858
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
859
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
876
860
|
sig do
|
877
|
-
|
878
|
-
T
|
879
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Eagerness::TaggedSymbol
|
880
|
-
|
861
|
+
returns(
|
862
|
+
T.nilable(
|
863
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
864
|
+
)
|
881
865
|
)
|
882
866
|
end
|
883
|
-
|
867
|
+
attr_reader :eagerness
|
868
|
+
|
869
|
+
sig do
|
870
|
+
params(
|
871
|
+
eagerness:
|
872
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol
|
873
|
+
).void
|
884
874
|
end
|
885
|
-
|
875
|
+
attr_writer :eagerness
|
886
876
|
|
887
|
-
|
888
|
-
|
889
|
-
|
877
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
878
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
879
|
+
# occurs.
|
880
|
+
sig { returns(T.nilable(T::Boolean)) }
|
881
|
+
attr_reader :interrupt_response
|
890
882
|
|
891
|
-
|
892
|
-
|
893
|
-
T.all(
|
894
|
-
Symbol,
|
895
|
-
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Type
|
896
|
-
)
|
897
|
-
end
|
898
|
-
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
883
|
+
sig { params(interrupt_response: T::Boolean).void }
|
884
|
+
attr_writer :interrupt_response
|
899
885
|
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
:
|
908
|
-
|
909
|
-
)
|
886
|
+
# Server-side semantic turn detection which uses a model to determine when the
|
887
|
+
# user has finished speaking.
|
888
|
+
sig do
|
889
|
+
params(
|
890
|
+
create_response: T::Boolean,
|
891
|
+
eagerness:
|
892
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::OrSymbol,
|
893
|
+
interrupt_response: T::Boolean,
|
894
|
+
type: Symbol
|
895
|
+
).returns(T.attached_class)
|
896
|
+
end
|
897
|
+
def self.new(
|
898
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
899
|
+
# occurs.
|
900
|
+
create_response: nil,
|
901
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
902
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
903
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
904
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
905
|
+
eagerness: nil,
|
906
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
907
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
908
|
+
# occurs.
|
909
|
+
interrupt_response: nil,
|
910
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
911
|
+
type: :semantic_vad
|
912
|
+
)
|
913
|
+
end
|
910
914
|
|
911
915
|
sig do
|
912
916
|
override.returns(
|
913
|
-
|
914
|
-
|
915
|
-
|
917
|
+
{
|
918
|
+
type: Symbol,
|
919
|
+
create_response: T::Boolean,
|
920
|
+
eagerness:
|
921
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol,
|
922
|
+
interrupt_response: T::Boolean
|
923
|
+
}
|
916
924
|
)
|
917
925
|
end
|
918
|
-
def
|
926
|
+
def to_hash
|
927
|
+
end
|
928
|
+
|
929
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
930
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
931
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
932
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
933
|
+
module Eagerness
|
934
|
+
extend OpenAI::Internal::Type::Enum
|
935
|
+
|
936
|
+
TaggedSymbol =
|
937
|
+
T.type_alias do
|
938
|
+
T.all(
|
939
|
+
Symbol,
|
940
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness
|
941
|
+
)
|
942
|
+
end
|
943
|
+
OrSymbol = T.type_alias { T.any(Symbol, String) }
|
944
|
+
|
945
|
+
LOW =
|
946
|
+
T.let(
|
947
|
+
:low,
|
948
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
949
|
+
)
|
950
|
+
MEDIUM =
|
951
|
+
T.let(
|
952
|
+
:medium,
|
953
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
954
|
+
)
|
955
|
+
HIGH =
|
956
|
+
T.let(
|
957
|
+
:high,
|
958
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
959
|
+
)
|
960
|
+
AUTO =
|
961
|
+
T.let(
|
962
|
+
:auto,
|
963
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
964
|
+
)
|
965
|
+
|
966
|
+
sig do
|
967
|
+
override.returns(
|
968
|
+
T::Array[
|
969
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::SemanticVad::Eagerness::TaggedSymbol
|
970
|
+
]
|
971
|
+
)
|
972
|
+
end
|
973
|
+
def self.values
|
974
|
+
end
|
919
975
|
end
|
920
976
|
end
|
977
|
+
|
978
|
+
sig do
|
979
|
+
override.returns(
|
980
|
+
T::Array[
|
981
|
+
OpenAI::Realtime::RealtimeSessionCreateResponse::Audio::Input::TurnDetection::Variants
|
982
|
+
]
|
983
|
+
)
|
984
|
+
end
|
985
|
+
def self.variants
|
986
|
+
end
|
921
987
|
end
|
922
988
|
end
|
923
989
|
|