openai 0.23.1 → 0.23.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
- data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
- data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
- data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
- data/lib/openai/models/realtime/realtime_session.rb +179 -118
- data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
- data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
- data/lib/openai/models/responses/response.rb +8 -8
- data/lib/openai/models/responses/response_create_params.rb +8 -8
- data/lib/openai/version.rb +1 -1
- data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
- data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
- data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
- data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
- data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
- data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
- data/rbi/openai/models/responses/response.rbi +12 -12
- data/rbi/openai/models/responses/response_create_params.rbi +12 -12
- data/rbi/openai/resources/responses.rbi +8 -8
- data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
- data/sig/openai/models/realtime/realtime_session.rbs +95 -69
- data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
- data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
- metadata +2 -2
@@ -158,17 +158,20 @@ module OpenAI
|
|
158
158
|
# @!attribute turn_detection
|
159
159
|
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
160
160
|
# set to `null` to turn off, in which case the client must manually trigger model
|
161
|
-
# response.
|
162
|
-
#
|
163
|
-
#
|
164
|
-
#
|
165
|
-
#
|
166
|
-
#
|
167
|
-
#
|
168
|
-
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
|
161
|
+
# response.
|
162
|
+
#
|
163
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
164
|
+
# audio volume and respond at the end of user speech.
|
165
|
+
#
|
166
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
167
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
168
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
169
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
170
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
171
|
+
# natural conversations, but may have a higher latency.
|
172
|
+
#
|
173
|
+
# @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil]
|
174
|
+
optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true
|
172
175
|
|
173
176
|
# @!attribute voice
|
174
177
|
# The voice the model uses to respond. Voice cannot be changed during the session
|
@@ -182,7 +185,7 @@ module OpenAI
|
|
182
185
|
# Some parameter documentations has been truncated, see
|
183
186
|
# {OpenAI::Models::Realtime::RealtimeSession} for more details.
|
184
187
|
#
|
185
|
-
# Realtime session object.
|
188
|
+
# Realtime session object for the beta interface.
|
186
189
|
#
|
187
190
|
# @param id [String] Unique identifier for the session that looks like `sess_1234567890abcdef`.
|
188
191
|
#
|
@@ -220,7 +223,7 @@ module OpenAI
|
|
220
223
|
#
|
221
224
|
# @param tracing [Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration, nil] Configuration options for tracing. Set to null to disable tracing. Once
|
222
225
|
#
|
223
|
-
# @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
226
|
+
# @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
224
227
|
#
|
225
228
|
# @param voice [String, Symbol, OpenAI::Models::Realtime::RealtimeSession::Voice] The voice the model uses to respond. Voice cannot be changed during the
|
226
229
|
|
@@ -401,127 +404,185 @@ module OpenAI
|
|
401
404
|
# @return [Array(Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration)]
|
402
405
|
end
|
403
406
|
|
407
|
+
# Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
408
|
+
# set to `null` to turn off, in which case the client must manually trigger model
|
409
|
+
# response.
|
410
|
+
#
|
411
|
+
# Server VAD means that the model will detect the start and end of speech based on
|
412
|
+
# audio volume and respond at the end of user speech.
|
413
|
+
#
|
414
|
+
# Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
415
|
+
# with VAD) to semantically estimate whether the user has finished speaking, then
|
416
|
+
# dynamically sets a timeout based on this probability. For example, if user audio
|
417
|
+
# trails off with "uhhm", the model will score a low probability of turn end and
|
418
|
+
# wait longer for the user to continue speaking. This can be useful for more
|
419
|
+
# natural conversations, but may have a higher latency.
|
420
|
+
#
|
404
421
|
# @see OpenAI::Models::Realtime::RealtimeSession#turn_detection
|
405
|
-
|
406
|
-
|
407
|
-
# Whether or not to automatically generate a response when a VAD stop event
|
408
|
-
# occurs.
|
409
|
-
#
|
410
|
-
# @return [Boolean, nil]
|
411
|
-
optional :create_response, OpenAI::Internal::Type::Boolean
|
412
|
-
|
413
|
-
# @!attribute eagerness
|
414
|
-
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
415
|
-
# will wait longer for the user to continue speaking, `high` will respond more
|
416
|
-
# quickly. `auto` is the default and is equivalent to `medium`.
|
417
|
-
#
|
418
|
-
# @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness, nil]
|
419
|
-
optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness }
|
422
|
+
module TurnDetection
|
423
|
+
extend OpenAI::Internal::Type::Union
|
420
424
|
|
421
|
-
|
422
|
-
# Optional idle timeout after which turn detection will auto-timeout when no
|
423
|
-
# additional audio is received.
|
424
|
-
#
|
425
|
-
# @return [Integer, nil]
|
426
|
-
optional :idle_timeout_ms, Integer, nil?: true
|
425
|
+
discriminator :type
|
427
426
|
|
428
|
-
#
|
429
|
-
|
430
|
-
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
431
|
-
# occurs.
|
432
|
-
#
|
433
|
-
# @return [Boolean, nil]
|
434
|
-
optional :interrupt_response, OpenAI::Internal::Type::Boolean
|
427
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
|
428
|
+
variant :server_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad }
|
435
429
|
|
436
|
-
#
|
437
|
-
|
438
|
-
# detected speech (in milliseconds). Defaults to 300ms.
|
439
|
-
#
|
440
|
-
# @return [Integer, nil]
|
441
|
-
optional :prefix_padding_ms, Integer
|
430
|
+
# Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
|
431
|
+
variant :semantic_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad }
|
442
432
|
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
optional :silence_duration_ms, Integer
|
433
|
+
class ServerVad < OpenAI::Internal::Type::BaseModel
|
434
|
+
# @!attribute type
|
435
|
+
# Type of turn detection, `server_vad` to turn on simple Server VAD.
|
436
|
+
#
|
437
|
+
# @return [Symbol, :server_vad]
|
438
|
+
required :type, const: :server_vad
|
450
439
|
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
440
|
+
# @!attribute create_response
|
441
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
442
|
+
# occurs.
|
443
|
+
#
|
444
|
+
# @return [Boolean, nil]
|
445
|
+
optional :create_response, OpenAI::Internal::Type::Boolean
|
446
|
+
|
447
|
+
# @!attribute idle_timeout_ms
|
448
|
+
# Optional timeout after which a model response will be triggered automatically.
|
449
|
+
# This is useful for situations in which a long pause from the user is unexpected,
|
450
|
+
# such as a phone call. The model will effectively prompt the user to continue the
|
451
|
+
# conversation based on the current context.
|
452
|
+
#
|
453
|
+
# The timeout value will be applied after the last model response's audio has
|
454
|
+
# finished playing, i.e. it's set to the `response.done` time plus audio playback
|
455
|
+
# duration.
|
456
|
+
#
|
457
|
+
# An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
458
|
+
# Response) will be emitted when the timeout is reached. Idle timeout is currently
|
459
|
+
# only supported for `server_vad` mode.
|
460
|
+
#
|
461
|
+
# @return [Integer, nil]
|
462
|
+
optional :idle_timeout_ms, Integer, nil?: true
|
458
463
|
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
+
# @!attribute interrupt_response
|
465
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
466
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
467
|
+
# occurs.
|
468
|
+
#
|
469
|
+
# @return [Boolean, nil]
|
470
|
+
optional :interrupt_response, OpenAI::Internal::Type::Boolean
|
464
471
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
# response. Server VAD means that the model will detect the start and end of
|
472
|
-
# speech based on audio volume and respond at the end of user speech. Semantic VAD
|
473
|
-
# is more advanced and uses a turn detection model (in conjunction with VAD) to
|
474
|
-
# semantically estimate whether the user has finished speaking, then dynamically
|
475
|
-
# sets a timeout based on this probability. For example, if user audio trails off
|
476
|
-
# with "uhhm", the model will score a low probability of turn end and wait longer
|
477
|
-
# for the user to continue speaking. This can be useful for more natural
|
478
|
-
# conversations, but may have a higher latency.
|
479
|
-
#
|
480
|
-
# @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
|
481
|
-
#
|
482
|
-
# @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
483
|
-
#
|
484
|
-
# @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
|
485
|
-
#
|
486
|
-
# @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
|
487
|
-
#
|
488
|
-
# @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
|
489
|
-
#
|
490
|
-
# @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
|
491
|
-
#
|
492
|
-
# @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
493
|
-
#
|
494
|
-
# @param type [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type] Type of turn detection.
|
472
|
+
# @!attribute prefix_padding_ms
|
473
|
+
# Used only for `server_vad` mode. Amount of audio to include before the VAD
|
474
|
+
# detected speech (in milliseconds). Defaults to 300ms.
|
475
|
+
#
|
476
|
+
# @return [Integer, nil]
|
477
|
+
optional :prefix_padding_ms, Integer
|
495
478
|
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
479
|
+
# @!attribute silence_duration_ms
|
480
|
+
# Used only for `server_vad` mode. Duration of silence to detect speech stop (in
|
481
|
+
# milliseconds). Defaults to 500ms. With shorter values the model will respond
|
482
|
+
# more quickly, but may jump in on short pauses from the user.
|
483
|
+
#
|
484
|
+
# @return [Integer, nil]
|
485
|
+
optional :silence_duration_ms, Integer
|
503
486
|
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
487
|
+
# @!attribute threshold
|
488
|
+
# Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
489
|
+
# defaults to 0.5. A higher threshold will require louder audio to activate the
|
490
|
+
# model, and thus might perform better in noisy environments.
|
491
|
+
#
|
492
|
+
# @return [Float, nil]
|
493
|
+
optional :threshold, Float
|
508
494
|
|
509
|
-
# @!method
|
510
|
-
#
|
495
|
+
# @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
|
496
|
+
# Some parameter documentations has been truncated, see
|
497
|
+
# {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad} for more
|
498
|
+
# details.
|
499
|
+
#
|
500
|
+
# Server-side voice activity detection (VAD) which flips on when user speech is
|
501
|
+
# detected and off after a period of silence.
|
502
|
+
#
|
503
|
+
# @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
|
504
|
+
#
|
505
|
+
# @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
|
506
|
+
#
|
507
|
+
# @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
|
508
|
+
#
|
509
|
+
# @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
|
510
|
+
#
|
511
|
+
# @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
|
512
|
+
#
|
513
|
+
# @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
|
514
|
+
#
|
515
|
+
# @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
|
511
516
|
end
|
512
517
|
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
+
class SemanticVad < OpenAI::Internal::Type::BaseModel
|
519
|
+
# @!attribute type
|
520
|
+
# Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
521
|
+
#
|
522
|
+
# @return [Symbol, :semantic_vad]
|
523
|
+
required :type, const: :semantic_vad
|
524
|
+
|
525
|
+
# @!attribute create_response
|
526
|
+
# Whether or not to automatically generate a response when a VAD stop event
|
527
|
+
# occurs.
|
528
|
+
#
|
529
|
+
# @return [Boolean, nil]
|
530
|
+
optional :create_response, OpenAI::Internal::Type::Boolean
|
531
|
+
|
532
|
+
# @!attribute eagerness
|
533
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
534
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
535
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
536
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
537
|
+
#
|
538
|
+
# @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness, nil]
|
539
|
+
optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness }
|
518
540
|
|
519
|
-
|
520
|
-
|
541
|
+
# @!attribute interrupt_response
|
542
|
+
# Whether or not to automatically interrupt any ongoing response with output to
|
543
|
+
# the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
544
|
+
# occurs.
|
545
|
+
#
|
546
|
+
# @return [Boolean, nil]
|
547
|
+
optional :interrupt_response, OpenAI::Internal::Type::Boolean
|
521
548
|
|
522
|
-
# @!method
|
523
|
-
#
|
549
|
+
# @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
|
550
|
+
# Some parameter documentations has been truncated, see
|
551
|
+
# {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad} for more
|
552
|
+
# details.
|
553
|
+
#
|
554
|
+
# Server-side semantic turn detection which uses a model to determine when the
|
555
|
+
# user has finished speaking.
|
556
|
+
#
|
557
|
+
# @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
|
558
|
+
#
|
559
|
+
# @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
560
|
+
#
|
561
|
+
# @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
|
562
|
+
#
|
563
|
+
# @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
564
|
+
|
565
|
+
# Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
566
|
+
# will wait longer for the user to continue speaking, `high` will respond more
|
567
|
+
# quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
568
|
+
# and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
569
|
+
#
|
570
|
+
# @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad#eagerness
|
571
|
+
module Eagerness
|
572
|
+
extend OpenAI::Internal::Type::Enum
|
573
|
+
|
574
|
+
LOW = :low
|
575
|
+
MEDIUM = :medium
|
576
|
+
HIGH = :high
|
577
|
+
AUTO = :auto
|
578
|
+
|
579
|
+
# @!method self.values
|
580
|
+
# @return [Array<Symbol>]
|
581
|
+
end
|
524
582
|
end
|
583
|
+
|
584
|
+
# @!method self.variants
|
585
|
+
# @return [Array(OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad)]
|
525
586
|
end
|
526
587
|
|
527
588
|
# The voice the model uses to respond. Voice cannot be changed during the session
|