openai 0.23.1 → 0.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/README.md +1 -1
  4. data/lib/openai/models/realtime/input_audio_buffer_timeout_triggered.rb +25 -5
  5. data/lib/openai/models/realtime/realtime_audio_config_input.rb +14 -11
  6. data/lib/openai/models/realtime/realtime_audio_input_turn_detection.rb +173 -117
  7. data/lib/openai/models/realtime/realtime_server_event.rb +13 -1
  8. data/lib/openai/models/realtime/realtime_session.rb +179 -118
  9. data/lib/openai/models/realtime/realtime_session_create_response.rb +184 -122
  10. data/lib/openai/models/realtime/realtime_transcription_session_audio_input.rb +16 -11
  11. data/lib/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rb +175 -117
  12. data/lib/openai/models/responses/response.rb +8 -8
  13. data/lib/openai/models/responses/response_create_params.rb +8 -8
  14. data/lib/openai/version.rb +1 -1
  15. data/rbi/openai/models/realtime/input_audio_buffer_timeout_triggered.rbi +24 -5
  16. data/rbi/openai/models/realtime/realtime_audio_config_input.rbi +44 -28
  17. data/rbi/openai/models/realtime/realtime_audio_input_turn_detection.rbi +264 -203
  18. data/rbi/openai/models/realtime/realtime_session.rbi +306 -231
  19. data/rbi/openai/models/realtime/realtime_session_create_response.rbi +298 -232
  20. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input.rbi +39 -28
  21. data/rbi/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbi +264 -200
  22. data/rbi/openai/models/responses/response.rbi +12 -12
  23. data/rbi/openai/models/responses/response_create_params.rbi +12 -12
  24. data/rbi/openai/resources/responses.rbi +8 -8
  25. data/sig/openai/models/realtime/realtime_audio_config_input.rbs +4 -8
  26. data/sig/openai/models/realtime/realtime_audio_input_turn_detection.rbs +91 -65
  27. data/sig/openai/models/realtime/realtime_session.rbs +95 -69
  28. data/sig/openai/models/realtime/realtime_session_create_response.rbs +95 -73
  29. data/sig/openai/models/realtime/realtime_transcription_session_audio_input.rbs +4 -8
  30. data/sig/openai/models/realtime/realtime_transcription_session_audio_input_turn_detection.rbs +91 -65
  31. metadata +2 -2
@@ -158,17 +158,20 @@ module OpenAI
158
158
  # @!attribute turn_detection
159
159
  # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
160
160
  # set to `null` to turn off, in which case the client must manually trigger model
161
- # response. Server VAD means that the model will detect the start and end of
162
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
163
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
164
- # semantically estimate whether the user has finished speaking, then dynamically
165
- # sets a timeout based on this probability. For example, if user audio trails off
166
- # with "uhhm", the model will score a low probability of turn end and wait longer
167
- # for the user to continue speaking. This can be useful for more natural
168
- # conversations, but may have a higher latency.
169
- #
170
- # @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil]
171
- optional :turn_detection, -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true
161
+ # response.
162
+ #
163
+ # Server VAD means that the model will detect the start and end of speech based on
164
+ # audio volume and respond at the end of user speech.
165
+ #
166
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
167
+ # with VAD) to semantically estimate whether the user has finished speaking, then
168
+ # dynamically sets a timeout based on this probability. For example, if user audio
169
+ # trails off with "uhhm", the model will score a low probability of turn end and
170
+ # wait longer for the user to continue speaking. This can be useful for more
171
+ # natural conversations, but may have a higher latency.
172
+ #
173
+ # @return [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil]
174
+ optional :turn_detection, union: -> { OpenAI::Realtime::RealtimeSession::TurnDetection }, nil?: true
172
175
 
173
176
  # @!attribute voice
174
177
  # The voice the model uses to respond. Voice cannot be changed during the session
@@ -182,7 +185,7 @@ module OpenAI
182
185
  # Some parameter documentations has been truncated, see
183
186
  # {OpenAI::Models::Realtime::RealtimeSession} for more details.
184
187
  #
185
- # Realtime session object.
188
+ # Realtime session object for the beta interface.
186
189
  #
187
190
  # @param id [String] Unique identifier for the session that looks like `sess_1234567890abcdef`.
188
191
  #
@@ -220,7 +223,7 @@ module OpenAI
220
223
  #
221
224
  # @param tracing [Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration, nil] Configuration options for tracing. Set to null to disable tracing. Once
222
225
  #
223
- # @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
226
+ # @param turn_detection [OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad, nil] Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
224
227
  #
225
228
  # @param voice [String, Symbol, OpenAI::Models::Realtime::RealtimeSession::Voice] The voice the model uses to respond. Voice cannot be changed during the
226
229
 
@@ -401,127 +404,185 @@ module OpenAI
401
404
  # @return [Array(Symbol, :auto, OpenAI::Models::Realtime::RealtimeSession::Tracing::TracingConfiguration)]
402
405
  end
403
406
 
407
+ # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
408
+ # set to `null` to turn off, in which case the client must manually trigger model
409
+ # response.
410
+ #
411
+ # Server VAD means that the model will detect the start and end of speech based on
412
+ # audio volume and respond at the end of user speech.
413
+ #
414
+ # Semantic VAD is more advanced and uses a turn detection model (in conjunction
415
+ # with VAD) to semantically estimate whether the user has finished speaking, then
416
+ # dynamically sets a timeout based on this probability. For example, if user audio
417
+ # trails off with "uhhm", the model will score a low probability of turn end and
418
+ # wait longer for the user to continue speaking. This can be useful for more
419
+ # natural conversations, but may have a higher latency.
420
+ #
404
421
  # @see OpenAI::Models::Realtime::RealtimeSession#turn_detection
405
- class TurnDetection < OpenAI::Internal::Type::BaseModel
406
- # @!attribute create_response
407
- # Whether or not to automatically generate a response when a VAD stop event
408
- # occurs.
409
- #
410
- # @return [Boolean, nil]
411
- optional :create_response, OpenAI::Internal::Type::Boolean
412
-
413
- # @!attribute eagerness
414
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
415
- # will wait longer for the user to continue speaking, `high` will respond more
416
- # quickly. `auto` is the default and is equivalent to `medium`.
417
- #
418
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness, nil]
419
- optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Eagerness }
422
+ module TurnDetection
423
+ extend OpenAI::Internal::Type::Union
420
424
 
421
- # @!attribute idle_timeout_ms
422
- # Optional idle timeout after which turn detection will auto-timeout when no
423
- # additional audio is received.
424
- #
425
- # @return [Integer, nil]
426
- optional :idle_timeout_ms, Integer, nil?: true
425
+ discriminator :type
427
426
 
428
- # @!attribute interrupt_response
429
- # Whether or not to automatically interrupt any ongoing response with output to
430
- # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
431
- # occurs.
432
- #
433
- # @return [Boolean, nil]
434
- optional :interrupt_response, OpenAI::Internal::Type::Boolean
427
+ # Server-side voice activity detection (VAD) which flips on when user speech is detected and off after a period of silence.
428
+ variant :server_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::ServerVad }
435
429
 
436
- # @!attribute prefix_padding_ms
437
- # Used only for `server_vad` mode. Amount of audio to include before the VAD
438
- # detected speech (in milliseconds). Defaults to 300ms.
439
- #
440
- # @return [Integer, nil]
441
- optional :prefix_padding_ms, Integer
430
+ # Server-side semantic turn detection which uses a model to determine when the user has finished speaking.
431
+ variant :semantic_vad, -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad }
442
432
 
443
- # @!attribute silence_duration_ms
444
- # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
445
- # milliseconds). Defaults to 500ms. With shorter values the model will respond
446
- # more quickly, but may jump in on short pauses from the user.
447
- #
448
- # @return [Integer, nil]
449
- optional :silence_duration_ms, Integer
433
+ class ServerVad < OpenAI::Internal::Type::BaseModel
434
+ # @!attribute type
435
+ # Type of turn detection, `server_vad` to turn on simple Server VAD.
436
+ #
437
+ # @return [Symbol, :server_vad]
438
+ required :type, const: :server_vad
450
439
 
451
- # @!attribute threshold
452
- # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
453
- # defaults to 0.5. A higher threshold will require louder audio to activate the
454
- # model, and thus might perform better in noisy environments.
455
- #
456
- # @return [Float, nil]
457
- optional :threshold, Float
440
+ # @!attribute create_response
441
+ # Whether or not to automatically generate a response when a VAD stop event
442
+ # occurs.
443
+ #
444
+ # @return [Boolean, nil]
445
+ optional :create_response, OpenAI::Internal::Type::Boolean
446
+
447
+ # @!attribute idle_timeout_ms
448
+ # Optional timeout after which a model response will be triggered automatically.
449
+ # This is useful for situations in which a long pause from the user is unexpected,
450
+ # such as a phone call. The model will effectively prompt the user to continue the
451
+ # conversation based on the current context.
452
+ #
453
+ # The timeout value will be applied after the last model response's audio has
454
+ # finished playing, i.e. it's set to the `response.done` time plus audio playback
455
+ # duration.
456
+ #
457
+ # An `input_audio_buffer.timeout_triggered` event (plus events associated with the
458
+ # Response) will be emitted when the timeout is reached. Idle timeout is currently
459
+ # only supported for `server_vad` mode.
460
+ #
461
+ # @return [Integer, nil]
462
+ optional :idle_timeout_ms, Integer, nil?: true
458
463
 
459
- # @!attribute type
460
- # Type of turn detection.
461
- #
462
- # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type, nil]
463
- optional :type, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::Type }
464
+ # @!attribute interrupt_response
465
+ # Whether or not to automatically interrupt any ongoing response with output to
466
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
467
+ # occurs.
468
+ #
469
+ # @return [Boolean, nil]
470
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
464
471
 
465
- # @!method initialize(create_response: nil, eagerness: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: nil)
466
- # Some parameter documentations has been truncated, see
467
- # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection} for more details.
468
- #
469
- # Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
470
- # set to `null` to turn off, in which case the client must manually trigger model
471
- # response. Server VAD means that the model will detect the start and end of
472
- # speech based on audio volume and respond at the end of user speech. Semantic VAD
473
- # is more advanced and uses a turn detection model (in conjunction with VAD) to
474
- # semantically estimate whether the user has finished speaking, then dynamically
475
- # sets a timeout based on this probability. For example, if user audio trails off
476
- # with "uhhm", the model will score a low probability of turn end and wait longer
477
- # for the user to continue speaking. This can be useful for more natural
478
- # conversations, but may have a higher latency.
479
- #
480
- # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
481
- #
482
- # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
483
- #
484
- # @param idle_timeout_ms [Integer, nil] Optional idle timeout after which turn detection will auto-timeout when
485
- #
486
- # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
487
- #
488
- # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
489
- #
490
- # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
491
- #
492
- # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
493
- #
494
- # @param type [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::Type] Type of turn detection.
472
+ # @!attribute prefix_padding_ms
473
+ # Used only for `server_vad` mode. Amount of audio to include before the VAD
474
+ # detected speech (in milliseconds). Defaults to 300ms.
475
+ #
476
+ # @return [Integer, nil]
477
+ optional :prefix_padding_ms, Integer
495
478
 
496
- # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
497
- # will wait longer for the user to continue speaking, `high` will respond more
498
- # quickly. `auto` is the default and is equivalent to `medium`.
499
- #
500
- # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#eagerness
501
- module Eagerness
502
- extend OpenAI::Internal::Type::Enum
479
+ # @!attribute silence_duration_ms
480
+ # Used only for `server_vad` mode. Duration of silence to detect speech stop (in
481
+ # milliseconds). Defaults to 500ms. With shorter values the model will respond
482
+ # more quickly, but may jump in on short pauses from the user.
483
+ #
484
+ # @return [Integer, nil]
485
+ optional :silence_duration_ms, Integer
503
486
 
504
- LOW = :low
505
- MEDIUM = :medium
506
- HIGH = :high
507
- AUTO = :auto
487
+ # @!attribute threshold
488
+ # Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
489
+ # defaults to 0.5. A higher threshold will require louder audio to activate the
490
+ # model, and thus might perform better in noisy environments.
491
+ #
492
+ # @return [Float, nil]
493
+ optional :threshold, Float
508
494
 
509
- # @!method self.values
510
- # @return [Array<Symbol>]
495
+ # @!method initialize(create_response: nil, idle_timeout_ms: nil, interrupt_response: nil, prefix_padding_ms: nil, silence_duration_ms: nil, threshold: nil, type: :server_vad)
496
+ # Some parameter documentations has been truncated, see
497
+ # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad} for more
498
+ # details.
499
+ #
500
+ # Server-side voice activity detection (VAD) which flips on when user speech is
501
+ # detected and off after a period of silence.
502
+ #
503
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
504
+ #
505
+ # @param idle_timeout_ms [Integer, nil] Optional timeout after which a model response will be triggered automatically. T
506
+ #
507
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
508
+ #
509
+ # @param prefix_padding_ms [Integer] Used only for `server_vad` mode. Amount of audio to include before the VAD detec
510
+ #
511
+ # @param silence_duration_ms [Integer] Used only for `server_vad` mode. Duration of silence to detect speech stop (in m
512
+ #
513
+ # @param threshold [Float] Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
514
+ #
515
+ # @param type [Symbol, :server_vad] Type of turn detection, `server_vad` to turn on simple Server VAD.
511
516
  end
512
517
 
513
- # Type of turn detection.
514
- #
515
- # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection#type
516
- module Type
517
- extend OpenAI::Internal::Type::Enum
518
+ class SemanticVad < OpenAI::Internal::Type::BaseModel
519
+ # @!attribute type
520
+ # Type of turn detection, `semantic_vad` to turn on Semantic VAD.
521
+ #
522
+ # @return [Symbol, :semantic_vad]
523
+ required :type, const: :semantic_vad
524
+
525
+ # @!attribute create_response
526
+ # Whether or not to automatically generate a response when a VAD stop event
527
+ # occurs.
528
+ #
529
+ # @return [Boolean, nil]
530
+ optional :create_response, OpenAI::Internal::Type::Boolean
531
+
532
+ # @!attribute eagerness
533
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
534
+ # will wait longer for the user to continue speaking, `high` will respond more
535
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
536
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
537
+ #
538
+ # @return [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness, nil]
539
+ optional :eagerness, enum: -> { OpenAI::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness }
518
540
 
519
- SERVER_VAD = :server_vad
520
- SEMANTIC_VAD = :semantic_vad
541
+ # @!attribute interrupt_response
542
+ # Whether or not to automatically interrupt any ongoing response with output to
543
+ # the default conversation (i.e. `conversation` of `auto`) when a VAD start event
544
+ # occurs.
545
+ #
546
+ # @return [Boolean, nil]
547
+ optional :interrupt_response, OpenAI::Internal::Type::Boolean
521
548
 
522
- # @!method self.values
523
- # @return [Array<Symbol>]
549
+ # @!method initialize(create_response: nil, eagerness: nil, interrupt_response: nil, type: :semantic_vad)
550
+ # Some parameter documentations has been truncated, see
551
+ # {OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad} for more
552
+ # details.
553
+ #
554
+ # Server-side semantic turn detection which uses a model to determine when the
555
+ # user has finished speaking.
556
+ #
557
+ # @param create_response [Boolean] Whether or not to automatically generate a response when a VAD stop event occurs
558
+ #
559
+ # @param eagerness [Symbol, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad::Eagerness] Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
560
+ #
561
+ # @param interrupt_response [Boolean] Whether or not to automatically interrupt any ongoing response with output to th
562
+ #
563
+ # @param type [Symbol, :semantic_vad] Type of turn detection, `semantic_vad` to turn on Semantic VAD.
564
+
565
+ # Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
566
+ # will wait longer for the user to continue speaking, `high` will respond more
567
+ # quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
568
+ # and `high` have max timeouts of 8s, 4s, and 2s respectively.
569
+ #
570
+ # @see OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad#eagerness
571
+ module Eagerness
572
+ extend OpenAI::Internal::Type::Enum
573
+
574
+ LOW = :low
575
+ MEDIUM = :medium
576
+ HIGH = :high
577
+ AUTO = :auto
578
+
579
+ # @!method self.values
580
+ # @return [Array<Symbol>]
581
+ end
524
582
  end
583
+
584
+ # @!method self.variants
585
+ # @return [Array(OpenAI::Models::Realtime::RealtimeSession::TurnDetection::ServerVad, OpenAI::Models::Realtime::RealtimeSession::TurnDetection::SemanticVad)]
525
586
  end
526
587
 
527
588
  # The voice the model uses to respond. Voice cannot be changed during the session