openai 0.31.0 → 0.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/README.md +1 -1
  4. data/lib/openai/internal/util.rb +5 -5
  5. data/lib/openai/models/audio/transcription_create_params.rb +42 -11
  6. data/lib/openai/models/audio/transcription_create_response.rb +4 -1
  7. data/lib/openai/models/audio/transcription_diarized.rb +160 -0
  8. data/lib/openai/models/audio/transcription_diarized_segment.rb +65 -0
  9. data/lib/openai/models/audio/transcription_stream_event.rb +7 -4
  10. data/lib/openai/models/audio/transcription_text_delta_event.rb +10 -1
  11. data/lib/openai/models/audio/transcription_text_segment_event.rb +63 -0
  12. data/lib/openai/models/audio_model.rb +1 -0
  13. data/lib/openai/models/audio_response_format.rb +5 -2
  14. data/lib/openai/models/realtime/audio_transcription.rb +8 -6
  15. data/lib/openai/models/vector_store_create_params.rb +10 -1
  16. data/lib/openai/resources/audio/transcriptions.rb +12 -4
  17. data/lib/openai/resources/vector_stores.rb +3 -1
  18. data/lib/openai/version.rb +1 -1
  19. data/lib/openai.rb +3 -0
  20. data/rbi/openai/models/audio/transcription_create_params.rbi +66 -16
  21. data/rbi/openai/models/audio/transcription_create_response.rbi +1 -0
  22. data/rbi/openai/models/audio/transcription_diarized.rbi +281 -0
  23. data/rbi/openai/models/audio/transcription_diarized_segment.rbi +87 -0
  24. data/rbi/openai/models/audio/transcription_stream_event.rbi +4 -3
  25. data/rbi/openai/models/audio/transcription_text_delta_event.rbi +14 -1
  26. data/rbi/openai/models/audio/transcription_text_segment_event.rbi +86 -0
  27. data/rbi/openai/models/audio_model.rbi +2 -0
  28. data/rbi/openai/models/audio_response_format.rbi +6 -2
  29. data/rbi/openai/models/realtime/audio_transcription.rbi +15 -12
  30. data/rbi/openai/models/vector_store_create_params.rbi +13 -0
  31. data/rbi/openai/resources/audio/transcriptions.rbi +52 -14
  32. data/rbi/openai/resources/vector_stores.rbi +4 -0
  33. data/sig/openai/models/audio/transcription_create_params.rbs +14 -0
  34. data/sig/openai/models/audio/transcription_create_response.rbs +3 -1
  35. data/sig/openai/models/audio/transcription_diarized.rbs +129 -0
  36. data/sig/openai/models/audio/transcription_diarized_segment.rbs +47 -0
  37. data/sig/openai/models/audio/transcription_stream_event.rbs +2 -1
  38. data/sig/openai/models/audio/transcription_text_delta_event.rbs +9 -2
  39. data/sig/openai/models/audio/transcription_text_segment_event.rbs +47 -0
  40. data/sig/openai/models/audio_model.rbs +5 -1
  41. data/sig/openai/models/audio_response_format.rbs +3 -1
  42. data/sig/openai/models/realtime/audio_transcription.rbs +2 -2
  43. data/sig/openai/models/vector_store_create_params.rbs +7 -0
  44. data/sig/openai/resources/audio/transcriptions.rbs +4 -0
  45. data/sig/openai/resources/vector_stores.rbs +1 -0
  46. metadata +11 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c2acb13f76000aa282621830c5724ffe7ceebf464e7cd4d58b4d8b99d6ddd5d
4
- data.tar.gz: e55e41d0e9dd00daa7ab4dd6409ea8f4fc68c91ccf09b8660725634ab563c56b
3
+ metadata.gz: 93eb98c219356a4e424930695d90aad0d6100f2d7547416e1ab30ccc7527e69a
4
+ data.tar.gz: 6fe4f2e35ec1f2ce294eb14d92fd8fff1e872d1e2f2c93e33b0f0a7014bdda6b
5
5
  SHA512:
6
- metadata.gz: ffb82dfdebeca385d0f584be09ea4bbf5ec79e0031209f362a93fc1167f78b04ddc2743e77ad18f87f9e9b1a33909f9d532fb5229379139479ef09317b263289
7
- data.tar.gz: dd09d9967b4cb16fd70bbbcc153ab34bf144a1dd3e922100f3f92fa597a3756fcb112763392745934b647d61ba48925070e55889a09a0b94ee89ff435f326cc7
6
+ metadata.gz: e13c1e23f3e17b8a8cbf52696205fbe81ab4d2d369d61bb327acaf9dad77673ab8adf636b109a117bd8ccf9fb619081a5e63d37493f85cfe5839a52a78653024
7
+ data.tar.gz: 1ace0e832f9f9542b5d8bdd968d8cf127b9501d074846e66edcd265e362fbcef5a83a7a59d5feb4a064ab4e76c1271109070cbf2da9e3dc2d249d79691ca395e
data/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.32.0 (2025-10-16)
4
+
5
+ Full Changelog: [v0.31.0...v0.32.0](https://github.com/openai/openai-ruby/compare/v0.31.0...v0.32.0)
6
+
7
+ ### Features
8
+
9
+ * **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([b31bd7f](https://github.com/openai/openai-ruby/commit/b31bd7f20ca702160873fa26ab39479fd8102f85))
10
+
11
+
12
+ ### Bug Fixes
13
+
14
+ * absolutely qualified uris should always override the default ([14fdff8](https://github.com/openai/openai-ruby/commit/14fdff8de533a1002c64c9086016777a1e152a97))
15
+ * should not reuse buffers for `IO.copy_stream` interop ([8f33de1](https://github.com/openai/openai-ruby/commit/8f33de18bb104d5003a4d459ad244c0813e5a07e))
16
+
3
17
  ## 0.31.0 (2025-10-10)
4
18
 
5
19
  Full Changelog: [v0.30.0...v0.31.0](https://github.com/openai/openai-ruby/compare/v0.30.0...v0.31.0)
data/README.md CHANGED
@@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application
15
15
  <!-- x-release-please-start-version -->
16
16
 
17
17
  ```ruby
18
- gem "openai", "~> 0.31.0"
18
+ gem "openai", "~> 0.32.0"
19
19
  ```
20
20
 
21
21
  <!-- x-release-please-end -->
@@ -346,8 +346,9 @@ module OpenAI
346
346
  base_path, base_query = lhs.fetch_values(:path, :query)
347
347
  slashed = base_path.end_with?("/") ? base_path : "#{base_path}/"
348
348
 
349
- parsed_path, parsed_query = parse_uri(rhs.fetch(:path)).fetch_values(:path, :query)
350
- override = URI::Generic.build(**rhs.slice(:scheme, :host, :port), path: parsed_path)
349
+ merged = {**parse_uri(rhs.fetch(:path)), **rhs.except(:path, :query)}
350
+ parsed_path, parsed_query = merged.fetch_values(:path, :query)
351
+ override = URI::Generic.build(**merged.slice(:scheme, :host, :port), path: parsed_path)
351
352
 
352
353
  joined = URI.join(URI::Generic.build(lhs.except(:path, :query)), slashed, override)
353
354
  query = deep_merge(
@@ -473,10 +474,9 @@ module OpenAI
473
474
  # @return [Enumerable<String>]
474
475
  def writable_enum(&blk)
475
476
  Enumerator.new do |y|
476
- buf = String.new
477
477
  y.define_singleton_method(:write) do
478
- self << buf.replace(_1)
479
- buf.bytesize
478
+ self << _1.dup
479
+ _1.bytesize
480
480
  end
481
481
 
482
482
  blk.call(y)
@@ -19,8 +19,8 @@ module OpenAI
19
19
 
20
20
  # @!attribute model
21
21
  # ID of the model to use. The options are `gpt-4o-transcribe`,
22
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
23
- # Whisper V2 model).
22
+ # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
23
+ # Whisper V2 model), and `gpt-4o-transcribe-diarize`.
24
24
  #
25
25
  # @return [String, Symbol, OpenAI::Models::AudioModel]
26
26
  required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model }
@@ -30,6 +30,8 @@ module OpenAI
30
30
  # first normalizes loudness and then uses voice activity detection (VAD) to choose
31
31
  # boundaries. `server_vad` object can be provided to tweak VAD detection
32
32
  # parameters manually. If unset, the audio is transcribed as a single block.
33
+ # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
34
+ # seconds.
33
35
  #
34
36
  # @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil]
35
37
  optional :chunking_strategy,
@@ -41,11 +43,30 @@ module OpenAI
41
43
  # return the log probabilities of the tokens in the response to understand the
42
44
  # model's confidence in the transcription. `logprobs` only works with
43
45
  # response_format set to `json` and only with the models `gpt-4o-transcribe` and
44
- # `gpt-4o-mini-transcribe`.
46
+ # `gpt-4o-mini-transcribe`. This field is not supported when using
47
+ # `gpt-4o-transcribe-diarize`.
45
48
  #
46
49
  # @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
47
50
  optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] }
48
51
 
52
+ # @!attribute known_speaker_names
53
+ # Optional list of speaker names that correspond to the audio samples provided in
54
+ # `known_speaker_references[]`. Each entry should be a short identifier (for
55
+ # example `customer` or `agent`). Up to 4 speakers are supported.
56
+ #
57
+ # @return [Array<String>, nil]
58
+ optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String]
59
+
60
+ # @!attribute known_speaker_references
61
+ # Optional list of audio samples (as
62
+ # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
63
+ # that contain known speaker references matching `known_speaker_names[]`. Each
64
+ # sample must be between 2 and 10 seconds, and can use any of the same input audio
65
+ # formats supported by `file`.
66
+ #
67
+ # @return [Array<String>, nil]
68
+ optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String]
69
+
49
70
  # @!attribute language
50
71
  # The language of the input audio. Supplying the input language in
51
72
  # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -58,15 +79,18 @@ module OpenAI
58
79
  # An optional text to guide the model's style or continue a previous audio
59
80
  # segment. The
60
81
  # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
61
- # should match the audio language.
82
+ # should match the audio language. This field is not supported when using
83
+ # `gpt-4o-transcribe-diarize`.
62
84
  #
63
85
  # @return [String, nil]
64
86
  optional :prompt, String
65
87
 
66
88
  # @!attribute response_format
67
89
  # The format of the output, in one of these options: `json`, `text`, `srt`,
68
- # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
69
- # the only supported format is `json`.
90
+ # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
91
+ # `gpt-4o-mini-transcribe`, the only supported format is `json`. For
92
+ # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
93
+ # `diarized_json`, with `diarized_json` required to receive speaker annotations.
70
94
  #
71
95
  # @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
72
96
  optional :response_format, enum: -> { OpenAI::AudioResponseFormat }
@@ -86,13 +110,14 @@ module OpenAI
86
110
  # `response_format` must be set `verbose_json` to use timestamp granularities.
87
111
  # Either or both of these options are supported: `word`, or `segment`. Note: There
88
112
  # is no additional latency for segment timestamps, but generating word timestamps
89
- # incurs additional latency.
113
+ # incurs additional latency. This option is not available for
114
+ # `gpt-4o-transcribe-diarize`.
90
115
  #
91
116
  # @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionCreateParams::TimestampGranularity>, nil]
92
117
  optional :timestamp_granularities,
93
118
  -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] }
94
119
 
95
- # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
120
+ # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
96
121
  # Some parameter documentations has been truncated, see
97
122
  # {OpenAI::Models::Audio::TranscriptionCreateParams} for more details.
98
123
  #
@@ -104,6 +129,10 @@ module OpenAI
104
129
  #
105
130
  # @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>] Additional information to include in the transcription response.
106
131
  #
132
+ # @param known_speaker_names [Array<String>] Optional list of speaker names that correspond to the audio samples provided in
133
+ #
134
+ # @param known_speaker_references [Array<String>] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en-
135
+ #
107
136
  # @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt
108
137
  #
109
138
  # @param prompt [String] An optional text to guide the model's style or continue a previous audio segment
@@ -117,14 +146,14 @@ module OpenAI
117
146
  # @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]
118
147
 
119
148
  # ID of the model to use. The options are `gpt-4o-transcribe`,
120
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
121
- # Whisper V2 model).
149
+ # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
150
+ # Whisper V2 model), and `gpt-4o-transcribe-diarize`.
122
151
  module Model
123
152
  extend OpenAI::Internal::Type::Union
124
153
 
125
154
  variant String
126
155
 
127
- # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model).
156
+ # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`.
128
157
  variant enum: -> { OpenAI::AudioModel }
129
158
 
130
159
  # @!method self.variants
@@ -135,6 +164,8 @@ module OpenAI
135
164
  # first normalizes loudness and then uses voice activity detection (VAD) to choose
136
165
  # boundaries. `server_vad` object can be provided to tweak VAD detection
137
166
  # parameters manually. If unset, the audio is transcribed as a single block.
167
+ # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
168
+ # seconds.
138
169
  module ChunkingStrategy
139
170
  extend OpenAI::Internal::Type::Union
140
171
 
@@ -15,11 +15,14 @@ module OpenAI
15
15
  # Represents a transcription response returned by model, based on the provided input.
16
16
  variant -> { OpenAI::Audio::Transcription }
17
17
 
18
+ # Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations.
19
+ variant -> { OpenAI::Audio::TranscriptionDiarized }
20
+
18
21
  # Represents a verbose json transcription response returned by model, based on the provided input.
19
22
  variant -> { OpenAI::Audio::TranscriptionVerbose }
20
23
 
21
24
  # @!method self.variants
22
- # @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)]
25
+ # @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)]
23
26
  end
24
27
  end
25
28
  end
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Audio
6
+ class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute duration
8
+ # Duration of the input audio in seconds.
9
+ #
10
+ # @return [Float]
11
+ required :duration, Float
12
+
13
+ # @!attribute segments
14
+ # Segments of the transcript annotated with timestamps and speaker labels.
15
+ #
16
+ # @return [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>]
17
+ required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] }
18
+
19
+ # @!attribute task
20
+ # The type of task that was run. Always `transcribe`.
21
+ #
22
+ # @return [Symbol, :transcribe]
23
+ required :task, const: :transcribe
24
+
25
+ # @!attribute text
26
+ # The concatenated transcript text for the entire audio input.
27
+ #
28
+ # @return [String]
29
+ required :text, String
30
+
31
+ # @!attribute usage
32
+ # Token or duration usage statistics for the request.
33
+ #
34
+ # @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil]
35
+ optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage }
36
+
37
+ # @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe)
38
+ # Represents a diarized transcription response returned by the model, including
39
+ # the combined transcript and speaker-segment annotations.
40
+ #
41
+ # @param duration [Float] Duration of the input audio in seconds.
42
+ #
43
+ # @param segments [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>] Segments of the transcript annotated with timestamps and speaker labels.
44
+ #
45
+ # @param text [String] The concatenated transcript text for the entire audio input.
46
+ #
47
+ # @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request.
48
+ #
49
+ # @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`.
50
+
51
+ # Token or duration usage statistics for the request.
52
+ #
53
+ # @see OpenAI::Models::Audio::TranscriptionDiarized#usage
54
+ module Usage
55
+ extend OpenAI::Internal::Type::Union
56
+
57
+ discriminator :type
58
+
59
+ # Usage statistics for models billed by token usage.
60
+ variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens }
61
+
62
+ # Usage statistics for models billed by audio input duration.
63
+ variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration }
64
+
65
+ class Tokens < OpenAI::Internal::Type::BaseModel
66
+ # @!attribute input_tokens
67
+ # Number of input tokens billed for this request.
68
+ #
69
+ # @return [Integer]
70
+ required :input_tokens, Integer
71
+
72
+ # @!attribute output_tokens
73
+ # Number of output tokens generated.
74
+ #
75
+ # @return [Integer]
76
+ required :output_tokens, Integer
77
+
78
+ # @!attribute total_tokens
79
+ # Total number of tokens used (input + output).
80
+ #
81
+ # @return [Integer]
82
+ required :total_tokens, Integer
83
+
84
+ # @!attribute type
85
+ # The type of the usage object. Always `tokens` for this variant.
86
+ #
87
+ # @return [Symbol, :tokens]
88
+ required :type, const: :tokens
89
+
90
+ # @!attribute input_token_details
91
+ # Details about the input tokens billed for this request.
92
+ #
93
+ # @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil]
94
+ optional :input_token_details,
95
+ -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails }
96
+
97
+ # @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens)
98
+ # Usage statistics for models billed by token usage.
99
+ #
100
+ # @param input_tokens [Integer] Number of input tokens billed for this request.
101
+ #
102
+ # @param output_tokens [Integer] Number of output tokens generated.
103
+ #
104
+ # @param total_tokens [Integer] Total number of tokens used (input + output).
105
+ #
106
+ # @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request.
107
+ #
108
+ # @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant.
109
+
110
+ # @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details
111
+ class InputTokenDetails < OpenAI::Internal::Type::BaseModel
112
+ # @!attribute audio_tokens
113
+ # Number of audio tokens billed for this request.
114
+ #
115
+ # @return [Integer, nil]
116
+ optional :audio_tokens, Integer
117
+
118
+ # @!attribute text_tokens
119
+ # Number of text tokens billed for this request.
120
+ #
121
+ # @return [Integer, nil]
122
+ optional :text_tokens, Integer
123
+
124
+ # @!method initialize(audio_tokens: nil, text_tokens: nil)
125
+ # Details about the input tokens billed for this request.
126
+ #
127
+ # @param audio_tokens [Integer] Number of audio tokens billed for this request.
128
+ #
129
+ # @param text_tokens [Integer] Number of text tokens billed for this request.
130
+ end
131
+ end
132
+
133
+ class Duration < OpenAI::Internal::Type::BaseModel
134
+ # @!attribute seconds
135
+ # Duration of the input audio in seconds.
136
+ #
137
+ # @return [Float]
138
+ required :seconds, Float
139
+
140
+ # @!attribute type
141
+ # The type of the usage object. Always `duration` for this variant.
142
+ #
143
+ # @return [Symbol, :duration]
144
+ required :type, const: :duration
145
+
146
+ # @!method initialize(seconds:, type: :duration)
147
+ # Usage statistics for models billed by audio input duration.
148
+ #
149
+ # @param seconds [Float] Duration of the input audio in seconds.
150
+ #
151
+ # @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant.
152
+ end
153
+
154
+ # @!method self.variants
155
+ # @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)]
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Audio
6
+ class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute id
8
+ # Unique identifier for the segment.
9
+ #
10
+ # @return [String]
11
+ required :id, String
12
+
13
+ # @!attribute end_
14
+ # End timestamp of the segment in seconds.
15
+ #
16
+ # @return [Float]
17
+ required :end_, Float, api_name: :end
18
+
19
+ # @!attribute speaker
20
+ # Speaker label for this segment. When known speakers are provided, the label
21
+ # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
22
+ # using capital letters (`A`, `B`, ...).
23
+ #
24
+ # @return [String]
25
+ required :speaker, String
26
+
27
+ # @!attribute start
28
+ # Start timestamp of the segment in seconds.
29
+ #
30
+ # @return [Float]
31
+ required :start, Float
32
+
33
+ # @!attribute text
34
+ # Transcript text for this segment.
35
+ #
36
+ # @return [String]
37
+ required :text, String
38
+
39
+ # @!attribute type
40
+ # The type of the segment. Always `transcript.text.segment`.
41
+ #
42
+ # @return [Symbol, :"transcript.text.segment"]
43
+ required :type, const: :"transcript.text.segment"
44
+
45
+ # @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment")
46
+ # Some parameter documentations has been truncated, see
47
+ # {OpenAI::Models::Audio::TranscriptionDiarizedSegment} for more details.
48
+ #
49
+ # A segment of diarized transcript text with speaker metadata.
50
+ #
51
+ # @param id [String] Unique identifier for the segment.
52
+ #
53
+ # @param end_ [Float] End timestamp of the segment in seconds.
54
+ #
55
+ # @param speaker [String] Speaker label for this segment. When known speakers are provided, the label matc
56
+ #
57
+ # @param start [Float] Start timestamp of the segment in seconds.
58
+ #
59
+ # @param text [String] Transcript text for this segment.
60
+ #
61
+ # @param type [Symbol, :"transcript.text.segment"] The type of the segment. Always `transcript.text.segment`.
62
+ end
63
+ end
64
+ end
65
+ end
@@ -3,15 +3,18 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Audio
6
- # Emitted when there is an additional text delta. This is also the first event
7
- # emitted when the transcription starts. Only emitted when you
6
+ # Emitted when a diarized transcription returns a completed segment with speaker
7
+ # information. Only emitted when you
8
8
  # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
9
- # with the `Stream` parameter set to `true`.
9
+ # with `stream` set to `true` and `response_format` set to `diarized_json`.
10
10
  module TranscriptionStreamEvent
11
11
  extend OpenAI::Internal::Type::Union
12
12
 
13
13
  discriminator :type
14
14
 
15
+ # Emitted when a diarized transcription returns a completed segment with speaker information. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with `stream` set to `true` and `response_format` set to `diarized_json`.
16
+ variant :"transcript.text.segment", -> { OpenAI::Audio::TranscriptionTextSegmentEvent }
17
+
15
18
  # Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`.
16
19
  variant :"transcript.text.delta", -> { OpenAI::Audio::TranscriptionTextDeltaEvent }
17
20
 
@@ -19,7 +22,7 @@ module OpenAI
19
22
  variant :"transcript.text.done", -> { OpenAI::Audio::TranscriptionTextDoneEvent }
20
23
 
21
24
  # @!method self.variants
22
- # @return [Array(OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)]
25
+ # @return [Array(OpenAI::Models::Audio::TranscriptionTextSegmentEvent, OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)]
23
26
  end
24
27
  end
25
28
  end
@@ -25,7 +25,14 @@ module OpenAI
25
25
  optional :logprobs,
26
26
  -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] }
27
27
 
28
- # @!method initialize(delta:, logprobs: nil, type: :"transcript.text.delta")
28
+ # @!attribute segment_id
29
+ # Identifier of the diarized segment that this delta belongs to. Only present when
30
+ # using `gpt-4o-transcribe-diarize`.
31
+ #
32
+ # @return [String, nil]
33
+ optional :segment_id, String
34
+
35
+ # @!method initialize(delta:, logprobs: nil, segment_id: nil, type: :"transcript.text.delta")
29
36
  # Some parameter documentations has been truncated, see
30
37
  # {OpenAI::Models::Audio::TranscriptionTextDeltaEvent} for more details.
31
38
  #
@@ -38,6 +45,8 @@ module OpenAI
38
45
  #
39
46
  # @param logprobs [Array<OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob>] The log probabilities of the delta. Only included if you [create a transcription
40
47
  #
48
+ # @param segment_id [String] Identifier of the diarized segment that this delta belongs to. Only present when
49
+ #
41
50
  # @param type [Symbol, :"transcript.text.delta"] The type of the event. Always `transcript.text.delta`.
42
51
 
43
52
  class Logprob < OpenAI::Internal::Type::BaseModel
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Audio
6
+ class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel
7
+ # @!attribute id
8
+ # Unique identifier for the segment.
9
+ #
10
+ # @return [String]
11
+ required :id, String
12
+
13
+ # @!attribute end_
14
+ # End timestamp of the segment in seconds.
15
+ #
16
+ # @return [Float]
17
+ required :end_, Float, api_name: :end
18
+
19
+ # @!attribute speaker
20
+ # Speaker label for this segment.
21
+ #
22
+ # @return [String]
23
+ required :speaker, String
24
+
25
+ # @!attribute start
26
+ # Start timestamp of the segment in seconds.
27
+ #
28
+ # @return [Float]
29
+ required :start, Float
30
+
31
+ # @!attribute text
32
+ # Transcript text for this segment.
33
+ #
34
+ # @return [String]
35
+ required :text, String
36
+
37
+ # @!attribute type
38
+ # The type of the event. Always `transcript.text.segment`.
39
+ #
40
+ # @return [Symbol, :"transcript.text.segment"]
41
+ required :type, const: :"transcript.text.segment"
42
+
43
+ # @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment")
44
+ # Emitted when a diarized transcription returns a completed segment with speaker
45
+ # information. Only emitted when you
46
+ # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
47
+ # with `stream` set to `true` and `response_format` set to `diarized_json`.
48
+ #
49
+ # @param id [String] Unique identifier for the segment.
50
+ #
51
+ # @param end_ [Float] End timestamp of the segment in seconds.
52
+ #
53
+ # @param speaker [String] Speaker label for this segment.
54
+ #
55
+ # @param start [Float] Start timestamp of the segment in seconds.
56
+ #
57
+ # @param text [String] Transcript text for this segment.
58
+ #
59
+ # @param type [Symbol, :"transcript.text.segment"] The type of the event. Always `transcript.text.segment`.
60
+ end
61
+ end
62
+ end
63
+ end
@@ -8,6 +8,7 @@ module OpenAI
8
8
  WHISPER_1 = :"whisper-1"
9
9
  GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
10
10
  GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
11
+ GPT_4O_TRANSCRIBE_DIARIZE = :"gpt-4o-transcribe-diarize"
11
12
 
12
13
  # @!method self.values
13
14
  # @return [Array<Symbol>]
@@ -3,8 +3,10 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  # The format of the output, in one of these options: `json`, `text`, `srt`,
6
- # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
7
- # the only supported format is `json`.
6
+ # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
7
+ # `gpt-4o-mini-transcribe`, the only supported format is `json`. For
8
+ # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
9
+ # `diarized_json`, with `diarized_json` required to receive speaker annotations.
8
10
  module AudioResponseFormat
9
11
  extend OpenAI::Internal::Type::Enum
10
12
 
@@ -13,6 +15,7 @@ module OpenAI
13
15
  SRT = :srt
14
16
  VERBOSE_JSON = :verbose_json
15
17
  VTT = :vtt
18
+ DIARIZED_JSON = :diarized_json
16
19
 
17
20
  # @!method self.values
18
21
  # @return [Array<Symbol>]
@@ -14,7 +14,8 @@ module OpenAI
14
14
 
15
15
  # @!attribute model
16
16
  # The model to use for transcription. Current options are `whisper-1`,
17
- # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
17
+ # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
18
+ # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
18
19
  #
19
20
  # @return [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model, nil]
20
21
  optional :model, enum: -> { OpenAI::Realtime::AudioTranscription::Model }
@@ -23,8 +24,8 @@ module OpenAI
23
24
  # An optional text to guide the model's style or continue a previous audio
24
25
  # segment. For `whisper-1`, the
25
26
  # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
26
- # For `gpt-4o-transcribe` models, the prompt is a free text string, for example
27
- # "expect words related to technology".
27
+ # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
28
+ # prompt is a free text string, for example "expect words related to technology".
28
29
  #
29
30
  # @return [String, nil]
30
31
  optional :prompt, String
@@ -35,21 +36,22 @@ module OpenAI
35
36
  #
36
37
  # @param language [String] The language of the input audio. Supplying the input language in
37
38
  #
38
- # @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-tra
39
+ # @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-min
39
40
  #
40
41
  # @param prompt [String] An optional text to guide the model's style or continue a previous audio
41
42
 
42
43
  # The model to use for transcription. Current options are `whisper-1`,
43
- # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
44
+ # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
45
+ # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
44
46
  #
45
47
  # @see OpenAI::Models::Realtime::AudioTranscription#model
46
48
  module Model
47
49
  extend OpenAI::Internal::Type::Enum
48
50
 
49
51
  WHISPER_1 = :"whisper-1"
50
- GPT_4O_TRANSCRIBE_LATEST = :"gpt-4o-transcribe-latest"
51
52
  GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
52
53
  GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
54
+ GPT_4O_TRANSCRIBE_DIARIZE = :"gpt-4o-transcribe-diarize"
53
55
 
54
56
  # @!method self.values
55
57
  # @return [Array<Symbol>]