openai 0.31.0 → 0.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +1 -1
- data/lib/openai/internal/util.rb +5 -5
- data/lib/openai/models/audio/transcription_create_params.rb +42 -11
- data/lib/openai/models/audio/transcription_create_response.rb +4 -1
- data/lib/openai/models/audio/transcription_diarized.rb +160 -0
- data/lib/openai/models/audio/transcription_diarized_segment.rb +65 -0
- data/lib/openai/models/audio/transcription_stream_event.rb +7 -4
- data/lib/openai/models/audio/transcription_text_delta_event.rb +10 -1
- data/lib/openai/models/audio/transcription_text_segment_event.rb +63 -0
- data/lib/openai/models/audio_model.rb +1 -0
- data/lib/openai/models/audio_response_format.rb +5 -2
- data/lib/openai/models/realtime/audio_transcription.rb +8 -6
- data/lib/openai/models/vector_store_create_params.rb +10 -1
- data/lib/openai/resources/audio/transcriptions.rb +12 -4
- data/lib/openai/resources/vector_stores.rb +3 -1
- data/lib/openai/version.rb +1 -1
- data/lib/openai.rb +3 -0
- data/rbi/openai/models/audio/transcription_create_params.rbi +66 -16
- data/rbi/openai/models/audio/transcription_create_response.rbi +1 -0
- data/rbi/openai/models/audio/transcription_diarized.rbi +281 -0
- data/rbi/openai/models/audio/transcription_diarized_segment.rbi +87 -0
- data/rbi/openai/models/audio/transcription_stream_event.rbi +4 -3
- data/rbi/openai/models/audio/transcription_text_delta_event.rbi +14 -1
- data/rbi/openai/models/audio/transcription_text_segment_event.rbi +86 -0
- data/rbi/openai/models/audio_model.rbi +2 -0
- data/rbi/openai/models/audio_response_format.rbi +6 -2
- data/rbi/openai/models/realtime/audio_transcription.rbi +15 -12
- data/rbi/openai/models/vector_store_create_params.rbi +13 -0
- data/rbi/openai/resources/audio/transcriptions.rbi +52 -14
- data/rbi/openai/resources/vector_stores.rbi +4 -0
- data/sig/openai/models/audio/transcription_create_params.rbs +14 -0
- data/sig/openai/models/audio/transcription_create_response.rbs +3 -1
- data/sig/openai/models/audio/transcription_diarized.rbs +129 -0
- data/sig/openai/models/audio/transcription_diarized_segment.rbs +47 -0
- data/sig/openai/models/audio/transcription_stream_event.rbs +2 -1
- data/sig/openai/models/audio/transcription_text_delta_event.rbs +9 -2
- data/sig/openai/models/audio/transcription_text_segment_event.rbs +47 -0
- data/sig/openai/models/audio_model.rbs +5 -1
- data/sig/openai/models/audio_response_format.rbs +3 -1
- data/sig/openai/models/realtime/audio_transcription.rbs +2 -2
- data/sig/openai/models/vector_store_create_params.rbs +7 -0
- data/sig/openai/resources/audio/transcriptions.rbs +4 -0
- data/sig/openai/resources/vector_stores.rbs +1 -0
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 93eb98c219356a4e424930695d90aad0d6100f2d7547416e1ab30ccc7527e69a
|
4
|
+
data.tar.gz: 6fe4f2e35ec1f2ce294eb14d92fd8fff1e872d1e2f2c93e33b0f0a7014bdda6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e13c1e23f3e17b8a8cbf52696205fbe81ab4d2d369d61bb327acaf9dad77673ab8adf636b109a117bd8ccf9fb619081a5e63d37493f85cfe5839a52a78653024
|
7
|
+
data.tar.gz: 1ace0e832f9f9542b5d8bdd968d8cf127b9501d074846e66edcd265e362fbcef5a83a7a59d5feb4a064ab4e76c1271109070cbf2da9e3dc2d249d79691ca395e
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,19 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.32.0 (2025-10-16)
|
4
|
+
|
5
|
+
Full Changelog: [v0.31.0...v0.32.0](https://github.com/openai/openai-ruby/compare/v0.31.0...v0.32.0)
|
6
|
+
|
7
|
+
### Features
|
8
|
+
|
9
|
+
* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([b31bd7f](https://github.com/openai/openai-ruby/commit/b31bd7f20ca702160873fa26ab39479fd8102f85))
|
10
|
+
|
11
|
+
|
12
|
+
### Bug Fixes
|
13
|
+
|
14
|
+
* absolutely qualified uris should always override the default ([14fdff8](https://github.com/openai/openai-ruby/commit/14fdff8de533a1002c64c9086016777a1e152a97))
|
15
|
+
* should not reuse buffers for `IO.copy_stream` interop ([8f33de1](https://github.com/openai/openai-ruby/commit/8f33de18bb104d5003a4d459ad244c0813e5a07e))
|
16
|
+
|
3
17
|
## 0.31.0 (2025-10-10)
|
4
18
|
|
5
19
|
Full Changelog: [v0.30.0...v0.31.0](https://github.com/openai/openai-ruby/compare/v0.30.0...v0.31.0)
|
data/README.md
CHANGED
data/lib/openai/internal/util.rb
CHANGED
@@ -346,8 +346,9 @@ module OpenAI
|
|
346
346
|
base_path, base_query = lhs.fetch_values(:path, :query)
|
347
347
|
slashed = base_path.end_with?("/") ? base_path : "#{base_path}/"
|
348
348
|
|
349
|
-
|
350
|
-
|
349
|
+
merged = {**parse_uri(rhs.fetch(:path)), **rhs.except(:path, :query)}
|
350
|
+
parsed_path, parsed_query = merged.fetch_values(:path, :query)
|
351
|
+
override = URI::Generic.build(**merged.slice(:scheme, :host, :port), path: parsed_path)
|
351
352
|
|
352
353
|
joined = URI.join(URI::Generic.build(lhs.except(:path, :query)), slashed, override)
|
353
354
|
query = deep_merge(
|
@@ -473,10 +474,9 @@ module OpenAI
|
|
473
474
|
# @return [Enumerable<String>]
|
474
475
|
def writable_enum(&blk)
|
475
476
|
Enumerator.new do |y|
|
476
|
-
buf = String.new
|
477
477
|
y.define_singleton_method(:write) do
|
478
|
-
self <<
|
479
|
-
|
478
|
+
self << _1.dup
|
479
|
+
_1.bytesize
|
480
480
|
end
|
481
481
|
|
482
482
|
blk.call(y)
|
@@ -19,8 +19,8 @@ module OpenAI
|
|
19
19
|
|
20
20
|
# @!attribute model
|
21
21
|
# ID of the model to use. The options are `gpt-4o-transcribe`,
|
22
|
-
# `gpt-4o-mini-transcribe`,
|
23
|
-
# Whisper V2 model)
|
22
|
+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
|
23
|
+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
|
24
24
|
#
|
25
25
|
# @return [String, Symbol, OpenAI::Models::AudioModel]
|
26
26
|
required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model }
|
@@ -30,6 +30,8 @@ module OpenAI
|
|
30
30
|
# first normalizes loudness and then uses voice activity detection (VAD) to choose
|
31
31
|
# boundaries. `server_vad` object can be provided to tweak VAD detection
|
32
32
|
# parameters manually. If unset, the audio is transcribed as a single block.
|
33
|
+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
|
34
|
+
# seconds.
|
33
35
|
#
|
34
36
|
# @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil]
|
35
37
|
optional :chunking_strategy,
|
@@ -41,11 +43,30 @@ module OpenAI
|
|
41
43
|
# return the log probabilities of the tokens in the response to understand the
|
42
44
|
# model's confidence in the transcription. `logprobs` only works with
|
43
45
|
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
|
44
|
-
# `gpt-4o-mini-transcribe`.
|
46
|
+
# `gpt-4o-mini-transcribe`. This field is not supported when using
|
47
|
+
# `gpt-4o-transcribe-diarize`.
|
45
48
|
#
|
46
49
|
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
|
47
50
|
optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] }
|
48
51
|
|
52
|
+
# @!attribute known_speaker_names
|
53
|
+
# Optional list of speaker names that correspond to the audio samples provided in
|
54
|
+
# `known_speaker_references[]`. Each entry should be a short identifier (for
|
55
|
+
# example `customer` or `agent`). Up to 4 speakers are supported.
|
56
|
+
#
|
57
|
+
# @return [Array<String>, nil]
|
58
|
+
optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String]
|
59
|
+
|
60
|
+
# @!attribute known_speaker_references
|
61
|
+
# Optional list of audio samples (as
|
62
|
+
# [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
|
63
|
+
# that contain known speaker references matching `known_speaker_names[]`. Each
|
64
|
+
# sample must be between 2 and 10 seconds, and can use any of the same input audio
|
65
|
+
# formats supported by `file`.
|
66
|
+
#
|
67
|
+
# @return [Array<String>, nil]
|
68
|
+
optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String]
|
69
|
+
|
49
70
|
# @!attribute language
|
50
71
|
# The language of the input audio. Supplying the input language in
|
51
72
|
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
@@ -58,15 +79,18 @@ module OpenAI
|
|
58
79
|
# An optional text to guide the model's style or continue a previous audio
|
59
80
|
# segment. The
|
60
81
|
# [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
61
|
-
# should match the audio language.
|
82
|
+
# should match the audio language. This field is not supported when using
|
83
|
+
# `gpt-4o-transcribe-diarize`.
|
62
84
|
#
|
63
85
|
# @return [String, nil]
|
64
86
|
optional :prompt, String
|
65
87
|
|
66
88
|
# @!attribute response_format
|
67
89
|
# The format of the output, in one of these options: `json`, `text`, `srt`,
|
68
|
-
# `verbose_json`, or `
|
69
|
-
# the only supported format is `json`.
|
90
|
+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
91
|
+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
92
|
+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
93
|
+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
70
94
|
#
|
71
95
|
# @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
|
72
96
|
optional :response_format, enum: -> { OpenAI::AudioResponseFormat }
|
@@ -86,13 +110,14 @@ module OpenAI
|
|
86
110
|
# `response_format` must be set `verbose_json` to use timestamp granularities.
|
87
111
|
# Either or both of these options are supported: `word`, or `segment`. Note: There
|
88
112
|
# is no additional latency for segment timestamps, but generating word timestamps
|
89
|
-
# incurs additional latency.
|
113
|
+
# incurs additional latency. This option is not available for
|
114
|
+
# `gpt-4o-transcribe-diarize`.
|
90
115
|
#
|
91
116
|
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionCreateParams::TimestampGranularity>, nil]
|
92
117
|
optional :timestamp_granularities,
|
93
118
|
-> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] }
|
94
119
|
|
95
|
-
# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
|
120
|
+
# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
|
96
121
|
# Some parameter documentations has been truncated, see
|
97
122
|
# {OpenAI::Models::Audio::TranscriptionCreateParams} for more details.
|
98
123
|
#
|
@@ -104,6 +129,10 @@ module OpenAI
|
|
104
129
|
#
|
105
130
|
# @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>] Additional information to include in the transcription response.
|
106
131
|
#
|
132
|
+
# @param known_speaker_names [Array<String>] Optional list of speaker names that correspond to the audio samples provided in
|
133
|
+
#
|
134
|
+
# @param known_speaker_references [Array<String>] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en-
|
135
|
+
#
|
107
136
|
# @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt
|
108
137
|
#
|
109
138
|
# @param prompt [String] An optional text to guide the model's style or continue a previous audio segment
|
@@ -117,14 +146,14 @@ module OpenAI
|
|
117
146
|
# @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]
|
118
147
|
|
119
148
|
# ID of the model to use. The options are `gpt-4o-transcribe`,
|
120
|
-
# `gpt-4o-mini-transcribe`,
|
121
|
-
# Whisper V2 model)
|
149
|
+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
|
150
|
+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
|
122
151
|
module Model
|
123
152
|
extend OpenAI::Internal::Type::Union
|
124
153
|
|
125
154
|
variant String
|
126
155
|
|
127
|
-
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`,
|
156
|
+
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`.
|
128
157
|
variant enum: -> { OpenAI::AudioModel }
|
129
158
|
|
130
159
|
# @!method self.variants
|
@@ -135,6 +164,8 @@ module OpenAI
|
|
135
164
|
# first normalizes loudness and then uses voice activity detection (VAD) to choose
|
136
165
|
# boundaries. `server_vad` object can be provided to tweak VAD detection
|
137
166
|
# parameters manually. If unset, the audio is transcribed as a single block.
|
167
|
+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
|
168
|
+
# seconds.
|
138
169
|
module ChunkingStrategy
|
139
170
|
extend OpenAI::Internal::Type::Union
|
140
171
|
|
@@ -15,11 +15,14 @@ module OpenAI
|
|
15
15
|
# Represents a transcription response returned by model, based on the provided input.
|
16
16
|
variant -> { OpenAI::Audio::Transcription }
|
17
17
|
|
18
|
+
# Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations.
|
19
|
+
variant -> { OpenAI::Audio::TranscriptionDiarized }
|
20
|
+
|
18
21
|
# Represents a verbose json transcription response returned by model, based on the provided input.
|
19
22
|
variant -> { OpenAI::Audio::TranscriptionVerbose }
|
20
23
|
|
21
24
|
# @!method self.variants
|
22
|
-
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)]
|
25
|
+
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)]
|
23
26
|
end
|
24
27
|
end
|
25
28
|
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OpenAI
|
4
|
+
module Models
|
5
|
+
module Audio
|
6
|
+
class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel
|
7
|
+
# @!attribute duration
|
8
|
+
# Duration of the input audio in seconds.
|
9
|
+
#
|
10
|
+
# @return [Float]
|
11
|
+
required :duration, Float
|
12
|
+
|
13
|
+
# @!attribute segments
|
14
|
+
# Segments of the transcript annotated with timestamps and speaker labels.
|
15
|
+
#
|
16
|
+
# @return [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>]
|
17
|
+
required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] }
|
18
|
+
|
19
|
+
# @!attribute task
|
20
|
+
# The type of task that was run. Always `transcribe`.
|
21
|
+
#
|
22
|
+
# @return [Symbol, :transcribe]
|
23
|
+
required :task, const: :transcribe
|
24
|
+
|
25
|
+
# @!attribute text
|
26
|
+
# The concatenated transcript text for the entire audio input.
|
27
|
+
#
|
28
|
+
# @return [String]
|
29
|
+
required :text, String
|
30
|
+
|
31
|
+
# @!attribute usage
|
32
|
+
# Token or duration usage statistics for the request.
|
33
|
+
#
|
34
|
+
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil]
|
35
|
+
optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage }
|
36
|
+
|
37
|
+
# @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe)
|
38
|
+
# Represents a diarized transcription response returned by the model, including
|
39
|
+
# the combined transcript and speaker-segment annotations.
|
40
|
+
#
|
41
|
+
# @param duration [Float] Duration of the input audio in seconds.
|
42
|
+
#
|
43
|
+
# @param segments [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>] Segments of the transcript annotated with timestamps and speaker labels.
|
44
|
+
#
|
45
|
+
# @param text [String] The concatenated transcript text for the entire audio input.
|
46
|
+
#
|
47
|
+
# @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request.
|
48
|
+
#
|
49
|
+
# @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`.
|
50
|
+
|
51
|
+
# Token or duration usage statistics for the request.
|
52
|
+
#
|
53
|
+
# @see OpenAI::Models::Audio::TranscriptionDiarized#usage
|
54
|
+
module Usage
|
55
|
+
extend OpenAI::Internal::Type::Union
|
56
|
+
|
57
|
+
discriminator :type
|
58
|
+
|
59
|
+
# Usage statistics for models billed by token usage.
|
60
|
+
variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens }
|
61
|
+
|
62
|
+
# Usage statistics for models billed by audio input duration.
|
63
|
+
variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration }
|
64
|
+
|
65
|
+
class Tokens < OpenAI::Internal::Type::BaseModel
|
66
|
+
# @!attribute input_tokens
|
67
|
+
# Number of input tokens billed for this request.
|
68
|
+
#
|
69
|
+
# @return [Integer]
|
70
|
+
required :input_tokens, Integer
|
71
|
+
|
72
|
+
# @!attribute output_tokens
|
73
|
+
# Number of output tokens generated.
|
74
|
+
#
|
75
|
+
# @return [Integer]
|
76
|
+
required :output_tokens, Integer
|
77
|
+
|
78
|
+
# @!attribute total_tokens
|
79
|
+
# Total number of tokens used (input + output).
|
80
|
+
#
|
81
|
+
# @return [Integer]
|
82
|
+
required :total_tokens, Integer
|
83
|
+
|
84
|
+
# @!attribute type
|
85
|
+
# The type of the usage object. Always `tokens` for this variant.
|
86
|
+
#
|
87
|
+
# @return [Symbol, :tokens]
|
88
|
+
required :type, const: :tokens
|
89
|
+
|
90
|
+
# @!attribute input_token_details
|
91
|
+
# Details about the input tokens billed for this request.
|
92
|
+
#
|
93
|
+
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil]
|
94
|
+
optional :input_token_details,
|
95
|
+
-> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails }
|
96
|
+
|
97
|
+
# @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens)
|
98
|
+
# Usage statistics for models billed by token usage.
|
99
|
+
#
|
100
|
+
# @param input_tokens [Integer] Number of input tokens billed for this request.
|
101
|
+
#
|
102
|
+
# @param output_tokens [Integer] Number of output tokens generated.
|
103
|
+
#
|
104
|
+
# @param total_tokens [Integer] Total number of tokens used (input + output).
|
105
|
+
#
|
106
|
+
# @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request.
|
107
|
+
#
|
108
|
+
# @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant.
|
109
|
+
|
110
|
+
# @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details
|
111
|
+
class InputTokenDetails < OpenAI::Internal::Type::BaseModel
|
112
|
+
# @!attribute audio_tokens
|
113
|
+
# Number of audio tokens billed for this request.
|
114
|
+
#
|
115
|
+
# @return [Integer, nil]
|
116
|
+
optional :audio_tokens, Integer
|
117
|
+
|
118
|
+
# @!attribute text_tokens
|
119
|
+
# Number of text tokens billed for this request.
|
120
|
+
#
|
121
|
+
# @return [Integer, nil]
|
122
|
+
optional :text_tokens, Integer
|
123
|
+
|
124
|
+
# @!method initialize(audio_tokens: nil, text_tokens: nil)
|
125
|
+
# Details about the input tokens billed for this request.
|
126
|
+
#
|
127
|
+
# @param audio_tokens [Integer] Number of audio tokens billed for this request.
|
128
|
+
#
|
129
|
+
# @param text_tokens [Integer] Number of text tokens billed for this request.
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class Duration < OpenAI::Internal::Type::BaseModel
|
134
|
+
# @!attribute seconds
|
135
|
+
# Duration of the input audio in seconds.
|
136
|
+
#
|
137
|
+
# @return [Float]
|
138
|
+
required :seconds, Float
|
139
|
+
|
140
|
+
# @!attribute type
|
141
|
+
# The type of the usage object. Always `duration` for this variant.
|
142
|
+
#
|
143
|
+
# @return [Symbol, :duration]
|
144
|
+
required :type, const: :duration
|
145
|
+
|
146
|
+
# @!method initialize(seconds:, type: :duration)
|
147
|
+
# Usage statistics for models billed by audio input duration.
|
148
|
+
#
|
149
|
+
# @param seconds [Float] Duration of the input audio in seconds.
|
150
|
+
#
|
151
|
+
# @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant.
|
152
|
+
end
|
153
|
+
|
154
|
+
# @!method self.variants
|
155
|
+
# @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)]
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OpenAI
|
4
|
+
module Models
|
5
|
+
module Audio
|
6
|
+
class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel
|
7
|
+
# @!attribute id
|
8
|
+
# Unique identifier for the segment.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
required :id, String
|
12
|
+
|
13
|
+
# @!attribute end_
|
14
|
+
# End timestamp of the segment in seconds.
|
15
|
+
#
|
16
|
+
# @return [Float]
|
17
|
+
required :end_, Float, api_name: :end
|
18
|
+
|
19
|
+
# @!attribute speaker
|
20
|
+
# Speaker label for this segment. When known speakers are provided, the label
|
21
|
+
# matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
|
22
|
+
# using capital letters (`A`, `B`, ...).
|
23
|
+
#
|
24
|
+
# @return [String]
|
25
|
+
required :speaker, String
|
26
|
+
|
27
|
+
# @!attribute start
|
28
|
+
# Start timestamp of the segment in seconds.
|
29
|
+
#
|
30
|
+
# @return [Float]
|
31
|
+
required :start, Float
|
32
|
+
|
33
|
+
# @!attribute text
|
34
|
+
# Transcript text for this segment.
|
35
|
+
#
|
36
|
+
# @return [String]
|
37
|
+
required :text, String
|
38
|
+
|
39
|
+
# @!attribute type
|
40
|
+
# The type of the segment. Always `transcript.text.segment`.
|
41
|
+
#
|
42
|
+
# @return [Symbol, :"transcript.text.segment"]
|
43
|
+
required :type, const: :"transcript.text.segment"
|
44
|
+
|
45
|
+
# @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment")
|
46
|
+
# Some parameter documentations has been truncated, see
|
47
|
+
# {OpenAI::Models::Audio::TranscriptionDiarizedSegment} for more details.
|
48
|
+
#
|
49
|
+
# A segment of diarized transcript text with speaker metadata.
|
50
|
+
#
|
51
|
+
# @param id [String] Unique identifier for the segment.
|
52
|
+
#
|
53
|
+
# @param end_ [Float] End timestamp of the segment in seconds.
|
54
|
+
#
|
55
|
+
# @param speaker [String] Speaker label for this segment. When known speakers are provided, the label matc
|
56
|
+
#
|
57
|
+
# @param start [Float] Start timestamp of the segment in seconds.
|
58
|
+
#
|
59
|
+
# @param text [String] Transcript text for this segment.
|
60
|
+
#
|
61
|
+
# @param type [Symbol, :"transcript.text.segment"] The type of the segment. Always `transcript.text.segment`.
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -3,15 +3,18 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
module Audio
|
6
|
-
# Emitted when
|
7
|
-
#
|
6
|
+
# Emitted when a diarized transcription returns a completed segment with speaker
|
7
|
+
# information. Only emitted when you
|
8
8
|
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
9
|
-
# with
|
9
|
+
# with `stream` set to `true` and `response_format` set to `diarized_json`.
|
10
10
|
module TranscriptionStreamEvent
|
11
11
|
extend OpenAI::Internal::Type::Union
|
12
12
|
|
13
13
|
discriminator :type
|
14
14
|
|
15
|
+
# Emitted when a diarized transcription returns a completed segment with speaker information. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with `stream` set to `true` and `response_format` set to `diarized_json`.
|
16
|
+
variant :"transcript.text.segment", -> { OpenAI::Audio::TranscriptionTextSegmentEvent }
|
17
|
+
|
15
18
|
# Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`.
|
16
19
|
variant :"transcript.text.delta", -> { OpenAI::Audio::TranscriptionTextDeltaEvent }
|
17
20
|
|
@@ -19,7 +22,7 @@ module OpenAI
|
|
19
22
|
variant :"transcript.text.done", -> { OpenAI::Audio::TranscriptionTextDoneEvent }
|
20
23
|
|
21
24
|
# @!method self.variants
|
22
|
-
# @return [Array(OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)]
|
25
|
+
# @return [Array(OpenAI::Models::Audio::TranscriptionTextSegmentEvent, OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)]
|
23
26
|
end
|
24
27
|
end
|
25
28
|
end
|
@@ -25,7 +25,14 @@ module OpenAI
|
|
25
25
|
optional :logprobs,
|
26
26
|
-> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] }
|
27
27
|
|
28
|
-
# @!
|
28
|
+
# @!attribute segment_id
|
29
|
+
# Identifier of the diarized segment that this delta belongs to. Only present when
|
30
|
+
# using `gpt-4o-transcribe-diarize`.
|
31
|
+
#
|
32
|
+
# @return [String, nil]
|
33
|
+
optional :segment_id, String
|
34
|
+
|
35
|
+
# @!method initialize(delta:, logprobs: nil, segment_id: nil, type: :"transcript.text.delta")
|
29
36
|
# Some parameter documentations has been truncated, see
|
30
37
|
# {OpenAI::Models::Audio::TranscriptionTextDeltaEvent} for more details.
|
31
38
|
#
|
@@ -38,6 +45,8 @@ module OpenAI
|
|
38
45
|
#
|
39
46
|
# @param logprobs [Array<OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob>] The log probabilities of the delta. Only included if you [create a transcription
|
40
47
|
#
|
48
|
+
# @param segment_id [String] Identifier of the diarized segment that this delta belongs to. Only present when
|
49
|
+
#
|
41
50
|
# @param type [Symbol, :"transcript.text.delta"] The type of the event. Always `transcript.text.delta`.
|
42
51
|
|
43
52
|
class Logprob < OpenAI::Internal::Type::BaseModel
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module OpenAI
|
4
|
+
module Models
|
5
|
+
module Audio
|
6
|
+
class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel
|
7
|
+
# @!attribute id
|
8
|
+
# Unique identifier for the segment.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
required :id, String
|
12
|
+
|
13
|
+
# @!attribute end_
|
14
|
+
# End timestamp of the segment in seconds.
|
15
|
+
#
|
16
|
+
# @return [Float]
|
17
|
+
required :end_, Float, api_name: :end
|
18
|
+
|
19
|
+
# @!attribute speaker
|
20
|
+
# Speaker label for this segment.
|
21
|
+
#
|
22
|
+
# @return [String]
|
23
|
+
required :speaker, String
|
24
|
+
|
25
|
+
# @!attribute start
|
26
|
+
# Start timestamp of the segment in seconds.
|
27
|
+
#
|
28
|
+
# @return [Float]
|
29
|
+
required :start, Float
|
30
|
+
|
31
|
+
# @!attribute text
|
32
|
+
# Transcript text for this segment.
|
33
|
+
#
|
34
|
+
# @return [String]
|
35
|
+
required :text, String
|
36
|
+
|
37
|
+
# @!attribute type
|
38
|
+
# The type of the event. Always `transcript.text.segment`.
|
39
|
+
#
|
40
|
+
# @return [Symbol, :"transcript.text.segment"]
|
41
|
+
required :type, const: :"transcript.text.segment"
|
42
|
+
|
43
|
+
# @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment")
|
44
|
+
# Emitted when a diarized transcription returns a completed segment with speaker
|
45
|
+
# information. Only emitted when you
|
46
|
+
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
47
|
+
# with `stream` set to `true` and `response_format` set to `diarized_json`.
|
48
|
+
#
|
49
|
+
# @param id [String] Unique identifier for the segment.
|
50
|
+
#
|
51
|
+
# @param end_ [Float] End timestamp of the segment in seconds.
|
52
|
+
#
|
53
|
+
# @param speaker [String] Speaker label for this segment.
|
54
|
+
#
|
55
|
+
# @param start [Float] Start timestamp of the segment in seconds.
|
56
|
+
#
|
57
|
+
# @param text [String] Transcript text for this segment.
|
58
|
+
#
|
59
|
+
# @param type [Symbol, :"transcript.text.segment"] The type of the event. Always `transcript.text.segment`.
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -3,8 +3,10 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
# The format of the output, in one of these options: `json`, `text`, `srt`,
|
6
|
-
# `verbose_json`, or `
|
7
|
-
# the only supported format is `json`.
|
6
|
+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
7
|
+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
8
|
+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
9
|
+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
8
10
|
module AudioResponseFormat
|
9
11
|
extend OpenAI::Internal::Type::Enum
|
10
12
|
|
@@ -13,6 +15,7 @@ module OpenAI
|
|
13
15
|
SRT = :srt
|
14
16
|
VERBOSE_JSON = :verbose_json
|
15
17
|
VTT = :vtt
|
18
|
+
DIARIZED_JSON = :diarized_json
|
16
19
|
|
17
20
|
# @!method self.values
|
18
21
|
# @return [Array<Symbol>]
|
@@ -14,7 +14,8 @@ module OpenAI
|
|
14
14
|
|
15
15
|
# @!attribute model
|
16
16
|
# The model to use for transcription. Current options are `whisper-1`,
|
17
|
-
# `gpt-4o-transcribe
|
17
|
+
# `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
|
18
|
+
# Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
|
18
19
|
#
|
19
20
|
# @return [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model, nil]
|
20
21
|
optional :model, enum: -> { OpenAI::Realtime::AudioTranscription::Model }
|
@@ -23,8 +24,8 @@ module OpenAI
|
|
23
24
|
# An optional text to guide the model's style or continue a previous audio
|
24
25
|
# segment. For `whisper-1`, the
|
25
26
|
# [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
|
26
|
-
# For `gpt-4o-transcribe` models
|
27
|
-
# "expect words related to technology".
|
27
|
+
# For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
|
28
|
+
# prompt is a free text string, for example "expect words related to technology".
|
28
29
|
#
|
29
30
|
# @return [String, nil]
|
30
31
|
optional :prompt, String
|
@@ -35,21 +36,22 @@ module OpenAI
|
|
35
36
|
#
|
36
37
|
# @param language [String] The language of the input audio. Supplying the input language in
|
37
38
|
#
|
38
|
-
# @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-
|
39
|
+
# @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-min
|
39
40
|
#
|
40
41
|
# @param prompt [String] An optional text to guide the model's style or continue a previous audio
|
41
42
|
|
42
43
|
# The model to use for transcription. Current options are `whisper-1`,
|
43
|
-
# `gpt-4o-transcribe
|
44
|
+
# `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
|
45
|
+
# Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
|
44
46
|
#
|
45
47
|
# @see OpenAI::Models::Realtime::AudioTranscription#model
|
46
48
|
module Model
|
47
49
|
extend OpenAI::Internal::Type::Enum
|
48
50
|
|
49
51
|
WHISPER_1 = :"whisper-1"
|
50
|
-
GPT_4O_TRANSCRIBE_LATEST = :"gpt-4o-transcribe-latest"
|
51
52
|
GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe"
|
52
53
|
GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe"
|
54
|
+
GPT_4O_TRANSCRIBE_DIARIZE = :"gpt-4o-transcribe-diarize"
|
53
55
|
|
54
56
|
# @!method self.values
|
55
57
|
# @return [Array<Symbol>]
|