openai 0.31.0 → 0.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/README.md +1 -1
- data/lib/openai/internal/util.rb +5 -5
- data/lib/openai/models/audio/transcription_create_params.rb +42 -11
- data/lib/openai/models/audio/transcription_create_response.rb +4 -1
- data/lib/openai/models/audio/transcription_diarized.rb +160 -0
- data/lib/openai/models/audio/transcription_diarized_segment.rb +65 -0
- data/lib/openai/models/audio/transcription_stream_event.rb +7 -4
- data/lib/openai/models/audio/transcription_text_delta_event.rb +10 -1
- data/lib/openai/models/audio/transcription_text_segment_event.rb +63 -0
- data/lib/openai/models/audio_model.rb +1 -0
- data/lib/openai/models/audio_response_format.rb +5 -2
- data/lib/openai/models/realtime/audio_transcription.rb +8 -6
- data/lib/openai/models/vector_store_create_params.rb +10 -1
- data/lib/openai/resources/audio/transcriptions.rb +12 -4
- data/lib/openai/resources/vector_stores.rb +3 -1
- data/lib/openai/version.rb +1 -1
- data/lib/openai.rb +3 -0
- data/rbi/openai/models/audio/transcription_create_params.rbi +66 -16
- data/rbi/openai/models/audio/transcription_create_response.rbi +1 -0
- data/rbi/openai/models/audio/transcription_diarized.rbi +281 -0
- data/rbi/openai/models/audio/transcription_diarized_segment.rbi +87 -0
- data/rbi/openai/models/audio/transcription_stream_event.rbi +4 -3
- data/rbi/openai/models/audio/transcription_text_delta_event.rbi +14 -1
- data/rbi/openai/models/audio/transcription_text_segment_event.rbi +86 -0
- data/rbi/openai/models/audio_model.rbi +2 -0
- data/rbi/openai/models/audio_response_format.rbi +6 -2
- data/rbi/openai/models/realtime/audio_transcription.rbi +15 -12
- data/rbi/openai/models/vector_store_create_params.rbi +13 -0
- data/rbi/openai/resources/audio/transcriptions.rbi +52 -14
- data/rbi/openai/resources/vector_stores.rbi +4 -0
- data/sig/openai/models/audio/transcription_create_params.rbs +14 -0
- data/sig/openai/models/audio/transcription_create_response.rbs +3 -1
- data/sig/openai/models/audio/transcription_diarized.rbs +129 -0
- data/sig/openai/models/audio/transcription_diarized_segment.rbs +47 -0
- data/sig/openai/models/audio/transcription_stream_event.rbs +2 -1
- data/sig/openai/models/audio/transcription_text_delta_event.rbs +9 -2
- data/sig/openai/models/audio/transcription_text_segment_event.rbs +47 -0
- data/sig/openai/models/audio_model.rbs +5 -1
- data/sig/openai/models/audio_response_format.rbs +3 -1
- data/sig/openai/models/realtime/audio_transcription.rbs +2 -2
- data/sig/openai/models/vector_store_create_params.rbs +7 -0
- data/sig/openai/resources/audio/transcriptions.rbs +4 -0
- data/sig/openai/resources/vector_stores.rbs +1 -0
- metadata +11 -2
@@ -0,0 +1,87 @@
|
|
1
|
+
# typed: strong
|
2
|
+
|
3
|
+
module OpenAI
|
4
|
+
module Models
|
5
|
+
module Audio
|
6
|
+
class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel
|
7
|
+
OrHash =
|
8
|
+
T.type_alias do
|
9
|
+
T.any(
|
10
|
+
OpenAI::Audio::TranscriptionDiarizedSegment,
|
11
|
+
OpenAI::Internal::AnyHash
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Unique identifier for the segment.
|
16
|
+
sig { returns(String) }
|
17
|
+
attr_accessor :id
|
18
|
+
|
19
|
+
# End timestamp of the segment in seconds.
|
20
|
+
sig { returns(Float) }
|
21
|
+
attr_accessor :end_
|
22
|
+
|
23
|
+
# Speaker label for this segment. When known speakers are provided, the label
|
24
|
+
# matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
|
25
|
+
# using capital letters (`A`, `B`, ...).
|
26
|
+
sig { returns(String) }
|
27
|
+
attr_accessor :speaker
|
28
|
+
|
29
|
+
# Start timestamp of the segment in seconds.
|
30
|
+
sig { returns(Float) }
|
31
|
+
attr_accessor :start
|
32
|
+
|
33
|
+
# Transcript text for this segment.
|
34
|
+
sig { returns(String) }
|
35
|
+
attr_accessor :text
|
36
|
+
|
37
|
+
# The type of the segment. Always `transcript.text.segment`.
|
38
|
+
sig { returns(Symbol) }
|
39
|
+
attr_accessor :type
|
40
|
+
|
41
|
+
# A segment of diarized transcript text with speaker metadata.
|
42
|
+
sig do
|
43
|
+
params(
|
44
|
+
id: String,
|
45
|
+
end_: Float,
|
46
|
+
speaker: String,
|
47
|
+
start: Float,
|
48
|
+
text: String,
|
49
|
+
type: Symbol
|
50
|
+
).returns(T.attached_class)
|
51
|
+
end
|
52
|
+
def self.new(
|
53
|
+
# Unique identifier for the segment.
|
54
|
+
id:,
|
55
|
+
# End timestamp of the segment in seconds.
|
56
|
+
end_:,
|
57
|
+
# Speaker label for this segment. When known speakers are provided, the label
|
58
|
+
# matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
|
59
|
+
# using capital letters (`A`, `B`, ...).
|
60
|
+
speaker:,
|
61
|
+
# Start timestamp of the segment in seconds.
|
62
|
+
start:,
|
63
|
+
# Transcript text for this segment.
|
64
|
+
text:,
|
65
|
+
# The type of the segment. Always `transcript.text.segment`.
|
66
|
+
type: :"transcript.text.segment"
|
67
|
+
)
|
68
|
+
end
|
69
|
+
|
70
|
+
sig do
|
71
|
+
override.returns(
|
72
|
+
{
|
73
|
+
id: String,
|
74
|
+
end_: Float,
|
75
|
+
speaker: String,
|
76
|
+
start: Float,
|
77
|
+
text: String,
|
78
|
+
type: Symbol
|
79
|
+
}
|
80
|
+
)
|
81
|
+
end
|
82
|
+
def to_hash
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -3,16 +3,17 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
module Audio
|
6
|
-
# Emitted when
|
7
|
-
#
|
6
|
+
# Emitted when a diarized transcription returns a completed segment with speaker
|
7
|
+
# information. Only emitted when you
|
8
8
|
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
9
|
-
# with
|
9
|
+
# with `stream` set to `true` and `response_format` set to `diarized_json`.
|
10
10
|
module TranscriptionStreamEvent
|
11
11
|
extend OpenAI::Internal::Type::Union
|
12
12
|
|
13
13
|
Variants =
|
14
14
|
T.type_alias do
|
15
15
|
T.any(
|
16
|
+
OpenAI::Audio::TranscriptionTextSegmentEvent,
|
16
17
|
OpenAI::Audio::TranscriptionTextDeltaEvent,
|
17
18
|
OpenAI::Audio::TranscriptionTextDoneEvent
|
18
19
|
)
|
@@ -42,6 +42,14 @@ module OpenAI
|
|
42
42
|
end
|
43
43
|
attr_writer :logprobs
|
44
44
|
|
45
|
+
# Identifier of the diarized segment that this delta belongs to. Only present when
|
46
|
+
# using `gpt-4o-transcribe-diarize`.
|
47
|
+
sig { returns(T.nilable(String)) }
|
48
|
+
attr_reader :segment_id
|
49
|
+
|
50
|
+
sig { params(segment_id: String).void }
|
51
|
+
attr_writer :segment_id
|
52
|
+
|
45
53
|
# Emitted when there is an additional text delta. This is also the first event
|
46
54
|
# emitted when the transcription starts. Only emitted when you
|
47
55
|
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
@@ -53,6 +61,7 @@ module OpenAI
|
|
53
61
|
T::Array[
|
54
62
|
OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob::OrHash
|
55
63
|
],
|
64
|
+
segment_id: String,
|
56
65
|
type: Symbol
|
57
66
|
).returns(T.attached_class)
|
58
67
|
end
|
@@ -63,6 +72,9 @@ module OpenAI
|
|
63
72
|
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
64
73
|
# with the `include[]` parameter set to `logprobs`.
|
65
74
|
logprobs: nil,
|
75
|
+
# Identifier of the diarized segment that this delta belongs to. Only present when
|
76
|
+
# using `gpt-4o-transcribe-diarize`.
|
77
|
+
segment_id: nil,
|
66
78
|
# The type of the event. Always `transcript.text.delta`.
|
67
79
|
type: :"transcript.text.delta"
|
68
80
|
)
|
@@ -74,7 +86,8 @@ module OpenAI
|
|
74
86
|
delta: String,
|
75
87
|
type: Symbol,
|
76
88
|
logprobs:
|
77
|
-
T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob]
|
89
|
+
T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob],
|
90
|
+
segment_id: String
|
78
91
|
}
|
79
92
|
)
|
80
93
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# typed: strong
|
2
|
+
|
3
|
+
module OpenAI
|
4
|
+
module Models
|
5
|
+
module Audio
|
6
|
+
class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel
|
7
|
+
OrHash =
|
8
|
+
T.type_alias do
|
9
|
+
T.any(
|
10
|
+
OpenAI::Audio::TranscriptionTextSegmentEvent,
|
11
|
+
OpenAI::Internal::AnyHash
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Unique identifier for the segment.
|
16
|
+
sig { returns(String) }
|
17
|
+
attr_accessor :id
|
18
|
+
|
19
|
+
# End timestamp of the segment in seconds.
|
20
|
+
sig { returns(Float) }
|
21
|
+
attr_accessor :end_
|
22
|
+
|
23
|
+
# Speaker label for this segment.
|
24
|
+
sig { returns(String) }
|
25
|
+
attr_accessor :speaker
|
26
|
+
|
27
|
+
# Start timestamp of the segment in seconds.
|
28
|
+
sig { returns(Float) }
|
29
|
+
attr_accessor :start
|
30
|
+
|
31
|
+
# Transcript text for this segment.
|
32
|
+
sig { returns(String) }
|
33
|
+
attr_accessor :text
|
34
|
+
|
35
|
+
# The type of the event. Always `transcript.text.segment`.
|
36
|
+
sig { returns(Symbol) }
|
37
|
+
attr_accessor :type
|
38
|
+
|
39
|
+
# Emitted when a diarized transcription returns a completed segment with speaker
|
40
|
+
# information. Only emitted when you
|
41
|
+
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
42
|
+
# with `stream` set to `true` and `response_format` set to `diarized_json`.
|
43
|
+
sig do
|
44
|
+
params(
|
45
|
+
id: String,
|
46
|
+
end_: Float,
|
47
|
+
speaker: String,
|
48
|
+
start: Float,
|
49
|
+
text: String,
|
50
|
+
type: Symbol
|
51
|
+
).returns(T.attached_class)
|
52
|
+
end
|
53
|
+
def self.new(
|
54
|
+
# Unique identifier for the segment.
|
55
|
+
id:,
|
56
|
+
# End timestamp of the segment in seconds.
|
57
|
+
end_:,
|
58
|
+
# Speaker label for this segment.
|
59
|
+
speaker:,
|
60
|
+
# Start timestamp of the segment in seconds.
|
61
|
+
start:,
|
62
|
+
# Transcript text for this segment.
|
63
|
+
text:,
|
64
|
+
# The type of the event. Always `transcript.text.segment`.
|
65
|
+
type: :"transcript.text.segment"
|
66
|
+
)
|
67
|
+
end
|
68
|
+
|
69
|
+
sig do
|
70
|
+
override.returns(
|
71
|
+
{
|
72
|
+
id: String,
|
73
|
+
end_: Float,
|
74
|
+
speaker: String,
|
75
|
+
start: Float,
|
76
|
+
text: String,
|
77
|
+
type: Symbol
|
78
|
+
}
|
79
|
+
)
|
80
|
+
end
|
81
|
+
def to_hash
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -13,6 +13,8 @@ module OpenAI
|
|
13
13
|
T.let(:"gpt-4o-transcribe", OpenAI::AudioModel::TaggedSymbol)
|
14
14
|
GPT_4O_MINI_TRANSCRIBE =
|
15
15
|
T.let(:"gpt-4o-mini-transcribe", OpenAI::AudioModel::TaggedSymbol)
|
16
|
+
GPT_4O_TRANSCRIBE_DIARIZE =
|
17
|
+
T.let(:"gpt-4o-transcribe-diarize", OpenAI::AudioModel::TaggedSymbol)
|
16
18
|
|
17
19
|
sig { override.returns(T::Array[OpenAI::AudioModel::TaggedSymbol]) }
|
18
20
|
def self.values
|
@@ -3,8 +3,10 @@
|
|
3
3
|
module OpenAI
|
4
4
|
module Models
|
5
5
|
# The format of the output, in one of these options: `json`, `text`, `srt`,
|
6
|
-
# `verbose_json`, or `
|
7
|
-
# the only supported format is `json`.
|
6
|
+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
7
|
+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
8
|
+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
9
|
+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
8
10
|
module AudioResponseFormat
|
9
11
|
extend OpenAI::Internal::Type::Enum
|
10
12
|
|
@@ -17,6 +19,8 @@ module OpenAI
|
|
17
19
|
VERBOSE_JSON =
|
18
20
|
T.let(:verbose_json, OpenAI::AudioResponseFormat::TaggedSymbol)
|
19
21
|
VTT = T.let(:vtt, OpenAI::AudioResponseFormat::TaggedSymbol)
|
22
|
+
DIARIZED_JSON =
|
23
|
+
T.let(:diarized_json, OpenAI::AudioResponseFormat::TaggedSymbol)
|
20
24
|
|
21
25
|
sig do
|
22
26
|
override.returns(T::Array[OpenAI::AudioResponseFormat::TaggedSymbol])
|
@@ -22,7 +22,8 @@ module OpenAI
|
|
22
22
|
attr_writer :language
|
23
23
|
|
24
24
|
# The model to use for transcription. Current options are `whisper-1`,
|
25
|
-
# `gpt-4o-transcribe
|
25
|
+
# `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
|
26
|
+
# Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
|
26
27
|
sig do
|
27
28
|
returns(
|
28
29
|
T.nilable(OpenAI::Realtime::AudioTranscription::Model::OrSymbol)
|
@@ -40,8 +41,8 @@ module OpenAI
|
|
40
41
|
# An optional text to guide the model's style or continue a previous audio
|
41
42
|
# segment. For `whisper-1`, the
|
42
43
|
# [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
|
43
|
-
# For `gpt-4o-transcribe` models
|
44
|
-
# "expect words related to technology".
|
44
|
+
# For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
|
45
|
+
# prompt is a free text string, for example "expect words related to technology".
|
45
46
|
sig { returns(T.nilable(String)) }
|
46
47
|
attr_reader :prompt
|
47
48
|
|
@@ -61,13 +62,14 @@ module OpenAI
|
|
61
62
|
# format will improve accuracy and latency.
|
62
63
|
language: nil,
|
63
64
|
# The model to use for transcription. Current options are `whisper-1`,
|
64
|
-
# `gpt-4o-transcribe
|
65
|
+
# `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
|
66
|
+
# Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
|
65
67
|
model: nil,
|
66
68
|
# An optional text to guide the model's style or continue a previous audio
|
67
69
|
# segment. For `whisper-1`, the
|
68
70
|
# [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
|
69
|
-
# For `gpt-4o-transcribe` models
|
70
|
-
# "expect words related to technology".
|
71
|
+
# For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
|
72
|
+
# prompt is a free text string, for example "expect words related to technology".
|
71
73
|
prompt: nil
|
72
74
|
)
|
73
75
|
end
|
@@ -85,7 +87,8 @@ module OpenAI
|
|
85
87
|
end
|
86
88
|
|
87
89
|
# The model to use for transcription. Current options are `whisper-1`,
|
88
|
-
# `gpt-4o-transcribe
|
90
|
+
# `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
|
91
|
+
# Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
|
89
92
|
module Model
|
90
93
|
extend OpenAI::Internal::Type::Enum
|
91
94
|
|
@@ -100,11 +103,6 @@ module OpenAI
|
|
100
103
|
:"whisper-1",
|
101
104
|
OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
|
102
105
|
)
|
103
|
-
GPT_4O_TRANSCRIBE_LATEST =
|
104
|
-
T.let(
|
105
|
-
:"gpt-4o-transcribe-latest",
|
106
|
-
OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
|
107
|
-
)
|
108
106
|
GPT_4O_MINI_TRANSCRIBE =
|
109
107
|
T.let(
|
110
108
|
:"gpt-4o-mini-transcribe",
|
@@ -115,6 +113,11 @@ module OpenAI
|
|
115
113
|
:"gpt-4o-transcribe",
|
116
114
|
OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
|
117
115
|
)
|
116
|
+
GPT_4O_TRANSCRIBE_DIARIZE =
|
117
|
+
T.let(
|
118
|
+
:"gpt-4o-transcribe-diarize",
|
119
|
+
OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
|
120
|
+
)
|
118
121
|
|
119
122
|
sig do
|
120
123
|
override.returns(
|
@@ -36,6 +36,14 @@ module OpenAI
|
|
36
36
|
end
|
37
37
|
attr_writer :chunking_strategy
|
38
38
|
|
39
|
+
# A description for the vector store. Can be used to describe the vector store's
|
40
|
+
# purpose.
|
41
|
+
sig { returns(T.nilable(String)) }
|
42
|
+
attr_reader :description
|
43
|
+
|
44
|
+
sig { params(description: String).void }
|
45
|
+
attr_writer :description
|
46
|
+
|
39
47
|
# The expiration policy for a vector store.
|
40
48
|
sig { returns(T.nilable(OpenAI::VectorStoreCreateParams::ExpiresAfter)) }
|
41
49
|
attr_reader :expires_after
|
@@ -79,6 +87,7 @@ module OpenAI
|
|
79
87
|
OpenAI::AutoFileChunkingStrategyParam::OrHash,
|
80
88
|
OpenAI::StaticFileChunkingStrategyObjectParam::OrHash
|
81
89
|
),
|
90
|
+
description: String,
|
82
91
|
expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash,
|
83
92
|
file_ids: T::Array[String],
|
84
93
|
metadata: T.nilable(T::Hash[Symbol, String]),
|
@@ -90,6 +99,9 @@ module OpenAI
|
|
90
99
|
# The chunking strategy used to chunk the file(s). If not set, will use the `auto`
|
91
100
|
# strategy. Only applicable if `file_ids` is non-empty.
|
92
101
|
chunking_strategy: nil,
|
102
|
+
# A description for the vector store. Can be used to describe the vector store's
|
103
|
+
# purpose.
|
104
|
+
description: nil,
|
93
105
|
# The expiration policy for a vector store.
|
94
106
|
expires_after: nil,
|
95
107
|
# A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
|
@@ -117,6 +129,7 @@ module OpenAI
|
|
117
129
|
OpenAI::AutoFileChunkingStrategyParam,
|
118
130
|
OpenAI::StaticFileChunkingStrategyObjectParam
|
119
131
|
),
|
132
|
+
description: String,
|
120
133
|
expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter,
|
121
134
|
file_ids: T::Array[String],
|
122
135
|
metadata: T.nilable(T::Hash[Symbol, String]),
|
@@ -20,6 +20,8 @@ module OpenAI
|
|
20
20
|
)
|
21
21
|
),
|
22
22
|
include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol],
|
23
|
+
known_speaker_names: T::Array[String],
|
24
|
+
known_speaker_references: T::Array[String],
|
23
25
|
language: String,
|
24
26
|
prompt: String,
|
25
27
|
response_format: OpenAI::AudioResponseFormat::OrSymbol,
|
@@ -39,20 +41,33 @@ module OpenAI
|
|
39
41
|
# flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
|
40
42
|
file:,
|
41
43
|
# ID of the model to use. The options are `gpt-4o-transcribe`,
|
42
|
-
# `gpt-4o-mini-transcribe`,
|
43
|
-
# Whisper V2 model)
|
44
|
+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
|
45
|
+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
|
44
46
|
model:,
|
45
47
|
# Controls how the audio is cut into chunks. When set to `"auto"`, the server
|
46
48
|
# first normalizes loudness and then uses voice activity detection (VAD) to choose
|
47
49
|
# boundaries. `server_vad` object can be provided to tweak VAD detection
|
48
50
|
# parameters manually. If unset, the audio is transcribed as a single block.
|
51
|
+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
|
52
|
+
# seconds.
|
49
53
|
chunking_strategy: nil,
|
50
54
|
# Additional information to include in the transcription response. `logprobs` will
|
51
55
|
# return the log probabilities of the tokens in the response to understand the
|
52
56
|
# model's confidence in the transcription. `logprobs` only works with
|
53
57
|
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
|
54
|
-
# `gpt-4o-mini-transcribe`.
|
58
|
+
# `gpt-4o-mini-transcribe`. This field is not supported when using
|
59
|
+
# `gpt-4o-transcribe-diarize`.
|
55
60
|
include: nil,
|
61
|
+
# Optional list of speaker names that correspond to the audio samples provided in
|
62
|
+
# `known_speaker_references[]`. Each entry should be a short identifier (for
|
63
|
+
# example `customer` or `agent`). Up to 4 speakers are supported.
|
64
|
+
known_speaker_names: nil,
|
65
|
+
# Optional list of audio samples (as
|
66
|
+
# [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
|
67
|
+
# that contain known speaker references matching `known_speaker_names[]`. Each
|
68
|
+
# sample must be between 2 and 10 seconds, and can use any of the same input audio
|
69
|
+
# formats supported by `file`.
|
70
|
+
known_speaker_references: nil,
|
56
71
|
# The language of the input audio. Supplying the input language in
|
57
72
|
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
58
73
|
# format will improve accuracy and latency.
|
@@ -60,11 +75,14 @@ module OpenAI
|
|
60
75
|
# An optional text to guide the model's style or continue a previous audio
|
61
76
|
# segment. The
|
62
77
|
# [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
63
|
-
# should match the audio language.
|
78
|
+
# should match the audio language. This field is not supported when using
|
79
|
+
# `gpt-4o-transcribe-diarize`.
|
64
80
|
prompt: nil,
|
65
81
|
# The format of the output, in one of these options: `json`, `text`, `srt`,
|
66
|
-
# `verbose_json`, or `
|
67
|
-
# the only supported format is `json`.
|
82
|
+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
83
|
+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
84
|
+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
85
|
+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
68
86
|
response_format: nil,
|
69
87
|
# The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
|
70
88
|
# output more random, while lower values like 0.2 will make it more focused and
|
@@ -76,7 +94,8 @@ module OpenAI
|
|
76
94
|
# `response_format` must be set `verbose_json` to use timestamp granularities.
|
77
95
|
# Either or both of these options are supported: `word`, or `segment`. Note: There
|
78
96
|
# is no additional latency for segment timestamps, but generating word timestamps
|
79
|
-
# incurs additional latency.
|
97
|
+
# incurs additional latency. This option is not available for
|
98
|
+
# `gpt-4o-transcribe-diarize`.
|
80
99
|
timestamp_granularities: nil,
|
81
100
|
# There is no need to provide `stream:`. Instead, use `#create_streaming` or
|
82
101
|
# `#create` for streaming and non-streaming use cases, respectively.
|
@@ -101,6 +120,8 @@ module OpenAI
|
|
101
120
|
)
|
102
121
|
),
|
103
122
|
include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol],
|
123
|
+
known_speaker_names: T::Array[String],
|
124
|
+
known_speaker_references: T::Array[String],
|
104
125
|
language: String,
|
105
126
|
prompt: String,
|
106
127
|
response_format: OpenAI::AudioResponseFormat::OrSymbol,
|
@@ -122,20 +143,33 @@ module OpenAI
|
|
122
143
|
# flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
|
123
144
|
file:,
|
124
145
|
# ID of the model to use. The options are `gpt-4o-transcribe`,
|
125
|
-
# `gpt-4o-mini-transcribe`,
|
126
|
-
# Whisper V2 model)
|
146
|
+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
|
147
|
+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
|
127
148
|
model:,
|
128
149
|
# Controls how the audio is cut into chunks. When set to `"auto"`, the server
|
129
150
|
# first normalizes loudness and then uses voice activity detection (VAD) to choose
|
130
151
|
# boundaries. `server_vad` object can be provided to tweak VAD detection
|
131
152
|
# parameters manually. If unset, the audio is transcribed as a single block.
|
153
|
+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
|
154
|
+
# seconds.
|
132
155
|
chunking_strategy: nil,
|
133
156
|
# Additional information to include in the transcription response. `logprobs` will
|
134
157
|
# return the log probabilities of the tokens in the response to understand the
|
135
158
|
# model's confidence in the transcription. `logprobs` only works with
|
136
159
|
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
|
137
|
-
# `gpt-4o-mini-transcribe`.
|
160
|
+
# `gpt-4o-mini-transcribe`. This field is not supported when using
|
161
|
+
# `gpt-4o-transcribe-diarize`.
|
138
162
|
include: nil,
|
163
|
+
# Optional list of speaker names that correspond to the audio samples provided in
|
164
|
+
# `known_speaker_references[]`. Each entry should be a short identifier (for
|
165
|
+
# example `customer` or `agent`). Up to 4 speakers are supported.
|
166
|
+
known_speaker_names: nil,
|
167
|
+
# Optional list of audio samples (as
|
168
|
+
# [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
|
169
|
+
# that contain known speaker references matching `known_speaker_names[]`. Each
|
170
|
+
# sample must be between 2 and 10 seconds, and can use any of the same input audio
|
171
|
+
# formats supported by `file`.
|
172
|
+
known_speaker_references: nil,
|
139
173
|
# The language of the input audio. Supplying the input language in
|
140
174
|
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
141
175
|
# format will improve accuracy and latency.
|
@@ -143,11 +177,14 @@ module OpenAI
|
|
143
177
|
# An optional text to guide the model's style or continue a previous audio
|
144
178
|
# segment. The
|
145
179
|
# [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
146
|
-
# should match the audio language.
|
180
|
+
# should match the audio language. This field is not supported when using
|
181
|
+
# `gpt-4o-transcribe-diarize`.
|
147
182
|
prompt: nil,
|
148
183
|
# The format of the output, in one of these options: `json`, `text`, `srt`,
|
149
|
-
# `verbose_json`, or `
|
150
|
-
# the only supported format is `json`.
|
184
|
+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
185
|
+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
186
|
+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
187
|
+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
151
188
|
response_format: nil,
|
152
189
|
# The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
|
153
190
|
# output more random, while lower values like 0.2 will make it more focused and
|
@@ -159,7 +196,8 @@ module OpenAI
|
|
159
196
|
# `response_format` must be set `verbose_json` to use timestamp granularities.
|
160
197
|
# Either or both of these options are supported: `word`, or `segment`. Note: There
|
161
198
|
# is no additional latency for segment timestamps, but generating word timestamps
|
162
|
-
# incurs additional latency.
|
199
|
+
# incurs additional latency. This option is not available for
|
200
|
+
# `gpt-4o-transcribe-diarize`.
|
163
201
|
timestamp_granularities: nil,
|
164
202
|
# There is no need to provide `stream:`. Instead, use `#create_streaming` or
|
165
203
|
# `#create` for streaming and non-streaming use cases, respectively.
|
@@ -17,6 +17,7 @@ module OpenAI
|
|
17
17
|
OpenAI::AutoFileChunkingStrategyParam::OrHash,
|
18
18
|
OpenAI::StaticFileChunkingStrategyObjectParam::OrHash
|
19
19
|
),
|
20
|
+
description: String,
|
20
21
|
expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash,
|
21
22
|
file_ids: T::Array[String],
|
22
23
|
metadata: T.nilable(T::Hash[Symbol, String]),
|
@@ -28,6 +29,9 @@ module OpenAI
|
|
28
29
|
# The chunking strategy used to chunk the file(s). If not set, will use the `auto`
|
29
30
|
# strategy. Only applicable if `file_ids` is non-empty.
|
30
31
|
chunking_strategy: nil,
|
32
|
+
# A description for the vector store. Can be used to describe the vector store's
|
33
|
+
# purpose.
|
34
|
+
description: nil,
|
31
35
|
# The expiration policy for a vector store.
|
32
36
|
expires_after: nil,
|
33
37
|
# A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
|
@@ -7,6 +7,8 @@ module OpenAI
|
|
7
7
|
model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
|
8
8
|
chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
|
9
9
|
include: ::Array[OpenAI::Models::Audio::transcription_include],
|
10
|
+
known_speaker_names: ::Array[String],
|
11
|
+
known_speaker_references: ::Array[String],
|
10
12
|
language: String,
|
11
13
|
prompt: String,
|
12
14
|
response_format: OpenAI::Models::audio_response_format,
|
@@ -31,6 +33,14 @@ module OpenAI
|
|
31
33
|
::Array[OpenAI::Models::Audio::transcription_include]
|
32
34
|
) -> ::Array[OpenAI::Models::Audio::transcription_include]
|
33
35
|
|
36
|
+
attr_reader known_speaker_names: ::Array[String]?
|
37
|
+
|
38
|
+
def known_speaker_names=: (::Array[String]) -> ::Array[String]
|
39
|
+
|
40
|
+
attr_reader known_speaker_references: ::Array[String]?
|
41
|
+
|
42
|
+
def known_speaker_references=: (::Array[String]) -> ::Array[String]
|
43
|
+
|
34
44
|
attr_reader language: String?
|
35
45
|
|
36
46
|
def language=: (String) -> String
|
@@ -60,6 +70,8 @@ module OpenAI
|
|
60
70
|
model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
|
61
71
|
?chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
|
62
72
|
?include: ::Array[OpenAI::Models::Audio::transcription_include],
|
73
|
+
?known_speaker_names: ::Array[String],
|
74
|
+
?known_speaker_references: ::Array[String],
|
63
75
|
?language: String,
|
64
76
|
?prompt: String,
|
65
77
|
?response_format: OpenAI::Models::audio_response_format,
|
@@ -73,6 +85,8 @@ module OpenAI
|
|
73
85
|
model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
|
74
86
|
chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
|
75
87
|
include: ::Array[OpenAI::Models::Audio::transcription_include],
|
88
|
+
known_speaker_names: ::Array[String],
|
89
|
+
known_speaker_references: ::Array[String],
|
76
90
|
language: String,
|
77
91
|
prompt: String,
|
78
92
|
response_format: OpenAI::Models::audio_response_format,
|
@@ -2,7 +2,9 @@ module OpenAI
|
|
2
2
|
module Models
|
3
3
|
module Audio
|
4
4
|
type transcription_create_response =
|
5
|
-
OpenAI::Audio::Transcription
|
5
|
+
OpenAI::Audio::Transcription
|
6
|
+
| OpenAI::Audio::TranscriptionDiarized
|
7
|
+
| OpenAI::Audio::TranscriptionVerbose
|
6
8
|
|
7
9
|
module TranscriptionCreateResponse
|
8
10
|
extend OpenAI::Internal::Type::Union
|