openai 0.31.0 → 0.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -0
  3. data/README.md +1 -1
  4. data/lib/openai/internal/util.rb +5 -5
  5. data/lib/openai/models/audio/transcription_create_params.rb +42 -11
  6. data/lib/openai/models/audio/transcription_create_response.rb +4 -1
  7. data/lib/openai/models/audio/transcription_diarized.rb +160 -0
  8. data/lib/openai/models/audio/transcription_diarized_segment.rb +65 -0
  9. data/lib/openai/models/audio/transcription_stream_event.rb +7 -4
  10. data/lib/openai/models/audio/transcription_text_delta_event.rb +10 -1
  11. data/lib/openai/models/audio/transcription_text_segment_event.rb +63 -0
  12. data/lib/openai/models/audio_model.rb +1 -0
  13. data/lib/openai/models/audio_response_format.rb +5 -2
  14. data/lib/openai/models/realtime/audio_transcription.rb +8 -6
  15. data/lib/openai/models/vector_store_create_params.rb +10 -1
  16. data/lib/openai/resources/audio/transcriptions.rb +12 -4
  17. data/lib/openai/resources/vector_stores.rb +3 -1
  18. data/lib/openai/version.rb +1 -1
  19. data/lib/openai.rb +3 -0
  20. data/rbi/openai/models/audio/transcription_create_params.rbi +66 -16
  21. data/rbi/openai/models/audio/transcription_create_response.rbi +1 -0
  22. data/rbi/openai/models/audio/transcription_diarized.rbi +281 -0
  23. data/rbi/openai/models/audio/transcription_diarized_segment.rbi +87 -0
  24. data/rbi/openai/models/audio/transcription_stream_event.rbi +4 -3
  25. data/rbi/openai/models/audio/transcription_text_delta_event.rbi +14 -1
  26. data/rbi/openai/models/audio/transcription_text_segment_event.rbi +86 -0
  27. data/rbi/openai/models/audio_model.rbi +2 -0
  28. data/rbi/openai/models/audio_response_format.rbi +6 -2
  29. data/rbi/openai/models/realtime/audio_transcription.rbi +15 -12
  30. data/rbi/openai/models/vector_store_create_params.rbi +13 -0
  31. data/rbi/openai/resources/audio/transcriptions.rbi +52 -14
  32. data/rbi/openai/resources/vector_stores.rbi +4 -0
  33. data/sig/openai/models/audio/transcription_create_params.rbs +14 -0
  34. data/sig/openai/models/audio/transcription_create_response.rbs +3 -1
  35. data/sig/openai/models/audio/transcription_diarized.rbs +129 -0
  36. data/sig/openai/models/audio/transcription_diarized_segment.rbs +47 -0
  37. data/sig/openai/models/audio/transcription_stream_event.rbs +2 -1
  38. data/sig/openai/models/audio/transcription_text_delta_event.rbs +9 -2
  39. data/sig/openai/models/audio/transcription_text_segment_event.rbs +47 -0
  40. data/sig/openai/models/audio_model.rbs +5 -1
  41. data/sig/openai/models/audio_response_format.rbs +3 -1
  42. data/sig/openai/models/realtime/audio_transcription.rbs +2 -2
  43. data/sig/openai/models/vector_store_create_params.rbs +7 -0
  44. data/sig/openai/resources/audio/transcriptions.rbs +4 -0
  45. data/sig/openai/resources/vector_stores.rbs +1 -0
  46. metadata +11 -2
@@ -0,0 +1,87 @@
1
+ # typed: strong
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Audio
6
+ class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel
7
+ OrHash =
8
+ T.type_alias do
9
+ T.any(
10
+ OpenAI::Audio::TranscriptionDiarizedSegment,
11
+ OpenAI::Internal::AnyHash
12
+ )
13
+ end
14
+
15
+ # Unique identifier for the segment.
16
+ sig { returns(String) }
17
+ attr_accessor :id
18
+
19
+ # End timestamp of the segment in seconds.
20
+ sig { returns(Float) }
21
+ attr_accessor :end_
22
+
23
+ # Speaker label for this segment. When known speakers are provided, the label
24
+ # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
25
+ # using capital letters (`A`, `B`, ...).
26
+ sig { returns(String) }
27
+ attr_accessor :speaker
28
+
29
+ # Start timestamp of the segment in seconds.
30
+ sig { returns(Float) }
31
+ attr_accessor :start
32
+
33
+ # Transcript text for this segment.
34
+ sig { returns(String) }
35
+ attr_accessor :text
36
+
37
+ # The type of the segment. Always `transcript.text.segment`.
38
+ sig { returns(Symbol) }
39
+ attr_accessor :type
40
+
41
+ # A segment of diarized transcript text with speaker metadata.
42
+ sig do
43
+ params(
44
+ id: String,
45
+ end_: Float,
46
+ speaker: String,
47
+ start: Float,
48
+ text: String,
49
+ type: Symbol
50
+ ).returns(T.attached_class)
51
+ end
52
+ def self.new(
53
+ # Unique identifier for the segment.
54
+ id:,
55
+ # End timestamp of the segment in seconds.
56
+ end_:,
57
+ # Speaker label for this segment. When known speakers are provided, the label
58
+ # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
59
+ # using capital letters (`A`, `B`, ...).
60
+ speaker:,
61
+ # Start timestamp of the segment in seconds.
62
+ start:,
63
+ # Transcript text for this segment.
64
+ text:,
65
+ # The type of the segment. Always `transcript.text.segment`.
66
+ type: :"transcript.text.segment"
67
+ )
68
+ end
69
+
70
+ sig do
71
+ override.returns(
72
+ {
73
+ id: String,
74
+ end_: Float,
75
+ speaker: String,
76
+ start: Float,
77
+ text: String,
78
+ type: Symbol
79
+ }
80
+ )
81
+ end
82
+ def to_hash
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -3,16 +3,17 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  module Audio
6
- # Emitted when there is an additional text delta. This is also the first event
7
- # emitted when the transcription starts. Only emitted when you
6
+ # Emitted when a diarized transcription returns a completed segment with speaker
7
+ # information. Only emitted when you
8
8
  # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
9
- # with the `Stream` parameter set to `true`.
9
+ # with `stream` set to `true` and `response_format` set to `diarized_json`.
10
10
  module TranscriptionStreamEvent
11
11
  extend OpenAI::Internal::Type::Union
12
12
 
13
13
  Variants =
14
14
  T.type_alias do
15
15
  T.any(
16
+ OpenAI::Audio::TranscriptionTextSegmentEvent,
16
17
  OpenAI::Audio::TranscriptionTextDeltaEvent,
17
18
  OpenAI::Audio::TranscriptionTextDoneEvent
18
19
  )
@@ -42,6 +42,14 @@ module OpenAI
42
42
  end
43
43
  attr_writer :logprobs
44
44
 
45
+ # Identifier of the diarized segment that this delta belongs to. Only present when
46
+ # using `gpt-4o-transcribe-diarize`.
47
+ sig { returns(T.nilable(String)) }
48
+ attr_reader :segment_id
49
+
50
+ sig { params(segment_id: String).void }
51
+ attr_writer :segment_id
52
+
45
53
  # Emitted when there is an additional text delta. This is also the first event
46
54
  # emitted when the transcription starts. Only emitted when you
47
55
  # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
@@ -53,6 +61,7 @@ module OpenAI
53
61
  T::Array[
54
62
  OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob::OrHash
55
63
  ],
64
+ segment_id: String,
56
65
  type: Symbol
57
66
  ).returns(T.attached_class)
58
67
  end
@@ -63,6 +72,9 @@ module OpenAI
63
72
  # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
64
73
  # with the `include[]` parameter set to `logprobs`.
65
74
  logprobs: nil,
75
+ # Identifier of the diarized segment that this delta belongs to. Only present when
76
+ # using `gpt-4o-transcribe-diarize`.
77
+ segment_id: nil,
66
78
  # The type of the event. Always `transcript.text.delta`.
67
79
  type: :"transcript.text.delta"
68
80
  )
@@ -74,7 +86,8 @@ module OpenAI
74
86
  delta: String,
75
87
  type: Symbol,
76
88
  logprobs:
77
- T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob]
89
+ T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob],
90
+ segment_id: String
78
91
  }
79
92
  )
80
93
  end
@@ -0,0 +1,86 @@
1
+ # typed: strong
2
+
3
+ module OpenAI
4
+ module Models
5
+ module Audio
6
+ class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel
7
+ OrHash =
8
+ T.type_alias do
9
+ T.any(
10
+ OpenAI::Audio::TranscriptionTextSegmentEvent,
11
+ OpenAI::Internal::AnyHash
12
+ )
13
+ end
14
+
15
+ # Unique identifier for the segment.
16
+ sig { returns(String) }
17
+ attr_accessor :id
18
+
19
+ # End timestamp of the segment in seconds.
20
+ sig { returns(Float) }
21
+ attr_accessor :end_
22
+
23
+ # Speaker label for this segment.
24
+ sig { returns(String) }
25
+ attr_accessor :speaker
26
+
27
+ # Start timestamp of the segment in seconds.
28
+ sig { returns(Float) }
29
+ attr_accessor :start
30
+
31
+ # Transcript text for this segment.
32
+ sig { returns(String) }
33
+ attr_accessor :text
34
+
35
+ # The type of the event. Always `transcript.text.segment`.
36
+ sig { returns(Symbol) }
37
+ attr_accessor :type
38
+
39
+ # Emitted when a diarized transcription returns a completed segment with speaker
40
+ # information. Only emitted when you
41
+ # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
42
+ # with `stream` set to `true` and `response_format` set to `diarized_json`.
43
+ sig do
44
+ params(
45
+ id: String,
46
+ end_: Float,
47
+ speaker: String,
48
+ start: Float,
49
+ text: String,
50
+ type: Symbol
51
+ ).returns(T.attached_class)
52
+ end
53
+ def self.new(
54
+ # Unique identifier for the segment.
55
+ id:,
56
+ # End timestamp of the segment in seconds.
57
+ end_:,
58
+ # Speaker label for this segment.
59
+ speaker:,
60
+ # Start timestamp of the segment in seconds.
61
+ start:,
62
+ # Transcript text for this segment.
63
+ text:,
64
+ # The type of the event. Always `transcript.text.segment`.
65
+ type: :"transcript.text.segment"
66
+ )
67
+ end
68
+
69
+ sig do
70
+ override.returns(
71
+ {
72
+ id: String,
73
+ end_: Float,
74
+ speaker: String,
75
+ start: Float,
76
+ text: String,
77
+ type: Symbol
78
+ }
79
+ )
80
+ end
81
+ def to_hash
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
@@ -13,6 +13,8 @@ module OpenAI
13
13
  T.let(:"gpt-4o-transcribe", OpenAI::AudioModel::TaggedSymbol)
14
14
  GPT_4O_MINI_TRANSCRIBE =
15
15
  T.let(:"gpt-4o-mini-transcribe", OpenAI::AudioModel::TaggedSymbol)
16
+ GPT_4O_TRANSCRIBE_DIARIZE =
17
+ T.let(:"gpt-4o-transcribe-diarize", OpenAI::AudioModel::TaggedSymbol)
16
18
 
17
19
  sig { override.returns(T::Array[OpenAI::AudioModel::TaggedSymbol]) }
18
20
  def self.values
@@ -3,8 +3,10 @@
3
3
  module OpenAI
4
4
  module Models
5
5
  # The format of the output, in one of these options: `json`, `text`, `srt`,
6
- # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
7
- # the only supported format is `json`.
6
+ # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
7
+ # `gpt-4o-mini-transcribe`, the only supported format is `json`. For
8
+ # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
9
+ # `diarized_json`, with `diarized_json` required to receive speaker annotations.
8
10
  module AudioResponseFormat
9
11
  extend OpenAI::Internal::Type::Enum
10
12
 
@@ -17,6 +19,8 @@ module OpenAI
17
19
  VERBOSE_JSON =
18
20
  T.let(:verbose_json, OpenAI::AudioResponseFormat::TaggedSymbol)
19
21
  VTT = T.let(:vtt, OpenAI::AudioResponseFormat::TaggedSymbol)
22
+ DIARIZED_JSON =
23
+ T.let(:diarized_json, OpenAI::AudioResponseFormat::TaggedSymbol)
20
24
 
21
25
  sig do
22
26
  override.returns(T::Array[OpenAI::AudioResponseFormat::TaggedSymbol])
@@ -22,7 +22,8 @@ module OpenAI
22
22
  attr_writer :language
23
23
 
24
24
  # The model to use for transcription. Current options are `whisper-1`,
25
- # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
25
+ # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
26
+ # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
26
27
  sig do
27
28
  returns(
28
29
  T.nilable(OpenAI::Realtime::AudioTranscription::Model::OrSymbol)
@@ -40,8 +41,8 @@ module OpenAI
40
41
  # An optional text to guide the model's style or continue a previous audio
41
42
  # segment. For `whisper-1`, the
42
43
  # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
43
- # For `gpt-4o-transcribe` models, the prompt is a free text string, for example
44
- # "expect words related to technology".
44
+ # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
45
+ # prompt is a free text string, for example "expect words related to technology".
45
46
  sig { returns(T.nilable(String)) }
46
47
  attr_reader :prompt
47
48
 
@@ -61,13 +62,14 @@ module OpenAI
61
62
  # format will improve accuracy and latency.
62
63
  language: nil,
63
64
  # The model to use for transcription. Current options are `whisper-1`,
64
- # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
65
+ # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
66
+ # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
65
67
  model: nil,
66
68
  # An optional text to guide the model's style or continue a previous audio
67
69
  # segment. For `whisper-1`, the
68
70
  # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
69
- # For `gpt-4o-transcribe` models, the prompt is a free text string, for example
70
- # "expect words related to technology".
71
+ # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
72
+ # prompt is a free text string, for example "expect words related to technology".
71
73
  prompt: nil
72
74
  )
73
75
  end
@@ -85,7 +87,8 @@ module OpenAI
85
87
  end
86
88
 
87
89
  # The model to use for transcription. Current options are `whisper-1`,
88
- # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
90
+ # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`.
91
+ # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.
89
92
  module Model
90
93
  extend OpenAI::Internal::Type::Enum
91
94
 
@@ -100,11 +103,6 @@ module OpenAI
100
103
  :"whisper-1",
101
104
  OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
102
105
  )
103
- GPT_4O_TRANSCRIBE_LATEST =
104
- T.let(
105
- :"gpt-4o-transcribe-latest",
106
- OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
107
- )
108
106
  GPT_4O_MINI_TRANSCRIBE =
109
107
  T.let(
110
108
  :"gpt-4o-mini-transcribe",
@@ -115,6 +113,11 @@ module OpenAI
115
113
  :"gpt-4o-transcribe",
116
114
  OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
117
115
  )
116
+ GPT_4O_TRANSCRIBE_DIARIZE =
117
+ T.let(
118
+ :"gpt-4o-transcribe-diarize",
119
+ OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol
120
+ )
118
121
 
119
122
  sig do
120
123
  override.returns(
@@ -36,6 +36,14 @@ module OpenAI
36
36
  end
37
37
  attr_writer :chunking_strategy
38
38
 
39
+ # A description for the vector store. Can be used to describe the vector store's
40
+ # purpose.
41
+ sig { returns(T.nilable(String)) }
42
+ attr_reader :description
43
+
44
+ sig { params(description: String).void }
45
+ attr_writer :description
46
+
39
47
  # The expiration policy for a vector store.
40
48
  sig { returns(T.nilable(OpenAI::VectorStoreCreateParams::ExpiresAfter)) }
41
49
  attr_reader :expires_after
@@ -79,6 +87,7 @@ module OpenAI
79
87
  OpenAI::AutoFileChunkingStrategyParam::OrHash,
80
88
  OpenAI::StaticFileChunkingStrategyObjectParam::OrHash
81
89
  ),
90
+ description: String,
82
91
  expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash,
83
92
  file_ids: T::Array[String],
84
93
  metadata: T.nilable(T::Hash[Symbol, String]),
@@ -90,6 +99,9 @@ module OpenAI
90
99
  # The chunking strategy used to chunk the file(s). If not set, will use the `auto`
91
100
  # strategy. Only applicable if `file_ids` is non-empty.
92
101
  chunking_strategy: nil,
102
+ # A description for the vector store. Can be used to describe the vector store's
103
+ # purpose.
104
+ description: nil,
93
105
  # The expiration policy for a vector store.
94
106
  expires_after: nil,
95
107
  # A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -117,6 +129,7 @@ module OpenAI
117
129
  OpenAI::AutoFileChunkingStrategyParam,
118
130
  OpenAI::StaticFileChunkingStrategyObjectParam
119
131
  ),
132
+ description: String,
120
133
  expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter,
121
134
  file_ids: T::Array[String],
122
135
  metadata: T.nilable(T::Hash[Symbol, String]),
@@ -20,6 +20,8 @@ module OpenAI
20
20
  )
21
21
  ),
22
22
  include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol],
23
+ known_speaker_names: T::Array[String],
24
+ known_speaker_references: T::Array[String],
23
25
  language: String,
24
26
  prompt: String,
25
27
  response_format: OpenAI::AudioResponseFormat::OrSymbol,
@@ -39,20 +41,33 @@ module OpenAI
39
41
  # flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
40
42
  file:,
41
43
  # ID of the model to use. The options are `gpt-4o-transcribe`,
42
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
43
- # Whisper V2 model).
44
+ # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
45
+ # Whisper V2 model), and `gpt-4o-transcribe-diarize`.
44
46
  model:,
45
47
  # Controls how the audio is cut into chunks. When set to `"auto"`, the server
46
48
  # first normalizes loudness and then uses voice activity detection (VAD) to choose
47
49
  # boundaries. `server_vad` object can be provided to tweak VAD detection
48
50
  # parameters manually. If unset, the audio is transcribed as a single block.
51
+ # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
52
+ # seconds.
49
53
  chunking_strategy: nil,
50
54
  # Additional information to include in the transcription response. `logprobs` will
51
55
  # return the log probabilities of the tokens in the response to understand the
52
56
  # model's confidence in the transcription. `logprobs` only works with
53
57
  # response_format set to `json` and only with the models `gpt-4o-transcribe` and
54
- # `gpt-4o-mini-transcribe`.
58
+ # `gpt-4o-mini-transcribe`. This field is not supported when using
59
+ # `gpt-4o-transcribe-diarize`.
55
60
  include: nil,
61
+ # Optional list of speaker names that correspond to the audio samples provided in
62
+ # `known_speaker_references[]`. Each entry should be a short identifier (for
63
+ # example `customer` or `agent`). Up to 4 speakers are supported.
64
+ known_speaker_names: nil,
65
+ # Optional list of audio samples (as
66
+ # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
67
+ # that contain known speaker references matching `known_speaker_names[]`. Each
68
+ # sample must be between 2 and 10 seconds, and can use any of the same input audio
69
+ # formats supported by `file`.
70
+ known_speaker_references: nil,
56
71
  # The language of the input audio. Supplying the input language in
57
72
  # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
58
73
  # format will improve accuracy and latency.
@@ -60,11 +75,14 @@ module OpenAI
60
75
  # An optional text to guide the model's style or continue a previous audio
61
76
  # segment. The
62
77
  # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
63
- # should match the audio language.
78
+ # should match the audio language. This field is not supported when using
79
+ # `gpt-4o-transcribe-diarize`.
64
80
  prompt: nil,
65
81
  # The format of the output, in one of these options: `json`, `text`, `srt`,
66
- # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
67
- # the only supported format is `json`.
82
+ # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
83
+ # `gpt-4o-mini-transcribe`, the only supported format is `json`. For
84
+ # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
85
+ # `diarized_json`, with `diarized_json` required to receive speaker annotations.
68
86
  response_format: nil,
69
87
  # The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
70
88
  # output more random, while lower values like 0.2 will make it more focused and
@@ -76,7 +94,8 @@ module OpenAI
76
94
  # `response_format` must be set `verbose_json` to use timestamp granularities.
77
95
  # Either or both of these options are supported: `word`, or `segment`. Note: There
78
96
  # is no additional latency for segment timestamps, but generating word timestamps
79
- # incurs additional latency.
97
+ # incurs additional latency. This option is not available for
98
+ # `gpt-4o-transcribe-diarize`.
80
99
  timestamp_granularities: nil,
81
100
  # There is no need to provide `stream:`. Instead, use `#create_streaming` or
82
101
  # `#create` for streaming and non-streaming use cases, respectively.
@@ -101,6 +120,8 @@ module OpenAI
101
120
  )
102
121
  ),
103
122
  include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol],
123
+ known_speaker_names: T::Array[String],
124
+ known_speaker_references: T::Array[String],
104
125
  language: String,
105
126
  prompt: String,
106
127
  response_format: OpenAI::AudioResponseFormat::OrSymbol,
@@ -122,20 +143,33 @@ module OpenAI
122
143
  # flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
123
144
  file:,
124
145
  # ID of the model to use. The options are `gpt-4o-transcribe`,
125
- # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
126
- # Whisper V2 model).
146
+ # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
147
+ # Whisper V2 model), and `gpt-4o-transcribe-diarize`.
127
148
  model:,
128
149
  # Controls how the audio is cut into chunks. When set to `"auto"`, the server
129
150
  # first normalizes loudness and then uses voice activity detection (VAD) to choose
130
151
  # boundaries. `server_vad` object can be provided to tweak VAD detection
131
152
  # parameters manually. If unset, the audio is transcribed as a single block.
153
+ # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
154
+ # seconds.
132
155
  chunking_strategy: nil,
133
156
  # Additional information to include in the transcription response. `logprobs` will
134
157
  # return the log probabilities of the tokens in the response to understand the
135
158
  # model's confidence in the transcription. `logprobs` only works with
136
159
  # response_format set to `json` and only with the models `gpt-4o-transcribe` and
137
- # `gpt-4o-mini-transcribe`.
160
+ # `gpt-4o-mini-transcribe`. This field is not supported when using
161
+ # `gpt-4o-transcribe-diarize`.
138
162
  include: nil,
163
+ # Optional list of speaker names that correspond to the audio samples provided in
164
+ # `known_speaker_references[]`. Each entry should be a short identifier (for
165
+ # example `customer` or `agent`). Up to 4 speakers are supported.
166
+ known_speaker_names: nil,
167
+ # Optional list of audio samples (as
168
+ # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
169
+ # that contain known speaker references matching `known_speaker_names[]`. Each
170
+ # sample must be between 2 and 10 seconds, and can use any of the same input audio
171
+ # formats supported by `file`.
172
+ known_speaker_references: nil,
139
173
  # The language of the input audio. Supplying the input language in
140
174
  # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
141
175
  # format will improve accuracy and latency.
@@ -143,11 +177,14 @@ module OpenAI
143
177
  # An optional text to guide the model's style or continue a previous audio
144
178
  # segment. The
145
179
  # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
146
- # should match the audio language.
180
+ # should match the audio language. This field is not supported when using
181
+ # `gpt-4o-transcribe-diarize`.
147
182
  prompt: nil,
148
183
  # The format of the output, in one of these options: `json`, `text`, `srt`,
149
- # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
150
- # the only supported format is `json`.
184
+ # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
185
+ # `gpt-4o-mini-transcribe`, the only supported format is `json`. For
186
+ # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
187
+ # `diarized_json`, with `diarized_json` required to receive speaker annotations.
151
188
  response_format: nil,
152
189
  # The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
153
190
  # output more random, while lower values like 0.2 will make it more focused and
@@ -159,7 +196,8 @@ module OpenAI
159
196
  # `response_format` must be set `verbose_json` to use timestamp granularities.
160
197
  # Either or both of these options are supported: `word`, or `segment`. Note: There
161
198
  # is no additional latency for segment timestamps, but generating word timestamps
162
- # incurs additional latency.
199
+ # incurs additional latency. This option is not available for
200
+ # `gpt-4o-transcribe-diarize`.
163
201
  timestamp_granularities: nil,
164
202
  # There is no need to provide `stream:`. Instead, use `#create_streaming` or
165
203
  # `#create` for streaming and non-streaming use cases, respectively.
@@ -17,6 +17,7 @@ module OpenAI
17
17
  OpenAI::AutoFileChunkingStrategyParam::OrHash,
18
18
  OpenAI::StaticFileChunkingStrategyObjectParam::OrHash
19
19
  ),
20
+ description: String,
20
21
  expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash,
21
22
  file_ids: T::Array[String],
22
23
  metadata: T.nilable(T::Hash[Symbol, String]),
@@ -28,6 +29,9 @@ module OpenAI
28
29
  # The chunking strategy used to chunk the file(s). If not set, will use the `auto`
29
30
  # strategy. Only applicable if `file_ids` is non-empty.
30
31
  chunking_strategy: nil,
32
+ # A description for the vector store. Can be used to describe the vector store's
33
+ # purpose.
34
+ description: nil,
31
35
  # The expiration policy for a vector store.
32
36
  expires_after: nil,
33
37
  # A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -7,6 +7,8 @@ module OpenAI
7
7
  model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
8
8
  chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
9
9
  include: ::Array[OpenAI::Models::Audio::transcription_include],
10
+ known_speaker_names: ::Array[String],
11
+ known_speaker_references: ::Array[String],
10
12
  language: String,
11
13
  prompt: String,
12
14
  response_format: OpenAI::Models::audio_response_format,
@@ -31,6 +33,14 @@ module OpenAI
31
33
  ::Array[OpenAI::Models::Audio::transcription_include]
32
34
  ) -> ::Array[OpenAI::Models::Audio::transcription_include]
33
35
 
36
+ attr_reader known_speaker_names: ::Array[String]?
37
+
38
+ def known_speaker_names=: (::Array[String]) -> ::Array[String]
39
+
40
+ attr_reader known_speaker_references: ::Array[String]?
41
+
42
+ def known_speaker_references=: (::Array[String]) -> ::Array[String]
43
+
34
44
  attr_reader language: String?
35
45
 
36
46
  def language=: (String) -> String
@@ -60,6 +70,8 @@ module OpenAI
60
70
  model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
61
71
  ?chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
62
72
  ?include: ::Array[OpenAI::Models::Audio::transcription_include],
73
+ ?known_speaker_names: ::Array[String],
74
+ ?known_speaker_references: ::Array[String],
63
75
  ?language: String,
64
76
  ?prompt: String,
65
77
  ?response_format: OpenAI::Models::audio_response_format,
@@ -73,6 +85,8 @@ module OpenAI
73
85
  model: OpenAI::Models::Audio::TranscriptionCreateParams::model,
74
86
  chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?,
75
87
  include: ::Array[OpenAI::Models::Audio::transcription_include],
88
+ known_speaker_names: ::Array[String],
89
+ known_speaker_references: ::Array[String],
76
90
  language: String,
77
91
  prompt: String,
78
92
  response_format: OpenAI::Models::audio_response_format,
@@ -2,7 +2,9 @@ module OpenAI
2
2
  module Models
3
3
  module Audio
4
4
  type transcription_create_response =
5
- OpenAI::Audio::Transcription | OpenAI::Audio::TranscriptionVerbose
5
+ OpenAI::Audio::Transcription
6
+ | OpenAI::Audio::TranscriptionDiarized
7
+ | OpenAI::Audio::TranscriptionVerbose
6
8
 
7
9
  module TranscriptionCreateResponse
8
10
  extend OpenAI::Internal::Type::Union