youtube-transcript-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of youtube-transcript-rb might be problematic. Click here for more details.

Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +42 -42
  3. data/lib/youtube-transcript-rb.rb +3 -0
  4. data/lib/youtube_rb/transcript/api.rb +148 -0
  5. data/lib/youtube_rb/transcript/errors.rb +215 -0
  6. data/lib/youtube_rb/transcript/formatters.rb +267 -0
  7. data/lib/youtube_rb/transcript/settings.rb +26 -0
  8. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  9. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  10. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
  11. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  12. data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
  13. data/lib/youtube_rb/transcript.rb +35 -0
  14. data/sig/youtube_rb/transcript.rbs +6 -0
  15. data/spec/api_spec.rb +20 -20
  16. data/spec/errors_spec.rb +39 -39
  17. data/spec/formatters_spec.rb +36 -36
  18. data/spec/integration_spec.rb +32 -32
  19. data/spec/settings_spec.rb +16 -16
  20. data/spec/spec_helper.rb +1 -1
  21. data/spec/transcript_list_fetcher_spec.rb +27 -27
  22. data/spec/transcript_list_spec.rb +6 -6
  23. data/spec/transcript_parser_spec.rb +3 -3
  24. data/spec/transcript_spec.rb +16 -16
  25. metadata +13 -12
  26. data/lib/youtube/transcript/rb/api.rb +0 -150
  27. data/lib/youtube/transcript/rb/errors.rb +0 -217
  28. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  29. data/lib/youtube/transcript/rb/settings.rb +0 -28
  30. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  31. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  32. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  33. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  34. data/lib/youtube/transcript/rb.rb +0 -37
  35. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -0,0 +1,267 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module YoutubeRb
6
+ module Transcript
7
+ # Module containing all transcript formatters
8
+ module Formatters
9
+ # Base formatter class. All formatters should inherit from this class
10
+ # and implement their own format_transcript and format_transcripts methods.
11
+ class Formatter
12
+ # Format a single transcript
13
+ #
14
+ # @param transcript [FetchedTranscript] The transcript to format
15
+ # @param options [Hash] Additional formatting options
16
+ # @return [String] The formatted transcript
17
+ def format_transcript(transcript, **options)
18
+ raise NotImplementedError, "Subclass must implement #format_transcript"
19
+ end
20
+
21
+ # Format multiple transcripts
22
+ #
23
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
24
+ # @param options [Hash] Additional formatting options
25
+ # @return [String] The formatted transcripts
26
+ def format_transcripts(transcripts, **options)
27
+ raise NotImplementedError, "Subclass must implement #format_transcripts"
28
+ end
29
+ end
30
+
31
+ # Formats transcript as pretty-printed Ruby data structures
32
+ class PrettyPrintFormatter < Formatter
33
+ # Format a single transcript as pretty-printed output
34
+ #
35
+ # @param transcript [FetchedTranscript] The transcript to format
36
+ # @param options [Hash] Options passed to PP.pp
37
+ # @return [String] Pretty-printed transcript data
38
+ def format_transcript(transcript, **options)
39
+ require "pp"
40
+ PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
41
+ end
42
+
43
+ # Format multiple transcripts as pretty-printed output
44
+ #
45
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
46
+ # @param options [Hash] Options passed to PP.pp
47
+ # @return [String] Pretty-printed transcripts data
48
+ def format_transcripts(transcripts, **options)
49
+ require "pp"
50
+ data = transcripts.map(&:to_raw_data)
51
+ PP.pp(data, +"", options[:width] || 79)
52
+ end
53
+ end
54
+
55
+ # Formats transcript as JSON
56
+ class JSONFormatter < Formatter
57
+ # Format a single transcript as JSON
58
+ #
59
+ # @param transcript [FetchedTranscript] The transcript to format
60
+ # @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
61
+ # @return [String] JSON representation of the transcript
62
+ def format_transcript(transcript, **options)
63
+ JSON.generate(transcript.to_raw_data, options)
64
+ end
65
+
66
+ # Format multiple transcripts as JSON array
67
+ #
68
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
69
+ # @param options [Hash] Options passed to JSON.generate
70
+ # @return [String] JSON array representation of the transcripts
71
+ def format_transcripts(transcripts, **options)
72
+ data = transcripts.map(&:to_raw_data)
73
+ JSON.generate(data, options)
74
+ end
75
+ end
76
+
77
+ # Formats transcript as plain text (text only, no timestamps)
78
+ class TextFormatter < Formatter
79
+ # Format a single transcript as plain text
80
+ #
81
+ # @param transcript [FetchedTranscript] The transcript to format
82
+ # @param options [Hash] Unused options
83
+ # @return [String] Plain text with each line separated by newlines
84
+ def format_transcript(transcript, **options)
85
+ transcript.map(&:text).join("\n")
86
+ end
87
+
88
+ # Format multiple transcripts as plain text
89
+ #
90
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
91
+ # @param options [Hash] Unused options
92
+ # @return [String] Plain text with transcripts separated by triple newlines
93
+ def format_transcripts(transcripts, **options)
94
+ transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
95
+ end
96
+ end
97
+
98
+ # Base class for timestamp-based formatters (SRT, WebVTT)
99
+ class TextBasedFormatter < TextFormatter
100
+ # Format a single transcript with timestamps
101
+ #
102
+ # @param transcript [FetchedTranscript] The transcript to format
103
+ # @param options [Hash] Unused options
104
+ # @return [String] Formatted transcript with timestamps
105
+ def format_transcript(transcript, **options)
106
+ lines = []
107
+ snippets = transcript.to_a
108
+
109
+ snippets.each_with_index do |snippet, i|
110
+ end_time = snippet.start + snippet.duration
111
+
112
+ # Use next snippet's start time if it starts before current end time
113
+ if i < snippets.length - 1 && snippets[i + 1].start < end_time
114
+ end_time = snippets[i + 1].start
115
+ end
116
+
117
+ time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
118
+ lines << format_transcript_helper(i, time_text, snippet)
119
+ end
120
+
121
+ format_transcript_header(lines)
122
+ end
123
+
124
+ protected
125
+
126
+ # Format a timestamp from components
127
+ #
128
+ # @param hours [Integer] Hours component
129
+ # @param mins [Integer] Minutes component
130
+ # @param secs [Integer] Seconds component
131
+ # @param ms [Integer] Milliseconds component
132
+ # @return [String] Formatted timestamp
133
+ def format_timestamp(hours, mins, secs, ms)
134
+ raise NotImplementedError, "Subclass must implement #format_timestamp"
135
+ end
136
+
137
+ # Format the transcript header/wrapper
138
+ #
139
+ # @param lines [Array<String>] The formatted lines
140
+ # @return [String] The complete formatted transcript
141
+ def format_transcript_header(lines)
142
+ raise NotImplementedError, "Subclass must implement #format_transcript_header"
143
+ end
144
+
145
+ # Format a single transcript entry
146
+ #
147
+ # @param index [Integer] The entry index (0-based)
148
+ # @param time_text [String] The formatted time range
149
+ # @param snippet [TranscriptSnippet] The snippet to format
150
+ # @return [String] The formatted entry
151
+ def format_transcript_helper(index, time_text, snippet)
152
+ raise NotImplementedError, "Subclass must implement #format_transcript_helper"
153
+ end
154
+
155
+ private
156
+
157
+ # Convert seconds to timestamp string
158
+ #
159
+ # @param time [Float] Time in seconds
160
+ # @return [String] Formatted timestamp
161
+ def seconds_to_timestamp(time)
162
+ time = time.to_f
163
+ hours, remainder = time.divmod(3600)
164
+ mins, secs_float = remainder.divmod(60)
165
+ secs = secs_float.to_i
166
+ ms = ((time - time.to_i) * 1000).round
167
+
168
+ format_timestamp(hours.to_i, mins.to_i, secs, ms)
169
+ end
170
+ end
171
+
172
+ # Formats transcript as SRT (SubRip) subtitle format
173
+ #
174
+ # @example SRT format
175
+ # 1
176
+ # 00:00:00,000 --> 00:00:02,500
177
+ # Hello world
178
+ #
179
+ # 2
180
+ # 00:00:02,500 --> 00:00:05,000
181
+ # This is a test
182
+ #
183
+ class SRTFormatter < TextBasedFormatter
184
+ protected
185
+
186
+ def format_timestamp(hours, mins, secs, ms)
187
+ format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
188
+ end
189
+
190
+ def format_transcript_header(lines)
191
+ lines.join("\n\n") + "\n"
192
+ end
193
+
194
+ def format_transcript_helper(index, time_text, snippet)
195
+ "#{index + 1}\n#{time_text}\n#{snippet.text}"
196
+ end
197
+ end
198
+
199
+ # Formats transcript as WebVTT (Web Video Text Tracks) format
200
+ #
201
+ # @example WebVTT format
202
+ # WEBVTT
203
+ #
204
+ # 00:00:00.000 --> 00:00:02.500
205
+ # Hello world
206
+ #
207
+ # 00:00:02.500 --> 00:00:05.000
208
+ # This is a test
209
+ #
210
+ class WebVTTFormatter < TextBasedFormatter
211
+ protected
212
+
213
+ def format_timestamp(hours, mins, secs, ms)
214
+ format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
215
+ end
216
+
217
+ def format_transcript_header(lines)
218
+ "WEBVTT\n\n" + lines.join("\n\n") + "\n"
219
+ end
220
+
221
+ def format_transcript_helper(index, time_text, snippet)
222
+ "#{time_text}\n#{snippet.text}"
223
+ end
224
+ end
225
+
226
+ # Utility class to load formatters by type name
227
+ class FormatterLoader
228
+ # Mapping of format names to formatter classes
229
+ TYPES = {
230
+ "json" => JSONFormatter,
231
+ "pretty" => PrettyPrintFormatter,
232
+ "text" => TextFormatter,
233
+ "webvtt" => WebVTTFormatter,
234
+ "srt" => SRTFormatter
235
+ }.freeze
236
+
237
+ # Error raised when an unknown formatter type is requested
238
+ class UnknownFormatterType < StandardError
239
+ def initialize(formatter_type)
240
+ super(
241
+ "The format '#{formatter_type}' is not supported. " \
242
+ "Choose one of the following formats: #{TYPES.keys.join(", ")}"
243
+ )
244
+ end
245
+ end
246
+
247
+ # Load a formatter by type name
248
+ #
249
+ # @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
250
+ # @return [Formatter] An instance of the requested formatter
251
+ # @raise [UnknownFormatterType] If the formatter type is not supported
252
+ #
253
+ # @example
254
+ # loader = FormatterLoader.new
255
+ # formatter = loader.load("json")
256
+ # output = formatter.format_transcript(transcript)
257
+ #
258
+ def load(formatter_type = "pretty")
259
+ formatter_type = formatter_type.to_s
260
+ raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
261
+
262
+ TYPES[formatter_type].new
263
+ end
264
+ end
265
+ end
266
+ end
267
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # YouTube watch URL template
6
+ # @example
7
+ # format(WATCH_URL, video_id: "abc123")
8
+ # # => "https://www.youtube.com/watch?v=abc123"
9
+ WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
10
+
11
+ # YouTube Innertube API URL template
12
+ # @example
13
+ # format(INNERTUBE_API_URL, api_key: "key123")
14
+ # # => "https://www.youtube.com/youtubei/v1/player?key=key123"
15
+ INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
16
+
17
+ # Innertube API context for Android client
18
+ # Used in POST requests to the Innertube API
19
+ INNERTUBE_CONTEXT = {
20
+ "client" => {
21
+ "clientName" => "ANDROID",
22
+ "clientVersion" => "20.10.38"
23
+ }
24
+ }.freeze
25
+ end
26
+ end
@@ -0,0 +1,237 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # Represents a language available for translation
6
+ class TranslationLanguage
7
+ # @return [String] the language name (e.g., "Spanish")
8
+ attr_reader :language
9
+
10
+ # @return [String] the language code (e.g., "es")
11
+ attr_reader :language_code
12
+
13
+ # @param language [String] the language name
14
+ # @param language_code [String] the language code
15
+ def initialize(language:, language_code:)
16
+ @language = language
17
+ @language_code = language_code
18
+ end
19
+ end
20
+
21
+ # Represents a single transcript snippet/segment
22
+ class TranscriptSnippet
23
+ # @return [String] the text content of the snippet
24
+ attr_reader :text
25
+
26
+ # @return [Float] the start time in seconds
27
+ attr_reader :start
28
+
29
+ # @return [Float] the duration in seconds
30
+ attr_reader :duration
31
+
32
+ # @param text [String] the text content
33
+ # @param start [Float] the start time in seconds
34
+ # @param duration [Float] the duration in seconds
35
+ def initialize(text:, start:, duration:)
36
+ @text = text
37
+ @start = start.to_f
38
+ @duration = duration.to_f
39
+ end
40
+
41
+ # Convert to hash representation
42
+ # @return [Hash] hash with text, start, and duration keys
43
+ def to_h
44
+ {
45
+ "text" => @text,
46
+ "start" => @start,
47
+ "duration" => @duration
48
+ }
49
+ end
50
+ end
51
+
52
+ # Represents a fetched transcript containing multiple snippets
53
+ # This class is Enumerable, allowing iteration over snippets
54
+ class FetchedTranscript
55
+ include Enumerable
56
+
57
+ # @return [String] the video ID
58
+ attr_reader :video_id
59
+
60
+ # @return [String] the language name (e.g., "English")
61
+ attr_reader :language
62
+
63
+ # @return [String] the language code (e.g., "en")
64
+ attr_reader :language_code
65
+
66
+ # @return [Boolean] whether the transcript was auto-generated
67
+ attr_reader :is_generated
68
+
69
+ # @return [Array<TranscriptSnippet>] the transcript snippets
70
+ attr_reader :snippets
71
+
72
+ # @param video_id [String] the YouTube video ID
73
+ # @param language [String] the language name
74
+ # @param language_code [String] the language code
75
+ # @param is_generated [Boolean] whether auto-generated
76
+ # @param snippets [Array<TranscriptSnippet>] the snippets (optional)
77
+ def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
78
+ @video_id = video_id
79
+ @language = language
80
+ @language_code = language_code
81
+ @is_generated = is_generated
82
+ @snippets = snippets
83
+ end
84
+
85
+ # Add a snippet to the transcript
86
+ # @param snippet [TranscriptSnippet] the snippet to add
87
+ # @return [self]
88
+ def add_snippet(snippet)
89
+ @snippets << snippet
90
+ self
91
+ end
92
+
93
+ # Iterate over each snippet
94
+ # @yield [TranscriptSnippet] each snippet in the transcript
95
+ def each(&block)
96
+ @snippets.each(&block)
97
+ end
98
+
99
+ # Get a snippet by index
100
+ # @param index [Integer] the index
101
+ # @return [TranscriptSnippet] the snippet at the given index
102
+ def [](index)
103
+ @snippets[index]
104
+ end
105
+
106
+ # Get the number of snippets
107
+ # @return [Integer] the count of snippets
108
+ def length
109
+ @snippets.length
110
+ end
111
+ alias size length
112
+
113
+ # Convert to raw data (array of hashes)
114
+ # @return [Array<Hash>] array of snippet hashes
115
+ def to_raw_data
116
+ @snippets.map(&:to_h)
117
+ end
118
+
119
+ # Check if transcript was auto-generated
120
+ # @return [Boolean]
121
+ def generated?
122
+ @is_generated
123
+ end
124
+ end
125
+
126
+ # Represents transcript metadata and provides fetch/translate capabilities
127
+ class TranscriptMetadata
128
+ # @return [String] the video ID
129
+ attr_reader :video_id
130
+
131
+ # @return [String] the language name
132
+ attr_reader :language
133
+
134
+ # @return [String] the language code
135
+ attr_reader :language_code
136
+
137
+ # @return [Boolean] whether auto-generated
138
+ attr_reader :is_generated
139
+
140
+ # @return [Array<TranslationLanguage>] available translation languages
141
+ attr_reader :translation_languages
142
+
143
+ # @param http_client [Faraday::Connection] the HTTP client
144
+ # @param video_id [String] the YouTube video ID
145
+ # @param url [String] the transcript URL
146
+ # @param language [String] the language name
147
+ # @param language_code [String] the language code
148
+ # @param is_generated [Boolean] whether auto-generated
149
+ # @param translation_languages [Array<TranslationLanguage>] available translations
150
+ def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
151
+ @http_client = http_client
152
+ @video_id = video_id
153
+ @url = url
154
+ @language = language
155
+ @language_code = language_code
156
+ @is_generated = is_generated
157
+ @translation_languages = translation_languages
158
+ @translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
159
+ hash[tl.language_code] = tl.language
160
+ end
161
+ end
162
+
163
+ # Fetch the actual transcript data
164
+ # @param preserve_formatting [Boolean] whether to preserve HTML formatting
165
+ # @return [FetchedTranscript] the fetched transcript
166
+ # @raise [PoTokenRequired] if a PO token is required
167
+ def fetch(preserve_formatting: false)
168
+ raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
169
+
170
+ response = @http_client.get(@url)
171
+ raise_http_errors(response)
172
+
173
+ parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
174
+ snippets = parser.parse(response.body)
175
+
176
+ FetchedTranscript.new(
177
+ video_id: @video_id,
178
+ language: @language,
179
+ language_code: @language_code,
180
+ is_generated: @is_generated,
181
+ snippets: snippets
182
+ )
183
+ end
184
+
185
+ # Check if this transcript can be translated
186
+ # @return [Boolean]
187
+ def translatable?
188
+ !@translation_languages.empty?
189
+ end
190
+ alias is_translatable translatable?
191
+
192
+ # Translate this transcript to another language
193
+ # @param language_code [String] the target language code
194
+ # @return [TranscriptMetadata] a new TranscriptMetadata object for the translated version
195
+ # @raise [NotTranslatable] if the transcript cannot be translated
196
+ # @raise [TranslationLanguageNotAvailable] if the language is not available
197
+ def translate(language_code)
198
+ raise NotTranslatable, @video_id unless translatable?
199
+ raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
200
+
201
+ TranscriptMetadata.new(
202
+ http_client: @http_client,
203
+ video_id: @video_id,
204
+ url: "#{@url}&tlang=#{language_code}",
205
+ language: @translation_languages_dict[language_code],
206
+ language_code: language_code,
207
+ is_generated: true,
208
+ translation_languages: []
209
+ )
210
+ end
211
+
212
+ # Check if transcript was auto-generated
213
+ # @return [Boolean]
214
+ def generated?
215
+ @is_generated
216
+ end
217
+
218
+ # String representation of the transcript
219
+ # @return [String]
220
+ def to_s
221
+ translation_desc = translatable? ? "[TRANSLATABLE]" : ""
222
+ "#{@language_code} (\"#{@language}\")#{translation_desc}"
223
+ end
224
+
225
+ private
226
+
227
+ def raise_http_errors(response)
228
+ case response.status
229
+ when 429
230
+ raise IpBlocked, @video_id
231
+ when 400..599
232
+ raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
233
+ end
234
+ end
235
+ end
236
+ end
237
+ end
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # Represents a list of available transcripts for a YouTube video.
6
+ # This class is Enumerable, allowing iteration over all available transcripts.
7
+ # It provides functionality to search for transcripts in specific languages.
8
+ class TranscriptList
9
+ include Enumerable
10
+
11
+ # @return [String] the video ID this TranscriptList is for
12
+ attr_reader :video_id
13
+
14
+ # Build a TranscriptList from captions JSON data
15
+ #
16
+ # @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
17
+ # @param video_id [String] the YouTube video ID
18
+ # @param captions_json [Hash] the captions JSON parsed from YouTube
19
+ # @return [TranscriptList] the created TranscriptList
20
+ def self.build(http_client:, video_id:, captions_json:)
21
+ translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
22
+ TranslationLanguage.new(
23
+ language: tl.dig("languageName", "runs", 0, "text") || "",
24
+ language_code: tl["languageCode"]
25
+ )
26
+ end
27
+
28
+ manually_created_transcripts = {}
29
+ generated_transcripts = {}
30
+
31
+ (captions_json["captionTracks"] || []).each do |caption|
32
+ is_generated = caption.fetch("kind", "") == "asr"
33
+ target_dict = is_generated ? generated_transcripts : manually_created_transcripts
34
+
35
+ language_code = caption["languageCode"]
36
+ transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
37
+
38
+ target_dict[language_code] = TranscriptMetadata.new(
39
+ http_client: http_client,
40
+ video_id: video_id,
41
+ url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
42
+ language: caption.dig("name", "runs", 0, "text") || "",
43
+ language_code: language_code,
44
+ is_generated: is_generated,
45
+ translation_languages: transcript_translation_languages
46
+ )
47
+ end
48
+
49
+ new(
50
+ video_id: video_id,
51
+ manually_created_transcripts: manually_created_transcripts,
52
+ generated_transcripts: generated_transcripts,
53
+ translation_languages: translation_languages
54
+ )
55
+ end
56
+
57
+ # @param video_id [String] the YouTube video ID
58
+ # @param manually_created_transcripts [Hash<String, TranscriptMetadata>] manually created transcripts by language code
59
+ # @param generated_transcripts [Hash<String, TranscriptMetadata>] auto-generated transcripts by language code
60
+ # @param translation_languages [Array<TranslationLanguage>] available translation languages
61
+ def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
62
+ @video_id = video_id
63
+ @manually_created_transcripts = manually_created_transcripts
64
+ @generated_transcripts = generated_transcripts
65
+ @translation_languages = translation_languages
66
+ end
67
+
68
+ # Iterate over all transcripts (manually created first, then generated)
69
+ #
70
+ # @yield [TranscriptMetadata] each available transcript
71
+ # @return [Enumerator] if no block given
72
+ def each(&block)
73
+ return to_enum(:each) unless block_given?
74
+
75
+ @manually_created_transcripts.each_value(&block)
76
+ @generated_transcripts.each_value(&block)
77
+ end
78
+
79
+ # Find a transcript for the given language codes.
80
+ # Manually created transcripts are preferred over generated ones.
81
+ #
82
+ # @param language_codes [Array<String>] language codes in descending priority
83
+ # @return [TranscriptMetadata] the found transcript
84
+ # @raise [NoTranscriptFound] if no transcript matches the requested languages
85
+ def find_transcript(language_codes)
86
+ find_transcript_in(
87
+ language_codes,
88
+ [@manually_created_transcripts, @generated_transcripts]
89
+ )
90
+ end
91
+
92
+ # Find an automatically generated transcript for the given language codes.
93
+ #
94
+ # @param language_codes [Array<String>] language codes in descending priority
95
+ # @return [TranscriptMetadata] the found transcript
96
+ # @raise [NoTranscriptFound] if no generated transcript matches
97
+ def find_generated_transcript(language_codes)
98
+ find_transcript_in(language_codes, [@generated_transcripts])
99
+ end
100
+
101
+ # Find a manually created transcript for the given language codes.
102
+ #
103
+ # @param language_codes [Array<String>] language codes in descending priority
104
+ # @return [TranscriptMetadata] the found transcript
105
+ # @raise [NoTranscriptFound] if no manually created transcript matches
106
+ def find_manually_created_transcript(language_codes)
107
+ find_transcript_in(language_codes, [@manually_created_transcripts])
108
+ end
109
+
110
+ # String representation of the transcript list
111
+ #
112
+ # @return [String] human-readable description of available transcripts
113
+ def to_s
114
+ <<~DESC
115
+ For this video (#{@video_id}) transcripts are available in the following languages:
116
+
117
+ (MANUALLY CREATED)
118
+ #{format_language_list(@manually_created_transcripts.values)}
119
+
120
+ (GENERATED)
121
+ #{format_language_list(@generated_transcripts.values)}
122
+
123
+ (TRANSLATION LANGUAGES)
124
+ #{format_translation_languages}
125
+ DESC
126
+ end
127
+
128
+ private
129
+
130
+ # Find a transcript from the given dictionaries
131
+ #
132
+ # @param language_codes [Array<String>] language codes to search for
133
+ # @param transcript_dicts [Array<Hash>] transcript dictionaries to search
134
+ # @return [TranscriptMetadata] the found transcript
135
+ # @raise [NoTranscriptFound] if no transcript matches
136
+ def find_transcript_in(language_codes, transcript_dicts)
137
+ language_codes.each do |language_code|
138
+ transcript_dicts.each do |dict|
139
+ return dict[language_code] if dict.key?(language_code)
140
+ end
141
+ end
142
+
143
+ raise NoTranscriptFound.new(@video_id, language_codes, self)
144
+ end
145
+
146
+ # Format a list of transcripts for display
147
+ #
148
+ # @param transcripts [Array<TranscriptMetadata>] transcripts to format
149
+ # @return [String] formatted list or "None"
150
+ def format_language_list(transcripts)
151
+ return "None" if transcripts.empty?
152
+
153
+ transcripts.map { |t| " - #{t}" }.join("\n")
154
+ end
155
+
156
+ # Format translation languages for display
157
+ #
158
+ # @return [String] formatted list or "None"
159
+ def format_translation_languages
160
+ return "None" if @translation_languages.empty?
161
+
162
+ @translation_languages.map do |tl|
163
+ " - #{tl.language_code} (\"#{tl.language}\")"
164
+ end.join("\n")
165
+ end
166
+ end
167
+ end
168
+ end