youtube-transcript-rb 0.1.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/.rubocop_todo.yml +166 -0
  4. data/README.md +42 -42
  5. data/lib/youtube-transcript-rb.rb +4 -0
  6. data/lib/youtube_rb/formatters.rb +263 -0
  7. data/lib/youtube_rb/transcript/api.rb +144 -0
  8. data/lib/youtube_rb/transcript/errors.rb +215 -0
  9. data/lib/youtube_rb/transcript/settings.rb +26 -0
  10. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  11. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  12. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +220 -0
  13. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  14. data/lib/youtube_rb/transcript.rb +33 -0
  15. data/lib/youtube_rb/version.rb +5 -0
  16. data/sig/youtube_rb/transcript.rbs +4 -0
  17. data/spec/api_spec.rb +27 -27
  18. data/spec/errors_spec.rb +41 -41
  19. data/spec/formatters_spec.rb +45 -46
  20. data/spec/integration_spec.rb +39 -48
  21. data/spec/settings_spec.rb +16 -16
  22. data/spec/spec_helper.rb +52 -52
  23. data/spec/transcript_list_fetcher_spec.rb +38 -33
  24. data/spec/transcript_list_spec.rb +16 -19
  25. data/spec/transcript_parser_spec.rb +3 -3
  26. data/spec/transcript_spec.rb +23 -24
  27. metadata +17 -13
  28. data/lib/youtube/transcript/rb/api.rb +0 -150
  29. data/lib/youtube/transcript/rb/errors.rb +0 -217
  30. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  31. data/lib/youtube/transcript/rb/settings.rb +0 -28
  32. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  33. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  34. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  35. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  36. data/lib/youtube/transcript/rb/version.rb +0 -9
  37. data/lib/youtube/transcript/rb.rb +0 -37
  38. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "faraday"
4
+ require "faraday/follow_redirects"
5
+
6
+ module YoutubeRb
7
+ module Transcript
8
+ # Main entry point for fetching YouTube transcripts.
9
+ # This class provides a simple API for retrieving transcripts from YouTube videos.
10
+ #
11
+ # @example Basic usage
12
+ # api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
13
+ # transcript = api.fetch("dQw4w9WgXcQ")
14
+ # transcript.each { |snippet| puts snippet.text }
15
+ #
16
+ # @example With language preference
17
+ # api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
18
+ # transcript = api.fetch("dQw4w9WgXcQ", languages: ["es", "en"])
19
+ #
20
+ # @example Listing available transcripts
21
+ # api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
22
+ # transcript_list = api.list("dQw4w9WgXcQ")
23
+ # transcript_list.each { |t| puts t }
24
+ #
25
+ class YouTubeTranscriptApi
26
+ # Default timeout for HTTP requests in seconds
27
+ DEFAULT_TIMEOUT = 30
28
+
29
+ # @param http_client [Faraday::Connection, nil] Custom HTTP client (optional)
30
+ # @param proxy_config [Object, nil] Proxy configuration (optional)
31
+ def initialize(http_client: nil, proxy_config: nil)
32
+ @http_client = http_client || build_default_http_client
33
+ @proxy_config = proxy_config
34
+ @fetcher = TranscriptListFetcher.new(
35
+ http_client: @http_client,
36
+ proxy_config: @proxy_config
37
+ )
38
+ end
39
+
40
+ # Fetch a transcript for a video.
41
+ # This is a convenience method that combines `list` and `find_transcript`.
42
+ #
43
+ # @param video_id [String] The YouTube video ID
44
+ # @param languages [Array<String>] Language codes in order of preference (default: ["en"])
45
+ # @param preserve_formatting [Boolean] Whether to preserve HTML formatting (default: false)
46
+ # @return [FetchedTranscript] The fetched transcript
47
+ # @raise [NoTranscriptFound] If no transcript matches the requested languages
48
+ # @raise [TranscriptsDisabled] If transcripts are disabled for the video
49
+ # @raise [VideoUnavailable] If the video is not available
50
+ #
51
+ # @example
52
+ # api = YouTubeTranscriptApi.new
53
+ # transcript = api.fetch("dQw4w9WgXcQ", languages: ["en", "es"])
54
+ # puts transcript.first.text
55
+ #
56
+ def fetch(video_id, languages: ["en"], preserve_formatting: false)
57
+ list(video_id)
58
+ .find_transcript(languages)
59
+ .fetch(preserve_formatting: preserve_formatting)
60
+ end
61
+
62
+ # List all available transcripts for a video.
63
+ #
64
+ # @param video_id [String] The YouTube video ID
65
+ # @return [TranscriptList] A list of available transcripts
66
+ # @raise [TranscriptsDisabled] If transcripts are disabled for the video
67
+ # @raise [VideoUnavailable] If the video is not available
68
+ #
69
+ # @example
70
+ # api = YouTubeTranscriptApi.new
71
+ # transcript_list = api.list("dQw4w9WgXcQ")
72
+ #
73
+ # # Find a specific transcript
74
+ # transcript = transcript_list.find_transcript(["en"])
75
+ #
76
+ # # Or iterate over all available transcripts
77
+ # transcript_list.each do |transcript|
78
+ # puts "#{transcript.language_code}: #{transcript.language}"
79
+ # end
80
+ #
81
+ def list(video_id)
82
+ @fetcher.fetch(video_id)
83
+ end
84
+
85
+ # Fetch transcripts for multiple videos.
86
+ #
87
+ # @param video_ids [Array<String>] Array of YouTube video IDs
88
+ # @param languages [Array<String>] Language codes in order of preference (default: ["en"])
89
+ # @param preserve_formatting [Boolean] Whether to preserve HTML formatting (default: false)
90
+ # @param continue_on_error [Boolean] Whether to continue if a video fails (default: false)
91
+ # @yield [video_id, result] Block called for each video with either transcript or error
92
+ # @yieldparam video_id [String] The video ID being processed
93
+ # @yieldparam result [FetchedTranscript, StandardError] The transcript or error
94
+ # @return [Hash<String, FetchedTranscript>] Hash mapping video IDs to transcripts
95
+ # @raise [CouldNotRetrieveTranscript] If any video fails and continue_on_error is false
96
+ #
97
+ # @example Fetch multiple videos
98
+ # api = YouTubeTranscriptApi.new
99
+ # transcripts = api.fetch_all(["video1", "video2", "video3"])
100
+ # transcripts.each { |id, t| puts "#{id}: #{t.length} snippets" }
101
+ #
102
+ # @example With error handling
103
+ # api = YouTubeTranscriptApi.new
104
+ # api.fetch_all(["video1", "video2"], continue_on_error: true) do |video_id, result|
105
+ # if result.is_a?(StandardError)
106
+ # puts "Error for #{video_id}: #{result.message}"
107
+ # else
108
+ # puts "Got #{result.length} snippets for #{video_id}"
109
+ # end
110
+ # end
111
+ #
112
+ def fetch_all(video_ids, languages: ["en"], preserve_formatting: false, continue_on_error: false)
113
+ results = {}
114
+
115
+ video_ids.each do |video_id|
116
+ transcript = fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
117
+ results[video_id] = transcript
118
+ yield(video_id, transcript) if block_given?
119
+ rescue CouldNotRetrieveTranscript => e
120
+ raise unless continue_on_error
121
+
122
+ yield(video_id, e) if block_given?
123
+ end
124
+
125
+ results
126
+ end
127
+
128
+ private
129
+
130
+ # Build the default Faraday HTTP client
131
+ #
132
+ # @return [Faraday::Connection] The configured HTTP client
133
+ def build_default_http_client
134
+ Faraday.new do |conn|
135
+ conn.options.timeout = DEFAULT_TIMEOUT
136
+ conn.options.open_timeout = DEFAULT_TIMEOUT
137
+ conn.request :url_encoded
138
+ conn.response :follow_redirects
139
+ conn.adapter Faraday.default_adapter
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,215 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # Base error class for all YouTube Transcript errors
6
+ class Error < StandardError; end
7
+
8
+ # Raised when a transcript could not be retrieved
9
+ class CouldNotRetrieveTranscript < Error
10
+ WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
11
+
12
+ # @return [String] the video ID that caused the error
13
+ attr_reader :video_id
14
+
15
+ # @param video_id [String] the YouTube video ID
16
+ def initialize(video_id)
17
+ @video_id = video_id
18
+ super(build_error_message)
19
+ end
20
+
21
+ # @return [String] the cause of the error
22
+ def cause_message
23
+ self.class::CAUSE_MESSAGE
24
+ end
25
+
26
+ private
27
+
28
+ def build_error_message
29
+ video_url = format(WATCH_URL, video_id: @video_id)
30
+ message = "\nCould not retrieve a transcript for the video #{video_url}!"
31
+
32
+ if cause_message && !cause_message.empty?
33
+ message += " This is most likely caused by:\n\n#{cause_message}"
34
+ message += github_referral
35
+ end
36
+
37
+ message
38
+ end
39
+
40
+ def github_referral
41
+ "\n\nIf you are sure that the described cause is not responsible for this error " \
42
+ "and that a transcript should be retrievable, please create an issue at " \
43
+ "https://github.com/stadia/youtube-transcript-rb/issues. " \
44
+ "Please add which version of youtube_transcript_api you are using " \
45
+ "and provide the information needed to replicate the error. " \
46
+ "Also make sure that there are no open issues which already describe your problem!"
47
+ end
48
+ end
49
+
50
+ # Raised when YouTube data cannot be parsed
51
+ class YouTubeDataUnparsable < CouldNotRetrieveTranscript
52
+ CAUSE_MESSAGE = "The data required to fetch the transcript is not parsable. This should " \
53
+ "not happen, please open an issue (make sure to include the video ID)!"
54
+ end
55
+
56
+ # Raised when a request to YouTube fails
57
+ class YouTubeRequestFailed < CouldNotRetrieveTranscript
58
+ CAUSE_MESSAGE = "Request to YouTube failed: %<reason>s"
59
+
60
+ # @return [String] the reason for the failure
61
+ attr_reader :reason
62
+
63
+ # @param video_id [String] the YouTube video ID
64
+ # @param http_error [StandardError] the HTTP error that occurred
65
+ def initialize(video_id, http_error)
66
+ @reason = http_error.to_s
67
+ super(video_id)
68
+ end
69
+
70
+ def cause_message
71
+ format(CAUSE_MESSAGE, reason: @reason)
72
+ end
73
+ end
74
+
75
+ # Raised when a video is unplayable
76
+ class VideoUnplayable < CouldNotRetrieveTranscript
77
+ CAUSE_MESSAGE = "The video is unplayable for the following reason: %<reason>s"
78
+
79
+ # @return [String, nil] the reason the video is unplayable
80
+ attr_reader :reason
81
+
82
+ # @return [Array<String>] additional sub-reasons
83
+ attr_reader :sub_reasons
84
+
85
+ # @param video_id [String] the YouTube video ID
86
+ # @param reason [String, nil] the reason the video is unplayable
87
+ # @param sub_reasons [Array<String>] additional details
88
+ def initialize(video_id, reason = nil, sub_reasons = [])
89
+ @reason = reason
90
+ @sub_reasons = sub_reasons
91
+ super(video_id)
92
+ end
93
+
94
+ def cause_message
95
+ reason_text = @reason || "No reason specified!"
96
+
97
+ if @sub_reasons.any?
98
+ sub_reasons_text = @sub_reasons.map { |r| " - #{r}" }.join("\n")
99
+ reason_text = "#{reason_text}\n\nAdditional Details:\n#{sub_reasons_text}"
100
+ end
101
+
102
+ format(CAUSE_MESSAGE, reason: reason_text)
103
+ end
104
+ end
105
+
106
+ # Raised when a video is unavailable
107
+ class VideoUnavailable < CouldNotRetrieveTranscript
108
+ CAUSE_MESSAGE = "The video is no longer available"
109
+ end
110
+
111
+ # Raised when an invalid video ID is provided
112
+ class InvalidVideoId < CouldNotRetrieveTranscript
113
+ CAUSE_MESSAGE = "You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n" \
114
+ 'Do NOT run: `YoutubeRb::Transcript.fetch("https://www.youtube.com/watch?v=1234")`' \
115
+ "\n" \
116
+ 'Instead run: `YoutubeRb::Transcript.fetch("1234")`'
117
+ end
118
+
119
+ # Raised when YouTube blocks the request
120
+ class RequestBlocked < CouldNotRetrieveTranscript
121
+ BASE_CAUSE_MESSAGE = "YouTube is blocking requests from your IP. This usually is due to one of the " \
122
+ "following reasons:\n" \
123
+ "- You have done too many requests and your IP has been blocked by YouTube\n" \
124
+ "- You are doing requests from an IP belonging to a cloud provider (like AWS, " \
125
+ "Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud " \
126
+ "providers are blocked by YouTube.\n\n"
127
+
128
+ CAUSE_MESSAGE = "#{BASE_CAUSE_MESSAGE}" \
129
+ "There are two things you can do to work around this:\n" \
130
+ "1. Use proxies to hide your IP address.\n" \
131
+ "2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you " \
132
+ "will be able to continue doing requests for a while. However, YouTube will " \
133
+ "eventually permanently ban the account that you have used to authenticate " \
134
+ "with! So only do this if you don't mind your account being banned!".freeze
135
+ end
136
+
137
+ # Raised when YouTube blocks the IP specifically
138
+ class IpBlocked < RequestBlocked
139
+ CAUSE_MESSAGE = "#{RequestBlocked::BASE_CAUSE_MESSAGE}" \
140
+ "Ways to work around this are using proxies or rotating residential IPs.".freeze
141
+ end
142
+
143
+ # Raised when too many requests are made (HTTP 429)
144
+ class TooManyRequests < CouldNotRetrieveTranscript
145
+ CAUSE_MESSAGE = "YouTube is rate limiting your requests. Please wait before making more requests."
146
+ end
147
+
148
+ # Raised when transcripts are disabled for a video
149
+ class TranscriptsDisabled < CouldNotRetrieveTranscript
150
+ CAUSE_MESSAGE = "Subtitles are disabled for this video"
151
+ end
152
+
153
+ # Raised when a video is age restricted
154
+ class AgeRestricted < CouldNotRetrieveTranscript
155
+ CAUSE_MESSAGE = "This video is age-restricted. Therefore, you are unable to retrieve " \
156
+ "transcripts for it without authenticating yourself.\n\n" \
157
+ "Unfortunately, Cookie Authentication is temporarily unsupported, " \
158
+ "as recent changes in YouTube's API broke the previous implementation."
159
+ end
160
+
161
+ # Raised when a transcript is not translatable
162
+ class NotTranslatable < CouldNotRetrieveTranscript
163
+ CAUSE_MESSAGE = "The requested language is not translatable"
164
+ end
165
+
166
+ # Raised when the requested translation language is not available
167
+ class TranslationLanguageNotAvailable < CouldNotRetrieveTranscript
168
+ CAUSE_MESSAGE = "The requested translation language is not available"
169
+ end
170
+
171
+ # Raised when consent cookie creation fails
172
+ class FailedToCreateConsentCookie < CouldNotRetrieveTranscript
173
+ CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies"
174
+ end
175
+
176
+ # Raised when no transcript is found for the requested languages
177
+ class NoTranscriptFound < CouldNotRetrieveTranscript
178
+ CAUSE_MESSAGE = "No transcripts were found for any of the requested language codes: %<requested_language_codes>s\n\n%<transcript_data>s"
179
+
180
+ # @return [Array<String>] the requested language codes
181
+ attr_reader :requested_language_codes
182
+
183
+ # @return [Object] the transcript data (TranscriptList)
184
+ attr_reader :transcript_data
185
+
186
+ # @param video_id [String] the YouTube video ID
187
+ # @param requested_language_codes [Array<String>] the language codes that were requested
188
+ # @param transcript_data [Object] the TranscriptList object with available transcripts
189
+ def initialize(video_id, requested_language_codes, transcript_data)
190
+ @requested_language_codes = requested_language_codes
191
+ @transcript_data = transcript_data
192
+ super(video_id)
193
+ end
194
+
195
+ def cause_message
196
+ format(
197
+ CAUSE_MESSAGE,
198
+ requested_language_codes: @requested_language_codes.inspect,
199
+ transcript_data: @transcript_data.to_s
200
+ )
201
+ end
202
+ end
203
+
204
+ # Raised when no transcripts are available for a video
205
+ class NoTranscriptAvailable < CouldNotRetrieveTranscript
206
+ CAUSE_MESSAGE = "No transcripts are available for this video"
207
+ end
208
+
209
+ # Raised when a PO token is required to fetch the transcript
210
+ class PoTokenRequired < CouldNotRetrieveTranscript
211
+ CAUSE_MESSAGE = "The requested video cannot be retrieved without a PO Token. " \
212
+ "If this happens, please open a GitHub issue!"
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # YouTube watch URL template
6
+ # @example
7
+ # format(WATCH_URL, video_id: "abc123")
8
+ # # => "https://www.youtube.com/watch?v=abc123"
9
+ WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
10
+
11
+ # YouTube Innertube API URL template
12
+ # @example
13
+ # format(INNERTUBE_API_URL, api_key: "key123")
14
+ # # => "https://www.youtube.com/youtubei/v1/player?key=key123"
15
+ INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
16
+
17
+ # Innertube API context for Android client
18
+ # Used in POST requests to the Innertube API
19
+ INNERTUBE_CONTEXT = {
20
+ "client" => {
21
+ "clientName" => "ANDROID",
22
+ "clientVersion" => "20.10.38"
23
+ }
24
+ }.freeze
25
+ end
26
+ end
@@ -0,0 +1,237 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YoutubeRb
4
+ module Transcript
5
+ # Represents a language available for translation
6
+ class TranslationLanguage
7
+ # @return [String] the language name (e.g., "Spanish")
8
+ attr_reader :language
9
+
10
+ # @return [String] the language code (e.g., "es")
11
+ attr_reader :language_code
12
+
13
+ # @param language [String] the language name
14
+ # @param language_code [String] the language code
15
+ def initialize(language:, language_code:)
16
+ @language = language
17
+ @language_code = language_code
18
+ end
19
+ end
20
+
21
+ # Represents a single transcript snippet/segment
22
+ class TranscriptSnippet
23
+ # @return [String] the text content of the snippet
24
+ attr_reader :text
25
+
26
+ # @return [Float] the start time in seconds
27
+ attr_reader :start
28
+
29
+ # @return [Float] the duration in seconds
30
+ attr_reader :duration
31
+
32
+ # @param text [String] the text content
33
+ # @param start [Float] the start time in seconds
34
+ # @param duration [Float] the duration in seconds
35
+ def initialize(text:, start:, duration:)
36
+ @text = text
37
+ @start = start.to_f
38
+ @duration = duration.to_f
39
+ end
40
+
41
+ # Convert to hash representation
42
+ # @return [Hash] hash with text, start, and duration keys
43
+ def to_h
44
+ {
45
+ "text" => @text,
46
+ "start" => @start,
47
+ "duration" => @duration
48
+ }
49
+ end
50
+ end
51
+
52
+ # Represents a fetched transcript containing multiple snippets
53
+ # This class is Enumerable, allowing iteration over snippets
54
+ class FetchedTranscript
55
+ include Enumerable
56
+
57
+ # @return [String] the video ID
58
+ attr_reader :video_id
59
+
60
+ # @return [String] the language name (e.g., "English")
61
+ attr_reader :language
62
+
63
+ # @return [String] the language code (e.g., "en")
64
+ attr_reader :language_code
65
+
66
+ # @return [Boolean] whether the transcript was auto-generated
67
+ attr_reader :is_generated
68
+
69
+ # @return [Array<TranscriptSnippet>] the transcript snippets
70
+ attr_reader :snippets
71
+
72
+ # @param video_id [String] the YouTube video ID
73
+ # @param language [String] the language name
74
+ # @param language_code [String] the language code
75
+ # @param is_generated [Boolean] whether auto-generated
76
+ # @param snippets [Array<TranscriptSnippet>] the snippets (optional)
77
+ def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
78
+ @video_id = video_id
79
+ @language = language
80
+ @language_code = language_code
81
+ @is_generated = is_generated
82
+ @snippets = snippets
83
+ end
84
+
85
+ # Add a snippet to the transcript
86
+ # @param snippet [TranscriptSnippet] the snippet to add
87
+ # @return [self]
88
+ def add_snippet(snippet)
89
+ @snippets << snippet
90
+ self
91
+ end
92
+
93
+ # Iterate over each snippet
94
+ # @yield [TranscriptSnippet] each snippet in the transcript
95
+ def each(&)
96
+ @snippets.each(&)
97
+ end
98
+
99
+ # Get a snippet by index
100
+ # @param index [Integer] the index
101
+ # @return [TranscriptSnippet] the snippet at the given index
102
+ def [](index)
103
+ @snippets[index]
104
+ end
105
+
106
+ # Get the number of snippets
107
+ # @return [Integer] the count of snippets
108
+ def length
109
+ @snippets.length
110
+ end
111
+ alias size length
112
+
113
+ # Convert to raw data (array of hashes)
114
+ # @return [Array<Hash>] array of snippet hashes
115
+ def to_raw_data
116
+ @snippets.map(&:to_h)
117
+ end
118
+
119
+ # Check if transcript was auto-generated
120
+ # @return [Boolean]
121
+ def generated?
122
+ @is_generated
123
+ end
124
+ end
125
+
126
+ # Represents transcript metadata and provides fetch/translate capabilities
127
+ class TranscriptMetadata
128
+ # @return [String] the video ID
129
+ attr_reader :video_id
130
+
131
+ # @return [String] the language name
132
+ attr_reader :language
133
+
134
+ # @return [String] the language code
135
+ attr_reader :language_code
136
+
137
+ # @return [Boolean] whether auto-generated
138
+ attr_reader :is_generated
139
+
140
+ # @return [Array<TranslationLanguage>] available translation languages
141
+ attr_reader :translation_languages
142
+
143
+ # @param http_client [Faraday::Connection] the HTTP client
144
+ # @param video_id [String] the YouTube video ID
145
+ # @param url [String] the transcript URL
146
+ # @param language [String] the language name
147
+ # @param language_code [String] the language code
148
+ # @param is_generated [Boolean] whether auto-generated
149
+ # @param translation_languages [Array<TranslationLanguage>] available translations
150
+ def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
151
+ @http_client = http_client
152
+ @video_id = video_id
153
+ @url = url
154
+ @language = language
155
+ @language_code = language_code
156
+ @is_generated = is_generated
157
+ @translation_languages = translation_languages
158
+ @translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
159
+ hash[tl.language_code] = tl.language
160
+ end
161
+ end
162
+
163
+ # Fetch the actual transcript data
164
+ # @param preserve_formatting [Boolean] whether to preserve HTML formatting
165
+ # @return [FetchedTranscript] the fetched transcript
166
+ # @raise [PoTokenRequired] if a PO token is required
167
+ def fetch(preserve_formatting: false)
168
+ raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
169
+
170
+ response = @http_client.get(@url)
171
+ raise_http_errors(response)
172
+
173
+ parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
174
+ snippets = parser.parse(response.body)
175
+
176
+ FetchedTranscript.new(
177
+ video_id: @video_id,
178
+ language: @language,
179
+ language_code: @language_code,
180
+ is_generated: @is_generated,
181
+ snippets: snippets
182
+ )
183
+ end
184
+
185
+ # Check if this transcript can be translated
186
+ # @return [Boolean]
187
+ def translatable?
188
+ !@translation_languages.empty?
189
+ end
190
+ alias is_translatable translatable?
191
+
192
+ # Translate this transcript to another language
193
+ # @param language_code [String] the target language code
194
+ # @return [TranscriptMetadata] a new TranscriptMetadata object for the translated version
195
+ # @raise [NotTranslatable] if the transcript cannot be translated
196
+ # @raise [TranslationLanguageNotAvailable] if the language is not available
197
+ def translate(language_code)
198
+ raise NotTranslatable, @video_id unless translatable?
199
+ raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
200
+
201
+ TranscriptMetadata.new(
202
+ http_client: @http_client,
203
+ video_id: @video_id,
204
+ url: "#{@url}&tlang=#{language_code}",
205
+ language: @translation_languages_dict[language_code],
206
+ language_code: language_code,
207
+ is_generated: true,
208
+ translation_languages: []
209
+ )
210
+ end
211
+
212
+ # Check if transcript was auto-generated
213
+ # @return [Boolean]
214
+ def generated?
215
+ @is_generated
216
+ end
217
+
218
+ # String representation of the transcript
219
+ # @return [String]
220
+ def to_s
221
+ translation_desc = translatable? ? "[TRANSLATABLE]" : ""
222
+ "#{@language_code} (\"#{@language}\")#{translation_desc}"
223
+ end
224
+
225
+ private
226
+
227
+ def raise_http_errors(response)
228
+ case response.status
229
+ when 429
230
+ raise IpBlocked, @video_id
231
+ when 400..599
232
+ raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
233
+ end
234
+ end
235
+ end
236
+ end
237
+ end