youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Youtube
4
+ module Transcript
5
+ module Rb
6
+ # Represents a language available for translation
7
+ class TranslationLanguage
8
+ # @return [String] the language name (e.g., "Spanish")
9
+ attr_reader :language
10
+
11
+ # @return [String] the language code (e.g., "es")
12
+ attr_reader :language_code
13
+
14
+ # @param language [String] the language name
15
+ # @param language_code [String] the language code
16
+ def initialize(language:, language_code:)
17
+ @language = language
18
+ @language_code = language_code
19
+ end
20
+ end
21
+
22
+ # Represents a single transcript snippet/segment
23
+ class TranscriptSnippet
24
+ # @return [String] the text content of the snippet
25
+ attr_reader :text
26
+
27
+ # @return [Float] the start time in seconds
28
+ attr_reader :start
29
+
30
+ # @return [Float] the duration in seconds
31
+ attr_reader :duration
32
+
33
+ # @param text [String] the text content
34
+ # @param start [Float] the start time in seconds
35
+ # @param duration [Float] the duration in seconds
36
+ def initialize(text:, start:, duration:)
37
+ @text = text
38
+ @start = start.to_f
39
+ @duration = duration.to_f
40
+ end
41
+
42
+ # Convert to hash representation
43
+ # @return [Hash] hash with text, start, and duration keys
44
+ def to_h
45
+ {
46
+ "text" => @text,
47
+ "start" => @start,
48
+ "duration" => @duration
49
+ }
50
+ end
51
+ end
52
+
53
+ # Represents a fetched transcript containing multiple snippets
54
+ # This class is Enumerable, allowing iteration over snippets
55
+ class FetchedTranscript
56
+ include Enumerable
57
+
58
+ # @return [String] the video ID
59
+ attr_reader :video_id
60
+
61
+ # @return [String] the language name (e.g., "English")
62
+ attr_reader :language
63
+
64
+ # @return [String] the language code (e.g., "en")
65
+ attr_reader :language_code
66
+
67
+ # @return [Boolean] whether the transcript was auto-generated
68
+ attr_reader :is_generated
69
+
70
+ # @return [Array<TranscriptSnippet>] the transcript snippets
71
+ attr_reader :snippets
72
+
73
+ # @param video_id [String] the YouTube video ID
74
+ # @param language [String] the language name
75
+ # @param language_code [String] the language code
76
+ # @param is_generated [Boolean] whether auto-generated
77
+ # @param snippets [Array<TranscriptSnippet>] the snippets (optional)
78
+ def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
79
+ @video_id = video_id
80
+ @language = language
81
+ @language_code = language_code
82
+ @is_generated = is_generated
83
+ @snippets = snippets
84
+ end
85
+
86
+ # Add a snippet to the transcript
87
+ # @param snippet [TranscriptSnippet] the snippet to add
88
+ # @return [self]
89
+ def add_snippet(snippet)
90
+ @snippets << snippet
91
+ self
92
+ end
93
+
94
+ # Iterate over each snippet
95
+ # @yield [TranscriptSnippet] each snippet in the transcript
96
+ def each(&block)
97
+ @snippets.each(&block)
98
+ end
99
+
100
+ # Get a snippet by index
101
+ # @param index [Integer] the index
102
+ # @return [TranscriptSnippet] the snippet at the given index
103
+ def [](index)
104
+ @snippets[index]
105
+ end
106
+
107
+ # Get the number of snippets
108
+ # @return [Integer] the count of snippets
109
+ def length
110
+ @snippets.length
111
+ end
112
+ alias size length
113
+
114
+ # Convert to raw data (array of hashes)
115
+ # @return [Array<Hash>] array of snippet hashes
116
+ def to_raw_data
117
+ @snippets.map(&:to_h)
118
+ end
119
+
120
+ # Check if transcript was auto-generated
121
+ # @return [Boolean]
122
+ def generated?
123
+ @is_generated
124
+ end
125
+ end
126
+
127
+ # Represents transcript metadata and provides fetch/translate capabilities
128
+ class Transcript
129
+ # @return [String] the video ID
130
+ attr_reader :video_id
131
+
132
+ # @return [String] the language name
133
+ attr_reader :language
134
+
135
+ # @return [String] the language code
136
+ attr_reader :language_code
137
+
138
+ # @return [Boolean] whether auto-generated
139
+ attr_reader :is_generated
140
+
141
+ # @return [Array<TranslationLanguage>] available translation languages
142
+ attr_reader :translation_languages
143
+
144
+ # @param http_client [Faraday::Connection] the HTTP client
145
+ # @param video_id [String] the YouTube video ID
146
+ # @param url [String] the transcript URL
147
+ # @param language [String] the language name
148
+ # @param language_code [String] the language code
149
+ # @param is_generated [Boolean] whether auto-generated
150
+ # @param translation_languages [Array<TranslationLanguage>] available translations
151
+ def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
152
+ @http_client = http_client
153
+ @video_id = video_id
154
+ @url = url
155
+ @language = language
156
+ @language_code = language_code
157
+ @is_generated = is_generated
158
+ @translation_languages = translation_languages
159
+ @translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
160
+ hash[tl.language_code] = tl.language
161
+ end
162
+ end
163
+
164
+ # Fetch the actual transcript data
165
+ # @param preserve_formatting [Boolean] whether to preserve HTML formatting
166
+ # @return [FetchedTranscript] the fetched transcript
167
+ # @raise [PoTokenRequired] if a PO token is required
168
+ def fetch(preserve_formatting: false)
169
+ raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
170
+
171
+ response = @http_client.get(@url)
172
+ raise_http_errors(response)
173
+
174
+ parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
175
+ snippets = parser.parse(response.body)
176
+
177
+ FetchedTranscript.new(
178
+ video_id: @video_id,
179
+ language: @language,
180
+ language_code: @language_code,
181
+ is_generated: @is_generated,
182
+ snippets: snippets
183
+ )
184
+ end
185
+
186
+ # Check if this transcript can be translated
187
+ # @return [Boolean]
188
+ def translatable?
189
+ !@translation_languages.empty?
190
+ end
191
+ alias is_translatable translatable?
192
+
193
+ # Translate this transcript to another language
194
+ # @param language_code [String] the target language code
195
+ # @return [Transcript] a new Transcript object for the translated version
196
+ # @raise [NotTranslatable] if the transcript cannot be translated
197
+ # @raise [TranslationLanguageNotAvailable] if the language is not available
198
+ def translate(language_code)
199
+ raise NotTranslatable, @video_id unless translatable?
200
+ raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
201
+
202
+ Transcript.new(
203
+ http_client: @http_client,
204
+ video_id: @video_id,
205
+ url: "#{@url}&tlang=#{language_code}",
206
+ language: @translation_languages_dict[language_code],
207
+ language_code: language_code,
208
+ is_generated: true,
209
+ translation_languages: []
210
+ )
211
+ end
212
+
213
+ # Check if transcript was auto-generated
214
+ # @return [Boolean]
215
+ def generated?
216
+ @is_generated
217
+ end
218
+
219
+ # String representation of the transcript
220
+ # @return [String]
221
+ def to_s
222
+ translation_desc = translatable? ? "[TRANSLATABLE]" : ""
223
+ "#{@language_code} (\"#{@language}\")#{translation_desc}"
224
+ end
225
+
226
+ private
227
+
228
+ def raise_http_errors(response)
229
+ case response.status
230
+ when 429
231
+ raise IpBlocked, @video_id
232
+ when 400..599
233
+ raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
234
+ end
235
+ end
236
+ end
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Youtube
4
+ module Transcript
5
+ module Rb
6
+ # Represents a list of available transcripts for a YouTube video.
7
+ # This class is Enumerable, allowing iteration over all available transcripts.
8
+ # It provides functionality to search for transcripts in specific languages.
9
+ class TranscriptList
10
+ include Enumerable
11
+
12
+ # @return [String] the video ID this TranscriptList is for
13
+ attr_reader :video_id
14
+
15
+ # Build a TranscriptList from captions JSON data
16
+ #
17
+ # @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
18
+ # @param video_id [String] the YouTube video ID
19
+ # @param captions_json [Hash] the captions JSON parsed from YouTube
20
+ # @return [TranscriptList] the created TranscriptList
21
+ def self.build(http_client:, video_id:, captions_json:)
22
+ translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
23
+ TranslationLanguage.new(
24
+ language: tl.dig("languageName", "runs", 0, "text") || "",
25
+ language_code: tl["languageCode"]
26
+ )
27
+ end
28
+
29
+ manually_created_transcripts = {}
30
+ generated_transcripts = {}
31
+
32
+ (captions_json["captionTracks"] || []).each do |caption|
33
+ is_generated = caption.fetch("kind", "") == "asr"
34
+ target_dict = is_generated ? generated_transcripts : manually_created_transcripts
35
+
36
+ language_code = caption["languageCode"]
37
+ transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
38
+
39
+ target_dict[language_code] = Transcript.new(
40
+ http_client: http_client,
41
+ video_id: video_id,
42
+ url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
43
+ language: caption.dig("name", "runs", 0, "text") || "",
44
+ language_code: language_code,
45
+ is_generated: is_generated,
46
+ translation_languages: transcript_translation_languages
47
+ )
48
+ end
49
+
50
+ new(
51
+ video_id: video_id,
52
+ manually_created_transcripts: manually_created_transcripts,
53
+ generated_transcripts: generated_transcripts,
54
+ translation_languages: translation_languages
55
+ )
56
+ end
57
+
58
+ # @param video_id [String] the YouTube video ID
59
+ # @param manually_created_transcripts [Hash<String, Transcript>] manually created transcripts by language code
60
+ # @param generated_transcripts [Hash<String, Transcript>] auto-generated transcripts by language code
61
+ # @param translation_languages [Array<TranslationLanguage>] available translation languages
62
+ def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
63
+ @video_id = video_id
64
+ @manually_created_transcripts = manually_created_transcripts
65
+ @generated_transcripts = generated_transcripts
66
+ @translation_languages = translation_languages
67
+ end
68
+
69
+ # Iterate over all transcripts (manually created first, then generated)
70
+ #
71
+ # @yield [Transcript] each available transcript
72
+ # @return [Enumerator] if no block given
73
+ def each(&block)
74
+ return to_enum(:each) unless block_given?
75
+
76
+ @manually_created_transcripts.each_value(&block)
77
+ @generated_transcripts.each_value(&block)
78
+ end
79
+
80
+ # Find a transcript for the given language codes.
81
+ # Manually created transcripts are preferred over generated ones.
82
+ #
83
+ # @param language_codes [Array<String>] language codes in descending priority
84
+ # @return [Transcript] the found transcript
85
+ # @raise [NoTranscriptFound] if no transcript matches the requested languages
86
+ def find_transcript(language_codes)
87
+ find_transcript_in(
88
+ language_codes,
89
+ [@manually_created_transcripts, @generated_transcripts]
90
+ )
91
+ end
92
+
93
+ # Find an automatically generated transcript for the given language codes.
94
+ #
95
+ # @param language_codes [Array<String>] language codes in descending priority
96
+ # @return [Transcript] the found transcript
97
+ # @raise [NoTranscriptFound] if no generated transcript matches
98
+ def find_generated_transcript(language_codes)
99
+ find_transcript_in(language_codes, [@generated_transcripts])
100
+ end
101
+
102
+ # Find a manually created transcript for the given language codes.
103
+ #
104
+ # @param language_codes [Array<String>] language codes in descending priority
105
+ # @return [Transcript] the found transcript
106
+ # @raise [NoTranscriptFound] if no manually created transcript matches
107
+ def find_manually_created_transcript(language_codes)
108
+ find_transcript_in(language_codes, [@manually_created_transcripts])
109
+ end
110
+
111
+ # String representation of the transcript list
112
+ #
113
+ # @return [String] human-readable description of available transcripts
114
+ def to_s
115
+ <<~DESC
116
+ For this video (#{@video_id}) transcripts are available in the following languages:
117
+
118
+ (MANUALLY CREATED)
119
+ #{format_language_list(@manually_created_transcripts.values)}
120
+
121
+ (GENERATED)
122
+ #{format_language_list(@generated_transcripts.values)}
123
+
124
+ (TRANSLATION LANGUAGES)
125
+ #{format_translation_languages}
126
+ DESC
127
+ end
128
+
129
+ private
130
+
131
+ # Find a transcript from the given dictionaries
132
+ #
133
+ # @param language_codes [Array<String>] language codes to search for
134
+ # @param transcript_dicts [Array<Hash>] transcript dictionaries to search
135
+ # @return [Transcript] the found transcript
136
+ # @raise [NoTranscriptFound] if no transcript matches
137
+ def find_transcript_in(language_codes, transcript_dicts)
138
+ language_codes.each do |language_code|
139
+ transcript_dicts.each do |dict|
140
+ return dict[language_code] if dict.key?(language_code)
141
+ end
142
+ end
143
+
144
+ raise NoTranscriptFound.new(@video_id, language_codes, self)
145
+ end
146
+
147
+ # Format a list of transcripts for display
148
+ #
149
+ # @param transcripts [Array<Transcript>] transcripts to format
150
+ # @return [String] formatted list or "None"
151
+ def format_language_list(transcripts)
152
+ return "None" if transcripts.empty?
153
+
154
+ transcripts.map { |t| " - #{t}" }.join("\n")
155
+ end
156
+
157
+ # Format translation languages for display
158
+ #
159
+ # @return [String] formatted list or "None"
160
+ def format_translation_languages
161
+ return "None" if @translation_languages.empty?
162
+
163
+ @translation_languages.map do |tl|
164
+ " - #{tl.language_code} (\"#{tl.language}\")"
165
+ end.join("\n")
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "json"
5
+
6
+ module Youtube
7
+ module Transcript
8
+ module Rb
9
+ # Playability status values returned by YouTube
10
+ module PlayabilityStatus
11
+ OK = "OK"
12
+ ERROR = "ERROR"
13
+ LOGIN_REQUIRED = "LOGIN_REQUIRED"
14
+ end
15
+
16
+ # Reason messages for playability failures
17
+ module PlayabilityFailedReason
18
+ BOT_DETECTED = "Sign in to confirm you're not a bot"
19
+ AGE_RESTRICTED = "This video may be inappropriate for some users."
20
+ VIDEO_UNAVAILABLE = "This video is unavailable"
21
+ end
22
+
23
+ # Fetches transcript lists from YouTube videos.
24
+ # This class handles all the HTTP communication with YouTube,
25
+ # including consent cookie handling and error detection.
26
+ class TranscriptListFetcher
27
+ # @param http_client [Faraday::Connection] the HTTP client to use
28
+ # @param proxy_config [Object, nil] optional proxy configuration
29
+ def initialize(http_client:, proxy_config: nil)
30
+ @http_client = http_client
31
+ @proxy_config = proxy_config
32
+ end
33
+
34
+ # Fetch the transcript list for a video
35
+ #
36
+ # @param video_id [String] the YouTube video ID
37
+ # @return [TranscriptList] the list of available transcripts
38
+ # @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
39
+ def fetch(video_id)
40
+ TranscriptList.build(
41
+ http_client: @http_client,
42
+ video_id: video_id,
43
+ captions_json: fetch_captions_json(video_id)
44
+ )
45
+ end
46
+
47
+ private
48
+
49
+ # Fetch captions JSON with retry support
50
+ #
51
+ # @param video_id [String] the YouTube video ID
52
+ # @param try_number [Integer] current retry attempt
53
+ # @return [Hash] the captions JSON
54
+ def fetch_captions_json(video_id, try_number: 0)
55
+ html = fetch_video_html(video_id)
56
+ api_key = extract_innertube_api_key(html, video_id)
57
+ innertube_data = fetch_innertube_data(video_id, api_key)
58
+ extract_captions_json(innertube_data, video_id)
59
+ rescue RequestBlocked => e
60
+ retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
61
+ if try_number + 1 < retries
62
+ return fetch_captions_json(video_id, try_number: try_number + 1)
63
+ end
64
+ raise e
65
+ end
66
+
67
+ # Extract the INNERTUBE_API_KEY from the video page HTML
68
+ #
69
+ # @param html [String] the HTML content
70
+ # @param video_id [String] the video ID (for error messages)
71
+ # @return [String] the API key
72
+ # @raise [IpBlocked] if a CAPTCHA is detected
73
+ # @raise [YouTubeDataUnparsable] if the key cannot be found
74
+ def extract_innertube_api_key(html, video_id)
75
+ match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
76
+ if match && match[1]
77
+ return match[1]
78
+ end
79
+
80
+ raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
81
+ raise YouTubeDataUnparsable, video_id
82
+ end
83
+
84
+ # Extract captions JSON from innertube data
85
+ #
86
+ # @param innertube_data [Hash] the innertube API response
87
+ # @param video_id [String] the video ID
88
+ # @return [Hash] the captions JSON
89
+ # @raise [TranscriptsDisabled] if no captions are available
90
+ def extract_captions_json(innertube_data, video_id)
91
+ assert_playability(innertube_data["playabilityStatus"], video_id)
92
+
93
+ captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
94
+ if captions_json.nil? || !captions_json.key?("captionTracks")
95
+ raise TranscriptsDisabled, video_id
96
+ end
97
+
98
+ captions_json
99
+ end
100
+
101
+ # Assert that the video is playable
102
+ #
103
+ # @param playability_status_data [Hash, nil] the playability status from API
104
+ # @param video_id [String] the video ID
105
+ # @raise [Various] depending on the playability status
106
+ def assert_playability(playability_status_data, video_id)
107
+ return if playability_status_data.nil?
108
+
109
+ status = playability_status_data["status"]
110
+ return if status == PlayabilityStatus::OK || status.nil?
111
+
112
+ reason = playability_status_data["reason"]
113
+
114
+ if status == PlayabilityStatus::LOGIN_REQUIRED
115
+ if reason == PlayabilityFailedReason::BOT_DETECTED
116
+ raise RequestBlocked, video_id
117
+ elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
118
+ raise AgeRestricted, video_id
119
+ end
120
+ end
121
+
122
+ if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
123
+ if video_id.start_with?("http://") || video_id.start_with?("https://")
124
+ raise InvalidVideoId, video_id
125
+ end
126
+ raise VideoUnavailable, video_id
127
+ end
128
+
129
+ # Extract subreasons for more detailed error messages
130
+ subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
131
+ subreason_texts = subreasons.map { |run| run["text"] || "" }
132
+
133
+ raise VideoUnplayable.new(video_id, reason, subreason_texts)
134
+ end
135
+
136
+ # Create a consent cookie from the HTML
137
+ #
138
+ # @param html [String] the HTML content
139
+ # @param video_id [String] the video ID
140
+ # @raise [FailedToCreateConsentCookie] if the cookie cannot be created
141
+ def create_consent_cookie(html, video_id)
142
+ match = html.match(/name="v" value="(.*?)"/)
143
+ raise FailedToCreateConsentCookie, video_id if match.nil?
144
+
145
+ # Set the consent cookie
146
+ # Note: Faraday doesn't have built-in cookie management like requests.Session
147
+ # We'll need to handle this via headers or middleware
148
+ @consent_value = "YES+#{match[1]}"
149
+ end
150
+
151
+ # Fetch the video HTML page
152
+ #
153
+ # @param video_id [String] the video ID
154
+ # @return [String] the HTML content
155
+ def fetch_video_html(video_id)
156
+ html = fetch_html(video_id)
157
+
158
+ if html.include?('action="https://consent.youtube.com/s"')
159
+ create_consent_cookie(html, video_id)
160
+ html = fetch_html(video_id)
161
+ if html.include?('action="https://consent.youtube.com/s"')
162
+ raise FailedToCreateConsentCookie, video_id
163
+ end
164
+ end
165
+
166
+ html
167
+ end
168
+
169
+ # Fetch raw HTML from YouTube
170
+ #
171
+ # @param video_id [String] the video ID
172
+ # @return [String] the HTML content (unescaped)
173
+ def fetch_html(video_id)
174
+ url = format(WATCH_URL, video_id: video_id)
175
+ headers = { "Accept-Language" => "en-US" }
176
+
177
+ # Add consent cookie if we have one
178
+ headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
179
+
180
+ response = @http_client.get(url) do |req|
181
+ headers.each { |k, v| req.headers[k] = v }
182
+ end
183
+
184
+ raise_http_errors(response, video_id)
185
+ CGI.unescapeHTML(response.body)
186
+ end
187
+
188
+ # Fetch data from the Innertube API
189
+ #
190
+ # @param video_id [String] the video ID
191
+ # @param api_key [String] the API key
192
+ # @return [Hash] the API response
193
+ def fetch_innertube_data(video_id, api_key)
194
+ url = format(INNERTUBE_API_URL, api_key: api_key)
195
+
196
+ response = @http_client.post(url) do |req|
197
+ req.headers["Content-Type"] = "application/json"
198
+ req.body = JSON.generate({
199
+ "context" => INNERTUBE_CONTEXT,
200
+ "videoId" => video_id
201
+ })
202
+ end
203
+
204
+ raise_http_errors(response, video_id)
205
+ JSON.parse(response.body)
206
+ end
207
+
208
+ # Raise appropriate errors for HTTP responses
209
+ #
210
+ # @param response [Faraday::Response] the HTTP response
211
+ # @param video_id [String] the video ID
212
+ # @raise [IpBlocked] for 429 responses
213
+ # @raise [YouTubeRequestFailed] for other error responses
214
+ def raise_http_errors(response, video_id)
215
+ case response.status
216
+ when 429
217
+ raise IpBlocked, video_id
218
+ when 400..599
219
+ raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
220
+ end
221
+ end
222
+ end
223
+ end
224
+ end
225
+ end