youtube-transcript-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of youtube-transcript-rb might be problematic. Click here for more details.

Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +42 -42
  3. data/lib/youtube-transcript-rb.rb +3 -0
  4. data/lib/youtube_rb/transcript/api.rb +148 -0
  5. data/lib/youtube_rb/transcript/errors.rb +215 -0
  6. data/lib/youtube_rb/transcript/formatters.rb +267 -0
  7. data/lib/youtube_rb/transcript/settings.rb +26 -0
  8. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  9. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  10. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
  11. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  12. data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
  13. data/lib/youtube_rb/transcript.rb +35 -0
  14. data/sig/youtube_rb/transcript.rbs +6 -0
  15. data/spec/api_spec.rb +20 -20
  16. data/spec/errors_spec.rb +39 -39
  17. data/spec/formatters_spec.rb +36 -36
  18. data/spec/integration_spec.rb +32 -32
  19. data/spec/settings_spec.rb +16 -16
  20. data/spec/spec_helper.rb +1 -1
  21. data/spec/transcript_list_fetcher_spec.rb +27 -27
  22. data/spec/transcript_list_spec.rb +6 -6
  23. data/spec/transcript_parser_spec.rb +3 -3
  24. data/spec/transcript_spec.rb +16 -16
  25. metadata +13 -12
  26. data/lib/youtube/transcript/rb/api.rb +0 -150
  27. data/lib/youtube/transcript/rb/errors.rb +0 -217
  28. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  29. data/lib/youtube/transcript/rb/settings.rb +0 -28
  30. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  31. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  32. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  33. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  34. data/lib/youtube/transcript/rb.rb +0 -37
  35. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -1,170 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Youtube
4
- module Transcript
5
- module Rb
6
- # Represents a list of available transcripts for a YouTube video.
7
- # This class is Enumerable, allowing iteration over all available transcripts.
8
- # It provides functionality to search for transcripts in specific languages.
9
- class TranscriptList
10
- include Enumerable
11
-
12
- # @return [String] the video ID this TranscriptList is for
13
- attr_reader :video_id
14
-
15
- # Build a TranscriptList from captions JSON data
16
- #
17
- # @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
18
- # @param video_id [String] the YouTube video ID
19
- # @param captions_json [Hash] the captions JSON parsed from YouTube
20
- # @return [TranscriptList] the created TranscriptList
21
- def self.build(http_client:, video_id:, captions_json:)
22
- translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
23
- TranslationLanguage.new(
24
- language: tl.dig("languageName", "runs", 0, "text") || "",
25
- language_code: tl["languageCode"]
26
- )
27
- end
28
-
29
- manually_created_transcripts = {}
30
- generated_transcripts = {}
31
-
32
- (captions_json["captionTracks"] || []).each do |caption|
33
- is_generated = caption.fetch("kind", "") == "asr"
34
- target_dict = is_generated ? generated_transcripts : manually_created_transcripts
35
-
36
- language_code = caption["languageCode"]
37
- transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
38
-
39
- target_dict[language_code] = Transcript.new(
40
- http_client: http_client,
41
- video_id: video_id,
42
- url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
43
- language: caption.dig("name", "runs", 0, "text") || "",
44
- language_code: language_code,
45
- is_generated: is_generated,
46
- translation_languages: transcript_translation_languages
47
- )
48
- end
49
-
50
- new(
51
- video_id: video_id,
52
- manually_created_transcripts: manually_created_transcripts,
53
- generated_transcripts: generated_transcripts,
54
- translation_languages: translation_languages
55
- )
56
- end
57
-
58
- # @param video_id [String] the YouTube video ID
59
- # @param manually_created_transcripts [Hash<String, Transcript>] manually created transcripts by language code
60
- # @param generated_transcripts [Hash<String, Transcript>] auto-generated transcripts by language code
61
- # @param translation_languages [Array<TranslationLanguage>] available translation languages
62
- def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
63
- @video_id = video_id
64
- @manually_created_transcripts = manually_created_transcripts
65
- @generated_transcripts = generated_transcripts
66
- @translation_languages = translation_languages
67
- end
68
-
69
- # Iterate over all transcripts (manually created first, then generated)
70
- #
71
- # @yield [Transcript] each available transcript
72
- # @return [Enumerator] if no block given
73
- def each(&block)
74
- return to_enum(:each) unless block_given?
75
-
76
- @manually_created_transcripts.each_value(&block)
77
- @generated_transcripts.each_value(&block)
78
- end
79
-
80
- # Find a transcript for the given language codes.
81
- # Manually created transcripts are preferred over generated ones.
82
- #
83
- # @param language_codes [Array<String>] language codes in descending priority
84
- # @return [Transcript] the found transcript
85
- # @raise [NoTranscriptFound] if no transcript matches the requested languages
86
- def find_transcript(language_codes)
87
- find_transcript_in(
88
- language_codes,
89
- [@manually_created_transcripts, @generated_transcripts]
90
- )
91
- end
92
-
93
- # Find an automatically generated transcript for the given language codes.
94
- #
95
- # @param language_codes [Array<String>] language codes in descending priority
96
- # @return [Transcript] the found transcript
97
- # @raise [NoTranscriptFound] if no generated transcript matches
98
- def find_generated_transcript(language_codes)
99
- find_transcript_in(language_codes, [@generated_transcripts])
100
- end
101
-
102
- # Find a manually created transcript for the given language codes.
103
- #
104
- # @param language_codes [Array<String>] language codes in descending priority
105
- # @return [Transcript] the found transcript
106
- # @raise [NoTranscriptFound] if no manually created transcript matches
107
- def find_manually_created_transcript(language_codes)
108
- find_transcript_in(language_codes, [@manually_created_transcripts])
109
- end
110
-
111
- # String representation of the transcript list
112
- #
113
- # @return [String] human-readable description of available transcripts
114
- def to_s
115
- <<~DESC
116
- For this video (#{@video_id}) transcripts are available in the following languages:
117
-
118
- (MANUALLY CREATED)
119
- #{format_language_list(@manually_created_transcripts.values)}
120
-
121
- (GENERATED)
122
- #{format_language_list(@generated_transcripts.values)}
123
-
124
- (TRANSLATION LANGUAGES)
125
- #{format_translation_languages}
126
- DESC
127
- end
128
-
129
- private
130
-
131
- # Find a transcript from the given dictionaries
132
- #
133
- # @param language_codes [Array<String>] language codes to search for
134
- # @param transcript_dicts [Array<Hash>] transcript dictionaries to search
135
- # @return [Transcript] the found transcript
136
- # @raise [NoTranscriptFound] if no transcript matches
137
- def find_transcript_in(language_codes, transcript_dicts)
138
- language_codes.each do |language_code|
139
- transcript_dicts.each do |dict|
140
- return dict[language_code] if dict.key?(language_code)
141
- end
142
- end
143
-
144
- raise NoTranscriptFound.new(@video_id, language_codes, self)
145
- end
146
-
147
- # Format a list of transcripts for display
148
- #
149
- # @param transcripts [Array<Transcript>] transcripts to format
150
- # @return [String] formatted list or "None"
151
- def format_language_list(transcripts)
152
- return "None" if transcripts.empty?
153
-
154
- transcripts.map { |t| " - #{t}" }.join("\n")
155
- end
156
-
157
- # Format translation languages for display
158
- #
159
- # @return [String] formatted list or "None"
160
- def format_translation_languages
161
- return "None" if @translation_languages.empty?
162
-
163
- @translation_languages.map do |tl|
164
- " - #{tl.language_code} (\"#{tl.language}\")"
165
- end.join("\n")
166
- end
167
- end
168
- end
169
- end
170
- end
@@ -1,225 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "cgi"
4
- require "json"
5
-
6
- module Youtube
7
- module Transcript
8
- module Rb
9
- # Playability status values returned by YouTube
10
- module PlayabilityStatus
11
- OK = "OK"
12
- ERROR = "ERROR"
13
- LOGIN_REQUIRED = "LOGIN_REQUIRED"
14
- end
15
-
16
- # Reason messages for playability failures
17
- module PlayabilityFailedReason
18
- BOT_DETECTED = "Sign in to confirm you're not a bot"
19
- AGE_RESTRICTED = "This video may be inappropriate for some users."
20
- VIDEO_UNAVAILABLE = "This video is unavailable"
21
- end
22
-
23
- # Fetches transcript lists from YouTube videos.
24
- # This class handles all the HTTP communication with YouTube,
25
- # including consent cookie handling and error detection.
26
- class TranscriptListFetcher
27
- # @param http_client [Faraday::Connection] the HTTP client to use
28
- # @param proxy_config [Object, nil] optional proxy configuration
29
- def initialize(http_client:, proxy_config: nil)
30
- @http_client = http_client
31
- @proxy_config = proxy_config
32
- end
33
-
34
- # Fetch the transcript list for a video
35
- #
36
- # @param video_id [String] the YouTube video ID
37
- # @return [TranscriptList] the list of available transcripts
38
- # @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
39
- def fetch(video_id)
40
- TranscriptList.build(
41
- http_client: @http_client,
42
- video_id: video_id,
43
- captions_json: fetch_captions_json(video_id)
44
- )
45
- end
46
-
47
- private
48
-
49
- # Fetch captions JSON with retry support
50
- #
51
- # @param video_id [String] the YouTube video ID
52
- # @param try_number [Integer] current retry attempt
53
- # @return [Hash] the captions JSON
54
- def fetch_captions_json(video_id, try_number: 0)
55
- html = fetch_video_html(video_id)
56
- api_key = extract_innertube_api_key(html, video_id)
57
- innertube_data = fetch_innertube_data(video_id, api_key)
58
- extract_captions_json(innertube_data, video_id)
59
- rescue RequestBlocked => e
60
- retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
61
- if try_number + 1 < retries
62
- return fetch_captions_json(video_id, try_number: try_number + 1)
63
- end
64
- raise e
65
- end
66
-
67
- # Extract the INNERTUBE_API_KEY from the video page HTML
68
- #
69
- # @param html [String] the HTML content
70
- # @param video_id [String] the video ID (for error messages)
71
- # @return [String] the API key
72
- # @raise [IpBlocked] if a CAPTCHA is detected
73
- # @raise [YouTubeDataUnparsable] if the key cannot be found
74
- def extract_innertube_api_key(html, video_id)
75
- match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
76
- if match && match[1]
77
- return match[1]
78
- end
79
-
80
- raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
81
- raise YouTubeDataUnparsable, video_id
82
- end
83
-
84
- # Extract captions JSON from innertube data
85
- #
86
- # @param innertube_data [Hash] the innertube API response
87
- # @param video_id [String] the video ID
88
- # @return [Hash] the captions JSON
89
- # @raise [TranscriptsDisabled] if no captions are available
90
- def extract_captions_json(innertube_data, video_id)
91
- assert_playability(innertube_data["playabilityStatus"], video_id)
92
-
93
- captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
94
- if captions_json.nil? || !captions_json.key?("captionTracks")
95
- raise TranscriptsDisabled, video_id
96
- end
97
-
98
- captions_json
99
- end
100
-
101
- # Assert that the video is playable
102
- #
103
- # @param playability_status_data [Hash, nil] the playability status from API
104
- # @param video_id [String] the video ID
105
- # @raise [Various] depending on the playability status
106
- def assert_playability(playability_status_data, video_id)
107
- return if playability_status_data.nil?
108
-
109
- status = playability_status_data["status"]
110
- return if status == PlayabilityStatus::OK || status.nil?
111
-
112
- reason = playability_status_data["reason"]
113
-
114
- if status == PlayabilityStatus::LOGIN_REQUIRED
115
- if reason == PlayabilityFailedReason::BOT_DETECTED
116
- raise RequestBlocked, video_id
117
- elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
118
- raise AgeRestricted, video_id
119
- end
120
- end
121
-
122
- if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
123
- if video_id.start_with?("http://") || video_id.start_with?("https://")
124
- raise InvalidVideoId, video_id
125
- end
126
- raise VideoUnavailable, video_id
127
- end
128
-
129
- # Extract subreasons for more detailed error messages
130
- subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
131
- subreason_texts = subreasons.map { |run| run["text"] || "" }
132
-
133
- raise VideoUnplayable.new(video_id, reason, subreason_texts)
134
- end
135
-
136
- # Create a consent cookie from the HTML
137
- #
138
- # @param html [String] the HTML content
139
- # @param video_id [String] the video ID
140
- # @raise [FailedToCreateConsentCookie] if the cookie cannot be created
141
- def create_consent_cookie(html, video_id)
142
- match = html.match(/name="v" value="(.*?)"/)
143
- raise FailedToCreateConsentCookie, video_id if match.nil?
144
-
145
- # Set the consent cookie
146
- # Note: Faraday doesn't have built-in cookie management like requests.Session
147
- # We'll need to handle this via headers or middleware
148
- @consent_value = "YES+#{match[1]}"
149
- end
150
-
151
- # Fetch the video HTML page
152
- #
153
- # @param video_id [String] the video ID
154
- # @return [String] the HTML content
155
- def fetch_video_html(video_id)
156
- html = fetch_html(video_id)
157
-
158
- if html.include?('action="https://consent.youtube.com/s"')
159
- create_consent_cookie(html, video_id)
160
- html = fetch_html(video_id)
161
- if html.include?('action="https://consent.youtube.com/s"')
162
- raise FailedToCreateConsentCookie, video_id
163
- end
164
- end
165
-
166
- html
167
- end
168
-
169
- # Fetch raw HTML from YouTube
170
- #
171
- # @param video_id [String] the video ID
172
- # @return [String] the HTML content (unescaped)
173
- def fetch_html(video_id)
174
- url = format(WATCH_URL, video_id: video_id)
175
- headers = { "Accept-Language" => "en-US" }
176
-
177
- # Add consent cookie if we have one
178
- headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
179
-
180
- response = @http_client.get(url) do |req|
181
- headers.each { |k, v| req.headers[k] = v }
182
- end
183
-
184
- raise_http_errors(response, video_id)
185
- CGI.unescapeHTML(response.body)
186
- end
187
-
188
- # Fetch data from the Innertube API
189
- #
190
- # @param video_id [String] the video ID
191
- # @param api_key [String] the API key
192
- # @return [Hash] the API response
193
- def fetch_innertube_data(video_id, api_key)
194
- url = format(INNERTUBE_API_URL, api_key: api_key)
195
-
196
- response = @http_client.post(url) do |req|
197
- req.headers["Content-Type"] = "application/json"
198
- req.body = JSON.generate({
199
- "context" => INNERTUBE_CONTEXT,
200
- "videoId" => video_id
201
- })
202
- end
203
-
204
- raise_http_errors(response, video_id)
205
- JSON.parse(response.body)
206
- end
207
-
208
- # Raise appropriate errors for HTTP responses
209
- #
210
- # @param response [Faraday::Response] the HTTP response
211
- # @param video_id [String] the video ID
212
- # @raise [IpBlocked] for 429 responses
213
- # @raise [YouTubeRequestFailed] for other error responses
214
- def raise_http_errors(response, video_id)
215
- case response.status
216
- when 429
217
- raise IpBlocked, video_id
218
- when 400..599
219
- raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
220
- end
221
- end
222
- end
223
- end
224
- end
225
- end
@@ -1,83 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "nokogiri"
4
- require "cgi"
5
-
6
- module Youtube
7
- module Transcript
8
- module Rb
9
- # Parses XML transcript data from YouTube
10
- class TranscriptParser
11
- # HTML formatting tags to preserve when preserve_formatting is enabled
12
- FORMATTING_TAGS = %w[
13
- strong
14
- em
15
- b
16
- i
17
- mark
18
- small
19
- del
20
- ins
21
- sub
22
- sup
23
- ].freeze
24
-
25
- # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
26
- def initialize(preserve_formatting: false)
27
- @preserve_formatting = preserve_formatting
28
- @html_regex = build_html_regex
29
- end
30
-
31
- # Parse XML transcript data into TranscriptSnippet objects
32
- # @param raw_data [String] the raw XML data from YouTube
33
- # @return [Array<TranscriptSnippet>] parsed transcript snippets
34
- def parse(raw_data)
35
- doc = Nokogiri::XML(raw_data)
36
- snippets = []
37
-
38
- doc.xpath("//text").each do |element|
39
- text_content = element.text
40
- next if text_content.nil? || text_content.empty?
41
-
42
- # Unescape HTML entities and remove unwanted HTML tags
43
- text = process_text(text_content)
44
-
45
- snippets << TranscriptSnippet.new(
46
- text: text,
47
- start: element["start"].to_f,
48
- duration: (element["dur"] || "0.0").to_f
49
- )
50
- end
51
-
52
- snippets
53
- end
54
-
55
- private
56
-
57
- # Build regex for removing HTML tags
58
- # @return [Regexp]
59
- def build_html_regex
60
- if @preserve_formatting
61
- # Remove all tags except formatting tags
62
- formats_pattern = FORMATTING_TAGS.join("|")
63
- # Match tags that are NOT the formatting tags
64
- Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
65
- else
66
- # Remove all HTML tags
67
- Regexp.new("<[^>]*>", Regexp::IGNORECASE)
68
- end
69
- end
70
-
71
- # Process text by unescaping HTML entities and removing unwanted tags
72
- # @param text [String] the raw text
73
- # @return [String] processed text
74
- def process_text(text)
75
- # Unescape HTML entities
76
- unescaped = CGI.unescapeHTML(text)
77
- # Remove unwanted HTML tags
78
- unescaped.gsub(@html_regex, "")
79
- end
80
- end
81
- end
82
- end
83
- end
@@ -1,37 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "rb/version"
4
- require_relative "rb/settings"
5
- require_relative "rb/errors"
6
- require_relative "rb/transcript_parser"
7
- require_relative "rb/transcript"
8
- require_relative "rb/transcript_list"
9
- require_relative "rb/transcript_list_fetcher"
10
- require_relative "rb/api"
11
- require_relative "rb/formatters"
12
-
13
- module Youtube
14
- module Transcript
15
- module Rb
16
- class << self
17
- # Convenience method to fetch a transcript
18
- # @param video_id [String] YouTube video ID
19
- # @param languages [Array<String>] Language codes in order of preference
20
- # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
21
- # @return [FetchedTranscript] The fetched transcript
22
- def fetch(video_id, languages: ["en"], preserve_formatting: false)
23
- api = YouTubeTranscriptApi.new
24
- api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
25
- end
26
-
27
- # Convenience method to list available transcripts
28
- # @param video_id [String] YouTube video ID
29
- # @return [TranscriptList] List of available transcripts
30
- def list(video_id)
31
- api = YouTubeTranscriptApi.new
32
- api.list(video_id)
33
- end
34
- end
35
- end
36
- end
37
- end
@@ -1,8 +0,0 @@
1
- module Youtube
2
- module Transcript
3
- module Rb
4
- VERSION: String
5
- # See the writing guide of rbs: https://github.com/ruby/rbs#guides
6
- end
7
- end
8
- end