youtube-transcript-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of youtube-transcript-rb might be problematic. Click here for more details.

Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +42 -42
  3. data/lib/youtube-transcript-rb.rb +3 -0
  4. data/lib/youtube_rb/transcript/api.rb +148 -0
  5. data/lib/youtube_rb/transcript/errors.rb +215 -0
  6. data/lib/youtube_rb/transcript/formatters.rb +267 -0
  7. data/lib/youtube_rb/transcript/settings.rb +26 -0
  8. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  9. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  10. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
  11. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  12. data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
  13. data/lib/youtube_rb/transcript.rb +35 -0
  14. data/sig/youtube_rb/transcript.rbs +6 -0
  15. data/spec/api_spec.rb +20 -20
  16. data/spec/errors_spec.rb +39 -39
  17. data/spec/formatters_spec.rb +36 -36
  18. data/spec/integration_spec.rb +32 -32
  19. data/spec/settings_spec.rb +16 -16
  20. data/spec/spec_helper.rb +1 -1
  21. data/spec/transcript_list_fetcher_spec.rb +27 -27
  22. data/spec/transcript_list_spec.rb +6 -6
  23. data/spec/transcript_parser_spec.rb +3 -3
  24. data/spec/transcript_spec.rb +16 -16
  25. metadata +13 -12
  26. data/lib/youtube/transcript/rb/api.rb +0 -150
  27. data/lib/youtube/transcript/rb/errors.rb +0 -217
  28. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  29. data/lib/youtube/transcript/rb/settings.rb +0 -28
  30. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  31. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  32. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  33. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  34. data/lib/youtube/transcript/rb.rb +0 -37
  35. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "json"
5
+
6
+ module YoutubeRb
7
+ module Transcript
8
+ # Playability status values returned by YouTube
9
+ module PlayabilityStatus
10
+ OK = "OK"
11
+ ERROR = "ERROR"
12
+ LOGIN_REQUIRED = "LOGIN_REQUIRED"
13
+ end
14
+
15
+ # Reason messages for playability failures
16
+ module PlayabilityFailedReason
17
+ BOT_DETECTED = "Sign in to confirm you're not a bot"
18
+ AGE_RESTRICTED = "This video may be inappropriate for some users."
19
+ VIDEO_UNAVAILABLE = "This video is unavailable"
20
+ end
21
+
22
+ # Fetches transcript lists from YouTube videos.
23
+ # This class handles all the HTTP communication with YouTube,
24
+ # including consent cookie handling and error detection.
25
+ class TranscriptListFetcher
26
+ # @param http_client [Faraday::Connection] the HTTP client to use
27
+ # @param proxy_config [Object, nil] optional proxy configuration
28
+ def initialize(http_client:, proxy_config: nil)
29
+ @http_client = http_client
30
+ @proxy_config = proxy_config
31
+ end
32
+
33
+ # Fetch the transcript list for a video
34
+ #
35
+ # @param video_id [String] the YouTube video ID
36
+ # @return [TranscriptList] the list of available transcripts
37
+ # @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
38
+ def fetch(video_id)
39
+ TranscriptList.build(
40
+ http_client: @http_client,
41
+ video_id: video_id,
42
+ captions_json: fetch_captions_json(video_id)
43
+ )
44
+ end
45
+
46
+ private
47
+
48
+ # Fetch captions JSON with retry support
49
+ #
50
+ # @param video_id [String] the YouTube video ID
51
+ # @param try_number [Integer] current retry attempt
52
+ # @return [Hash] the captions JSON
53
+ def fetch_captions_json(video_id, try_number: 0)
54
+ html = fetch_video_html(video_id)
55
+ api_key = extract_innertube_api_key(html, video_id)
56
+ innertube_data = fetch_innertube_data(video_id, api_key)
57
+ extract_captions_json(innertube_data, video_id)
58
+ rescue RequestBlocked => e
59
+ retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
60
+ if try_number + 1 < retries
61
+ return fetch_captions_json(video_id, try_number: try_number + 1)
62
+ end
63
+ raise e
64
+ end
65
+
66
+ # Extract the INNERTUBE_API_KEY from the video page HTML
67
+ #
68
+ # @param html [String] the HTML content
69
+ # @param video_id [String] the video ID (for error messages)
70
+ # @return [String] the API key
71
+ # @raise [IpBlocked] if a CAPTCHA is detected
72
+ # @raise [YouTubeDataUnparsable] if the key cannot be found
73
+ def extract_innertube_api_key(html, video_id)
74
+ match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
75
+ if match && match[1]
76
+ return match[1]
77
+ end
78
+
79
+ raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
80
+ raise YouTubeDataUnparsable, video_id
81
+ end
82
+
83
+ # Extract captions JSON from innertube data
84
+ #
85
+ # @param innertube_data [Hash] the innertube API response
86
+ # @param video_id [String] the video ID
87
+ # @return [Hash] the captions JSON
88
+ # @raise [TranscriptsDisabled] if no captions are available
89
+ def extract_captions_json(innertube_data, video_id)
90
+ assert_playability(innertube_data["playabilityStatus"], video_id)
91
+
92
+ captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
93
+ if captions_json.nil? || !captions_json.key?("captionTracks")
94
+ raise TranscriptsDisabled, video_id
95
+ end
96
+
97
+ captions_json
98
+ end
99
+
100
+ # Assert that the video is playable
101
+ #
102
+ # @param playability_status_data [Hash, nil] the playability status from API
103
+ # @param video_id [String] the video ID
104
+ # @raise [Various] depending on the playability status
105
+ def assert_playability(playability_status_data, video_id)
106
+ return if playability_status_data.nil?
107
+
108
+ status = playability_status_data["status"]
109
+ return if status == PlayabilityStatus::OK || status.nil?
110
+
111
+ reason = playability_status_data["reason"]
112
+
113
+ if status == PlayabilityStatus::LOGIN_REQUIRED
114
+ if reason == PlayabilityFailedReason::BOT_DETECTED
115
+ raise RequestBlocked, video_id
116
+ elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
117
+ raise AgeRestricted, video_id
118
+ end
119
+ end
120
+
121
+ if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
122
+ if video_id.start_with?("http://") || video_id.start_with?("https://")
123
+ raise InvalidVideoId, video_id
124
+ end
125
+ raise VideoUnavailable, video_id
126
+ end
127
+
128
+ # Extract subreasons for more detailed error messages
129
+ subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
130
+ subreason_texts = subreasons.map { |run| run["text"] || "" }
131
+
132
+ raise VideoUnplayable.new(video_id, reason, subreason_texts)
133
+ end
134
+
135
+ # Create a consent cookie from the HTML
136
+ #
137
+ # @param html [String] the HTML content
138
+ # @param video_id [String] the video ID
139
+ # @raise [FailedToCreateConsentCookie] if the cookie cannot be created
140
+ def create_consent_cookie(html, video_id)
141
+ match = html.match(/name="v" value="(.*?)"/)
142
+ raise FailedToCreateConsentCookie, video_id if match.nil?
143
+
144
+ # Set the consent cookie
145
+ # Note: Faraday doesn't have built-in cookie management like requests.Session
146
+ # We'll need to handle this via headers or middleware
147
+ @consent_value = "YES+#{match[1]}"
148
+ end
149
+
150
+ # Fetch the video HTML page
151
+ #
152
+ # @param video_id [String] the video ID
153
+ # @return [String] the HTML content
154
+ def fetch_video_html(video_id)
155
+ html = fetch_html(video_id)
156
+
157
+ if html.include?('action="https://consent.youtube.com/s"')
158
+ create_consent_cookie(html, video_id)
159
+ html = fetch_html(video_id)
160
+ if html.include?('action="https://consent.youtube.com/s"')
161
+ raise FailedToCreateConsentCookie, video_id
162
+ end
163
+ end
164
+
165
+ html
166
+ end
167
+
168
+ # Fetch raw HTML from YouTube
169
+ #
170
+ # @param video_id [String] the video ID
171
+ # @return [String] the HTML content (unescaped)
172
+ def fetch_html(video_id)
173
+ url = format(WATCH_URL, video_id: video_id)
174
+ headers = { "Accept-Language" => "en-US" }
175
+
176
+ # Add consent cookie if we have one
177
+ headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
178
+
179
+ response = @http_client.get(url) do |req|
180
+ headers.each { |k, v| req.headers[k] = v }
181
+ end
182
+
183
+ raise_http_errors(response, video_id)
184
+ CGI.unescapeHTML(response.body)
185
+ end
186
+
187
+ # Fetch data from the Innertube API
188
+ #
189
+ # @param video_id [String] the video ID
190
+ # @param api_key [String] the API key
191
+ # @return [Hash] the API response
192
+ def fetch_innertube_data(video_id, api_key)
193
+ url = format(INNERTUBE_API_URL, api_key: api_key)
194
+
195
+ response = @http_client.post(url) do |req|
196
+ req.headers["Content-Type"] = "application/json"
197
+ req.body = JSON.generate({
198
+ "context" => INNERTUBE_CONTEXT,
199
+ "videoId" => video_id
200
+ })
201
+ end
202
+
203
+ raise_http_errors(response, video_id)
204
+ JSON.parse(response.body)
205
+ end
206
+
207
+ # Raise appropriate errors for HTTP responses
208
+ #
209
+ # @param response [Faraday::Response] the HTTP response
210
+ # @param video_id [String] the video ID
211
+ # @raise [IpBlocked] for 429 responses
212
+ # @raise [YouTubeRequestFailed] for other error responses
213
+ def raise_http_errors(response, video_id)
214
+ case response.status
215
+ when 429
216
+ raise IpBlocked, video_id
217
+ when 400..599
218
+ raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "cgi"
5
+
6
+ module YoutubeRb
7
+ module Transcript
8
+ # Parses XML transcript data from YouTube
9
+ class TranscriptParser
10
+ # HTML formatting tags to preserve when preserve_formatting is enabled
11
+ FORMATTING_TAGS = %w[
12
+ strong
13
+ em
14
+ b
15
+ i
16
+ mark
17
+ small
18
+ del
19
+ ins
20
+ sub
21
+ sup
22
+ ].freeze
23
+
24
+ # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
25
+ def initialize(preserve_formatting: false)
26
+ @preserve_formatting = preserve_formatting
27
+ @html_regex = build_html_regex
28
+ end
29
+
30
+ # Parse XML transcript data into TranscriptSnippet objects
31
+ # @param raw_data [String] the raw XML data from YouTube
32
+ # @return [Array<TranscriptSnippet>] parsed transcript snippets
33
+ def parse(raw_data)
34
+ doc = Nokogiri::XML(raw_data)
35
+ snippets = []
36
+
37
+ doc.xpath("//text").each do |element|
38
+ text_content = element.text
39
+ next if text_content.nil? || text_content.empty?
40
+
41
+ # Unescape HTML entities and remove unwanted HTML tags
42
+ text = process_text(text_content)
43
+
44
+ snippets << TranscriptSnippet.new(
45
+ text: text,
46
+ start: element["start"].to_f,
47
+ duration: (element["dur"] || "0.0").to_f
48
+ )
49
+ end
50
+
51
+ snippets
52
+ end
53
+
54
+ private
55
+
56
+ # Build regex for removing HTML tags
57
+ # @return [Regexp]
58
+ def build_html_regex
59
+ if @preserve_formatting
60
+ # Remove all tags except formatting tags
61
+ formats_pattern = FORMATTING_TAGS.join("|")
62
+ # Match tags that are NOT the formatting tags
63
+ Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
64
+ else
65
+ # Remove all HTML tags
66
+ Regexp.new("<[^>]*>", Regexp::IGNORECASE)
67
+ end
68
+ end
69
+
70
+ # Process text by unescaping HTML entities and removing unwanted tags
71
+ # @param text [String] the raw text
72
+ # @return [String] processed text
73
+ def process_text(text)
74
+ # Unescape HTML entities
75
+ unescaped = CGI.unescapeHTML(text)
76
+ # Remove unwanted HTML tags
77
+ unescaped.gsub(@html_regex, "")
78
+ end
79
+ end
80
+ end
81
+ end
@@ -1,9 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module Youtube
3
+ module YoutubeRb
4
4
  module Transcript
5
- module Rb
6
- VERSION = "0.1.0"
7
- end
5
+ VERSION = "0.2.1"
8
6
  end
9
7
  end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "transcript/version"
4
+ require_relative "transcript/settings"
5
+ require_relative "transcript/errors"
6
+ require_relative "transcript/transcript_parser"
7
+ require_relative "transcript/transcript"
8
+ require_relative "transcript/transcript_list"
9
+ require_relative "transcript/transcript_list_fetcher"
10
+ require_relative "transcript/api"
11
+ require_relative "transcript/formatters"
12
+
13
+ module YoutubeRb
14
+ module Transcript
15
+ class << self
16
+ # Convenience method to fetch a transcript
17
+ # @param video_id [String] YouTube video ID
18
+ # @param languages [Array<String>] Language codes in order of preference
19
+ # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
20
+ # @return [FetchedTranscript] The fetched transcript
21
+ def fetch(video_id, languages: ["en"], preserve_formatting: false)
22
+ api = YouTubeTranscriptApi.new
23
+ api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
24
+ end
25
+
26
+ # Convenience method to list available transcripts
27
+ # @param video_id [String] YouTube video ID
28
+ # @return [TranscriptList] List of available transcripts
29
+ def list(video_id)
30
+ api = YouTubeTranscriptApi.new
31
+ api.list(video_id)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,6 @@
1
+ module YoutubeRb
2
+ module Transcript
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end
data/spec/api_spec.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  require "spec_helper"
4
4
  require "webmock/rspec"
5
5
 
6
- RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
6
+ RSpec.describe YoutubeRb::Transcript::YouTubeTranscriptApi do
7
7
  let(:api) { described_class.new }
8
8
  let(:video_id) { "dQw4w9WgXcQ" }
9
9
  let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
@@ -84,7 +84,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
84
84
 
85
85
  it "creates a TranscriptListFetcher" do
86
86
  api = described_class.new
87
- expect(api.instance_variable_get(:@fetcher)).to be_a(Youtube::Transcript::Rb::TranscriptListFetcher)
87
+ expect(api.instance_variable_get(:@fetcher)).to be_a(YoutubeRb::Transcript::TranscriptListFetcher)
88
88
  end
89
89
  end
90
90
 
@@ -102,7 +102,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
102
102
 
103
103
  it "returns a FetchedTranscript" do
104
104
  result = api.fetch(video_id)
105
- expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
105
+ expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
106
106
  end
107
107
 
108
108
  it "fetches the transcript with correct video_id" do
@@ -138,7 +138,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
138
138
  it "raises NoTranscriptFound when no language matches" do
139
139
  expect {
140
140
  api.fetch(video_id, languages: ["ja", "ko", "zh"])
141
- }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
141
+ }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
142
142
  end
143
143
 
144
144
  context "with preserve_formatting option" do
@@ -182,7 +182,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
182
182
 
183
183
  it "returns a TranscriptList" do
184
184
  result = api.list(video_id)
185
- expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
185
+ expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
186
186
  end
187
187
 
188
188
  it "returns a list with the correct video_id" do
@@ -213,7 +213,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
213
213
  end
214
214
 
215
215
  it "raises VideoUnavailable error" do
216
- expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
216
+ expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
217
217
  end
218
218
  end
219
219
 
@@ -227,7 +227,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
227
227
  end
228
228
 
229
229
  it "raises TranscriptsDisabled error" do
230
- expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
230
+ expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::TranscriptsDisabled)
231
231
  end
232
232
  end
233
233
  end
@@ -272,7 +272,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
272
272
  it "fetches all video transcripts" do
273
273
  results = api.fetch_all(video_ids)
274
274
  results.each do |vid, transcript|
275
- expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
275
+ expect(transcript).to be_a(YoutubeRb::Transcript::FetchedTranscript)
276
276
  expect(transcript.video_id).to eq(vid)
277
277
  end
278
278
  end
@@ -292,7 +292,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
292
292
  expect(yielded.length).to eq(3)
293
293
  yielded.each do |vid, klass|
294
294
  expect(video_ids).to include(vid)
295
- expect(klass).to eq(Youtube::Transcript::Rb::FetchedTranscript)
295
+ expect(klass).to eq(YoutubeRb::Transcript::FetchedTranscript)
296
296
  end
297
297
  end
298
298
 
@@ -316,7 +316,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
316
316
  end
317
317
 
318
318
  it "raises error by default" do
319
- expect { api.fetch_all(failing_video_ids) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
319
+ expect { api.fetch_all(failing_video_ids) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
320
320
  end
321
321
 
322
322
  it "continues on error when configured" do
@@ -332,7 +332,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
332
332
  end
333
333
  expect(errors.length).to eq(1)
334
334
  expect(errors.first[0]).to eq("fail_video")
335
- expect(errors.first[1]).to be_a(Youtube::Transcript::Rb::VideoUnavailable)
335
+ expect(errors.first[1]).to be_a(YoutubeRb::Transcript::VideoUnavailable)
336
336
  end
337
337
  end
338
338
 
@@ -356,27 +356,27 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
356
356
  .to_return(status: 200, body: sample_transcript_xml)
357
357
  end
358
358
 
359
- describe "Youtube::Transcript::Rb.fetch" do
359
+ describe "YoutubeRb::Transcript.fetch" do
360
360
  it "fetches a transcript" do
361
- result = Youtube::Transcript::Rb.fetch(video_id)
362
- expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
361
+ result = YoutubeRb::Transcript.fetch(video_id)
362
+ expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
363
363
  end
364
364
 
365
365
  it "accepts language option" do
366
- result = Youtube::Transcript::Rb.fetch(video_id, languages: ["en"])
366
+ result = YoutubeRb::Transcript.fetch(video_id, languages: ["en"])
367
367
  expect(result.language_code).to eq("en")
368
368
  end
369
369
 
370
370
  it "accepts preserve_formatting option" do
371
- result = Youtube::Transcript::Rb.fetch(video_id, preserve_formatting: false)
372
- expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
371
+ result = YoutubeRb::Transcript.fetch(video_id, preserve_formatting: false)
372
+ expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
373
373
  end
374
374
  end
375
375
 
376
- describe "Youtube::Transcript::Rb.list" do
376
+ describe "YoutubeRb::Transcript.list" do
377
377
  it "lists available transcripts" do
378
- result = Youtube::Transcript::Rb.list(video_id)
379
- expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
378
+ result = YoutubeRb::Transcript.list(video_id)
379
+ expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
380
380
  end
381
381
  end
382
382
  end