RubyGems - youtube-transcript-rb - Versions diffs - 0.1.0 → 0.2.1 - Mend

youtube-transcript-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of youtube-transcript-rb might be problematic. Click here for more details.

Files changed (35) hide show

checksums.yaml +4 -4
data/README.md +42 -42
data/lib/youtube-transcript-rb.rb +3 -0
data/lib/youtube_rb/transcript/api.rb +148 -0
data/lib/youtube_rb/transcript/errors.rb +215 -0
data/lib/youtube_rb/transcript/formatters.rb +267 -0
data/lib/youtube_rb/transcript/settings.rb +26 -0
data/lib/youtube_rb/transcript/transcript.rb +237 -0
data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
data/lib/youtube_rb/transcript.rb +35 -0
data/sig/youtube_rb/transcript.rbs +6 -0
data/spec/api_spec.rb +20 -20
data/spec/errors_spec.rb +39 -39
data/spec/formatters_spec.rb +36 -36
data/spec/integration_spec.rb +32 -32
data/spec/settings_spec.rb +16 -16
data/spec/spec_helper.rb +1 -1
data/spec/transcript_list_fetcher_spec.rb +27 -27
data/spec/transcript_list_spec.rb +6 -6
data/spec/transcript_parser_spec.rb +3 -3
data/spec/transcript_spec.rb +16 -16
metadata +13 -12
data/lib/youtube/transcript/rb/api.rb +0 -150
data/lib/youtube/transcript/rb/errors.rb +0 -217
data/lib/youtube/transcript/rb/formatters.rb +0 -269
data/lib/youtube/transcript/rb/settings.rb +0 -28
data/lib/youtube/transcript/rb/transcript.rb +0 -239
data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
data/lib/youtube/transcript/rb.rb +0 -37
data/sig/youtube/transcript/rb.rbs +0 -8

data/lib/youtube_rb/transcript/transcript_list_fetcher.rb ADDED Viewed

@@ -0,0 +1,223 @@
+# frozen_string_literal: true
+require "cgi"
+require "json"
+module YoutubeRb
+  module Transcript
+    # Playability status values returned by YouTube
+    module PlayabilityStatus
+      OK = "OK"
+      ERROR = "ERROR"
+      LOGIN_REQUIRED = "LOGIN_REQUIRED"
+    end
+    # Reason messages for playability failures
+    module PlayabilityFailedReason
+      BOT_DETECTED = "Sign in to confirm you're not a bot"
+      AGE_RESTRICTED = "This video may be inappropriate for some users."
+      VIDEO_UNAVAILABLE = "This video is unavailable"
+    end
+    # Fetches transcript lists from YouTube videos.
+    # This class handles all the HTTP communication with YouTube,
+    # including consent cookie handling and error detection.
+    class TranscriptListFetcher
+      # @param http_client [Faraday::Connection] the HTTP client to use
+      # @param proxy_config [Object, nil] optional proxy configuration
+      def initialize(http_client:, proxy_config: nil)
+        @http_client = http_client
+        @proxy_config = proxy_config
+      end
+      # Fetch the transcript list for a video
+      #
+      # @param video_id [String] the YouTube video ID
+      # @return [TranscriptList] the list of available transcripts
+      # @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
+      def fetch(video_id)
+        TranscriptList.build(
+          http_client: @http_client,
+          video_id: video_id,
+          captions_json: fetch_captions_json(video_id)
+        )
+      end
+      private
+      # Fetch captions JSON with retry support
+      #
+      # @param video_id [String] the YouTube video ID
+      # @param try_number [Integer] current retry attempt
+      # @return [Hash] the captions JSON
+      def fetch_captions_json(video_id, try_number: 0)
+        html = fetch_video_html(video_id)
+        api_key = extract_innertube_api_key(html, video_id)
+        innertube_data = fetch_innertube_data(video_id, api_key)
+        extract_captions_json(innertube_data, video_id)
+      rescue RequestBlocked => e
+        retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
+        if try_number + 1 < retries
+          return fetch_captions_json(video_id, try_number: try_number + 1)
+        end
+        raise e
+      end
+      # Extract the INNERTUBE_API_KEY from the video page HTML
+      #
+      # @param html [String] the HTML content
+      # @param video_id [String] the video ID (for error messages)
+      # @return [String] the API key
+      # @raise [IpBlocked] if a CAPTCHA is detected
+      # @raise [YouTubeDataUnparsable] if the key cannot be found
+      def extract_innertube_api_key(html, video_id)
+        match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
+        if match && match[1]
+          return match[1]
+        end
+        raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
+        raise YouTubeDataUnparsable, video_id
+      end
+      # Extract captions JSON from innertube data
+      #
+      # @param innertube_data [Hash] the innertube API response
+      # @param video_id [String] the video ID
+      # @return [Hash] the captions JSON
+      # @raise [TranscriptsDisabled] if no captions are available
+      def extract_captions_json(innertube_data, video_id)
+        assert_playability(innertube_data["playabilityStatus"], video_id)
+        captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
+        if captions_json.nil? || !captions_json.key?("captionTracks")
+          raise TranscriptsDisabled, video_id
+        end
+        captions_json
+      end
+      # Assert that the video is playable
+      #
+      # @param playability_status_data [Hash, nil] the playability status from API
+      # @param video_id [String] the video ID
+      # @raise [Various] depending on the playability status
+      def assert_playability(playability_status_data, video_id)
+        return if playability_status_data.nil?
+        status = playability_status_data["status"]
+        return if status == PlayabilityStatus::OK || status.nil?
+        reason = playability_status_data["reason"]
+        if status == PlayabilityStatus::LOGIN_REQUIRED
+          if reason == PlayabilityFailedReason::BOT_DETECTED
+            raise RequestBlocked, video_id
+          elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
+            raise AgeRestricted, video_id
+          end
+        end
+        if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
+          if video_id.start_with?("http://") || video_id.start_with?("https://")
+            raise InvalidVideoId, video_id
+          end
+          raise VideoUnavailable, video_id
+        end
+        # Extract subreasons for more detailed error messages
+        subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
+        subreason_texts = subreasons.map { |run| run["text"] || "" }
+        raise VideoUnplayable.new(video_id, reason, subreason_texts)
+      end
+      # Create a consent cookie from the HTML
+      #
+      # @param html [String] the HTML content
+      # @param video_id [String] the video ID
+      # @raise [FailedToCreateConsentCookie] if the cookie cannot be created
+      def create_consent_cookie(html, video_id)
+        match = html.match(/name="v" value="(.*?)"/)
+        raise FailedToCreateConsentCookie, video_id if match.nil?
+        # Set the consent cookie
+        # Note: Faraday doesn't have built-in cookie management like requests.Session
+        # We'll need to handle this via headers or middleware
+        @consent_value = "YES+#{match[1]}"
+      end
+      # Fetch the video HTML page
+      #
+      # @param video_id [String] the video ID
+      # @return [String] the HTML content
+      def fetch_video_html(video_id)
+        html = fetch_html(video_id)
+        if html.include?('action="https://consent.youtube.com/s"')
+          create_consent_cookie(html, video_id)
+          html = fetch_html(video_id)
+          if html.include?('action="https://consent.youtube.com/s"')
+            raise FailedToCreateConsentCookie, video_id
+          end
+        end
+        html
+      end
+      # Fetch raw HTML from YouTube
+      #
+      # @param video_id [String] the video ID
+      # @return [String] the HTML content (unescaped)
+      def fetch_html(video_id)
+        url = format(WATCH_URL, video_id: video_id)
+        headers = { "Accept-Language" => "en-US" }
+        # Add consent cookie if we have one
+        headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
+        response = @http_client.get(url) do |req|
+          headers.each { |k, v| req.headers[k] = v }
+        end
+        raise_http_errors(response, video_id)
+        CGI.unescapeHTML(response.body)
+      end
+      # Fetch data from the Innertube API
+      #
+      # @param video_id [String] the video ID
+      # @param api_key [String] the API key
+      # @return [Hash] the API response
+      def fetch_innertube_data(video_id, api_key)
+        url = format(INNERTUBE_API_URL, api_key: api_key)
+        response = @http_client.post(url) do |req|
+          req.headers["Content-Type"] = "application/json"
+          req.body = JSON.generate({
+            "context" => INNERTUBE_CONTEXT,
+            "videoId" => video_id
+          })
+        end
+        raise_http_errors(response, video_id)
+        JSON.parse(response.body)
+      end
+      # Raise appropriate errors for HTTP responses
+      #
+      # @param response [Faraday::Response] the HTTP response
+      # @param video_id [String] the video ID
+      # @raise [IpBlocked] for 429 responses
+      # @raise [YouTubeRequestFailed] for other error responses
+      def raise_http_errors(response, video_id)
+        case response.status
+        when 429
+          raise IpBlocked, video_id
+        when 400..599
+          raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
+        end
+      end
+    end
+  end
+end

data/lib/youtube_rb/transcript/transcript_parser.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# frozen_string_literal: true
+require "nokogiri"
+require "cgi"
+module YoutubeRb
+  module Transcript
+    # Parses XML transcript data from YouTube
+    class TranscriptParser
+      # HTML formatting tags to preserve when preserve_formatting is enabled
+      FORMATTING_TAGS = %w[
+        strong
+        em
+        b
+        i
+        mark
+        small
+        del
+        ins
+        sub
+        sup
+      ].freeze
+      # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
+      def initialize(preserve_formatting: false)
+        @preserve_formatting = preserve_formatting
+        @html_regex = build_html_regex
+      end
+      # Parse XML transcript data into TranscriptSnippet objects
+      # @param raw_data [String] the raw XML data from YouTube
+      # @return [Array<TranscriptSnippet>] parsed transcript snippets
+      def parse(raw_data)
+        doc = Nokogiri::XML(raw_data)
+        snippets = []
+        doc.xpath("//text").each do |element|
+          text_content = element.text
+          next if text_content.nil? || text_content.empty?
+          # Unescape HTML entities and remove unwanted HTML tags
+          text = process_text(text_content)
+          snippets << TranscriptSnippet.new(
+            text: text,
+            start: element["start"].to_f,
+            duration: (element["dur"] || "0.0").to_f
+          )
+        end
+        snippets
+      end
+      private
+      # Build regex for removing HTML tags
+      # @return [Regexp]
+      def build_html_regex
+        if @preserve_formatting
+          # Remove all tags except formatting tags
+          formats_pattern = FORMATTING_TAGS.join("|")
+          # Match tags that are NOT the formatting tags
+          Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
+        else
+          # Remove all HTML tags
+          Regexp.new("<[^>]*>", Regexp::IGNORECASE)
+        end
+      end
+      # Process text by unescaping HTML entities and removing unwanted tags
+      # @param text [String] the raw text
+      # @return [String] processed text
+      def process_text(text)
+        # Unescape HTML entities
+        unescaped = CGI.unescapeHTML(text)
+        # Remove unwanted HTML tags
+        unescaped.gsub(@html_regex, "")
+      end
+    end
+  end
+end

data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb RENAMED Viewed

@@ -1,9 +1,7 @@
 # frozen_string_literal: true
-module Youtube
+module YoutubeRb
   module Transcript
-    module Rb
-      VERSION = "0.1.0"
-    end
+    VERSION = "0.2.1"
   end
 end

data/lib/youtube_rb/transcript.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require_relative "transcript/version"
+require_relative "transcript/settings"
+require_relative "transcript/errors"
+require_relative "transcript/transcript_parser"
+require_relative "transcript/transcript"
+require_relative "transcript/transcript_list"
+require_relative "transcript/transcript_list_fetcher"
+require_relative "transcript/api"
+require_relative "transcript/formatters"
+module YoutubeRb
+  module Transcript
+    class << self
+      # Convenience method to fetch a transcript
+      # @param video_id [String] YouTube video ID
+      # @param languages [Array<String>] Language codes in order of preference
+      # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
+      # @return [FetchedTranscript] The fetched transcript
+      def fetch(video_id, languages: ["en"], preserve_formatting: false)
+        api = YouTubeTranscriptApi.new
+        api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
+      end
+      # Convenience method to list available transcripts
+      # @param video_id [String] YouTube video ID
+      # @return [TranscriptList] List of available transcripts
+      def list(video_id)
+        api = YouTubeTranscriptApi.new
+        api.list(video_id)
+      end
+    end
+  end
+end

data/sig/youtube_rb/transcript.rbs ADDED Viewed

@@ -0,0 +1,6 @@
+module YoutubeRb
+  module Transcript
+    VERSION: String
+    # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+  end
+end

data/spec/api_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 require "spec_helper"
 require "webmock/rspec"
-RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
+RSpec.describe YoutubeRb::Transcript::YouTubeTranscriptApi do
   let(:api) { described_class.new }
   let(:video_id) { "dQw4w9WgXcQ" }
   let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
@@ -84,7 +84,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
     it "creates a TranscriptListFetcher" do
       api = described_class.new
-      expect(api.instance_variable_get(:@fetcher)).to be_a(Youtube::Transcript::Rb::TranscriptListFetcher)
+      expect(api.instance_variable_get(:@fetcher)).to be_a(YoutubeRb::Transcript::TranscriptListFetcher)
     end
   end
@@ -102,7 +102,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
     it "returns a FetchedTranscript" do
       result = api.fetch(video_id)
-      expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+      expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
     end
     it "fetches the transcript with correct video_id" do
@@ -138,7 +138,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
     it "raises NoTranscriptFound when no language matches" do
       expect {
         api.fetch(video_id, languages: ["ja", "ko", "zh"])
-      }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
+      }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
     end
     context "with preserve_formatting option" do
@@ -182,7 +182,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
     it "returns a TranscriptList" do
       result = api.list(video_id)
-      expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
+      expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
     end
     it "returns a list with the correct video_id" do
@@ -213,7 +213,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
       end
       it "raises VideoUnavailable error" do
-        expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
+        expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
       end
     end
@@ -227,7 +227,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
       end
       it "raises TranscriptsDisabled error" do
-        expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
+        expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::TranscriptsDisabled)
       end
     end
   end
@@ -272,7 +272,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
     it "fetches all video transcripts" do
       results = api.fetch_all(video_ids)
       results.each do |vid, transcript|
-        expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        expect(transcript).to be_a(YoutubeRb::Transcript::FetchedTranscript)
         expect(transcript.video_id).to eq(vid)
       end
     end
@@ -292,7 +292,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
       expect(yielded.length).to eq(3)
       yielded.each do |vid, klass|
         expect(video_ids).to include(vid)
-        expect(klass).to eq(Youtube::Transcript::Rb::FetchedTranscript)
+        expect(klass).to eq(YoutubeRb::Transcript::FetchedTranscript)
       end
     end
@@ -316,7 +316,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
       end
       it "raises error by default" do
-        expect { api.fetch_all(failing_video_ids) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
+        expect { api.fetch_all(failing_video_ids) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
       end
       it "continues on error when configured" do
@@ -332,7 +332,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
         end
         expect(errors.length).to eq(1)
         expect(errors.first[0]).to eq("fail_video")
-        expect(errors.first[1]).to be_a(Youtube::Transcript::Rb::VideoUnavailable)
+        expect(errors.first[1]).to be_a(YoutubeRb::Transcript::VideoUnavailable)
       end
     end
@@ -356,27 +356,27 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
         .to_return(status: 200, body: sample_transcript_xml)
     end
-    describe "Youtube::Transcript::Rb.fetch" do
+    describe "YoutubeRb::Transcript.fetch" do
       it "fetches a transcript" do
-        result = Youtube::Transcript::Rb.fetch(video_id)
-        expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        result = YoutubeRb::Transcript.fetch(video_id)
+        expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
       end
       it "accepts language option" do
-        result = Youtube::Transcript::Rb.fetch(video_id, languages: ["en"])
+        result = YoutubeRb::Transcript.fetch(video_id, languages: ["en"])
         expect(result.language_code).to eq("en")
       end
       it "accepts preserve_formatting option" do
-        result = Youtube::Transcript::Rb.fetch(video_id, preserve_formatting: false)
-        expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        result = YoutubeRb::Transcript.fetch(video_id, preserve_formatting: false)
+        expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
       end
     end
-    describe "Youtube::Transcript::Rb.list" do
+    describe "YoutubeRb::Transcript.list" do
       it "lists available transcripts" do
-        result = Youtube::Transcript::Rb.list(video_id)
-        expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
+        result = YoutubeRb::Transcript.list(video_id)
+        expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
       end
     end
   end