RubyGems - youtube-transcript-rb - Versions diffs - 0.1.0 - Mend

youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.rspec +1 -0
data/.serena/.gitignore +1 -0
data/.serena/memories/code_style_and_conventions.md +35 -0
data/.serena/memories/project_overview.md +40 -0
data/.serena/memories/suggested_commands.md +50 -0
data/.serena/memories/task_completion_checklist.md +25 -0
data/.serena/memories/tech_stack.md +20 -0
data/.serena/project.yml +84 -0
data/LICENSE +21 -0
data/PLAN.md +422 -0
data/README.md +496 -0
data/Rakefile +4 -0
data/lib/youtube/transcript/rb/api.rb +150 -0
data/lib/youtube/transcript/rb/errors.rb +217 -0
data/lib/youtube/transcript/rb/formatters.rb +269 -0
data/lib/youtube/transcript/rb/settings.rb +28 -0
data/lib/youtube/transcript/rb/transcript.rb +239 -0
data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
data/lib/youtube/transcript/rb/version.rb +9 -0
data/lib/youtube/transcript/rb.rb +37 -0
data/sig/youtube/transcript/rb.rbs +8 -0
data/spec/api_spec.rb +397 -0
data/spec/errors_spec.rb +240 -0
data/spec/formatters_spec.rb +436 -0
data/spec/integration_spec.rb +363 -0
data/spec/settings_spec.rb +67 -0
data/spec/spec_helper.rb +109 -0
data/spec/transcript_list_fetcher_spec.rb +520 -0
data/spec/transcript_list_spec.rb +380 -0
data/spec/transcript_parser_spec.rb +355 -0
data/spec/transcript_spec.rb +435 -0
metadata +118 -0

data/lib/youtube/transcript/rb/transcript.rb ADDED Viewed

@@ -0,0 +1,239 @@
+# frozen_string_literal: true
+module Youtube
+  module Transcript
+    module Rb
+      # Represents a language available for translation
+      class TranslationLanguage
+        # @return [String] the language name (e.g., "Spanish")
+        attr_reader :language
+        # @return [String] the language code (e.g., "es")
+        attr_reader :language_code
+        # @param language [String] the language name
+        # @param language_code [String] the language code
+        def initialize(language:, language_code:)
+          @language = language
+          @language_code = language_code
+        end
+      end
+      # Represents a single transcript snippet/segment
+      class TranscriptSnippet
+        # @return [String] the text content of the snippet
+        attr_reader :text
+        # @return [Float] the start time in seconds
+        attr_reader :start
+        # @return [Float] the duration in seconds
+        attr_reader :duration
+        # @param text [String] the text content
+        # @param start [Float] the start time in seconds
+        # @param duration [Float] the duration in seconds
+        def initialize(text:, start:, duration:)
+          @text = text
+          @start = start.to_f
+          @duration = duration.to_f
+        end
+        # Convert to hash representation
+        # @return [Hash] hash with text, start, and duration keys
+        def to_h
+          {
+            "text" => @text,
+            "start" => @start,
+            "duration" => @duration
+          }
+        end
+      end
+      # Represents a fetched transcript containing multiple snippets
+      # This class is Enumerable, allowing iteration over snippets
+      class FetchedTranscript
+        include Enumerable
+        # @return [String] the video ID
+        attr_reader :video_id
+        # @return [String] the language name (e.g., "English")
+        attr_reader :language
+        # @return [String] the language code (e.g., "en")
+        attr_reader :language_code
+        # @return [Boolean] whether the transcript was auto-generated
+        attr_reader :is_generated
+        # @return [Array<TranscriptSnippet>] the transcript snippets
+        attr_reader :snippets
+        # @param video_id [String] the YouTube video ID
+        # @param language [String] the language name
+        # @param language_code [String] the language code
+        # @param is_generated [Boolean] whether auto-generated
+        # @param snippets [Array<TranscriptSnippet>] the snippets (optional)
+        def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
+          @video_id = video_id
+          @language = language
+          @language_code = language_code
+          @is_generated = is_generated
+          @snippets = snippets
+        end
+        # Add a snippet to the transcript
+        # @param snippet [TranscriptSnippet] the snippet to add
+        # @return [self]
+        def add_snippet(snippet)
+          @snippets << snippet
+          self
+        end
+        # Iterate over each snippet
+        # @yield [TranscriptSnippet] each snippet in the transcript
+        def each(&block)
+          @snippets.each(&block)
+        end
+        # Get a snippet by index
+        # @param index [Integer] the index
+        # @return [TranscriptSnippet] the snippet at the given index
+        def [](index)
+          @snippets[index]
+        end
+        # Get the number of snippets
+        # @return [Integer] the count of snippets
+        def length
+          @snippets.length
+        end
+        alias size length
+        # Convert to raw data (array of hashes)
+        # @return [Array<Hash>] array of snippet hashes
+        def to_raw_data
+          @snippets.map(&:to_h)
+        end
+        # Check if transcript was auto-generated
+        # @return [Boolean]
+        def generated?
+          @is_generated
+        end
+      end
+      # Represents transcript metadata and provides fetch/translate capabilities
+      class Transcript
+        # @return [String] the video ID
+        attr_reader :video_id
+        # @return [String] the language name
+        attr_reader :language
+        # @return [String] the language code
+        attr_reader :language_code
+        # @return [Boolean] whether auto-generated
+        attr_reader :is_generated
+        # @return [Array<TranslationLanguage>] available translation languages
+        attr_reader :translation_languages
+        # @param http_client [Faraday::Connection] the HTTP client
+        # @param video_id [String] the YouTube video ID
+        # @param url [String] the transcript URL
+        # @param language [String] the language name
+        # @param language_code [String] the language code
+        # @param is_generated [Boolean] whether auto-generated
+        # @param translation_languages [Array<TranslationLanguage>] available translations
+        def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
+          @http_client = http_client
+          @video_id = video_id
+          @url = url
+          @language = language
+          @language_code = language_code
+          @is_generated = is_generated
+          @translation_languages = translation_languages
+          @translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
+            hash[tl.language_code] = tl.language
+          end
+        end
+        # Fetch the actual transcript data
+        # @param preserve_formatting [Boolean] whether to preserve HTML formatting
+        # @return [FetchedTranscript] the fetched transcript
+        # @raise [PoTokenRequired] if a PO token is required
+        def fetch(preserve_formatting: false)
+          raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
+          response = @http_client.get(@url)
+          raise_http_errors(response)
+          parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
+          snippets = parser.parse(response.body)
+          FetchedTranscript.new(
+            video_id: @video_id,
+            language: @language,
+            language_code: @language_code,
+            is_generated: @is_generated,
+            snippets: snippets
+          )
+        end
+        # Check if this transcript can be translated
+        # @return [Boolean]
+        def translatable?
+          !@translation_languages.empty?
+        end
+        alias is_translatable translatable?
+        # Translate this transcript to another language
+        # @param language_code [String] the target language code
+        # @return [Transcript] a new Transcript object for the translated version
+        # @raise [NotTranslatable] if the transcript cannot be translated
+        # @raise [TranslationLanguageNotAvailable] if the language is not available
+        def translate(language_code)
+          raise NotTranslatable, @video_id unless translatable?
+          raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
+          Transcript.new(
+            http_client: @http_client,
+            video_id: @video_id,
+            url: "#{@url}&tlang=#{language_code}",
+            language: @translation_languages_dict[language_code],
+            language_code: language_code,
+            is_generated: true,
+            translation_languages: []
+          )
+        end
+        # Check if transcript was auto-generated
+        # @return [Boolean]
+        def generated?
+          @is_generated
+        end
+        # String representation of the transcript
+        # @return [String]
+        def to_s
+          translation_desc = translatable? ? "[TRANSLATABLE]" : ""
+          "#{@language_code} (\"#{@language}\")#{translation_desc}"
+        end
+        private
+        def raise_http_errors(response)
+          case response.status
+          when 429
+            raise IpBlocked, @video_id
+          when 400..599
+            raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
+          end
+        end
+      end
+    end
+  end
+end

data/lib/youtube/transcript/rb/transcript_list.rb ADDED Viewed

@@ -0,0 +1,170 @@
+# frozen_string_literal: true
+module Youtube
+  module Transcript
+    module Rb
+      # Represents a list of available transcripts for a YouTube video.
+      # This class is Enumerable, allowing iteration over all available transcripts.
+      # It provides functionality to search for transcripts in specific languages.
+      class TranscriptList
+        include Enumerable
+        # @return [String] the video ID this TranscriptList is for
+        attr_reader :video_id
+        # Build a TranscriptList from captions JSON data
+        #
+        # @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
+        # @param video_id [String] the YouTube video ID
+        # @param captions_json [Hash] the captions JSON parsed from YouTube
+        # @return [TranscriptList] the created TranscriptList
+        def self.build(http_client:, video_id:, captions_json:)
+          translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
+            TranslationLanguage.new(
+              language: tl.dig("languageName", "runs", 0, "text") || "",
+              language_code: tl["languageCode"]
+            )
+          end
+          manually_created_transcripts = {}
+          generated_transcripts = {}
+          (captions_json["captionTracks"] || []).each do |caption|
+            is_generated = caption.fetch("kind", "") == "asr"
+            target_dict = is_generated ? generated_transcripts : manually_created_transcripts
+            language_code = caption["languageCode"]
+            transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
+            target_dict[language_code] = Transcript.new(
+              http_client: http_client,
+              video_id: video_id,
+              url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
+              language: caption.dig("name", "runs", 0, "text") || "",
+              language_code: language_code,
+              is_generated: is_generated,
+              translation_languages: transcript_translation_languages
+            )
+          end
+          new(
+            video_id: video_id,
+            manually_created_transcripts: manually_created_transcripts,
+            generated_transcripts: generated_transcripts,
+            translation_languages: translation_languages
+          )
+        end
+        # @param video_id [String] the YouTube video ID
+        # @param manually_created_transcripts [Hash<String, Transcript>] manually created transcripts by language code
+        # @param generated_transcripts [Hash<String, Transcript>] auto-generated transcripts by language code
+        # @param translation_languages [Array<TranslationLanguage>] available translation languages
+        def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
+          @video_id = video_id
+          @manually_created_transcripts = manually_created_transcripts
+          @generated_transcripts = generated_transcripts
+          @translation_languages = translation_languages
+        end
+        # Iterate over all transcripts (manually created first, then generated)
+        #
+        # @yield [Transcript] each available transcript
+        # @return [Enumerator] if no block given
+        def each(&block)
+          return to_enum(:each) unless block_given?
+          @manually_created_transcripts.each_value(&block)
+          @generated_transcripts.each_value(&block)
+        end
+        # Find a transcript for the given language codes.
+        # Manually created transcripts are preferred over generated ones.
+        #
+        # @param language_codes [Array<String>] language codes in descending priority
+        # @return [Transcript] the found transcript
+        # @raise [NoTranscriptFound] if no transcript matches the requested languages
+        def find_transcript(language_codes)
+          find_transcript_in(
+            language_codes,
+            [@manually_created_transcripts, @generated_transcripts]
+          )
+        end
+        # Find an automatically generated transcript for the given language codes.
+        #
+        # @param language_codes [Array<String>] language codes in descending priority
+        # @return [Transcript] the found transcript
+        # @raise [NoTranscriptFound] if no generated transcript matches
+        def find_generated_transcript(language_codes)
+          find_transcript_in(language_codes, [@generated_transcripts])
+        end
+        # Find a manually created transcript for the given language codes.
+        #
+        # @param language_codes [Array<String>] language codes in descending priority
+        # @return [Transcript] the found transcript
+        # @raise [NoTranscriptFound] if no manually created transcript matches
+        def find_manually_created_transcript(language_codes)
+          find_transcript_in(language_codes, [@manually_created_transcripts])
+        end
+        # String representation of the transcript list
+        #
+        # @return [String] human-readable description of available transcripts
+        def to_s
+          <<~DESC
+            For this video (#{@video_id}) transcripts are available in the following languages:
+            (MANUALLY CREATED)
+            #{format_language_list(@manually_created_transcripts.values)}
+            (GENERATED)
+            #{format_language_list(@generated_transcripts.values)}
+            (TRANSLATION LANGUAGES)
+            #{format_translation_languages}
+          DESC
+        end
+        private
+        # Find a transcript from the given dictionaries
+        #
+        # @param language_codes [Array<String>] language codes to search for
+        # @param transcript_dicts [Array<Hash>] transcript dictionaries to search
+        # @return [Transcript] the found transcript
+        # @raise [NoTranscriptFound] if no transcript matches
+        def find_transcript_in(language_codes, transcript_dicts)
+          language_codes.each do |language_code|
+            transcript_dicts.each do |dict|
+              return dict[language_code] if dict.key?(language_code)
+            end
+          end
+          raise NoTranscriptFound.new(@video_id, language_codes, self)
+        end
+        # Format a list of transcripts for display
+        #
+        # @param transcripts [Array<Transcript>] transcripts to format
+        # @return [String] formatted list or "None"
+        def format_language_list(transcripts)
+          return "None" if transcripts.empty?
+          transcripts.map { |t| " - #{t}" }.join("\n")
+        end
+        # Format translation languages for display
+        #
+        # @return [String] formatted list or "None"
+        def format_translation_languages
+          return "None" if @translation_languages.empty?
+          @translation_languages.map do |tl|
+            " - #{tl.language_code} (\"#{tl.language}\")"
+          end.join("\n")
+        end
+      end
+    end
+  end
+end

data/lib/youtube/transcript/rb/transcript_list_fetcher.rb ADDED Viewed

@@ -0,0 +1,225 @@
+# frozen_string_literal: true
+require "cgi"
+require "json"
+module Youtube
+  module Transcript
+    module Rb
+      # Playability status values returned by YouTube
+      module PlayabilityStatus
+        OK = "OK"
+        ERROR = "ERROR"
+        LOGIN_REQUIRED = "LOGIN_REQUIRED"
+      end
+      # Reason messages for playability failures
+      module PlayabilityFailedReason
+        BOT_DETECTED = "Sign in to confirm you're not a bot"
+        AGE_RESTRICTED = "This video may be inappropriate for some users."
+        VIDEO_UNAVAILABLE = "This video is unavailable"
+      end
+      # Fetches transcript lists from YouTube videos.
+      # This class handles all the HTTP communication with YouTube,
+      # including consent cookie handling and error detection.
+      class TranscriptListFetcher
+        # @param http_client [Faraday::Connection] the HTTP client to use
+        # @param proxy_config [Object, nil] optional proxy configuration
+        def initialize(http_client:, proxy_config: nil)
+          @http_client = http_client
+          @proxy_config = proxy_config
+        end
+        # Fetch the transcript list for a video
+        #
+        # @param video_id [String] the YouTube video ID
+        # @return [TranscriptList] the list of available transcripts
+        # @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
+        def fetch(video_id)
+          TranscriptList.build(
+            http_client: @http_client,
+            video_id: video_id,
+            captions_json: fetch_captions_json(video_id)
+          )
+        end
+        private
+        # Fetch captions JSON with retry support
+        #
+        # @param video_id [String] the YouTube video ID
+        # @param try_number [Integer] current retry attempt
+        # @return [Hash] the captions JSON
+        def fetch_captions_json(video_id, try_number: 0)
+          html = fetch_video_html(video_id)
+          api_key = extract_innertube_api_key(html, video_id)
+          innertube_data = fetch_innertube_data(video_id, api_key)
+          extract_captions_json(innertube_data, video_id)
+        rescue RequestBlocked => e
+          retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
+          if try_number + 1 < retries
+            return fetch_captions_json(video_id, try_number: try_number + 1)
+          end
+          raise e
+        end
+        # Extract the INNERTUBE_API_KEY from the video page HTML
+        #
+        # @param html [String] the HTML content
+        # @param video_id [String] the video ID (for error messages)
+        # @return [String] the API key
+        # @raise [IpBlocked] if a CAPTCHA is detected
+        # @raise [YouTubeDataUnparsable] if the key cannot be found
+        def extract_innertube_api_key(html, video_id)
+          match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
+          if match && match[1]
+            return match[1]
+          end
+          raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
+          raise YouTubeDataUnparsable, video_id
+        end
+        # Extract captions JSON from innertube data
+        #
+        # @param innertube_data [Hash] the innertube API response
+        # @param video_id [String] the video ID
+        # @return [Hash] the captions JSON
+        # @raise [TranscriptsDisabled] if no captions are available
+        def extract_captions_json(innertube_data, video_id)
+          assert_playability(innertube_data["playabilityStatus"], video_id)
+          captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
+          if captions_json.nil? || !captions_json.key?("captionTracks")
+            raise TranscriptsDisabled, video_id
+          end
+          captions_json
+        end
+        # Assert that the video is playable
+        #
+        # @param playability_status_data [Hash, nil] the playability status from API
+        # @param video_id [String] the video ID
+        # @raise [Various] depending on the playability status
+        def assert_playability(playability_status_data, video_id)
+          return if playability_status_data.nil?
+          status = playability_status_data["status"]
+          return if status == PlayabilityStatus::OK || status.nil?
+          reason = playability_status_data["reason"]
+          if status == PlayabilityStatus::LOGIN_REQUIRED
+            if reason == PlayabilityFailedReason::BOT_DETECTED
+              raise RequestBlocked, video_id
+            elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
+              raise AgeRestricted, video_id
+            end
+          end
+          if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
+            if video_id.start_with?("http://") || video_id.start_with?("https://")
+              raise InvalidVideoId, video_id
+            end
+            raise VideoUnavailable, video_id
+          end
+          # Extract subreasons for more detailed error messages
+          subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
+          subreason_texts = subreasons.map { |run| run["text"] || "" }
+          raise VideoUnplayable.new(video_id, reason, subreason_texts)
+        end
+        # Create a consent cookie from the HTML
+        #
+        # @param html [String] the HTML content
+        # @param video_id [String] the video ID
+        # @raise [FailedToCreateConsentCookie] if the cookie cannot be created
+        def create_consent_cookie(html, video_id)
+          match = html.match(/name="v" value="(.*?)"/)
+          raise FailedToCreateConsentCookie, video_id if match.nil?
+          # Set the consent cookie
+          # Note: Faraday doesn't have built-in cookie management like requests.Session
+          # We'll need to handle this via headers or middleware
+          @consent_value = "YES+#{match[1]}"
+        end
+        # Fetch the video HTML page
+        #
+        # @param video_id [String] the video ID
+        # @return [String] the HTML content
+        def fetch_video_html(video_id)
+          html = fetch_html(video_id)
+          if html.include?('action="https://consent.youtube.com/s"')
+            create_consent_cookie(html, video_id)
+            html = fetch_html(video_id)
+            if html.include?('action="https://consent.youtube.com/s"')
+              raise FailedToCreateConsentCookie, video_id
+            end
+          end
+          html
+        end
+        # Fetch raw HTML from YouTube
+        #
+        # @param video_id [String] the video ID
+        # @return [String] the HTML content (unescaped)
+        def fetch_html(video_id)
+          url = format(WATCH_URL, video_id: video_id)
+          headers = { "Accept-Language" => "en-US" }
+          # Add consent cookie if we have one
+          headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
+          response = @http_client.get(url) do |req|
+            headers.each { |k, v| req.headers[k] = v }
+          end
+          raise_http_errors(response, video_id)
+          CGI.unescapeHTML(response.body)
+        end
+        # Fetch data from the Innertube API
+        #
+        # @param video_id [String] the video ID
+        # @param api_key [String] the API key
+        # @return [Hash] the API response
+        def fetch_innertube_data(video_id, api_key)
+          url = format(INNERTUBE_API_URL, api_key: api_key)
+          response = @http_client.post(url) do |req|
+            req.headers["Content-Type"] = "application/json"
+            req.body = JSON.generate({
+              "context" => INNERTUBE_CONTEXT,
+              "videoId" => video_id
+            })
+          end
+          raise_http_errors(response, video_id)
+          JSON.parse(response.body)
+        end
+        # Raise appropriate errors for HTTP responses
+        #
+        # @param response [Faraday::Response] the HTTP response
+        # @param video_id [String] the video ID
+        # @raise [IpBlocked] for 429 responses
+        # @raise [YouTubeRequestFailed] for other error responses
+        def raise_http_errors(response, video_id)
+          case response.status
+          when 429
+            raise IpBlocked, video_id
+          when 400..599
+            raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
+          end
+        end
+      end
+    end
+  end
+end