RubyGems - youtube-transcript-rb - Versions diffs - 0.1.0 - Mend

youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.rspec +1 -0
data/.serena/.gitignore +1 -0
data/.serena/memories/code_style_and_conventions.md +35 -0
data/.serena/memories/project_overview.md +40 -0
data/.serena/memories/suggested_commands.md +50 -0
data/.serena/memories/task_completion_checklist.md +25 -0
data/.serena/memories/tech_stack.md +20 -0
data/.serena/project.yml +84 -0
data/LICENSE +21 -0
data/PLAN.md +422 -0
data/README.md +496 -0
data/Rakefile +4 -0
data/lib/youtube/transcript/rb/api.rb +150 -0
data/lib/youtube/transcript/rb/errors.rb +217 -0
data/lib/youtube/transcript/rb/formatters.rb +269 -0
data/lib/youtube/transcript/rb/settings.rb +28 -0
data/lib/youtube/transcript/rb/transcript.rb +239 -0
data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
data/lib/youtube/transcript/rb/version.rb +9 -0
data/lib/youtube/transcript/rb.rb +37 -0
data/sig/youtube/transcript/rb.rbs +8 -0
data/spec/api_spec.rb +397 -0
data/spec/errors_spec.rb +240 -0
data/spec/formatters_spec.rb +436 -0
data/spec/integration_spec.rb +363 -0
data/spec/settings_spec.rb +67 -0
data/spec/spec_helper.rb +109 -0
data/spec/transcript_list_fetcher_spec.rb +520 -0
data/spec/transcript_list_spec.rb +380 -0
data/spec/transcript_parser_spec.rb +355 -0
data/spec/transcript_spec.rb +435 -0
metadata +118 -0

data/lib/youtube/transcript/rb/errors.rb ADDED Viewed

@@ -0,0 +1,217 @@
+# frozen_string_literal: true
+module Youtube
+  module Transcript
+    module Rb
+      # Base error class for all YouTube Transcript errors
+      class Error < StandardError; end
+      # Raised when a transcript could not be retrieved
+      class CouldNotRetrieveTranscript < Error
+        WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
+        # @return [String] the video ID that caused the error
+        attr_reader :video_id
+        # @param video_id [String] the YouTube video ID
+        def initialize(video_id)
+          @video_id = video_id
+          super(build_error_message)
+        end
+        # @return [String] the cause of the error
+        def cause_message
+          self.class::CAUSE_MESSAGE
+        end
+        private
+        def build_error_message
+          video_url = format(WATCH_URL, video_id: @video_id)
+          message = "\nCould not retrieve a transcript for the video #{video_url}!"
+          if cause_message && !cause_message.empty?
+            message += " This is most likely caused by:\n\n#{cause_message}"
+            message += github_referral
+          end
+          message
+        end
+        def github_referral
+          "\n\nIf you are sure that the described cause is not responsible for this error " \
+            "and that a transcript should be retrievable, please create an issue at " \
+            "https://github.com/jdepoix/youtube-transcript-api/issues. " \
+            "Please add which version of youtube_transcript_api you are using " \
+            "and provide the information needed to replicate the error. " \
+            "Also make sure that there are no open issues which already describe your problem!"
+        end
+      end
+      # Raised when YouTube data cannot be parsed
+      class YouTubeDataUnparsable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The data required to fetch the transcript is not parsable. This should " \
+                        "not happen, please open an issue (make sure to include the video ID)!"
+      end
+      # Raised when a request to YouTube fails
+      class YouTubeRequestFailed < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "Request to YouTube failed: %<reason>s"
+        # @return [String] the reason for the failure
+        attr_reader :reason
+        # @param video_id [String] the YouTube video ID
+        # @param http_error [StandardError] the HTTP error that occurred
+        def initialize(video_id, http_error)
+          @reason = http_error.to_s
+          super(video_id)
+        end
+        def cause_message
+          format(CAUSE_MESSAGE, reason: @reason)
+        end
+      end
+      # Raised when a video is unplayable
+      class VideoUnplayable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The video is unplayable for the following reason: %<reason>s"
+        # @return [String, nil] the reason the video is unplayable
+        attr_reader :reason
+        # @return [Array<String>] additional sub-reasons
+        attr_reader :sub_reasons
+        # @param video_id [String] the YouTube video ID
+        # @param reason [String, nil] the reason the video is unplayable
+        # @param sub_reasons [Array<String>] additional details
+        def initialize(video_id, reason = nil, sub_reasons = [])
+          @reason = reason
+          @sub_reasons = sub_reasons
+          super(video_id)
+        end
+        def cause_message
+          reason_text = @reason || "No reason specified!"
+          if @sub_reasons.any?
+            sub_reasons_text = @sub_reasons.map { |r| " - #{r}" }.join("\n")
+            reason_text = "#{reason_text}\n\nAdditional Details:\n#{sub_reasons_text}"
+          end
+          format(CAUSE_MESSAGE, reason: reason_text)
+        end
+      end
+      # Raised when a video is unavailable
+      class VideoUnavailable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The video is no longer available"
+      end
+      # Raised when an invalid video ID is provided
+      class InvalidVideoId < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n" \
+                        'Do NOT run: `Youtube::Transcript::Rb.fetch("https://www.youtube.com/watch?v=1234")`' \
+                        "\n" \
+                        'Instead run: `Youtube::Transcript::Rb.fetch("1234")`'
+      end
+      # Raised when YouTube blocks the request
+      class RequestBlocked < CouldNotRetrieveTranscript
+        BASE_CAUSE_MESSAGE = "YouTube is blocking requests from your IP. This usually is due to one of the " \
+                             "following reasons:\n" \
+                             "- You have done too many requests and your IP has been blocked by YouTube\n" \
+                             "- You are doing requests from an IP belonging to a cloud provider (like AWS, " \
+                             "Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud " \
+                             "providers are blocked by YouTube.\n\n"
+        CAUSE_MESSAGE = "#{BASE_CAUSE_MESSAGE}" \
+                        "There are two things you can do to work around this:\n" \
+                        "1. Use proxies to hide your IP address.\n" \
+                        "2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you " \
+                        "will be able to continue doing requests for a while. However, YouTube will " \
+                        "eventually permanently ban the account that you have used to authenticate " \
+                        "with! So only do this if you don't mind your account being banned!"
+      end
+      # Raised when YouTube blocks the IP specifically
+      class IpBlocked < RequestBlocked
+        CAUSE_MESSAGE = "#{RequestBlocked::BASE_CAUSE_MESSAGE}" \
+                        "Ways to work around this are using proxies or rotating residential IPs."
+      end
+      # Raised when too many requests are made (HTTP 429)
+      class TooManyRequests < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "YouTube is rate limiting your requests. Please wait before making more requests."
+      end
+      # Raised when transcripts are disabled for a video
+      class TranscriptsDisabled < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "Subtitles are disabled for this video"
+      end
+      # Raised when a video is age restricted
+      class AgeRestricted < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "This video is age-restricted. Therefore, you are unable to retrieve " \
+                        "transcripts for it without authenticating yourself.\n\n" \
+                        "Unfortunately, Cookie Authentication is temporarily unsupported, " \
+                        "as recent changes in YouTube's API broke the previous implementation."
+      end
+      # Raised when a transcript is not translatable
+      class NotTranslatable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The requested language is not translatable"
+      end
+      # Raised when the requested translation language is not available
+      class TranslationLanguageNotAvailable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The requested translation language is not available"
+      end
+      # Raised when consent cookie creation fails
+      class FailedToCreateConsentCookie < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies"
+      end
+      # Raised when no transcript is found for the requested languages
+      class NoTranscriptFound < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "No transcripts were found for any of the requested language codes: %<requested_language_codes>s\n\n%<transcript_data>s"
+        # @return [Array<String>] the requested language codes
+        attr_reader :requested_language_codes
+        # @return [Object] the transcript data (TranscriptList)
+        attr_reader :transcript_data
+        # @param video_id [String] the YouTube video ID
+        # @param requested_language_codes [Array<String>] the language codes that were requested
+        # @param transcript_data [Object] the TranscriptList object with available transcripts
+        def initialize(video_id, requested_language_codes, transcript_data)
+          @requested_language_codes = requested_language_codes
+          @transcript_data = transcript_data
+          super(video_id)
+        end
+        def cause_message
+          format(
+            CAUSE_MESSAGE,
+            requested_language_codes: @requested_language_codes.inspect,
+            transcript_data: @transcript_data.to_s
+          )
+        end
+      end
+      # Raised when no transcripts are available for a video
+      class NoTranscriptAvailable < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "No transcripts are available for this video"
+      end
+      # Raised when a PO token is required to fetch the transcript
+      class PoTokenRequired < CouldNotRetrieveTranscript
+        CAUSE_MESSAGE = "The requested video cannot be retrieved without a PO Token. " \
+                        "If this happens, please open a GitHub issue!"
+      end
+    end
+  end
+end

data/lib/youtube/transcript/rb/formatters.rb ADDED Viewed

@@ -0,0 +1,269 @@
+# frozen_string_literal: true
+require "json"
+module Youtube
+  module Transcript
+    module Rb
+      # Module containing all transcript formatters
+      module Formatters
+        # Base formatter class. All formatters should inherit from this class
+        # and implement their own format_transcript and format_transcripts methods.
+        class Formatter
+          # Format a single transcript
+          #
+          # @param transcript [FetchedTranscript] The transcript to format
+          # @param options [Hash] Additional formatting options
+          # @return [String] The formatted transcript
+          def format_transcript(transcript, **options)
+            raise NotImplementedError, "Subclass must implement #format_transcript"
+          end
+          # Format multiple transcripts
+          #
+          # @param transcripts [Array<FetchedTranscript>] The transcripts to format
+          # @param options [Hash] Additional formatting options
+          # @return [String] The formatted transcripts
+          def format_transcripts(transcripts, **options)
+            raise NotImplementedError, "Subclass must implement #format_transcripts"
+          end
+        end
+        # Formats transcript as pretty-printed Ruby data structures
+        class PrettyPrintFormatter < Formatter
+          # Format a single transcript as pretty-printed output
+          #
+          # @param transcript [FetchedTranscript] The transcript to format
+          # @param options [Hash] Options passed to PP.pp
+          # @return [String] Pretty-printed transcript data
+          def format_transcript(transcript, **options)
+            require "pp"
+            PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
+          end
+          # Format multiple transcripts as pretty-printed output
+          #
+          # @param transcripts [Array<FetchedTranscript>] The transcripts to format
+          # @param options [Hash] Options passed to PP.pp
+          # @return [String] Pretty-printed transcripts data
+          def format_transcripts(transcripts, **options)
+            require "pp"
+            data = transcripts.map(&:to_raw_data)
+            PP.pp(data, +"", options[:width] || 79)
+          end
+        end
+        # Formats transcript as JSON
+        class JSONFormatter < Formatter
+          # Format a single transcript as JSON
+          #
+          # @param transcript [FetchedTranscript] The transcript to format
+          # @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
+          # @return [String] JSON representation of the transcript
+          def format_transcript(transcript, **options)
+            JSON.generate(transcript.to_raw_data, options)
+          end
+          # Format multiple transcripts as JSON array
+          #
+          # @param transcripts [Array<FetchedTranscript>] The transcripts to format
+          # @param options [Hash] Options passed to JSON.generate
+          # @return [String] JSON array representation of the transcripts
+          def format_transcripts(transcripts, **options)
+            data = transcripts.map(&:to_raw_data)
+            JSON.generate(data, options)
+          end
+        end
+        # Formats transcript as plain text (text only, no timestamps)
+        class TextFormatter < Formatter
+          # Format a single transcript as plain text
+          #
+          # @param transcript [FetchedTranscript] The transcript to format
+          # @param options [Hash] Unused options
+          # @return [String] Plain text with each line separated by newlines
+          def format_transcript(transcript, **options)
+            transcript.map(&:text).join("\n")
+          end
+          # Format multiple transcripts as plain text
+          #
+          # @param transcripts [Array<FetchedTranscript>] The transcripts to format
+          # @param options [Hash] Unused options
+          # @return [String] Plain text with transcripts separated by triple newlines
+          def format_transcripts(transcripts, **options)
+            transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
+          end
+        end
+        # Base class for timestamp-based formatters (SRT, WebVTT)
+        class TextBasedFormatter < TextFormatter
+          # Format a single transcript with timestamps
+          #
+          # @param transcript [FetchedTranscript] The transcript to format
+          # @param options [Hash] Unused options
+          # @return [String] Formatted transcript with timestamps
+          def format_transcript(transcript, **options)
+            lines = []
+            snippets = transcript.to_a
+            snippets.each_with_index do |snippet, i|
+              end_time = snippet.start + snippet.duration
+              # Use next snippet's start time if it starts before current end time
+              if i < snippets.length - 1 && snippets[i + 1].start < end_time
+                end_time = snippets[i + 1].start
+              end
+              time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
+              lines << format_transcript_helper(i, time_text, snippet)
+            end
+            format_transcript_header(lines)
+          end
+          protected
+          # Format a timestamp from components
+          #
+          # @param hours [Integer] Hours component
+          # @param mins [Integer] Minutes component
+          # @param secs [Integer] Seconds component
+          # @param ms [Integer] Milliseconds component
+          # @return [String] Formatted timestamp
+          def format_timestamp(hours, mins, secs, ms)
+            raise NotImplementedError, "Subclass must implement #format_timestamp"
+          end
+          # Format the transcript header/wrapper
+          #
+          # @param lines [Array<String>] The formatted lines
+          # @return [String] The complete formatted transcript
+          def format_transcript_header(lines)
+            raise NotImplementedError, "Subclass must implement #format_transcript_header"
+          end
+          # Format a single transcript entry
+          #
+          # @param index [Integer] The entry index (0-based)
+          # @param time_text [String] The formatted time range
+          # @param snippet [TranscriptSnippet] The snippet to format
+          # @return [String] The formatted entry
+          def format_transcript_helper(index, time_text, snippet)
+            raise NotImplementedError, "Subclass must implement #format_transcript_helper"
+          end
+          private
+          # Convert seconds to timestamp string
+          #
+          # @param time [Float] Time in seconds
+          # @return [String] Formatted timestamp
+          def seconds_to_timestamp(time)
+            time = time.to_f
+            hours, remainder = time.divmod(3600)
+            mins, secs_float = remainder.divmod(60)
+            secs = secs_float.to_i
+            ms = ((time - time.to_i) * 1000).round
+            format_timestamp(hours.to_i, mins.to_i, secs, ms)
+          end
+        end
+        # Formats transcript as SRT (SubRip) subtitle format
+        #
+        # @example SRT format
+        #   1
+        #   00:00:00,000 --> 00:00:02,500
+        #   Hello world
+        #
+        #   2
+        #   00:00:02,500 --> 00:00:05,000
+        #   This is a test
+        #
+        class SRTFormatter < TextBasedFormatter
+          protected
+          def format_timestamp(hours, mins, secs, ms)
+            format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
+          end
+          def format_transcript_header(lines)
+            lines.join("\n\n") + "\n"
+          end
+          def format_transcript_helper(index, time_text, snippet)
+            "#{index + 1}\n#{time_text}\n#{snippet.text}"
+          end
+        end
+        # Formats transcript as WebVTT (Web Video Text Tracks) format
+        #
+        # @example WebVTT format
+        #   WEBVTT
+        #
+        #   00:00:00.000 --> 00:00:02.500
+        #   Hello world
+        #
+        #   00:00:02.500 --> 00:00:05.000
+        #   This is a test
+        #
+        class WebVTTFormatter < TextBasedFormatter
+          protected
+          def format_timestamp(hours, mins, secs, ms)
+            format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
+          end
+          def format_transcript_header(lines)
+            "WEBVTT\n\n" + lines.join("\n\n") + "\n"
+          end
+          def format_transcript_helper(index, time_text, snippet)
+            "#{time_text}\n#{snippet.text}"
+          end
+        end
+        # Utility class to load formatters by type name
+        class FormatterLoader
+          # Mapping of format names to formatter classes
+          TYPES = {
+            "json" => JSONFormatter,
+            "pretty" => PrettyPrintFormatter,
+            "text" => TextFormatter,
+            "webvtt" => WebVTTFormatter,
+            "srt" => SRTFormatter
+          }.freeze
+          # Error raised when an unknown formatter type is requested
+          class UnknownFormatterType < StandardError
+            def initialize(formatter_type)
+              super(
+                "The format '#{formatter_type}' is not supported. " \
+                "Choose one of the following formats: #{TYPES.keys.join(", ")}"
+              )
+            end
+          end
+          # Load a formatter by type name
+          #
+          # @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
+          # @return [Formatter] An instance of the requested formatter
+          # @raise [UnknownFormatterType] If the formatter type is not supported
+          #
+          # @example
+          #   loader = FormatterLoader.new
+          #   formatter = loader.load("json")
+          #   output = formatter.format_transcript(transcript)
+          #
+          def load(formatter_type = "pretty")
+            formatter_type = formatter_type.to_s
+            raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
+            TYPES[formatter_type].new
+          end
+        end
+      end
+    end
+  end
+end

data/lib/youtube/transcript/rb/settings.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module Youtube
+  module Transcript
+    module Rb
+      # YouTube watch URL template
+      # @example
+      #   format(WATCH_URL, video_id: "abc123")
+      #   # => "https://www.youtube.com/watch?v=abc123"
+      WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
+      # YouTube Innertube API URL template
+      # @example
+      #   format(INNERTUBE_API_URL, api_key: "key123")
+      #   # => "https://www.youtube.com/youtubei/v1/player?key=key123"
+      INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
+      # Innertube API context for Android client
+      # Used in POST requests to the Innertube API
+      INNERTUBE_CONTEXT = {
+        "client" => {
+          "clientName" => "ANDROID",
+          "clientVersion" => "20.10.38"
+        }
+      }.freeze
+    end
+  end
+end