RubyGems - youtube-transcript-rb - Versions diffs - 0.1.0 - Mend

youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.rspec +1 -0
data/.serena/.gitignore +1 -0
data/.serena/memories/code_style_and_conventions.md +35 -0
data/.serena/memories/project_overview.md +40 -0
data/.serena/memories/suggested_commands.md +50 -0
data/.serena/memories/task_completion_checklist.md +25 -0
data/.serena/memories/tech_stack.md +20 -0
data/.serena/project.yml +84 -0
data/LICENSE +21 -0
data/PLAN.md +422 -0
data/README.md +496 -0
data/Rakefile +4 -0
data/lib/youtube/transcript/rb/api.rb +150 -0
data/lib/youtube/transcript/rb/errors.rb +217 -0
data/lib/youtube/transcript/rb/formatters.rb +269 -0
data/lib/youtube/transcript/rb/settings.rb +28 -0
data/lib/youtube/transcript/rb/transcript.rb +239 -0
data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
data/lib/youtube/transcript/rb/version.rb +9 -0
data/lib/youtube/transcript/rb.rb +37 -0
data/sig/youtube/transcript/rb.rbs +8 -0
data/spec/api_spec.rb +397 -0
data/spec/errors_spec.rb +240 -0
data/spec/formatters_spec.rb +436 -0
data/spec/integration_spec.rb +363 -0
data/spec/settings_spec.rb +67 -0
data/spec/spec_helper.rb +109 -0
data/spec/transcript_list_fetcher_spec.rb +520 -0
data/spec/transcript_list_spec.rb +380 -0
data/spec/transcript_parser_spec.rb +355 -0
data/spec/transcript_spec.rb +435 -0
metadata +118 -0

data/lib/youtube/transcript/rb/transcript_parser.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+require "nokogiri"
+require "cgi"
+module Youtube
+  module Transcript
+    module Rb
+      # Parses XML transcript data from YouTube
+      class TranscriptParser
+        # HTML formatting tags to preserve when preserve_formatting is enabled
+        FORMATTING_TAGS = %w[
+          strong
+          em
+          b
+          i
+          mark
+          small
+          del
+          ins
+          sub
+          sup
+        ].freeze
+        # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
+        def initialize(preserve_formatting: false)
+          @preserve_formatting = preserve_formatting
+          @html_regex = build_html_regex
+        end
+        # Parse XML transcript data into TranscriptSnippet objects
+        # @param raw_data [String] the raw XML data from YouTube
+        # @return [Array<TranscriptSnippet>] parsed transcript snippets
+        def parse(raw_data)
+          doc = Nokogiri::XML(raw_data)
+          snippets = []
+          doc.xpath("//text").each do |element|
+            text_content = element.text
+            next if text_content.nil? || text_content.empty?
+            # Unescape HTML entities and remove unwanted HTML tags
+            text = process_text(text_content)
+            snippets << TranscriptSnippet.new(
+              text: text,
+              start: element["start"].to_f,
+              duration: (element["dur"] || "0.0").to_f
+            )
+          end
+          snippets
+        end
+        private
+        # Build regex for removing HTML tags
+        # @return [Regexp]
+        def build_html_regex
+          if @preserve_formatting
+            # Remove all tags except formatting tags
+            formats_pattern = FORMATTING_TAGS.join("|")
+            # Match tags that are NOT the formatting tags
+            Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
+          else
+            # Remove all HTML tags
+            Regexp.new("<[^>]*>", Regexp::IGNORECASE)
+          end
+        end
+        # Process text by unescaping HTML entities and removing unwanted tags
+        # @param text [String] the raw text
+        # @return [String] processed text
+        def process_text(text)
+          # Unescape HTML entities
+          unescaped = CGI.unescapeHTML(text)
+          # Remove unwanted HTML tags
+          unescaped.gsub(@html_regex, "")
+        end
+      end
+    end
+  end
+end

data/lib/youtube/transcript/rb/version.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module Youtube
+  module Transcript
+    module Rb
+      VERSION = "0.1.0"
+    end
+  end
+end

data/lib/youtube/transcript/rb.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require_relative "rb/version"
+require_relative "rb/settings"
+require_relative "rb/errors"
+require_relative "rb/transcript_parser"
+require_relative "rb/transcript"
+require_relative "rb/transcript_list"
+require_relative "rb/transcript_list_fetcher"
+require_relative "rb/api"
+require_relative "rb/formatters"
+module Youtube
+  module Transcript
+    module Rb
+      class << self
+        # Convenience method to fetch a transcript
+        # @param video_id [String] YouTube video ID
+        # @param languages [Array<String>] Language codes in order of preference
+        # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
+        # @return [FetchedTranscript] The fetched transcript
+        def fetch(video_id, languages: ["en"], preserve_formatting: false)
+          api = YouTubeTranscriptApi.new
+          api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
+        end
+        # Convenience method to list available transcripts
+        # @param video_id [String] YouTube video ID
+        # @return [TranscriptList] List of available transcripts
+        def list(video_id)
+          api = YouTubeTranscriptApi.new
+          api.list(video_id)
+        end
+      end
+    end
+  end
+end

data/sig/youtube/transcript/rb.rbs ADDED Viewed

@@ -0,0 +1,8 @@
+module Youtube
+  module Transcript
+    module Rb
+      VERSION: String
+      # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+    end
+  end
+end

data/spec/api_spec.rb ADDED Viewed

@@ -0,0 +1,397 @@
+# frozen_string_literal: true
+require "spec_helper"
+require "webmock/rspec"
+RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
+  let(:api) { described_class.new }
+  let(:video_id) { "dQw4w9WgXcQ" }
+  let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
+  let(:watch_url) { "https://www.youtube.com/watch?v=#{video_id}" }
+  let(:innertube_url) { "https://www.youtube.com/youtubei/v1/player?key=#{api_key}" }
+  let(:transcript_url) { "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=en" }
+  let(:sample_html) do
+    <<~HTML
+      <!DOCTYPE html>
+      <html>
+      <head><title>Test Video</title></head>
+      <body>
+        <script>
+          var ytcfg = {"INNERTUBE_API_KEY": "#{api_key}"};
+        </script>
+      </body>
+      </html>
+    HTML
+  end
+  let(:sample_innertube_response) do
+    {
+      "playabilityStatus" => { "status" => "OK" },
+      "captions" => {
+        "playerCaptionsTracklistRenderer" => {
+          "captionTracks" => [
+            {
+              "baseUrl" => transcript_url,
+              "name" => { "runs" => [{ "text" => "English" }] },
+              "languageCode" => "en",
+              "isTranslatable" => true
+            },
+            {
+              "baseUrl" => "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=es",
+              "name" => { "runs" => [{ "text" => "Spanish" }] },
+              "languageCode" => "es",
+              "isTranslatable" => false
+            }
+          ],
+          "translationLanguages" => [
+            { "languageCode" => "fr", "languageName" => { "runs" => [{ "text" => "French" }] } }
+          ]
+        }
+      }
+    }
+  end
+  let(:sample_transcript_xml) do
+    <<~XML
+      <?xml version="1.0" encoding="utf-8"?>
+      <transcript>
+        <text start="0.0" dur="2.5">Hello world</text>
+        <text start="2.5" dur="3.0">This is a test</text>
+        <text start="5.5" dur="2.0">Thank you</text>
+      </transcript>
+    XML
+  end
+  describe "#initialize" do
+    it "creates a default HTTP client when none provided" do
+      api = described_class.new
+      expect(api.instance_variable_get(:@http_client)).to be_a(Faraday::Connection)
+    end
+    it "accepts a custom HTTP client" do
+      custom_client = Faraday.new
+      api = described_class.new(http_client: custom_client)
+      expect(api.instance_variable_get(:@http_client)).to eq(custom_client)
+    end
+    it "accepts a proxy configuration" do
+      proxy_config = double("proxy_config")
+      api = described_class.new(proxy_config: proxy_config)
+      expect(api.instance_variable_get(:@proxy_config)).to eq(proxy_config)
+    end
+    it "creates a TranscriptListFetcher" do
+      api = described_class.new
+      expect(api.instance_variable_get(:@fetcher)).to be_a(Youtube::Transcript::Rb::TranscriptListFetcher)
+    end
+  end
+  describe "#fetch" do
+    before do
+      stub_request(:get, watch_url)
+        .to_return(status: 200, body: sample_html)
+      stub_request(:post, innertube_url)
+        .to_return(status: 200, body: sample_innertube_response.to_json)
+      stub_request(:get, transcript_url)
+        .to_return(status: 200, body: sample_transcript_xml)
+    end
+    it "returns a FetchedTranscript" do
+      result = api.fetch(video_id)
+      expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+    end
+    it "fetches the transcript with correct video_id" do
+      result = api.fetch(video_id)
+      expect(result.video_id).to eq(video_id)
+    end
+    it "fetches the transcript with correct language" do
+      result = api.fetch(video_id, languages: ["en"])
+      expect(result.language_code).to eq("en")
+      expect(result.language).to eq("English")
+    end
+    it "contains transcript snippets" do
+      result = api.fetch(video_id)
+      expect(result.length).to eq(3)
+      expect(result.first.text).to eq("Hello world")
+    end
+    it "respects language preference order" do
+      stub_request(:get, "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=es")
+        .to_return(status: 200, body: sample_transcript_xml)
+      result = api.fetch(video_id, languages: ["es", "en"])
+      expect(result.language_code).to eq("es")
+    end
+    it "falls back to next language if first not available" do
+      result = api.fetch(video_id, languages: ["ja", "en"])
+      expect(result.language_code).to eq("en")
+    end
+    it "raises NoTranscriptFound when no language matches" do
+      expect {
+        api.fetch(video_id, languages: ["ja", "ko", "zh"])
+      }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
+    end
+    context "with preserve_formatting option" do
+      let(:formatted_transcript_xml) do
+        <<~XML
+          <?xml version="1.0" encoding="utf-8"?>
+          <transcript>
+            <text start="0.0" dur="2.5">Hello &lt;b&gt;world&lt;/b&gt;</text>
+            <text start="2.5" dur="3.0">This is &lt;i&gt;important&lt;/i&gt;</text>
+          </transcript>
+        XML
+      end
+      before do
+        stub_request(:get, transcript_url)
+          .to_return(status: 200, body: formatted_transcript_xml)
+      end
+      it "preserves formatting when requested" do
+        result = api.fetch(video_id, preserve_formatting: true)
+        expect(result.first.text).to include("<b>")
+        expect(result.first.text).to eq("Hello <b>world</b>")
+      end
+      it "removes formatting by default" do
+        result = api.fetch(video_id, preserve_formatting: false)
+        expect(result.first.text).not_to include("<b>")
+        expect(result.first.text).to eq("Hello world")
+      end
+    end
+  end
+  describe "#list" do
+    before do
+      stub_request(:get, watch_url)
+        .to_return(status: 200, body: sample_html)
+      stub_request(:post, innertube_url)
+        .to_return(status: 200, body: sample_innertube_response.to_json)
+    end
+    it "returns a TranscriptList" do
+      result = api.list(video_id)
+      expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
+    end
+    it "returns a list with the correct video_id" do
+      result = api.list(video_id)
+      expect(result.video_id).to eq(video_id)
+    end
+    it "includes all available transcripts" do
+      result = api.list(video_id)
+      expect(result.count).to eq(2)
+    end
+    it "allows finding specific transcripts" do
+      result = api.list(video_id)
+      transcript = result.find_transcript(["en"])
+      expect(transcript.language_code).to eq("en")
+    end
+    context "when video is unavailable" do
+      before do
+        stub_request(:post, innertube_url)
+          .to_return(status: 200, body: {
+            "playabilityStatus" => {
+              "status" => "ERROR",
+              "reason" => "This video is unavailable"
+            }
+          }.to_json)
+      end
+      it "raises VideoUnavailable error" do
+        expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
+      end
+    end
+    context "when transcripts are disabled" do
+      before do
+        stub_request(:post, innertube_url)
+          .to_return(status: 200, body: {
+            "playabilityStatus" => { "status" => "OK" },
+            "captions" => {}
+          }.to_json)
+      end
+      it "raises TranscriptsDisabled error" do
+        expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
+      end
+    end
+  end
+  describe "#fetch_all" do
+    let(:video_ids) { ["video1", "video2", "video3"] }
+    before do
+      video_ids.each do |vid|
+        stub_request(:get, "https://www.youtube.com/watch?v=#{vid}")
+          .to_return(status: 200, body: sample_html)
+        stub_request(:post, innertube_url)
+          .to_return(status: 200, body: {
+            "playabilityStatus" => { "status" => "OK" },
+            "captions" => {
+              "playerCaptionsTracklistRenderer" => {
+                "captionTracks" => [
+                  {
+                    "baseUrl" => "https://www.youtube.com/api/timedtext?v=#{vid}&lang=en",
+                    "name" => { "runs" => [{ "text" => "English" }] },
+                    "languageCode" => "en",
+                    "isTranslatable" => false
+                  }
+                ],
+                "translationLanguages" => []
+              }
+            }
+          }.to_json)
+        stub_request(:get, "https://www.youtube.com/api/timedtext?v=#{vid}&lang=en")
+          .to_return(status: 200, body: sample_transcript_xml)
+      end
+    end
+    it "returns a hash of transcripts" do
+      results = api.fetch_all(video_ids)
+      expect(results).to be_a(Hash)
+      expect(results.keys).to contain_exactly(*video_ids)
+    end
+    it "fetches all video transcripts" do
+      results = api.fetch_all(video_ids)
+      results.each do |vid, transcript|
+        expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        expect(transcript.video_id).to eq(vid)
+      end
+    end
+    it "respects language preference" do
+      results = api.fetch_all(video_ids, languages: ["en"])
+      results.each do |_, transcript|
+        expect(transcript.language_code).to eq("en")
+      end
+    end
+    it "yields each result when block given" do
+      yielded = []
+      api.fetch_all(video_ids) do |video_id, result|
+        yielded << [video_id, result.class]
+      end
+      expect(yielded.length).to eq(3)
+      yielded.each do |vid, klass|
+        expect(video_ids).to include(vid)
+        expect(klass).to eq(Youtube::Transcript::Rb::FetchedTranscript)
+      end
+    end
+    context "when a video fails" do
+      let(:failing_video_ids) { ["fail_video"] }
+      before do
+        WebMock.reset!
+        # Setup a failing video
+        stub_request(:get, "https://www.youtube.com/watch?v=fail_video")
+          .to_return(status: 200, body: sample_html)
+        stub_request(:post, innertube_url)
+          .to_return(status: 200, body: {
+            "playabilityStatus" => {
+              "status" => "ERROR",
+              "reason" => "This video is unavailable"
+            }
+          }.to_json)
+      end
+      it "raises error by default" do
+        expect { api.fetch_all(failing_video_ids) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
+      end
+      it "continues on error when configured" do
+        results = api.fetch_all(failing_video_ids, continue_on_error: true)
+        # No successful ones
+        expect(results).to be_empty
+      end
+      it "yields errors when continue_on_error is true" do
+        errors = []
+        api.fetch_all(failing_video_ids, continue_on_error: true) do |video_id, result|
+          errors << [video_id, result] if result.is_a?(StandardError)
+        end
+        expect(errors.length).to eq(1)
+        expect(errors.first[0]).to eq("fail_video")
+        expect(errors.first[1]).to be_a(Youtube::Transcript::Rb::VideoUnavailable)
+      end
+    end
+    context "with empty video list" do
+      it "returns empty hash" do
+        results = api.fetch_all([])
+        expect(results).to eq({})
+      end
+    end
+  end
+  describe "convenience module methods" do
+    before do
+      stub_request(:get, watch_url)
+        .to_return(status: 200, body: sample_html)
+      stub_request(:post, innertube_url)
+        .to_return(status: 200, body: sample_innertube_response.to_json)
+      stub_request(:get, transcript_url)
+        .to_return(status: 200, body: sample_transcript_xml)
+    end
+    describe "Youtube::Transcript::Rb.fetch" do
+      it "fetches a transcript" do
+        result = Youtube::Transcript::Rb.fetch(video_id)
+        expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+      end
+      it "accepts language option" do
+        result = Youtube::Transcript::Rb.fetch(video_id, languages: ["en"])
+        expect(result.language_code).to eq("en")
+      end
+      it "accepts preserve_formatting option" do
+        result = Youtube::Transcript::Rb.fetch(video_id, preserve_formatting: false)
+        expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+      end
+    end
+    describe "Youtube::Transcript::Rb.list" do
+      it "lists available transcripts" do
+        result = Youtube::Transcript::Rb.list(video_id)
+        expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
+      end
+    end
+  end
+  describe "default HTTP client configuration" do
+    it "sets timeout" do
+      api = described_class.new
+      client = api.instance_variable_get(:@http_client)
+      expect(client.options.timeout).to eq(30)
+    end
+    it "sets open_timeout" do
+      api = described_class.new
+      client = api.instance_variable_get(:@http_client)
+      expect(client.options.open_timeout).to eq(30)
+    end
+  end
+end