RubyGems - youtube-transcript-rb - Versions diffs - 0.1.0 - Mend

youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.rspec +1 -0
data/.serena/.gitignore +1 -0
data/.serena/memories/code_style_and_conventions.md +35 -0
data/.serena/memories/project_overview.md +40 -0
data/.serena/memories/suggested_commands.md +50 -0
data/.serena/memories/task_completion_checklist.md +25 -0
data/.serena/memories/tech_stack.md +20 -0
data/.serena/project.yml +84 -0
data/LICENSE +21 -0
data/PLAN.md +422 -0
data/README.md +496 -0
data/Rakefile +4 -0
data/lib/youtube/transcript/rb/api.rb +150 -0
data/lib/youtube/transcript/rb/errors.rb +217 -0
data/lib/youtube/transcript/rb/formatters.rb +269 -0
data/lib/youtube/transcript/rb/settings.rb +28 -0
data/lib/youtube/transcript/rb/transcript.rb +239 -0
data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
data/lib/youtube/transcript/rb/version.rb +9 -0
data/lib/youtube/transcript/rb.rb +37 -0
data/sig/youtube/transcript/rb.rbs +8 -0
data/spec/api_spec.rb +397 -0
data/spec/errors_spec.rb +240 -0
data/spec/formatters_spec.rb +436 -0
data/spec/integration_spec.rb +363 -0
data/spec/settings_spec.rb +67 -0
data/spec/spec_helper.rb +109 -0
data/spec/transcript_list_fetcher_spec.rb +520 -0
data/spec/transcript_list_spec.rb +380 -0
data/spec/transcript_parser_spec.rb +355 -0
data/spec/transcript_spec.rb +435 -0
metadata +118 -0

data/spec/integration_spec.rb ADDED Viewed

@@ -0,0 +1,363 @@
+# frozen_string_literal: true
+# Integration tests that make real HTTP requests to YouTube.
+# These tests are skipped by default to avoid network dependencies in CI.
+#
+# To run integration tests:
+#   INTEGRATION=1 bundle exec rspec spec/integration_spec.rb
+#
+# Or run all tests including integration:
+#   INTEGRATION=1 bundle exec rspec
+require "spec_helper"
+RSpec.describe "Integration Tests", :integration do
+  # Skip all integration tests unless INTEGRATION env var is set
+  before(:all) do
+    skip "Integration tests skipped. Set INTEGRATION=1 to run." unless ENV["INTEGRATION"]
+    WebMock.allow_net_connect!
+  end
+  after(:all) do
+    WebMock.disable_net_connect!(allow_localhost: true) if ENV["INTEGRATION"]
+  end
+  # Well-known videos that should have transcripts available
+  # Using popular, stable videos that are unlikely to be removed
+  let(:ted_talk_video_id) { "8jPQjjsBbIc" } # TED Talk - usually has good transcripts
+  let(:google_video_id) { "dQw4w9WgXcQ" }   # Rick Astley - Never Gonna Give You Up (very stable)
+  describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
+    let(:api) { described_class.new }
+    describe "#list" do
+      it "fetches available transcripts for a video" do
+        transcript_list = api.list(ted_talk_video_id)
+        expect(transcript_list).to be_a(Youtube::Transcript::Rb::TranscriptList)
+        expect(transcript_list.video_id).to eq(ted_talk_video_id)
+        expect(transcript_list.count).to be > 0
+        # Print available transcripts for debugging
+        puts "\nAvailable transcripts for video #{ted_talk_video_id}:"
+        puts transcript_list.to_s
+      end
+      it "returns a TranscriptList that is enumerable" do
+        transcript_list = api.list(ted_talk_video_id)
+        transcript_list.each do |transcript|
+          expect(transcript).to be_a(Youtube::Transcript::Rb::Transcript)
+          expect(transcript.language_code).to be_a(String)
+          expect(transcript.language).to be_a(String)
+        end
+      end
+    end
+    describe "#fetch" do
+      it "fetches English transcript by default" do
+        transcript = api.fetch(ted_talk_video_id)
+        expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        expect(transcript.video_id).to eq(ted_talk_video_id)
+        expect(transcript.snippets).not_to be_empty
+        first_snippet = transcript.first
+        expect(first_snippet).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
+        expect(first_snippet.text).to be_a(String)
+        expect(first_snippet.start).to be_a(Float)
+        expect(first_snippet.duration).to be_a(Float)
+        puts "\nFetched #{transcript.length} snippets"
+        puts "First snippet: #{first_snippet.text[0..50]}..."
+      end
+      it "fetches transcript with specific language" do
+        # Try to fetch English transcript
+        transcript = api.fetch(ted_talk_video_id, languages: ["en"])
+        expect(transcript.language_code).to eq("en")
+        expect(transcript.snippets).not_to be_empty
+      end
+      it "falls back to alternative language if primary not available" do
+        # Request Japanese first, then English as fallback
+        transcript = api.fetch(ted_talk_video_id, languages: ["ja", "en"])
+        expect(["ja", "en"]).to include(transcript.language_code)
+        expect(transcript.snippets).not_to be_empty
+      end
+      it "preserves HTML formatting when requested" do
+        transcript = api.fetch(ted_talk_video_id, preserve_formatting: true)
+        expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        # Note: Not all videos have HTML formatting, so we just verify it doesn't break
+      end
+    end
+    describe "#fetch_all" do
+      it "fetches transcripts for multiple videos" do
+        video_ids = [ted_talk_video_id]
+        results = api.fetch_all(video_ids)
+        expect(results).to be_a(Hash)
+        expect(results.keys).to include(ted_talk_video_id)
+        expect(results[ted_talk_video_id]).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+      end
+      it "continues on error when option is set" do
+        video_ids = [ted_talk_video_id, "invalid_video_id_xyz"]
+        errors = []
+        results = api.fetch_all(video_ids, continue_on_error: true) do |video_id, result|
+          if result.is_a?(StandardError)
+            errors << { video_id: video_id, error: result }
+          end
+        end
+        expect(results).to have_key(ted_talk_video_id)
+        expect(errors.length).to be >= 0 # May or may not have errors
+      end
+    end
+  end
+  describe Youtube::Transcript::Rb do
+    describe ".fetch" do
+      it "provides convenience method for fetching transcripts" do
+        transcript = described_class.fetch(ted_talk_video_id)
+        expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+        expect(transcript.snippets).not_to be_empty
+      end
+    end
+    describe ".list" do
+      it "provides convenience method for listing transcripts" do
+        transcript_list = described_class.list(ted_talk_video_id)
+        expect(transcript_list).to be_a(Youtube::Transcript::Rb::TranscriptList)
+        expect(transcript_list.count).to be > 0
+      end
+    end
+  end
+  describe "Transcript Translation" do
+    it "translates a transcript to another language" do
+      api = Youtube::Transcript::Rb::YouTubeTranscriptApi.new
+      transcript_list = api.list(ted_talk_video_id)
+      # Find an English transcript
+      begin
+        transcript = transcript_list.find_transcript(["en"])
+      rescue Youtube::Transcript::Rb::NoTranscriptFound
+        skip "No English transcript available for this video"
+      end
+      if transcript.translatable?
+        # Try to translate to Spanish
+        begin
+          translated = transcript.translate("es")
+          fetched = translated.fetch
+          expect(fetched).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+          expect(fetched.language_code).to eq("es")
+          expect(fetched.snippets).not_to be_empty
+          puts "\nTranslated to Spanish: #{fetched.first.text[0..50]}..."
+        rescue Youtube::Transcript::Rb::TranslationLanguageNotAvailable
+          skip "Spanish translation not available for this video"
+        rescue Youtube::Transcript::Rb::IpBlocked
+          skip "IP blocked by YouTube - try again later or use a proxy"
+        end
+      else
+        skip "Transcript is not translatable"
+      end
+    end
+  end
+  describe "Formatters with Real Data" do
+    let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
+    let(:transcript) { api.fetch(ted_talk_video_id) }
+    describe Youtube::Transcript::Rb::Formatters::JSONFormatter do
+      it "formats real transcript as JSON" do
+        formatter = described_class.new
+        output = formatter.format_transcript(transcript)
+        expect { JSON.parse(output) }.not_to raise_error
+        parsed = JSON.parse(output)
+        expect(parsed).to be_an(Array)
+        expect(parsed.first).to include("text", "start", "duration")
+      end
+    end
+    describe Youtube::Transcript::Rb::Formatters::TextFormatter do
+      it "formats real transcript as plain text" do
+        formatter = described_class.new
+        output = formatter.format_transcript(transcript)
+        expect(output).to be_a(String)
+        # Each snippet becomes one entry, but text may contain newlines
+        # so we just verify it's not empty and has reasonable content
+        expect(output).not_to be_empty
+        expect(output.length).to be > transcript.length
+      end
+    end
+    describe Youtube::Transcript::Rb::Formatters::SRTFormatter do
+      it "formats real transcript as SRT" do
+        formatter = described_class.new
+        output = formatter.format_transcript(transcript)
+        expect(output).to include("-->")
+        expect(output).to match(/^\d+$/m) # Sequence numbers
+        # Verify SRT timestamp format (HH:MM:SS,mmm)
+        expect(output).to match(/\d{2}:\d{2}:\d{2},\d{3}/)
+      end
+    end
+    describe Youtube::Transcript::Rb::Formatters::WebVTTFormatter do
+      it "formats real transcript as WebVTT" do
+        formatter = described_class.new
+        output = formatter.format_transcript(transcript)
+        expect(output).to start_with("WEBVTT")
+        expect(output).to include("-->")
+        # Verify WebVTT timestamp format (HH:MM:SS.mmm)
+        expect(output).to match(/\d{2}:\d{2}:\d{2}\.\d{3}/)
+      end
+    end
+    describe Youtube::Transcript::Rb::Formatters::PrettyPrintFormatter do
+      it "formats real transcript as pretty-printed output" do
+        formatter = described_class.new
+        output = formatter.format_transcript(transcript)
+        expect(output).to be_a(String)
+        expect(output).to include("text")
+        expect(output).to include("start")
+        expect(output).to include("duration")
+      end
+    end
+  end
+  describe "Error Handling" do
+    let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
+    it "raises NoTranscriptFound for unavailable language" do
+      expect {
+        api.fetch(ted_talk_video_id, languages: ["xx"]) # Invalid language code
+      }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
+    end
+    it "raises appropriate error for invalid video ID" do
+      expect {
+        api.fetch("this_is_not_a_valid_video_id_12345")
+      }.to raise_error(Youtube::Transcript::Rb::CouldNotRetrieveTranscript)
+    end
+    it "raises TranscriptsDisabled for video without transcripts" do
+      # This test may need to be updated if the video gets transcripts
+      # or use a known video without transcripts
+      skip "Need a known video ID without transcripts"
+    end
+  end
+  describe "FetchedTranscript Interface" do
+    let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
+    let(:transcript) { api.fetch(ted_talk_video_id) }
+    it "is enumerable" do
+      expect(transcript).to respond_to(:each)
+      expect(transcript).to respond_to(:map)
+      expect(transcript).to respond_to(:select)
+      expect(transcript).to respond_to(:first)
+      # Note: Enumerable doesn't provide #last by default, but we can use to_a.last
+      expect(transcript.to_a.last).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
+    end
+    it "is indexable" do
+      expect(transcript[0]).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
+      expect(transcript[-1]).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
+    end
+    it "has length" do
+      expect(transcript.length).to be > 0
+      expect(transcript.size).to eq(transcript.length)
+    end
+    it "converts to raw data" do
+      raw = transcript.to_raw_data
+      expect(raw).to be_an(Array)
+      expect(raw.first).to be_a(Hash)
+      expect(raw.first).to include("text", "start", "duration")
+    end
+    it "provides metadata" do
+      expect(transcript.video_id).to eq(ted_talk_video_id)
+      expect(transcript.language).to be_a(String)
+      expect(transcript.language_code).to be_a(String)
+      expect([true, false]).to include(transcript.is_generated)
+    end
+  end
+  describe "TranscriptList Interface" do
+    let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
+    let(:transcript_list) { api.list(ted_talk_video_id) }
+    it "is enumerable" do
+      expect(transcript_list).to respond_to(:each)
+      expect(transcript_list).to respond_to(:map)
+      expect(transcript_list).to respond_to(:count)
+    end
+    it "finds transcripts by language" do
+      transcript = transcript_list.find_transcript(["en"])
+      expect(transcript).to be_a(Youtube::Transcript::Rb::Transcript)
+    end
+    it "provides string representation" do
+      output = transcript_list.to_s
+      expect(output).to include("MANUALLY CREATED")
+      expect(output).to include("GENERATED")
+      expect(output).to include(ted_talk_video_id)
+    end
+  end
+  describe "Transcript Object" do
+    let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
+    let(:transcript_list) { api.list(ted_talk_video_id) }
+    let(:transcript) { transcript_list.find_transcript(["en"]) }
+    it "provides metadata properties" do
+      expect(transcript.video_id).to eq(ted_talk_video_id)
+      expect(transcript.language).to be_a(String)
+      expect(transcript.language_code).to eq("en")
+      expect([true, false]).to include(transcript.is_generated)
+    end
+    it "indicates translatability" do
+      expect([true, false]).to include(transcript.translatable?)
+      expect(transcript.translation_languages).to be_an(Array)
+    end
+    it "fetches transcript data" do
+      fetched = transcript.fetch
+      expect(fetched).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
+      expect(fetched.snippets).not_to be_empty
+    end
+    it "provides string representation" do
+      output = transcript.to_s
+      expect(output).to include(transcript.language_code)
+      expect(output).to include(transcript.language)
+    end
+  end
+end

data/spec/settings_spec.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+require "spec_helper"
+require "youtube/transcript/rb"
+RSpec.describe "Youtube::Transcript::Rb Settings" do
+  describe "WATCH_URL" do
+    it "is defined" do
+      expect(Youtube::Transcript::Rb::WATCH_URL).not_to be_nil
+    end
+    it "is a YouTube watch URL template" do
+      expect(Youtube::Transcript::Rb::WATCH_URL).to include("youtube.com/watch")
+    end
+    it "contains video_id placeholder" do
+      expect(Youtube::Transcript::Rb::WATCH_URL).to include("%<video_id>s")
+    end
+    it "can be formatted with a video_id" do
+      url = format(Youtube::Transcript::Rb::WATCH_URL, video_id: "abc123")
+      expect(url).to eq("https://www.youtube.com/watch?v=abc123")
+    end
+  end
+  describe "INNERTUBE_API_URL" do
+    it "is defined" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).not_to be_nil
+    end
+    it "is a YouTube API URL" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).to include("youtube.com/youtubei")
+    end
+    it "contains api_key placeholder" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).to include("%<api_key>s")
+    end
+    it "can be formatted with an api_key" do
+      url = format(Youtube::Transcript::Rb::INNERTUBE_API_URL, api_key: "my_api_key")
+      expect(url).to eq("https://www.youtube.com/youtubei/v1/player?key=my_api_key")
+    end
+  end
+  describe "INNERTUBE_CONTEXT" do
+    it "is defined" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).not_to be_nil
+    end
+    it "is a frozen hash" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).to be_frozen
+    end
+    it "contains client configuration" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).to have_key("client")
+    end
+    it "specifies clientName as ANDROID" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientName"]).to eq("ANDROID")
+    end
+    it "specifies a clientVersion" do
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientVersion"]).not_to be_nil
+      expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientVersion"]).to be_a(String)
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause
+# this file to always be loaded, without a need to explicitly require it in any
+# files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need
+# it.
+#
+# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require "bundler/setup"
+require "youtube/transcript/rb"
+require "webmock/rspec"
+require "faraday"
+WebMock.disable_net_connect!(allow_localhost: true)
+RSpec.configure do |config|
+  # Exclude integration tests by default
+  # Run with: INTEGRATION=1 bundle exec rspec
+  config.filter_run_excluding integration: true unless ENV["INTEGRATION"]
+  # rspec-expectations config goes here. You can use an alternate
+  # assertion/expectation library such as wrong or the stdlib/minitest
+  # assertions if you prefer.
+  config.expect_with :rspec do |expectations|
+    # This option will default to `true` in RSpec 4. It makes the `description`
+    # and `failure_message` of custom matchers include text for helper methods
+    # defined using `chain`, e.g.:
+    #     be_bigger_than(2).and_smaller_than(4).description
+    #     # => "be bigger than 2 and smaller than 4"
+    # ...rather than:
+    #     # => "be bigger than 2"
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  # rspec-mocks config goes here. You can use an alternate test double
+  # library (such as bogus or mocha) by changing the `mock_with` option here.
+  config.mock_with :rspec do |mocks|
+    # Prevents you from mocking or stubbing a method that does not exist on
+    # a real object. This is generally recommended, and will default to
+    # `true` in RSpec 4.
+    mocks.verify_partial_doubles = true
+  end
+  # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
+  # have no way to turn it off -- the option exists only for backwards
+  # compatibility in RSpec 3). It causes shared context metadata to be
+  # inherited by the metadata hash of host groups and examples, rather than
+  # triggering implicit auto-inclusion in groups with matching metadata.
+  config.shared_context_metadata_behavior = :apply_to_host_groups
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+  # This allows you to limit a spec run to individual examples or groups
+  # you care about by tagging them with `:focus` metadata. When nothing
+  # is tagged with `:focus`, all examples get run. RSpec also provides
+  # aliases for `it`, `describe`, and `context` that include `:focus`
+  # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
+  config.filter_run_when_matching :focus
+  # Allows RSpec to persist some state between runs in order to support
+  # the `--only-failures` and `--next-failure` CLI options. We recommend
+  # you configure your source control system to ignore this file.
+  config.example_status_persistence_file_path = "spec/examples.txt"
+  # Limits the available syntax to the non-monkey patched syntax that is
+  # recommended. For more details, see:
+  # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
+  config.disable_monkey_patching!
+  # This setting enables warnings. It's recommended, but in some cases may
+  # be too noisy due to issues in dependencies.
+  config.warnings = true
+  # Many RSpec users commonly either run the entire suite or an individual
+  # file, and it's useful to allow more verbose output when running an
+  # individual spec file.
+  if config.files_to_run.one?
+    # Use the documentation formatter for detailed output,
+    # unless a formatter has already been configured
+    # (e.g. via a command-line flag).
+    config.default_formatter = "doc"
+  end
+  # Print the 10 slowest examples and example groups at the
+  # end of the spec run, to help surface which specs are running
+  # particularly slow.
+  config.profile_examples = 10
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = :random
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+=end
+end