youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Integration tests that make real HTTP requests to YouTube.
4
+ # These tests are skipped by default to avoid network dependencies in CI.
5
+ #
6
+ # To run integration tests:
7
+ # INTEGRATION=1 bundle exec rspec spec/integration_spec.rb
8
+ #
9
+ # Or run all tests including integration:
10
+ # INTEGRATION=1 bundle exec rspec
11
+
12
+ require "spec_helper"
13
+
14
+ RSpec.describe "Integration Tests", :integration do
15
+ # Skip all integration tests unless INTEGRATION env var is set
16
+ before(:all) do
17
+ skip "Integration tests skipped. Set INTEGRATION=1 to run." unless ENV["INTEGRATION"]
18
+ WebMock.allow_net_connect!
19
+ end
20
+
21
+ after(:all) do
22
+ WebMock.disable_net_connect!(allow_localhost: true) if ENV["INTEGRATION"]
23
+ end
24
+
25
+ # Well-known videos that should have transcripts available
26
+ # Using popular, stable videos that are unlikely to be removed
27
+ let(:ted_talk_video_id) { "8jPQjjsBbIc" } # TED Talk - usually has good transcripts
28
+ let(:google_video_id) { "dQw4w9WgXcQ" } # Rick Astley - Never Gonna Give You Up (very stable)
29
+
30
+ describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
31
+ let(:api) { described_class.new }
32
+
33
+ describe "#list" do
34
+ it "fetches available transcripts for a video" do
35
+ transcript_list = api.list(ted_talk_video_id)
36
+
37
+ expect(transcript_list).to be_a(Youtube::Transcript::Rb::TranscriptList)
38
+ expect(transcript_list.video_id).to eq(ted_talk_video_id)
39
+ expect(transcript_list.count).to be > 0
40
+
41
+ # Print available transcripts for debugging
42
+ puts "\nAvailable transcripts for video #{ted_talk_video_id}:"
43
+ puts transcript_list.to_s
44
+ end
45
+
46
+ it "returns a TranscriptList that is enumerable" do
47
+ transcript_list = api.list(ted_talk_video_id)
48
+
49
+ transcript_list.each do |transcript|
50
+ expect(transcript).to be_a(Youtube::Transcript::Rb::Transcript)
51
+ expect(transcript.language_code).to be_a(String)
52
+ expect(transcript.language).to be_a(String)
53
+ end
54
+ end
55
+ end
56
+
57
+ describe "#fetch" do
58
+ it "fetches English transcript by default" do
59
+ transcript = api.fetch(ted_talk_video_id)
60
+
61
+ expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
62
+ expect(transcript.video_id).to eq(ted_talk_video_id)
63
+ expect(transcript.snippets).not_to be_empty
64
+
65
+ first_snippet = transcript.first
66
+ expect(first_snippet).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
67
+ expect(first_snippet.text).to be_a(String)
68
+ expect(first_snippet.start).to be_a(Float)
69
+ expect(first_snippet.duration).to be_a(Float)
70
+
71
+ puts "\nFetched #{transcript.length} snippets"
72
+ puts "First snippet: #{first_snippet.text[0..50]}..."
73
+ end
74
+
75
+ it "fetches transcript with specific language" do
76
+ # Try to fetch English transcript
77
+ transcript = api.fetch(ted_talk_video_id, languages: ["en"])
78
+
79
+ expect(transcript.language_code).to eq("en")
80
+ expect(transcript.snippets).not_to be_empty
81
+ end
82
+
83
+ it "falls back to alternative language if primary not available" do
84
+ # Request Japanese first, then English as fallback
85
+ transcript = api.fetch(ted_talk_video_id, languages: ["ja", "en"])
86
+
87
+ expect(["ja", "en"]).to include(transcript.language_code)
88
+ expect(transcript.snippets).not_to be_empty
89
+ end
90
+
91
+ it "preserves HTML formatting when requested" do
92
+ transcript = api.fetch(ted_talk_video_id, preserve_formatting: true)
93
+
94
+ expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
95
+ # Note: Not all videos have HTML formatting, so we just verify it doesn't break
96
+ end
97
+ end
98
+
99
+ describe "#fetch_all" do
100
+ it "fetches transcripts for multiple videos" do
101
+ video_ids = [ted_talk_video_id]
102
+ results = api.fetch_all(video_ids)
103
+
104
+ expect(results).to be_a(Hash)
105
+ expect(results.keys).to include(ted_talk_video_id)
106
+ expect(results[ted_talk_video_id]).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
107
+ end
108
+
109
+ it "continues on error when option is set" do
110
+ video_ids = [ted_talk_video_id, "invalid_video_id_xyz"]
111
+ errors = []
112
+
113
+ results = api.fetch_all(video_ids, continue_on_error: true) do |video_id, result|
114
+ if result.is_a?(StandardError)
115
+ errors << { video_id: video_id, error: result }
116
+ end
117
+ end
118
+
119
+ expect(results).to have_key(ted_talk_video_id)
120
+ expect(errors.length).to be >= 0 # May or may not have errors
121
+ end
122
+ end
123
+ end
124
+
125
+ describe Youtube::Transcript::Rb do
126
+ describe ".fetch" do
127
+ it "provides convenience method for fetching transcripts" do
128
+ transcript = described_class.fetch(ted_talk_video_id)
129
+
130
+ expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
131
+ expect(transcript.snippets).not_to be_empty
132
+ end
133
+ end
134
+
135
+ describe ".list" do
136
+ it "provides convenience method for listing transcripts" do
137
+ transcript_list = described_class.list(ted_talk_video_id)
138
+
139
+ expect(transcript_list).to be_a(Youtube::Transcript::Rb::TranscriptList)
140
+ expect(transcript_list.count).to be > 0
141
+ end
142
+ end
143
+ end
144
+
145
+ describe "Transcript Translation" do
146
+ it "translates a transcript to another language" do
147
+ api = Youtube::Transcript::Rb::YouTubeTranscriptApi.new
148
+ transcript_list = api.list(ted_talk_video_id)
149
+
150
+ # Find an English transcript
151
+ begin
152
+ transcript = transcript_list.find_transcript(["en"])
153
+ rescue Youtube::Transcript::Rb::NoTranscriptFound
154
+ skip "No English transcript available for this video"
155
+ end
156
+
157
+ if transcript.translatable?
158
+ # Try to translate to Spanish
159
+ begin
160
+ translated = transcript.translate("es")
161
+ fetched = translated.fetch
162
+
163
+ expect(fetched).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
164
+ expect(fetched.language_code).to eq("es")
165
+ expect(fetched.snippets).not_to be_empty
166
+
167
+ puts "\nTranslated to Spanish: #{fetched.first.text[0..50]}..."
168
+ rescue Youtube::Transcript::Rb::TranslationLanguageNotAvailable
169
+ skip "Spanish translation not available for this video"
170
+ rescue Youtube::Transcript::Rb::IpBlocked
171
+ skip "IP blocked by YouTube - try again later or use a proxy"
172
+ end
173
+ else
174
+ skip "Transcript is not translatable"
175
+ end
176
+ end
177
+ end
178
+
179
+ describe "Formatters with Real Data" do
180
+ let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
181
+ let(:transcript) { api.fetch(ted_talk_video_id) }
182
+
183
+ describe Youtube::Transcript::Rb::Formatters::JSONFormatter do
184
+ it "formats real transcript as JSON" do
185
+ formatter = described_class.new
186
+ output = formatter.format_transcript(transcript)
187
+
188
+ expect { JSON.parse(output) }.not_to raise_error
189
+ parsed = JSON.parse(output)
190
+ expect(parsed).to be_an(Array)
191
+ expect(parsed.first).to include("text", "start", "duration")
192
+ end
193
+ end
194
+
195
+ describe Youtube::Transcript::Rb::Formatters::TextFormatter do
196
+ it "formats real transcript as plain text" do
197
+ formatter = described_class.new
198
+ output = formatter.format_transcript(transcript)
199
+
200
+ expect(output).to be_a(String)
201
+ # Each snippet becomes one entry, but text may contain newlines
202
+ # so we just verify it's not empty and has reasonable content
203
+ expect(output).not_to be_empty
204
+ expect(output.length).to be > transcript.length
205
+ end
206
+ end
207
+
208
+ describe Youtube::Transcript::Rb::Formatters::SRTFormatter do
209
+ it "formats real transcript as SRT" do
210
+ formatter = described_class.new
211
+ output = formatter.format_transcript(transcript)
212
+
213
+ expect(output).to include("-->")
214
+ expect(output).to match(/^\d+$/m) # Sequence numbers
215
+
216
+ # Verify SRT timestamp format (HH:MM:SS,mmm)
217
+ expect(output).to match(/\d{2}:\d{2}:\d{2},\d{3}/)
218
+ end
219
+ end
220
+
221
+ describe Youtube::Transcript::Rb::Formatters::WebVTTFormatter do
222
+ it "formats real transcript as WebVTT" do
223
+ formatter = described_class.new
224
+ output = formatter.format_transcript(transcript)
225
+
226
+ expect(output).to start_with("WEBVTT")
227
+ expect(output).to include("-->")
228
+
229
+ # Verify WebVTT timestamp format (HH:MM:SS.mmm)
230
+ expect(output).to match(/\d{2}:\d{2}:\d{2}\.\d{3}/)
231
+ end
232
+ end
233
+
234
+ describe Youtube::Transcript::Rb::Formatters::PrettyPrintFormatter do
235
+ it "formats real transcript as pretty-printed output" do
236
+ formatter = described_class.new
237
+ output = formatter.format_transcript(transcript)
238
+
239
+ expect(output).to be_a(String)
240
+ expect(output).to include("text")
241
+ expect(output).to include("start")
242
+ expect(output).to include("duration")
243
+ end
244
+ end
245
+ end
246
+
247
+ describe "Error Handling" do
248
+ let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
249
+
250
+ it "raises NoTranscriptFound for unavailable language" do
251
+ expect {
252
+ api.fetch(ted_talk_video_id, languages: ["xx"]) # Invalid language code
253
+ }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
254
+ end
255
+
256
+ it "raises appropriate error for invalid video ID" do
257
+ expect {
258
+ api.fetch("this_is_not_a_valid_video_id_12345")
259
+ }.to raise_error(Youtube::Transcript::Rb::CouldNotRetrieveTranscript)
260
+ end
261
+
262
+ it "raises TranscriptsDisabled for video without transcripts" do
263
+ # This test may need to be updated if the video gets transcripts
264
+ # or use a known video without transcripts
265
+ skip "Need a known video ID without transcripts"
266
+ end
267
+ end
268
+
269
+ describe "FetchedTranscript Interface" do
270
+ let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
271
+ let(:transcript) { api.fetch(ted_talk_video_id) }
272
+
273
+ it "is enumerable" do
274
+ expect(transcript).to respond_to(:each)
275
+ expect(transcript).to respond_to(:map)
276
+ expect(transcript).to respond_to(:select)
277
+ expect(transcript).to respond_to(:first)
278
+ # Note: Enumerable doesn't provide #last by default, but we can use to_a.last
279
+ expect(transcript.to_a.last).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
280
+ end
281
+
282
+ it "is indexable" do
283
+ expect(transcript[0]).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
284
+ expect(transcript[-1]).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
285
+ end
286
+
287
+ it "has length" do
288
+ expect(transcript.length).to be > 0
289
+ expect(transcript.size).to eq(transcript.length)
290
+ end
291
+
292
+ it "converts to raw data" do
293
+ raw = transcript.to_raw_data
294
+
295
+ expect(raw).to be_an(Array)
296
+ expect(raw.first).to be_a(Hash)
297
+ expect(raw.first).to include("text", "start", "duration")
298
+ end
299
+
300
+ it "provides metadata" do
301
+ expect(transcript.video_id).to eq(ted_talk_video_id)
302
+ expect(transcript.language).to be_a(String)
303
+ expect(transcript.language_code).to be_a(String)
304
+ expect([true, false]).to include(transcript.is_generated)
305
+ end
306
+ end
307
+
308
+ describe "TranscriptList Interface" do
309
+ let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
310
+ let(:transcript_list) { api.list(ted_talk_video_id) }
311
+
312
+ it "is enumerable" do
313
+ expect(transcript_list).to respond_to(:each)
314
+ expect(transcript_list).to respond_to(:map)
315
+ expect(transcript_list).to respond_to(:count)
316
+ end
317
+
318
+ it "finds transcripts by language" do
319
+ transcript = transcript_list.find_transcript(["en"])
320
+ expect(transcript).to be_a(Youtube::Transcript::Rb::Transcript)
321
+ end
322
+
323
+ it "provides string representation" do
324
+ output = transcript_list.to_s
325
+
326
+ expect(output).to include("MANUALLY CREATED")
327
+ expect(output).to include("GENERATED")
328
+ expect(output).to include(ted_talk_video_id)
329
+ end
330
+ end
331
+
332
+ describe "Transcript Object" do
333
+ let(:api) { Youtube::Transcript::Rb::YouTubeTranscriptApi.new }
334
+ let(:transcript_list) { api.list(ted_talk_video_id) }
335
+ let(:transcript) { transcript_list.find_transcript(["en"]) }
336
+
337
+ it "provides metadata properties" do
338
+ expect(transcript.video_id).to eq(ted_talk_video_id)
339
+ expect(transcript.language).to be_a(String)
340
+ expect(transcript.language_code).to eq("en")
341
+ expect([true, false]).to include(transcript.is_generated)
342
+ end
343
+
344
+ it "indicates translatability" do
345
+ expect([true, false]).to include(transcript.translatable?)
346
+ expect(transcript.translation_languages).to be_an(Array)
347
+ end
348
+
349
+ it "fetches transcript data" do
350
+ fetched = transcript.fetch
351
+
352
+ expect(fetched).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
353
+ expect(fetched.snippets).not_to be_empty
354
+ end
355
+
356
+ it "provides string representation" do
357
+ output = transcript.to_s
358
+
359
+ expect(output).to include(transcript.language_code)
360
+ expect(output).to include(transcript.language)
361
+ end
362
+ end
363
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "youtube/transcript/rb"
5
+
6
+ RSpec.describe "Youtube::Transcript::Rb Settings" do
7
+ describe "WATCH_URL" do
8
+ it "is defined" do
9
+ expect(Youtube::Transcript::Rb::WATCH_URL).not_to be_nil
10
+ end
11
+
12
+ it "is a YouTube watch URL template" do
13
+ expect(Youtube::Transcript::Rb::WATCH_URL).to include("youtube.com/watch")
14
+ end
15
+
16
+ it "contains video_id placeholder" do
17
+ expect(Youtube::Transcript::Rb::WATCH_URL).to include("%<video_id>s")
18
+ end
19
+
20
+ it "can be formatted with a video_id" do
21
+ url = format(Youtube::Transcript::Rb::WATCH_URL, video_id: "abc123")
22
+ expect(url).to eq("https://www.youtube.com/watch?v=abc123")
23
+ end
24
+ end
25
+
26
+ describe "INNERTUBE_API_URL" do
27
+ it "is defined" do
28
+ expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).not_to be_nil
29
+ end
30
+
31
+ it "is a YouTube API URL" do
32
+ expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).to include("youtube.com/youtubei")
33
+ end
34
+
35
+ it "contains api_key placeholder" do
36
+ expect(Youtube::Transcript::Rb::INNERTUBE_API_URL).to include("%<api_key>s")
37
+ end
38
+
39
+ it "can be formatted with an api_key" do
40
+ url = format(Youtube::Transcript::Rb::INNERTUBE_API_URL, api_key: "my_api_key")
41
+ expect(url).to eq("https://www.youtube.com/youtubei/v1/player?key=my_api_key")
42
+ end
43
+ end
44
+
45
+ describe "INNERTUBE_CONTEXT" do
46
+ it "is defined" do
47
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).not_to be_nil
48
+ end
49
+
50
+ it "is a frozen hash" do
51
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).to be_frozen
52
+ end
53
+
54
+ it "contains client configuration" do
55
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT).to have_key("client")
56
+ end
57
+
58
+ it "specifies clientName as ANDROID" do
59
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientName"]).to eq("ANDROID")
60
+ end
61
+
62
+ it "specifies a clientVersion" do
63
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientVersion"]).not_to be_nil
64
+ expect(Youtube::Transcript::Rb::INNERTUBE_CONTEXT["client"]["clientVersion"]).to be_a(String)
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,109 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
4
+ # this file to always be loaded, without a need to explicitly require it in any
5
+ # files.
6
+ #
7
+ # Given that it is always loaded, you are encouraged to keep this file as
8
+ # light-weight as possible. Requiring heavyweight dependencies from this file
9
+ # will add to the boot time of your test suite on EVERY test run, even for an
10
+ # individual file that may not need all of that loaded. Instead, consider making
11
+ # a separate helper file that requires the additional dependencies and performs
12
+ # the additional setup, and require it from the spec files that actually need
13
+ # it.
14
+ #
15
+ # See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
16
+
17
+ require "bundler/setup"
18
+ require "youtube/transcript/rb"
19
+ require "webmock/rspec"
20
+ require "faraday"
21
+
22
+ WebMock.disable_net_connect!(allow_localhost: true)
23
+
24
+ RSpec.configure do |config|
25
+ # Exclude integration tests by default
26
+ # Run with: INTEGRATION=1 bundle exec rspec
27
+ config.filter_run_excluding integration: true unless ENV["INTEGRATION"]
28
+ # rspec-expectations config goes here. You can use an alternate
29
+ # assertion/expectation library such as wrong or the stdlib/minitest
30
+ # assertions if you prefer.
31
+ config.expect_with :rspec do |expectations|
32
+ # This option will default to `true` in RSpec 4. It makes the `description`
33
+ # and `failure_message` of custom matchers include text for helper methods
34
+ # defined using `chain`, e.g.:
35
+ # be_bigger_than(2).and_smaller_than(4).description
36
+ # # => "be bigger than 2 and smaller than 4"
37
+ # ...rather than:
38
+ # # => "be bigger than 2"
39
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
40
+ end
41
+
42
+ # rspec-mocks config goes here. You can use an alternate test double
43
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
44
+ config.mock_with :rspec do |mocks|
45
+ # Prevents you from mocking or stubbing a method that does not exist on
46
+ # a real object. This is generally recommended, and will default to
47
+ # `true` in RSpec 4.
48
+ mocks.verify_partial_doubles = true
49
+ end
50
+
51
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
52
+ # have no way to turn it off -- the option exists only for backwards
53
+ # compatibility in RSpec 3). It causes shared context metadata to be
54
+ # inherited by the metadata hash of host groups and examples, rather than
55
+ # triggering implicit auto-inclusion in groups with matching metadata.
56
+ config.shared_context_metadata_behavior = :apply_to_host_groups
57
+
58
+ # The settings below are suggested to provide a good initial experience
59
+ # with RSpec, but feel free to customize to your heart's content.
60
+ =begin
61
+ # This allows you to limit a spec run to individual examples or groups
62
+ # you care about by tagging them with `:focus` metadata. When nothing
63
+ # is tagged with `:focus`, all examples get run. RSpec also provides
64
+ # aliases for `it`, `describe`, and `context` that include `:focus`
65
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
66
+ config.filter_run_when_matching :focus
67
+
68
+ # Allows RSpec to persist some state between runs in order to support
69
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
70
+ # you configure your source control system to ignore this file.
71
+ config.example_status_persistence_file_path = "spec/examples.txt"
72
+
73
+ # Limits the available syntax to the non-monkey patched syntax that is
74
+ # recommended. For more details, see:
75
+ # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
76
+ config.disable_monkey_patching!
77
+
78
+ # This setting enables warnings. It's recommended, but in some cases may
79
+ # be too noisy due to issues in dependencies.
80
+ config.warnings = true
81
+
82
+ # Many RSpec users commonly either run the entire suite or an individual
83
+ # file, and it's useful to allow more verbose output when running an
84
+ # individual spec file.
85
+ if config.files_to_run.one?
86
+ # Use the documentation formatter for detailed output,
87
+ # unless a formatter has already been configured
88
+ # (e.g. via a command-line flag).
89
+ config.default_formatter = "doc"
90
+ end
91
+
92
+ # Print the 10 slowest examples and example groups at the
93
+ # end of the spec run, to help surface which specs are running
94
+ # particularly slow.
95
+ config.profile_examples = 10
96
+
97
+ # Run specs in random order to surface order dependencies. If you find an
98
+ # order dependency and want to debug it, you can fix the order by providing
99
+ # the seed, which is printed after each run.
100
+ # --seed 1234
101
+ config.order = :random
102
+
103
+ # Seed global randomization in this process using the `--seed` CLI option.
104
+ # Setting this allows you to use `--seed` to deterministically reproduce
105
+ # test failures related to randomization by passing the same `--seed` value
106
+ # as the one that triggered the failure.
107
+ Kernel.srand config.seed
108
+ =end
109
+ end