youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,520 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "webmock/rspec"
5
+
6
+ RSpec.describe Youtube::Transcript::Rb::TranscriptListFetcher do
7
+ let(:http_client) { Faraday.new }
8
+ let(:fetcher) { described_class.new(http_client: http_client) }
9
+ let(:video_id) { "dQw4w9WgXcQ" }
10
+ let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
11
+
12
+ let(:watch_url) { "https://www.youtube.com/watch?v=#{video_id}" }
13
+ let(:innertube_url) { "https://www.youtube.com/youtubei/v1/player?key=#{api_key}" }
14
+
15
+ # Sample HTML with embedded API key
16
+ let(:sample_html) do
17
+ <<~HTML
18
+ <!DOCTYPE html>
19
+ <html>
20
+ <head><title>Test Video</title></head>
21
+ <body>
22
+ <script>
23
+ var ytcfg = {"INNERTUBE_API_KEY": "#{api_key}", "OTHER_KEY": "value"};
24
+ </script>
25
+ </body>
26
+ </html>
27
+ HTML
28
+ end
29
+
30
+ # Sample innertube API response with captions
31
+ let(:sample_innertube_response) do
32
+ {
33
+ "playabilityStatus" => { "status" => "OK" },
34
+ "captions" => {
35
+ "playerCaptionsTracklistRenderer" => {
36
+ "captionTracks" => [
37
+ {
38
+ "baseUrl" => "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=en",
39
+ "name" => { "runs" => [{ "text" => "English" }] },
40
+ "languageCode" => "en",
41
+ "isTranslatable" => true
42
+ }
43
+ ],
44
+ "translationLanguages" => [
45
+ { "languageCode" => "es", "languageName" => { "runs" => [{ "text" => "Spanish" }] } }
46
+ ]
47
+ }
48
+ }
49
+ }
50
+ end
51
+
52
+ describe "#initialize" do
53
+ it "stores the http_client" do
54
+ fetcher = described_class.new(http_client: http_client)
55
+ expect(fetcher.instance_variable_get(:@http_client)).to eq(http_client)
56
+ end
57
+
58
+ it "stores the proxy_config when provided" do
59
+ proxy_config = double("proxy_config")
60
+ fetcher = described_class.new(http_client: http_client, proxy_config: proxy_config)
61
+ expect(fetcher.instance_variable_get(:@proxy_config)).to eq(proxy_config)
62
+ end
63
+
64
+ it "defaults proxy_config to nil" do
65
+ fetcher = described_class.new(http_client: http_client)
66
+ expect(fetcher.instance_variable_get(:@proxy_config)).to be_nil
67
+ end
68
+ end
69
+
70
+ describe "#fetch" do
71
+ before do
72
+ stub_request(:get, watch_url)
73
+ .to_return(status: 200, body: sample_html)
74
+
75
+ stub_request(:post, innertube_url)
76
+ .to_return(status: 200, body: sample_innertube_response.to_json, headers: { "Content-Type" => "application/json" })
77
+ end
78
+
79
+ it "returns a TranscriptList" do
80
+ result = fetcher.fetch(video_id)
81
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
82
+ end
83
+
84
+ it "returns a TranscriptList with the correct video_id" do
85
+ result = fetcher.fetch(video_id)
86
+ expect(result.video_id).to eq(video_id)
87
+ end
88
+
89
+ it "makes a GET request to the watch URL" do
90
+ fetcher.fetch(video_id)
91
+ expect(WebMock).to have_requested(:get, watch_url)
92
+ end
93
+
94
+ it "makes a POST request to the innertube API" do
95
+ fetcher.fetch(video_id)
96
+ expect(WebMock).to have_requested(:post, innertube_url)
97
+ end
98
+
99
+ it "includes Accept-Language header in watch request" do
100
+ fetcher.fetch(video_id)
101
+ expect(WebMock).to have_requested(:get, watch_url)
102
+ .with(headers: { "Accept-Language" => "en-US" })
103
+ end
104
+
105
+ it "includes proper body in innertube request" do
106
+ fetcher.fetch(video_id)
107
+ expect(WebMock).to have_requested(:post, innertube_url)
108
+ .with { |req|
109
+ body = JSON.parse(req.body)
110
+ body["videoId"] == video_id && body["context"]["client"]["clientName"] == "ANDROID"
111
+ }
112
+ end
113
+ end
114
+
115
+ describe "error handling" do
116
+ describe "when IP is blocked (429 response)" do
117
+ before do
118
+ stub_request(:get, watch_url)
119
+ .to_return(status: 429, body: "Too Many Requests")
120
+ end
121
+
122
+ it "raises IpBlocked error" do
123
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::IpBlocked)
124
+ end
125
+ end
126
+
127
+ describe "when CAPTCHA is detected" do
128
+ let(:captcha_html) do
129
+ '<html><body><div class="g-recaptcha" data-sitekey="abc"></div></body></html>'
130
+ end
131
+
132
+ before do
133
+ stub_request(:get, watch_url)
134
+ .to_return(status: 200, body: captcha_html)
135
+ end
136
+
137
+ it "raises IpBlocked error" do
138
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::IpBlocked)
139
+ end
140
+ end
141
+
142
+ describe "when API key cannot be found" do
143
+ let(:no_api_key_html) do
144
+ "<html><body>No API key here</body></html>"
145
+ end
146
+
147
+ before do
148
+ stub_request(:get, watch_url)
149
+ .to_return(status: 200, body: no_api_key_html)
150
+ end
151
+
152
+ it "raises YouTubeDataUnparsable error" do
153
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::YouTubeDataUnparsable)
154
+ end
155
+ end
156
+
157
+ describe "when video is unavailable" do
158
+ before do
159
+ stub_request(:get, watch_url)
160
+ .to_return(status: 200, body: sample_html)
161
+
162
+ stub_request(:post, innertube_url)
163
+ .to_return(status: 200, body: {
164
+ "playabilityStatus" => {
165
+ "status" => "ERROR",
166
+ "reason" => "This video is unavailable"
167
+ }
168
+ }.to_json)
169
+ end
170
+
171
+ it "raises VideoUnavailable error" do
172
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
173
+ end
174
+ end
175
+
176
+ describe "when video ID looks like a URL" do
177
+ let(:url_video_id) { "https://www.youtube.com/watch?v=abc123" }
178
+
179
+ before do
180
+ stub_request(:get, "https://www.youtube.com/watch?v=#{url_video_id}")
181
+ .to_return(status: 200, body: sample_html)
182
+
183
+ stub_request(:post, innertube_url)
184
+ .to_return(status: 200, body: {
185
+ "playabilityStatus" => {
186
+ "status" => "ERROR",
187
+ "reason" => "This video is unavailable"
188
+ }
189
+ }.to_json)
190
+ end
191
+
192
+ it "raises InvalidVideoId error" do
193
+ expect { fetcher.fetch(url_video_id) }.to raise_error(Youtube::Transcript::Rb::InvalidVideoId)
194
+ end
195
+ end
196
+
197
+ describe "when video is age restricted" do
198
+ before do
199
+ stub_request(:get, watch_url)
200
+ .to_return(status: 200, body: sample_html)
201
+
202
+ stub_request(:post, innertube_url)
203
+ .to_return(status: 200, body: {
204
+ "playabilityStatus" => {
205
+ "status" => "LOGIN_REQUIRED",
206
+ "reason" => "This video may be inappropriate for some users."
207
+ }
208
+ }.to_json)
209
+ end
210
+
211
+ it "raises AgeRestricted error" do
212
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::AgeRestricted)
213
+ end
214
+ end
215
+
216
+ describe "when bot is detected" do
217
+ before do
218
+ stub_request(:get, watch_url)
219
+ .to_return(status: 200, body: sample_html)
220
+
221
+ stub_request(:post, innertube_url)
222
+ .to_return(status: 200, body: {
223
+ "playabilityStatus" => {
224
+ "status" => "LOGIN_REQUIRED",
225
+ "reason" => "Sign in to confirm you're not a bot"
226
+ }
227
+ }.to_json)
228
+ end
229
+
230
+ it "raises RequestBlocked error" do
231
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::RequestBlocked)
232
+ end
233
+ end
234
+
235
+ describe "when video is unplayable with subreasons" do
236
+ before do
237
+ stub_request(:get, watch_url)
238
+ .to_return(status: 200, body: sample_html)
239
+
240
+ stub_request(:post, innertube_url)
241
+ .to_return(status: 200, body: {
242
+ "playabilityStatus" => {
243
+ "status" => "ERROR",
244
+ "reason" => "Video unavailable",
245
+ "errorScreen" => {
246
+ "playerErrorMessageRenderer" => {
247
+ "subreason" => {
248
+ "runs" => [
249
+ { "text" => "This video is private" },
250
+ { "text" => "Please contact the owner" }
251
+ ]
252
+ }
253
+ }
254
+ }
255
+ }
256
+ }.to_json)
257
+ end
258
+
259
+ it "raises VideoUnplayable error" do
260
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnplayable)
261
+ end
262
+ end
263
+
264
+ describe "when transcripts are disabled" do
265
+ before do
266
+ stub_request(:get, watch_url)
267
+ .to_return(status: 200, body: sample_html)
268
+
269
+ stub_request(:post, innertube_url)
270
+ .to_return(status: 200, body: {
271
+ "playabilityStatus" => { "status" => "OK" },
272
+ "captions" => {}
273
+ }.to_json)
274
+ end
275
+
276
+ it "raises TranscriptsDisabled error" do
277
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
278
+ end
279
+ end
280
+
281
+ describe "when captions is nil" do
282
+ before do
283
+ stub_request(:get, watch_url)
284
+ .to_return(status: 200, body: sample_html)
285
+
286
+ stub_request(:post, innertube_url)
287
+ .to_return(status: 200, body: {
288
+ "playabilityStatus" => { "status" => "OK" }
289
+ }.to_json)
290
+ end
291
+
292
+ it "raises TranscriptsDisabled error" do
293
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
294
+ end
295
+ end
296
+
297
+ describe "when captionTracks is missing" do
298
+ before do
299
+ stub_request(:get, watch_url)
300
+ .to_return(status: 200, body: sample_html)
301
+
302
+ stub_request(:post, innertube_url)
303
+ .to_return(status: 200, body: {
304
+ "playabilityStatus" => { "status" => "OK" },
305
+ "captions" => {
306
+ "playerCaptionsTracklistRenderer" => {
307
+ "translationLanguages" => []
308
+ }
309
+ }
310
+ }.to_json)
311
+ end
312
+
313
+ it "raises TranscriptsDisabled error" do
314
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
315
+ end
316
+ end
317
+
318
+ describe "when HTTP request fails" do
319
+ before do
320
+ stub_request(:get, watch_url)
321
+ .to_return(status: 500, body: "Internal Server Error")
322
+ end
323
+
324
+ it "raises YouTubeRequestFailed error" do
325
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::YouTubeRequestFailed)
326
+ end
327
+ end
328
+
329
+ describe "when innertube API returns error" do
330
+ before do
331
+ stub_request(:get, watch_url)
332
+ .to_return(status: 200, body: sample_html)
333
+
334
+ stub_request(:post, innertube_url)
335
+ .to_return(status: 403, body: "Forbidden")
336
+ end
337
+
338
+ it "raises YouTubeRequestFailed error" do
339
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::YouTubeRequestFailed)
340
+ end
341
+ end
342
+ end
343
+
344
+ describe "consent cookie handling" do
345
+ let(:consent_html) do
346
+ <<~HTML
347
+ <!DOCTYPE html>
348
+ <html>
349
+ <body>
350
+ <form action="https://consent.youtube.com/s">
351
+ <input name="v" value="cb.20231201-01-p1.en+FX+999">
352
+ </form>
353
+ </body>
354
+ </html>
355
+ HTML
356
+ end
357
+
358
+ context "when consent is required and resolved" do
359
+ before do
360
+ stub_request(:get, watch_url)
361
+ .to_return(
362
+ { status: 200, body: consent_html },
363
+ { status: 200, body: sample_html }
364
+ )
365
+
366
+ stub_request(:post, innertube_url)
367
+ .to_return(status: 200, body: sample_innertube_response.to_json)
368
+ end
369
+
370
+ it "retries after setting consent cookie" do
371
+ result = fetcher.fetch(video_id)
372
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
373
+ expect(WebMock).to have_requested(:get, watch_url).times(2)
374
+ end
375
+
376
+ it "includes consent cookie in second request" do
377
+ fetcher.fetch(video_id)
378
+ expect(WebMock).to have_requested(:get, watch_url)
379
+ .with(headers: { "Cookie" => /CONSENT=YES\+/ })
380
+ end
381
+ end
382
+
383
+ context "when consent cannot be resolved" do
384
+ let(:no_value_consent_html) do
385
+ <<~HTML
386
+ <!DOCTYPE html>
387
+ <html>
388
+ <body>
389
+ <form action="https://consent.youtube.com/s">
390
+ <input name="other" value="something">
391
+ </form>
392
+ </body>
393
+ </html>
394
+ HTML
395
+ end
396
+
397
+ before do
398
+ stub_request(:get, watch_url)
399
+ .to_return(status: 200, body: no_value_consent_html)
400
+ end
401
+
402
+ it "raises FailedToCreateConsentCookie error" do
403
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::FailedToCreateConsentCookie)
404
+ end
405
+ end
406
+
407
+ context "when consent page persists after cookie" do
408
+ before do
409
+ stub_request(:get, watch_url)
410
+ .to_return(status: 200, body: consent_html)
411
+ end
412
+
413
+ it "raises FailedToCreateConsentCookie error" do
414
+ expect { fetcher.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::FailedToCreateConsentCookie)
415
+ end
416
+ end
417
+ end
418
+
419
+ describe "HTML unescaping" do
420
+ let(:escaped_html) do
421
+ <<~HTML
422
+ <!DOCTYPE html>
423
+ <html>
424
+ <head><title>Test &amp; Video</title></head>
425
+ <body>
426
+ <script>
427
+ var ytcfg = {&quot;INNERTUBE_API_KEY&quot;: &quot;#{api_key}&quot;};
428
+ </script>
429
+ </body>
430
+ </html>
431
+ HTML
432
+ end
433
+
434
+ before do
435
+ stub_request(:get, watch_url)
436
+ .to_return(status: 200, body: escaped_html)
437
+
438
+ stub_request(:post, innertube_url)
439
+ .to_return(status: 200, body: sample_innertube_response.to_json)
440
+ end
441
+
442
+ it "properly unescapes HTML entities" do
443
+ result = fetcher.fetch(video_id)
444
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
445
+ end
446
+ end
447
+
448
+ describe "with proxy config" do
449
+ let(:proxy_config) do
450
+ double("proxy_config", retries_when_blocked: 3)
451
+ end
452
+
453
+ let(:fetcher_with_proxy) do
454
+ described_class.new(http_client: http_client, proxy_config: proxy_config)
455
+ end
456
+
457
+ context "when request is blocked and retries configured" do
458
+ before do
459
+ stub_request(:get, watch_url)
460
+ .to_return(status: 200, body: sample_html)
461
+
462
+ stub_request(:post, innertube_url)
463
+ .to_return(
464
+ { status: 200, body: { "playabilityStatus" => { "status" => "LOGIN_REQUIRED", "reason" => "Sign in to confirm you're not a bot" } }.to_json },
465
+ { status: 200, body: { "playabilityStatus" => { "status" => "LOGIN_REQUIRED", "reason" => "Sign in to confirm you're not a bot" } }.to_json },
466
+ { status: 200, body: sample_innertube_response.to_json }
467
+ )
468
+ end
469
+
470
+ it "retries the request" do
471
+ result = fetcher_with_proxy.fetch(video_id)
472
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
473
+ expect(WebMock).to have_requested(:post, innertube_url).times(3)
474
+ end
475
+ end
476
+
477
+ context "when all retries fail" do
478
+ before do
479
+ stub_request(:get, watch_url)
480
+ .to_return(status: 200, body: sample_html)
481
+
482
+ stub_request(:post, innertube_url)
483
+ .to_return(status: 200, body: { "playabilityStatus" => { "status" => "LOGIN_REQUIRED", "reason" => "Sign in to confirm you're not a bot" } }.to_json)
484
+ end
485
+
486
+ it "raises RequestBlocked after exhausting retries" do
487
+ expect { fetcher_with_proxy.fetch(video_id) }.to raise_error(Youtube::Transcript::Rb::RequestBlocked)
488
+ expect(WebMock).to have_requested(:post, innertube_url).times(3)
489
+ end
490
+ end
491
+ end
492
+
493
+ describe "PlayabilityStatus module" do
494
+ it "defines OK status" do
495
+ expect(Youtube::Transcript::Rb::PlayabilityStatus::OK).to eq("OK")
496
+ end
497
+
498
+ it "defines ERROR status" do
499
+ expect(Youtube::Transcript::Rb::PlayabilityStatus::ERROR).to eq("ERROR")
500
+ end
501
+
502
+ it "defines LOGIN_REQUIRED status" do
503
+ expect(Youtube::Transcript::Rb::PlayabilityStatus::LOGIN_REQUIRED).to eq("LOGIN_REQUIRED")
504
+ end
505
+ end
506
+
507
+ describe "PlayabilityFailedReason module" do
508
+ it "defines BOT_DETECTED reason" do
509
+ expect(Youtube::Transcript::Rb::PlayabilityFailedReason::BOT_DETECTED).to eq("Sign in to confirm you're not a bot")
510
+ end
511
+
512
+ it "defines AGE_RESTRICTED reason" do
513
+ expect(Youtube::Transcript::Rb::PlayabilityFailedReason::AGE_RESTRICTED).to eq("This video may be inappropriate for some users.")
514
+ end
515
+
516
+ it "defines VIDEO_UNAVAILABLE reason" do
517
+ expect(Youtube::Transcript::Rb::PlayabilityFailedReason::VIDEO_UNAVAILABLE).to eq("This video is unavailable")
518
+ end
519
+ end
520
+ end