youtube-transcript-rb 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -42
- data/lib/youtube_rb/transcript/api.rb +148 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/formatters.rb +267 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
- data/lib/youtube_rb/transcript.rb +35 -0
- data/sig/youtube_rb/transcript.rbs +6 -0
- data/spec/api_spec.rb +20 -20
- data/spec/errors_spec.rb +39 -39
- data/spec/formatters_spec.rb +36 -36
- data/spec/integration_spec.rb +32 -32
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +1 -1
- data/spec/transcript_list_fetcher_spec.rb +27 -27
- data/spec/transcript_list_spec.rb +6 -6
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +16 -16
- metadata +12 -12
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6d97023aca42aac08e42c3857d940e3a42ba735c895685522048caee415fe4af
|
|
4
|
+
data.tar.gz: 1d435b06743716beb8f3e892bc97a2c7d105f6ab710f61659cd24dd6ce438a81
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b5f345ebcef944ba98ae4adcc42d9951c5e04bf0230e98c18dc41e0b3b715cfe57a34722924042cbcf9b7244d00ad31767bc2ce287136656c2b05b3378f2db8
|
|
7
|
+
data.tar.gz: 07ba07e26a8ff0f895c64767869bda918122c35b21b39b945fc608067d864643712386987155e3ae1dbe24f5310410e642aa58fd03f1c15f4df358567a51fae2
|
data/README.md
CHANGED
|
@@ -47,9 +47,9 @@ gem install youtube-transcript-rb
|
|
|
47
47
|
The easiest way to get a transcript for a given video is to execute:
|
|
48
48
|
|
|
49
49
|
```ruby
|
|
50
|
-
require '
|
|
50
|
+
require 'youtube_rb/transcript'
|
|
51
51
|
|
|
52
|
-
api =
|
|
52
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
53
53
|
api.fetch(video_id)
|
|
54
54
|
```
|
|
55
55
|
|
|
@@ -62,14 +62,14 @@ api.fetch(video_id)
|
|
|
62
62
|
This will return a `FetchedTranscript` object looking somewhat like this:
|
|
63
63
|
|
|
64
64
|
```ruby
|
|
65
|
-
#<
|
|
65
|
+
#<YoutubeRb::Transcript::FetchedTranscript
|
|
66
66
|
@video_id="12345",
|
|
67
67
|
@language="English",
|
|
68
68
|
@language_code="en",
|
|
69
69
|
@is_generated=false,
|
|
70
70
|
@snippets=[
|
|
71
|
-
#<
|
|
72
|
-
#<
|
|
71
|
+
#<YoutubeRb::Transcript::TranscriptSnippet @text="Hey there", @start=0.0, @duration=1.54>,
|
|
72
|
+
#<YoutubeRb::Transcript::TranscriptSnippet @text="how are you", @start=1.54, @duration=4.16>,
|
|
73
73
|
# ...
|
|
74
74
|
]
|
|
75
75
|
>
|
|
@@ -78,7 +78,7 @@ This will return a `FetchedTranscript` object looking somewhat like this:
|
|
|
78
78
|
This object implements `Enumerable`, so you can iterate over it:
|
|
79
79
|
|
|
80
80
|
```ruby
|
|
81
|
-
api =
|
|
81
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
82
82
|
fetched_transcript = api.fetch(video_id)
|
|
83
83
|
|
|
84
84
|
# is iterable
|
|
@@ -117,13 +117,13 @@ an array of hashes:
|
|
|
117
117
|
You can also use the convenience methods on the module directly:
|
|
118
118
|
|
|
119
119
|
```ruby
|
|
120
|
-
require '
|
|
120
|
+
require 'youtube_rb/transcript'
|
|
121
121
|
|
|
122
122
|
# Fetch a transcript
|
|
123
|
-
transcript =
|
|
123
|
+
transcript = YoutubeRb::Transcript.fetch(video_id)
|
|
124
124
|
|
|
125
125
|
# List available transcripts
|
|
126
|
-
transcript_list =
|
|
126
|
+
transcript_list = YoutubeRb::Transcript.list(video_id)
|
|
127
127
|
```
|
|
128
128
|
|
|
129
129
|
### Retrieve different languages
|
|
@@ -132,7 +132,7 @@ You can add the `languages` param if you want to make sure the transcripts are r
|
|
|
132
132
|
(it defaults to english).
|
|
133
133
|
|
|
134
134
|
```ruby
|
|
135
|
-
|
|
135
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de', 'en'])
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
It's an array of language codes in a descending priority. In this example it will first try to fetch the german
|
|
@@ -142,7 +142,7 @@ which languages are available first, [have a look at `list`](#list-available-tra
|
|
|
142
142
|
If you only want one language, you still need to format the `languages` argument as an array:
|
|
143
143
|
|
|
144
144
|
```ruby
|
|
145
|
-
|
|
145
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de'])
|
|
146
146
|
```
|
|
147
147
|
|
|
148
148
|
### Preserve formatting
|
|
@@ -151,7 +151,7 @@ You can also add `preserve_formatting: true` if you'd like to keep HTML formatti
|
|
|
151
151
|
and `<b>` (bold).
|
|
152
152
|
|
|
153
153
|
```ruby
|
|
154
|
-
|
|
154
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de', 'en'], preserve_formatting: true)
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
### List available transcripts
|
|
@@ -159,7 +159,7 @@ Youtube::Transcript::Rb::YouTubeTranscriptApi.new.fetch(video_id, languages: ['d
|
|
|
159
159
|
If you want to list all transcripts which are available for a given video you can call:
|
|
160
160
|
|
|
161
161
|
```ruby
|
|
162
|
-
api =
|
|
162
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
163
163
|
transcript_list = api.list(video_id)
|
|
164
164
|
```
|
|
165
165
|
|
|
@@ -220,9 +220,9 @@ puts translated_transcript.fetch
|
|
|
220
220
|
### By example
|
|
221
221
|
|
|
222
222
|
```ruby
|
|
223
|
-
require '
|
|
223
|
+
require 'youtube_rb/transcript'
|
|
224
224
|
|
|
225
|
-
api =
|
|
225
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
226
226
|
|
|
227
227
|
# retrieve the available transcripts
|
|
228
228
|
transcript_list = api.list('video_id')
|
|
@@ -262,7 +262,7 @@ transcript = transcript_list.find_generated_transcript(['de', 'en'])
|
|
|
262
262
|
You can fetch transcripts for multiple videos at once:
|
|
263
263
|
|
|
264
264
|
```ruby
|
|
265
|
-
api =
|
|
265
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
266
266
|
|
|
267
267
|
# Fetch multiple videos
|
|
268
268
|
transcripts = api.fetch_all(['video1', 'video2', 'video3'])
|
|
@@ -297,14 +297,14 @@ The `Formatters` module provides a few basic formatters:
|
|
|
297
297
|
Here is how to import from the `Formatters` module:
|
|
298
298
|
|
|
299
299
|
```ruby
|
|
300
|
-
require '
|
|
300
|
+
require 'youtube_rb/transcript'
|
|
301
301
|
|
|
302
302
|
# Some provided formatter classes, each outputs a different string format.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
303
|
+
YoutubeRb::Transcript::Formatters::JSONFormatter
|
|
304
|
+
YoutubeRb::Transcript::Formatters::TextFormatter
|
|
305
|
+
YoutubeRb::Transcript::Formatters::PrettyPrintFormatter
|
|
306
|
+
YoutubeRb::Transcript::Formatters::WebVTTFormatter
|
|
307
|
+
YoutubeRb::Transcript::Formatters::SRTFormatter
|
|
308
308
|
```
|
|
309
309
|
|
|
310
310
|
### Formatter Example
|
|
@@ -312,12 +312,12 @@ Youtube::Transcript::Rb::Formatters::SRTFormatter
|
|
|
312
312
|
Let's say we wanted to retrieve a transcript and store it to a JSON file. That would look something like this:
|
|
313
313
|
|
|
314
314
|
```ruby
|
|
315
|
-
require '
|
|
315
|
+
require 'youtube_rb/transcript'
|
|
316
316
|
|
|
317
|
-
api =
|
|
317
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
318
318
|
transcript = api.fetch(video_id)
|
|
319
319
|
|
|
320
|
-
formatter =
|
|
320
|
+
formatter = YoutubeRb::Transcript::Formatters::JSONFormatter.new
|
|
321
321
|
|
|
322
322
|
# .format_transcript(transcript) turns the transcript into a JSON string.
|
|
323
323
|
json_formatted = formatter.format_transcript(transcript)
|
|
@@ -334,7 +334,7 @@ Since `JSONFormatter` leverages `JSON.generate` you can also forward keyword arg
|
|
|
334
334
|
`.format_transcript(transcript)` such as making your file output prettier:
|
|
335
335
|
|
|
336
336
|
```ruby
|
|
337
|
-
json_formatted =
|
|
337
|
+
json_formatted = YoutubeRb::Transcript::Formatters::JSONFormatter.new.format_transcript(
|
|
338
338
|
transcript,
|
|
339
339
|
indent: ' ',
|
|
340
340
|
space: ' '
|
|
@@ -346,9 +346,9 @@ json_formatted = Youtube::Transcript::Rb::Formatters::JSONFormatter.new.format_t
|
|
|
346
346
|
You can also use the `FormatterLoader` to dynamically load formatters by name:
|
|
347
347
|
|
|
348
348
|
```ruby
|
|
349
|
-
require '
|
|
349
|
+
require 'youtube_rb/transcript'
|
|
350
350
|
|
|
351
|
-
loader =
|
|
351
|
+
loader = YoutubeRb::Transcript::Formatters::FormatterLoader.new
|
|
352
352
|
|
|
353
353
|
# Load by type name: "json", "pretty", "text", "webvtt", "srt"
|
|
354
354
|
formatter = loader.load("json")
|
|
@@ -364,7 +364,7 @@ You can implement your own formatter class. Just inherit from the `Formatter` ba
|
|
|
364
364
|
`format_transcript` and `format_transcripts` methods which should ultimately return a string:
|
|
365
365
|
|
|
366
366
|
```ruby
|
|
367
|
-
class MyCustomFormatter <
|
|
367
|
+
class MyCustomFormatter < YoutubeRb::Transcript::Formatters::Formatter
|
|
368
368
|
def format_transcript(transcript, **options)
|
|
369
369
|
# Do your custom work in here, but return a string.
|
|
370
370
|
'your processed output data as a string.'
|
|
@@ -382,28 +382,28 @@ end
|
|
|
382
382
|
The library provides a comprehensive set of exceptions for different error scenarios:
|
|
383
383
|
|
|
384
384
|
```ruby
|
|
385
|
-
require '
|
|
385
|
+
require 'youtube_rb/transcript'
|
|
386
386
|
|
|
387
387
|
begin
|
|
388
|
-
transcript =
|
|
389
|
-
rescue
|
|
388
|
+
transcript = YoutubeRb::Transcript.fetch(video_id)
|
|
389
|
+
rescue YoutubeRb::Transcript::TranscriptsDisabled => e
|
|
390
390
|
puts "Subtitles are disabled for this video"
|
|
391
|
-
rescue
|
|
391
|
+
rescue YoutubeRb::Transcript::NoTranscriptFound => e
|
|
392
392
|
puts "No transcript found for the requested languages"
|
|
393
393
|
puts e.requested_language_codes
|
|
394
|
-
rescue
|
|
394
|
+
rescue YoutubeRb::Transcript::NoTranscriptAvailable => e
|
|
395
395
|
puts "No transcripts are available for this video"
|
|
396
|
-
rescue
|
|
396
|
+
rescue YoutubeRb::Transcript::VideoUnavailable => e
|
|
397
397
|
puts "The video is no longer available"
|
|
398
|
-
rescue
|
|
398
|
+
rescue YoutubeRb::Transcript::TooManyRequests => e
|
|
399
399
|
puts "Rate limited by YouTube"
|
|
400
|
-
rescue
|
|
400
|
+
rescue YoutubeRb::Transcript::RequestBlocked => e
|
|
401
401
|
puts "Request blocked by YouTube"
|
|
402
|
-
rescue
|
|
402
|
+
rescue YoutubeRb::Transcript::IpBlocked => e
|
|
403
403
|
puts "Your IP has been blocked by YouTube"
|
|
404
|
-
rescue
|
|
404
|
+
rescue YoutubeRb::Transcript::PoTokenRequired => e
|
|
405
405
|
puts "PO token required - this is a YouTube limitation"
|
|
406
|
-
rescue
|
|
406
|
+
rescue YoutubeRb::Transcript::CouldNotRetrieveTranscript => e
|
|
407
407
|
puts "Could not retrieve transcript: #{e.message}"
|
|
408
408
|
end
|
|
409
409
|
```
|
|
@@ -456,11 +456,11 @@ http_client = Faraday.new do |conn|
|
|
|
456
456
|
conn.adapter Faraday.default_adapter
|
|
457
457
|
end
|
|
458
458
|
|
|
459
|
-
api =
|
|
459
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new(http_client: http_client)
|
|
460
460
|
api.fetch(video_id)
|
|
461
461
|
|
|
462
462
|
# Share same connection between two instances
|
|
463
|
-
api_2 =
|
|
463
|
+
api_2 = YoutubeRb::Transcript::YouTubeTranscriptApi.new(http_client: http_client)
|
|
464
464
|
api_2.fetch(video_id)
|
|
465
465
|
```
|
|
466
466
|
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "faraday/follow_redirects"
|
|
5
|
+
|
|
6
|
+
module YoutubeRb
|
|
7
|
+
module Transcript
|
|
8
|
+
# Main entry point for fetching YouTube transcripts.
|
|
9
|
+
# This class provides a simple API for retrieving transcripts from YouTube videos.
|
|
10
|
+
#
|
|
11
|
+
# @example Basic usage
|
|
12
|
+
# api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
13
|
+
# transcript = api.fetch("dQw4w9WgXcQ")
|
|
14
|
+
# transcript.each { |snippet| puts snippet.text }
|
|
15
|
+
#
|
|
16
|
+
# @example With language preference
|
|
17
|
+
# api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
18
|
+
# transcript = api.fetch("dQw4w9WgXcQ", languages: ["es", "en"])
|
|
19
|
+
#
|
|
20
|
+
# @example Listing available transcripts
|
|
21
|
+
# api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
22
|
+
# transcript_list = api.list("dQw4w9WgXcQ")
|
|
23
|
+
# transcript_list.each { |t| puts t }
|
|
24
|
+
#
|
|
25
|
+
class YouTubeTranscriptApi
|
|
26
|
+
# Default timeout for HTTP requests in seconds
|
|
27
|
+
DEFAULT_TIMEOUT = 30
|
|
28
|
+
|
|
29
|
+
# @param http_client [Faraday::Connection, nil] Custom HTTP client (optional)
|
|
30
|
+
# @param proxy_config [Object, nil] Proxy configuration (optional)
|
|
31
|
+
def initialize(http_client: nil, proxy_config: nil)
|
|
32
|
+
@http_client = http_client || build_default_http_client
|
|
33
|
+
@proxy_config = proxy_config
|
|
34
|
+
@fetcher = TranscriptListFetcher.new(
|
|
35
|
+
http_client: @http_client,
|
|
36
|
+
proxy_config: @proxy_config
|
|
37
|
+
)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Fetch a transcript for a video.
|
|
41
|
+
# This is a convenience method that combines `list` and `find_transcript`.
|
|
42
|
+
#
|
|
43
|
+
# @param video_id [String] The YouTube video ID
|
|
44
|
+
# @param languages [Array<String>] Language codes in order of preference (default: ["en"])
|
|
45
|
+
# @param preserve_formatting [Boolean] Whether to preserve HTML formatting (default: false)
|
|
46
|
+
# @return [FetchedTranscript] The fetched transcript
|
|
47
|
+
# @raise [NoTranscriptFound] If no transcript matches the requested languages
|
|
48
|
+
# @raise [TranscriptsDisabled] If transcripts are disabled for the video
|
|
49
|
+
# @raise [VideoUnavailable] If the video is not available
|
|
50
|
+
#
|
|
51
|
+
# @example
|
|
52
|
+
# api = YouTubeTranscriptApi.new
|
|
53
|
+
# transcript = api.fetch("dQw4w9WgXcQ", languages: ["en", "es"])
|
|
54
|
+
# puts transcript.first.text
|
|
55
|
+
#
|
|
56
|
+
def fetch(video_id, languages: ["en"], preserve_formatting: false)
|
|
57
|
+
list(video_id)
|
|
58
|
+
.find_transcript(languages)
|
|
59
|
+
.fetch(preserve_formatting: preserve_formatting)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# List all available transcripts for a video.
|
|
63
|
+
#
|
|
64
|
+
# @param video_id [String] The YouTube video ID
|
|
65
|
+
# @return [TranscriptList] A list of available transcripts
|
|
66
|
+
# @raise [TranscriptsDisabled] If transcripts are disabled for the video
|
|
67
|
+
# @raise [VideoUnavailable] If the video is not available
|
|
68
|
+
#
|
|
69
|
+
# @example
|
|
70
|
+
# api = YouTubeTranscriptApi.new
|
|
71
|
+
# transcript_list = api.list("dQw4w9WgXcQ")
|
|
72
|
+
#
|
|
73
|
+
# # Find a specific transcript
|
|
74
|
+
# transcript = transcript_list.find_transcript(["en"])
|
|
75
|
+
#
|
|
76
|
+
# # Or iterate over all available transcripts
|
|
77
|
+
# transcript_list.each do |transcript|
|
|
78
|
+
# puts "#{transcript.language_code}: #{transcript.language}"
|
|
79
|
+
# end
|
|
80
|
+
#
|
|
81
|
+
def list(video_id)
|
|
82
|
+
@fetcher.fetch(video_id)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Fetch transcripts for multiple videos.
|
|
86
|
+
#
|
|
87
|
+
# @param video_ids [Array<String>] Array of YouTube video IDs
|
|
88
|
+
# @param languages [Array<String>] Language codes in order of preference (default: ["en"])
|
|
89
|
+
# @param preserve_formatting [Boolean] Whether to preserve HTML formatting (default: false)
|
|
90
|
+
# @param continue_on_error [Boolean] Whether to continue if a video fails (default: false)
|
|
91
|
+
# @yield [video_id, result] Block called for each video with either transcript or error
|
|
92
|
+
# @yieldparam video_id [String] The video ID being processed
|
|
93
|
+
# @yieldparam result [FetchedTranscript, StandardError] The transcript or error
|
|
94
|
+
# @return [Hash<String, FetchedTranscript>] Hash mapping video IDs to transcripts
|
|
95
|
+
# @raise [CouldNotRetrieveTranscript] If any video fails and continue_on_error is false
|
|
96
|
+
#
|
|
97
|
+
# @example Fetch multiple videos
|
|
98
|
+
# api = YouTubeTranscriptApi.new
|
|
99
|
+
# transcripts = api.fetch_all(["video1", "video2", "video3"])
|
|
100
|
+
# transcripts.each { |id, t| puts "#{id}: #{t.length} snippets" }
|
|
101
|
+
#
|
|
102
|
+
# @example With error handling
|
|
103
|
+
# api = YouTubeTranscriptApi.new
|
|
104
|
+
# api.fetch_all(["video1", "video2"], continue_on_error: true) do |video_id, result|
|
|
105
|
+
# if result.is_a?(StandardError)
|
|
106
|
+
# puts "Error for #{video_id}: #{result.message}"
|
|
107
|
+
# else
|
|
108
|
+
# puts "Got #{result.length} snippets for #{video_id}"
|
|
109
|
+
# end
|
|
110
|
+
# end
|
|
111
|
+
#
|
|
112
|
+
def fetch_all(video_ids, languages: ["en"], preserve_formatting: false, continue_on_error: false)
|
|
113
|
+
results = {}
|
|
114
|
+
|
|
115
|
+
video_ids.each do |video_id|
|
|
116
|
+
begin
|
|
117
|
+
transcript = fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
|
|
118
|
+
results[video_id] = transcript
|
|
119
|
+
yield(video_id, transcript) if block_given?
|
|
120
|
+
rescue CouldNotRetrieveTranscript => e
|
|
121
|
+
if continue_on_error
|
|
122
|
+
yield(video_id, e) if block_given?
|
|
123
|
+
else
|
|
124
|
+
raise
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
results
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
# Build the default Faraday HTTP client
|
|
135
|
+
#
|
|
136
|
+
# @return [Faraday::Connection] The configured HTTP client
|
|
137
|
+
def build_default_http_client
|
|
138
|
+
Faraday.new do |conn|
|
|
139
|
+
conn.options.timeout = DEFAULT_TIMEOUT
|
|
140
|
+
conn.options.open_timeout = DEFAULT_TIMEOUT
|
|
141
|
+
conn.request :url_encoded
|
|
142
|
+
conn.response :follow_redirects
|
|
143
|
+
conn.adapter Faraday.default_adapter
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module YoutubeRb
|
|
4
|
+
module Transcript
|
|
5
|
+
# Base error class for all YouTube Transcript errors
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
|
|
8
|
+
# Raised when a transcript could not be retrieved
|
|
9
|
+
class CouldNotRetrieveTranscript < Error
|
|
10
|
+
WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
|
|
11
|
+
|
|
12
|
+
# @return [String] the video ID that caused the error
|
|
13
|
+
attr_reader :video_id
|
|
14
|
+
|
|
15
|
+
# @param video_id [String] the YouTube video ID
|
|
16
|
+
def initialize(video_id)
|
|
17
|
+
@video_id = video_id
|
|
18
|
+
super(build_error_message)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# @return [String] the cause of the error
|
|
22
|
+
def cause_message
|
|
23
|
+
self.class::CAUSE_MESSAGE
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def build_error_message
|
|
29
|
+
video_url = format(WATCH_URL, video_id: @video_id)
|
|
30
|
+
message = "\nCould not retrieve a transcript for the video #{video_url}!"
|
|
31
|
+
|
|
32
|
+
if cause_message && !cause_message.empty?
|
|
33
|
+
message += " This is most likely caused by:\n\n#{cause_message}"
|
|
34
|
+
message += github_referral
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
message
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def github_referral
|
|
41
|
+
"\n\nIf you are sure that the described cause is not responsible for this error " \
|
|
42
|
+
"and that a transcript should be retrievable, please create an issue at " \
|
|
43
|
+
"https://github.com/jdepoix/youtube-transcript-api/issues. " \
|
|
44
|
+
"Please add which version of youtube_transcript_api you are using " \
|
|
45
|
+
"and provide the information needed to replicate the error. " \
|
|
46
|
+
"Also make sure that there are no open issues which already describe your problem!"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Raised when YouTube data cannot be parsed
|
|
51
|
+
class YouTubeDataUnparsable < CouldNotRetrieveTranscript
|
|
52
|
+
CAUSE_MESSAGE = "The data required to fetch the transcript is not parsable. This should " \
|
|
53
|
+
"not happen, please open an issue (make sure to include the video ID)!"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Raised when a request to YouTube fails
|
|
57
|
+
class YouTubeRequestFailed < CouldNotRetrieveTranscript
|
|
58
|
+
CAUSE_MESSAGE = "Request to YouTube failed: %<reason>s"
|
|
59
|
+
|
|
60
|
+
# @return [String] the reason for the failure
|
|
61
|
+
attr_reader :reason
|
|
62
|
+
|
|
63
|
+
# @param video_id [String] the YouTube video ID
|
|
64
|
+
# @param http_error [StandardError] the HTTP error that occurred
|
|
65
|
+
def initialize(video_id, http_error)
|
|
66
|
+
@reason = http_error.to_s
|
|
67
|
+
super(video_id)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def cause_message
|
|
71
|
+
format(CAUSE_MESSAGE, reason: @reason)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Raised when a video is unplayable
|
|
76
|
+
class VideoUnplayable < CouldNotRetrieveTranscript
|
|
77
|
+
CAUSE_MESSAGE = "The video is unplayable for the following reason: %<reason>s"
|
|
78
|
+
|
|
79
|
+
# @return [String, nil] the reason the video is unplayable
|
|
80
|
+
attr_reader :reason
|
|
81
|
+
|
|
82
|
+
# @return [Array<String>] additional sub-reasons
|
|
83
|
+
attr_reader :sub_reasons
|
|
84
|
+
|
|
85
|
+
# @param video_id [String] the YouTube video ID
|
|
86
|
+
# @param reason [String, nil] the reason the video is unplayable
|
|
87
|
+
# @param sub_reasons [Array<String>] additional details
|
|
88
|
+
def initialize(video_id, reason = nil, sub_reasons = [])
|
|
89
|
+
@reason = reason
|
|
90
|
+
@sub_reasons = sub_reasons
|
|
91
|
+
super(video_id)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def cause_message
|
|
95
|
+
reason_text = @reason || "No reason specified!"
|
|
96
|
+
|
|
97
|
+
if @sub_reasons.any?
|
|
98
|
+
sub_reasons_text = @sub_reasons.map { |r| " - #{r}" }.join("\n")
|
|
99
|
+
reason_text = "#{reason_text}\n\nAdditional Details:\n#{sub_reasons_text}"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
format(CAUSE_MESSAGE, reason: reason_text)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Raised when a video is unavailable
|
|
107
|
+
class VideoUnavailable < CouldNotRetrieveTranscript
|
|
108
|
+
CAUSE_MESSAGE = "The video is no longer available"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Raised when an invalid video ID is provided
|
|
112
|
+
class InvalidVideoId < CouldNotRetrieveTranscript
|
|
113
|
+
CAUSE_MESSAGE = "You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n" \
|
|
114
|
+
'Do NOT run: `YoutubeRb::Transcript.fetch("https://www.youtube.com/watch?v=1234")`' \
|
|
115
|
+
"\n" \
|
|
116
|
+
'Instead run: `YoutubeRb::Transcript.fetch("1234")`'
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Raised when YouTube blocks the request
|
|
120
|
+
class RequestBlocked < CouldNotRetrieveTranscript
|
|
121
|
+
BASE_CAUSE_MESSAGE = "YouTube is blocking requests from your IP. This usually is due to one of the " \
|
|
122
|
+
"following reasons:\n" \
|
|
123
|
+
"- You have done too many requests and your IP has been blocked by YouTube\n" \
|
|
124
|
+
"- You are doing requests from an IP belonging to a cloud provider (like AWS, " \
|
|
125
|
+
"Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud " \
|
|
126
|
+
"providers are blocked by YouTube.\n\n"
|
|
127
|
+
|
|
128
|
+
CAUSE_MESSAGE = "#{BASE_CAUSE_MESSAGE}" \
|
|
129
|
+
"There are two things you can do to work around this:\n" \
|
|
130
|
+
"1. Use proxies to hide your IP address.\n" \
|
|
131
|
+
"2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you " \
|
|
132
|
+
"will be able to continue doing requests for a while. However, YouTube will " \
|
|
133
|
+
"eventually permanently ban the account that you have used to authenticate " \
|
|
134
|
+
"with! So only do this if you don't mind your account being banned!"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Raised when YouTube blocks the IP specifically
|
|
138
|
+
class IpBlocked < RequestBlocked
|
|
139
|
+
CAUSE_MESSAGE = "#{RequestBlocked::BASE_CAUSE_MESSAGE}" \
|
|
140
|
+
"Ways to work around this are using proxies or rotating residential IPs."
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Raised when too many requests are made (HTTP 429)
|
|
144
|
+
class TooManyRequests < CouldNotRetrieveTranscript
|
|
145
|
+
CAUSE_MESSAGE = "YouTube is rate limiting your requests. Please wait before making more requests."
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Raised when transcripts are disabled for a video
|
|
149
|
+
class TranscriptsDisabled < CouldNotRetrieveTranscript
|
|
150
|
+
CAUSE_MESSAGE = "Subtitles are disabled for this video"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Raised when a video is age restricted
|
|
154
|
+
class AgeRestricted < CouldNotRetrieveTranscript
|
|
155
|
+
CAUSE_MESSAGE = "This video is age-restricted. Therefore, you are unable to retrieve " \
|
|
156
|
+
"transcripts for it without authenticating yourself.\n\n" \
|
|
157
|
+
"Unfortunately, Cookie Authentication is temporarily unsupported, " \
|
|
158
|
+
"as recent changes in YouTube's API broke the previous implementation."
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Raised when a transcript is not translatable
|
|
162
|
+
class NotTranslatable < CouldNotRetrieveTranscript
|
|
163
|
+
CAUSE_MESSAGE = "The requested language is not translatable"
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Raised when the requested translation language is not available
|
|
167
|
+
class TranslationLanguageNotAvailable < CouldNotRetrieveTranscript
|
|
168
|
+
CAUSE_MESSAGE = "The requested translation language is not available"
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Raised when consent cookie creation fails
|
|
172
|
+
class FailedToCreateConsentCookie < CouldNotRetrieveTranscript
|
|
173
|
+
CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies"
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Raised when no transcript is found for the requested languages
|
|
177
|
+
class NoTranscriptFound < CouldNotRetrieveTranscript
|
|
178
|
+
CAUSE_MESSAGE = "No transcripts were found for any of the requested language codes: %<requested_language_codes>s\n\n%<transcript_data>s"
|
|
179
|
+
|
|
180
|
+
# @return [Array<String>] the requested language codes
|
|
181
|
+
attr_reader :requested_language_codes
|
|
182
|
+
|
|
183
|
+
# @return [Object] the transcript data (TranscriptList)
|
|
184
|
+
attr_reader :transcript_data
|
|
185
|
+
|
|
186
|
+
# @param video_id [String] the YouTube video ID
|
|
187
|
+
# @param requested_language_codes [Array<String>] the language codes that were requested
|
|
188
|
+
# @param transcript_data [Object] the TranscriptList object with available transcripts
|
|
189
|
+
def initialize(video_id, requested_language_codes, transcript_data)
|
|
190
|
+
@requested_language_codes = requested_language_codes
|
|
191
|
+
@transcript_data = transcript_data
|
|
192
|
+
super(video_id)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def cause_message
|
|
196
|
+
format(
|
|
197
|
+
CAUSE_MESSAGE,
|
|
198
|
+
requested_language_codes: @requested_language_codes.inspect,
|
|
199
|
+
transcript_data: @transcript_data.to_s
|
|
200
|
+
)
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Raised when no transcripts are available for a video
|
|
205
|
+
class NoTranscriptAvailable < CouldNotRetrieveTranscript
|
|
206
|
+
CAUSE_MESSAGE = "No transcripts are available for this video"
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Raised when a PO token is required to fetch the transcript
|
|
210
|
+
class PoTokenRequired < CouldNotRetrieveTranscript
|
|
211
|
+
CAUSE_MESSAGE = "The requested video cannot be retrieved without a PO Token. " \
|
|
212
|
+
"If this happens, please open a GitHub issue!"
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|