youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Youtube
4
+ module Transcript
5
+ module Rb
6
+ # Base error class for all YouTube Transcript errors
7
+ class Error < StandardError; end
8
+
9
+ # Raised when a transcript could not be retrieved
10
+ class CouldNotRetrieveTranscript < Error
11
+ WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
12
+
13
+ # @return [String] the video ID that caused the error
14
+ attr_reader :video_id
15
+
16
+ # @param video_id [String] the YouTube video ID
17
+ def initialize(video_id)
18
+ @video_id = video_id
19
+ super(build_error_message)
20
+ end
21
+
22
+ # @return [String] the cause of the error
23
+ def cause_message
24
+ self.class::CAUSE_MESSAGE
25
+ end
26
+
27
+ private
28
+
29
+ def build_error_message
30
+ video_url = format(WATCH_URL, video_id: @video_id)
31
+ message = "\nCould not retrieve a transcript for the video #{video_url}!"
32
+
33
+ if cause_message && !cause_message.empty?
34
+ message += " This is most likely caused by:\n\n#{cause_message}"
35
+ message += github_referral
36
+ end
37
+
38
+ message
39
+ end
40
+
41
+ def github_referral
42
+ "\n\nIf you are sure that the described cause is not responsible for this error " \
43
+ "and that a transcript should be retrievable, please create an issue at " \
44
+ "https://github.com/jdepoix/youtube-transcript-api/issues. " \
45
+ "Please add which version of youtube_transcript_api you are using " \
46
+ "and provide the information needed to replicate the error. " \
47
+ "Also make sure that there are no open issues which already describe your problem!"
48
+ end
49
+ end
50
+
51
+ # Raised when YouTube data cannot be parsed
52
+ class YouTubeDataUnparsable < CouldNotRetrieveTranscript
53
+ CAUSE_MESSAGE = "The data required to fetch the transcript is not parsable. This should " \
54
+ "not happen, please open an issue (make sure to include the video ID)!"
55
+ end
56
+
57
+ # Raised when a request to YouTube fails
58
+ class YouTubeRequestFailed < CouldNotRetrieveTranscript
59
+ CAUSE_MESSAGE = "Request to YouTube failed: %<reason>s"
60
+
61
+ # @return [String] the reason for the failure
62
+ attr_reader :reason
63
+
64
+ # @param video_id [String] the YouTube video ID
65
+ # @param http_error [StandardError] the HTTP error that occurred
66
+ def initialize(video_id, http_error)
67
+ @reason = http_error.to_s
68
+ super(video_id)
69
+ end
70
+
71
+ def cause_message
72
+ format(CAUSE_MESSAGE, reason: @reason)
73
+ end
74
+ end
75
+
76
+ # Raised when a video is unplayable
77
+ class VideoUnplayable < CouldNotRetrieveTranscript
78
+ CAUSE_MESSAGE = "The video is unplayable for the following reason: %<reason>s"
79
+
80
+ # @return [String, nil] the reason the video is unplayable
81
+ attr_reader :reason
82
+
83
+ # @return [Array<String>] additional sub-reasons
84
+ attr_reader :sub_reasons
85
+
86
+ # @param video_id [String] the YouTube video ID
87
+ # @param reason [String, nil] the reason the video is unplayable
88
+ # @param sub_reasons [Array<String>] additional details
89
+ def initialize(video_id, reason = nil, sub_reasons = [])
90
+ @reason = reason
91
+ @sub_reasons = sub_reasons
92
+ super(video_id)
93
+ end
94
+
95
+ def cause_message
96
+ reason_text = @reason || "No reason specified!"
97
+
98
+ if @sub_reasons.any?
99
+ sub_reasons_text = @sub_reasons.map { |r| " - #{r}" }.join("\n")
100
+ reason_text = "#{reason_text}\n\nAdditional Details:\n#{sub_reasons_text}"
101
+ end
102
+
103
+ format(CAUSE_MESSAGE, reason: reason_text)
104
+ end
105
+ end
106
+
107
+ # Raised when a video is unavailable
108
+ class VideoUnavailable < CouldNotRetrieveTranscript
109
+ CAUSE_MESSAGE = "The video is no longer available"
110
+ end
111
+
112
+ # Raised when an invalid video ID is provided
113
+ class InvalidVideoId < CouldNotRetrieveTranscript
114
+ CAUSE_MESSAGE = "You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n" \
115
+ 'Do NOT run: `Youtube::Transcript::Rb.fetch("https://www.youtube.com/watch?v=1234")`' \
116
+ "\n" \
117
+ 'Instead run: `Youtube::Transcript::Rb.fetch("1234")`'
118
+ end
119
+
120
+ # Raised when YouTube blocks the request
121
+ class RequestBlocked < CouldNotRetrieveTranscript
122
+ BASE_CAUSE_MESSAGE = "YouTube is blocking requests from your IP. This usually is due to one of the " \
123
+ "following reasons:\n" \
124
+ "- You have done too many requests and your IP has been blocked by YouTube\n" \
125
+ "- You are doing requests from an IP belonging to a cloud provider (like AWS, " \
126
+ "Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud " \
127
+ "providers are blocked by YouTube.\n\n"
128
+
129
+ CAUSE_MESSAGE = "#{BASE_CAUSE_MESSAGE}" \
130
+ "There are two things you can do to work around this:\n" \
131
+ "1. Use proxies to hide your IP address.\n" \
132
+ "2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you " \
133
+ "will be able to continue doing requests for a while. However, YouTube will " \
134
+ "eventually permanently ban the account that you have used to authenticate " \
135
+ "with! So only do this if you don't mind your account being banned!"
136
+ end
137
+
138
+ # Raised when YouTube blocks the IP specifically
139
+ class IpBlocked < RequestBlocked
140
+ CAUSE_MESSAGE = "#{RequestBlocked::BASE_CAUSE_MESSAGE}" \
141
+ "Ways to work around this are using proxies or rotating residential IPs."
142
+ end
143
+
144
+ # Raised when too many requests are made (HTTP 429)
145
+ class TooManyRequests < CouldNotRetrieveTranscript
146
+ CAUSE_MESSAGE = "YouTube is rate limiting your requests. Please wait before making more requests."
147
+ end
148
+
149
+ # Raised when transcripts are disabled for a video
150
+ class TranscriptsDisabled < CouldNotRetrieveTranscript
151
+ CAUSE_MESSAGE = "Subtitles are disabled for this video"
152
+ end
153
+
154
+ # Raised when a video is age restricted
155
+ class AgeRestricted < CouldNotRetrieveTranscript
156
+ CAUSE_MESSAGE = "This video is age-restricted. Therefore, you are unable to retrieve " \
157
+ "transcripts for it without authenticating yourself.\n\n" \
158
+ "Unfortunately, Cookie Authentication is temporarily unsupported, " \
159
+ "as recent changes in YouTube's API broke the previous implementation."
160
+ end
161
+
162
+ # Raised when a transcript is not translatable
163
+ class NotTranslatable < CouldNotRetrieveTranscript
164
+ CAUSE_MESSAGE = "The requested language is not translatable"
165
+ end
166
+
167
+ # Raised when the requested translation language is not available
168
+ class TranslationLanguageNotAvailable < CouldNotRetrieveTranscript
169
+ CAUSE_MESSAGE = "The requested translation language is not available"
170
+ end
171
+
172
+ # Raised when consent cookie creation fails
173
+ class FailedToCreateConsentCookie < CouldNotRetrieveTranscript
174
+ CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies"
175
+ end
176
+
177
+ # Raised when no transcript is found for the requested languages
178
+ class NoTranscriptFound < CouldNotRetrieveTranscript
179
+ CAUSE_MESSAGE = "No transcripts were found for any of the requested language codes: %<requested_language_codes>s\n\n%<transcript_data>s"
180
+
181
+ # @return [Array<String>] the requested language codes
182
+ attr_reader :requested_language_codes
183
+
184
+ # @return [Object] the transcript data (TranscriptList)
185
+ attr_reader :transcript_data
186
+
187
+ # @param video_id [String] the YouTube video ID
188
+ # @param requested_language_codes [Array<String>] the language codes that were requested
189
+ # @param transcript_data [Object] the TranscriptList object with available transcripts
190
+ def initialize(video_id, requested_language_codes, transcript_data)
191
+ @requested_language_codes = requested_language_codes
192
+ @transcript_data = transcript_data
193
+ super(video_id)
194
+ end
195
+
196
+ def cause_message
197
+ format(
198
+ CAUSE_MESSAGE,
199
+ requested_language_codes: @requested_language_codes.inspect,
200
+ transcript_data: @transcript_data.to_s
201
+ )
202
+ end
203
+ end
204
+
205
+ # Raised when no transcripts are available for a video
206
+ class NoTranscriptAvailable < CouldNotRetrieveTranscript
207
+ CAUSE_MESSAGE = "No transcripts are available for this video"
208
+ end
209
+
210
+ # Raised when a PO token is required to fetch the transcript
211
+ class PoTokenRequired < CouldNotRetrieveTranscript
212
+ CAUSE_MESSAGE = "The requested video cannot be retrieved without a PO Token. " \
213
+ "If this happens, please open a GitHub issue!"
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Youtube
6
+ module Transcript
7
+ module Rb
8
+ # Module containing all transcript formatters
9
+ module Formatters
10
+ # Base formatter class. All formatters should inherit from this class
11
+ # and implement their own format_transcript and format_transcripts methods.
12
+ class Formatter
13
+ # Format a single transcript
14
+ #
15
+ # @param transcript [FetchedTranscript] The transcript to format
16
+ # @param options [Hash] Additional formatting options
17
+ # @return [String] The formatted transcript
18
+ def format_transcript(transcript, **options)
19
+ raise NotImplementedError, "Subclass must implement #format_transcript"
20
+ end
21
+
22
+ # Format multiple transcripts
23
+ #
24
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
25
+ # @param options [Hash] Additional formatting options
26
+ # @return [String] The formatted transcripts
27
+ def format_transcripts(transcripts, **options)
28
+ raise NotImplementedError, "Subclass must implement #format_transcripts"
29
+ end
30
+ end
31
+
32
+ # Formats transcript as pretty-printed Ruby data structures
33
+ class PrettyPrintFormatter < Formatter
34
+ # Format a single transcript as pretty-printed output
35
+ #
36
+ # @param transcript [FetchedTranscript] The transcript to format
37
+ # @param options [Hash] Options passed to PP.pp
38
+ # @return [String] Pretty-printed transcript data
39
+ def format_transcript(transcript, **options)
40
+ require "pp"
41
+ PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
42
+ end
43
+
44
+ # Format multiple transcripts as pretty-printed output
45
+ #
46
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
47
+ # @param options [Hash] Options passed to PP.pp
48
+ # @return [String] Pretty-printed transcripts data
49
+ def format_transcripts(transcripts, **options)
50
+ require "pp"
51
+ data = transcripts.map(&:to_raw_data)
52
+ PP.pp(data, +"", options[:width] || 79)
53
+ end
54
+ end
55
+
56
+ # Formats transcript as JSON
57
+ class JSONFormatter < Formatter
58
+ # Format a single transcript as JSON
59
+ #
60
+ # @param transcript [FetchedTranscript] The transcript to format
61
+ # @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
62
+ # @return [String] JSON representation of the transcript
63
+ def format_transcript(transcript, **options)
64
+ JSON.generate(transcript.to_raw_data, options)
65
+ end
66
+
67
+ # Format multiple transcripts as JSON array
68
+ #
69
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
70
+ # @param options [Hash] Options passed to JSON.generate
71
+ # @return [String] JSON array representation of the transcripts
72
+ def format_transcripts(transcripts, **options)
73
+ data = transcripts.map(&:to_raw_data)
74
+ JSON.generate(data, options)
75
+ end
76
+ end
77
+
78
+ # Formats transcript as plain text (text only, no timestamps)
79
+ class TextFormatter < Formatter
80
+ # Format a single transcript as plain text
81
+ #
82
+ # @param transcript [FetchedTranscript] The transcript to format
83
+ # @param options [Hash] Unused options
84
+ # @return [String] Plain text with each line separated by newlines
85
+ def format_transcript(transcript, **options)
86
+ transcript.map(&:text).join("\n")
87
+ end
88
+
89
+ # Format multiple transcripts as plain text
90
+ #
91
+ # @param transcripts [Array<FetchedTranscript>] The transcripts to format
92
+ # @param options [Hash] Unused options
93
+ # @return [String] Plain text with transcripts separated by triple newlines
94
+ def format_transcripts(transcripts, **options)
95
+ transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
96
+ end
97
+ end
98
+
99
+ # Base class for timestamp-based formatters (SRT, WebVTT)
100
+ class TextBasedFormatter < TextFormatter
101
+ # Format a single transcript with timestamps
102
+ #
103
+ # @param transcript [FetchedTranscript] The transcript to format
104
+ # @param options [Hash] Unused options
105
+ # @return [String] Formatted transcript with timestamps
106
+ def format_transcript(transcript, **options)
107
+ lines = []
108
+ snippets = transcript.to_a
109
+
110
+ snippets.each_with_index do |snippet, i|
111
+ end_time = snippet.start + snippet.duration
112
+
113
+ # Use next snippet's start time if it starts before current end time
114
+ if i < snippets.length - 1 && snippets[i + 1].start < end_time
115
+ end_time = snippets[i + 1].start
116
+ end
117
+
118
+ time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
119
+ lines << format_transcript_helper(i, time_text, snippet)
120
+ end
121
+
122
+ format_transcript_header(lines)
123
+ end
124
+
125
+ protected
126
+
127
+ # Format a timestamp from components
128
+ #
129
+ # @param hours [Integer] Hours component
130
+ # @param mins [Integer] Minutes component
131
+ # @param secs [Integer] Seconds component
132
+ # @param ms [Integer] Milliseconds component
133
+ # @return [String] Formatted timestamp
134
+ def format_timestamp(hours, mins, secs, ms)
135
+ raise NotImplementedError, "Subclass must implement #format_timestamp"
136
+ end
137
+
138
+ # Format the transcript header/wrapper
139
+ #
140
+ # @param lines [Array<String>] The formatted lines
141
+ # @return [String] The complete formatted transcript
142
+ def format_transcript_header(lines)
143
+ raise NotImplementedError, "Subclass must implement #format_transcript_header"
144
+ end
145
+
146
+ # Format a single transcript entry
147
+ #
148
+ # @param index [Integer] The entry index (0-based)
149
+ # @param time_text [String] The formatted time range
150
+ # @param snippet [TranscriptSnippet] The snippet to format
151
+ # @return [String] The formatted entry
152
+ def format_transcript_helper(index, time_text, snippet)
153
+ raise NotImplementedError, "Subclass must implement #format_transcript_helper"
154
+ end
155
+
156
+ private
157
+
158
+ # Convert seconds to timestamp string
159
+ #
160
+ # @param time [Float] Time in seconds
161
+ # @return [String] Formatted timestamp
162
+ def seconds_to_timestamp(time)
163
+ time = time.to_f
164
+ hours, remainder = time.divmod(3600)
165
+ mins, secs_float = remainder.divmod(60)
166
+ secs = secs_float.to_i
167
+ ms = ((time - time.to_i) * 1000).round
168
+
169
+ format_timestamp(hours.to_i, mins.to_i, secs, ms)
170
+ end
171
+ end
172
+
173
+ # Formats transcript as SRT (SubRip) subtitle format
174
+ #
175
+ # @example SRT format
176
+ # 1
177
+ # 00:00:00,000 --> 00:00:02,500
178
+ # Hello world
179
+ #
180
+ # 2
181
+ # 00:00:02,500 --> 00:00:05,000
182
+ # This is a test
183
+ #
184
+ class SRTFormatter < TextBasedFormatter
185
+ protected
186
+
187
+ def format_timestamp(hours, mins, secs, ms)
188
+ format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
189
+ end
190
+
191
+ def format_transcript_header(lines)
192
+ lines.join("\n\n") + "\n"
193
+ end
194
+
195
+ def format_transcript_helper(index, time_text, snippet)
196
+ "#{index + 1}\n#{time_text}\n#{snippet.text}"
197
+ end
198
+ end
199
+
200
+ # Formats transcript as WebVTT (Web Video Text Tracks) format
201
+ #
202
+ # @example WebVTT format
203
+ # WEBVTT
204
+ #
205
+ # 00:00:00.000 --> 00:00:02.500
206
+ # Hello world
207
+ #
208
+ # 00:00:02.500 --> 00:00:05.000
209
+ # This is a test
210
+ #
211
+ class WebVTTFormatter < TextBasedFormatter
212
+ protected
213
+
214
+ def format_timestamp(hours, mins, secs, ms)
215
+ format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
216
+ end
217
+
218
+ def format_transcript_header(lines)
219
+ "WEBVTT\n\n" + lines.join("\n\n") + "\n"
220
+ end
221
+
222
+ def format_transcript_helper(index, time_text, snippet)
223
+ "#{time_text}\n#{snippet.text}"
224
+ end
225
+ end
226
+
227
+ # Utility class to load formatters by type name
228
+ class FormatterLoader
229
+ # Mapping of format names to formatter classes
230
+ TYPES = {
231
+ "json" => JSONFormatter,
232
+ "pretty" => PrettyPrintFormatter,
233
+ "text" => TextFormatter,
234
+ "webvtt" => WebVTTFormatter,
235
+ "srt" => SRTFormatter
236
+ }.freeze
237
+
238
+ # Error raised when an unknown formatter type is requested
239
+ class UnknownFormatterType < StandardError
240
+ def initialize(formatter_type)
241
+ super(
242
+ "The format '#{formatter_type}' is not supported. " \
243
+ "Choose one of the following formats: #{TYPES.keys.join(", ")}"
244
+ )
245
+ end
246
+ end
247
+
248
+ # Load a formatter by type name
249
+ #
250
+ # @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
251
+ # @return [Formatter] An instance of the requested formatter
252
+ # @raise [UnknownFormatterType] If the formatter type is not supported
253
+ #
254
+ # @example
255
+ # loader = FormatterLoader.new
256
+ # formatter = loader.load("json")
257
+ # output = formatter.format_transcript(transcript)
258
+ #
259
+ def load(formatter_type = "pretty")
260
+ formatter_type = formatter_type.to_s
261
+ raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
262
+
263
+ TYPES[formatter_type].new
264
+ end
265
+ end
266
+ end
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Youtube
4
+ module Transcript
5
+ module Rb
6
+ # YouTube watch URL template
7
+ # @example
8
+ # format(WATCH_URL, video_id: "abc123")
9
+ # # => "https://www.youtube.com/watch?v=abc123"
10
+ WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
11
+
12
+ # YouTube Innertube API URL template
13
+ # @example
14
+ # format(INNERTUBE_API_URL, api_key: "key123")
15
+ # # => "https://www.youtube.com/youtubei/v1/player?key=key123"
16
+ INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
17
+
18
+ # Innertube API context for Android client
19
+ # Used in POST requests to the Innertube API
20
+ INNERTUBE_CONTEXT = {
21
+ "client" => {
22
+ "clientName" => "ANDROID",
23
+ "clientVersion" => "20.10.38"
24
+ }
25
+ }.freeze
26
+ end
27
+ end
28
+ end