youtube-transcript-rb 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of youtube-transcript-rb might be problematic. Click here for more details.

Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +42 -42
  3. data/lib/youtube-transcript-rb.rb +3 -0
  4. data/lib/youtube_rb/transcript/api.rb +148 -0
  5. data/lib/youtube_rb/transcript/errors.rb +215 -0
  6. data/lib/youtube_rb/transcript/formatters.rb +267 -0
  7. data/lib/youtube_rb/transcript/settings.rb +26 -0
  8. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  9. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  10. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
  11. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  12. data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
  13. data/lib/youtube_rb/transcript.rb +35 -0
  14. data/sig/youtube_rb/transcript.rbs +6 -0
  15. data/spec/api_spec.rb +20 -20
  16. data/spec/errors_spec.rb +39 -39
  17. data/spec/formatters_spec.rb +36 -36
  18. data/spec/integration_spec.rb +32 -32
  19. data/spec/settings_spec.rb +16 -16
  20. data/spec/spec_helper.rb +1 -1
  21. data/spec/transcript_list_fetcher_spec.rb +27 -27
  22. data/spec/transcript_list_spec.rb +6 -6
  23. data/spec/transcript_parser_spec.rb +3 -3
  24. data/spec/transcript_spec.rb +16 -16
  25. metadata +13 -12
  26. data/lib/youtube/transcript/rb/api.rb +0 -150
  27. data/lib/youtube/transcript/rb/errors.rb +0 -217
  28. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  29. data/lib/youtube/transcript/rb/settings.rb +0 -28
  30. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  31. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  32. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  33. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  34. data/lib/youtube/transcript/rb.rb +0 -37
  35. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -1,269 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "json"
4
-
5
- module Youtube
6
- module Transcript
7
- module Rb
8
- # Module containing all transcript formatters
9
- module Formatters
10
- # Base formatter class. All formatters should inherit from this class
11
- # and implement their own format_transcript and format_transcripts methods.
12
- class Formatter
13
- # Format a single transcript
14
- #
15
- # @param transcript [FetchedTranscript] The transcript to format
16
- # @param options [Hash] Additional formatting options
17
- # @return [String] The formatted transcript
18
- def format_transcript(transcript, **options)
19
- raise NotImplementedError, "Subclass must implement #format_transcript"
20
- end
21
-
22
- # Format multiple transcripts
23
- #
24
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
25
- # @param options [Hash] Additional formatting options
26
- # @return [String] The formatted transcripts
27
- def format_transcripts(transcripts, **options)
28
- raise NotImplementedError, "Subclass must implement #format_transcripts"
29
- end
30
- end
31
-
32
- # Formats transcript as pretty-printed Ruby data structures
33
- class PrettyPrintFormatter < Formatter
34
- # Format a single transcript as pretty-printed output
35
- #
36
- # @param transcript [FetchedTranscript] The transcript to format
37
- # @param options [Hash] Options passed to PP.pp
38
- # @return [String] Pretty-printed transcript data
39
- def format_transcript(transcript, **options)
40
- require "pp"
41
- PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
42
- end
43
-
44
- # Format multiple transcripts as pretty-printed output
45
- #
46
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
47
- # @param options [Hash] Options passed to PP.pp
48
- # @return [String] Pretty-printed transcripts data
49
- def format_transcripts(transcripts, **options)
50
- require "pp"
51
- data = transcripts.map(&:to_raw_data)
52
- PP.pp(data, +"", options[:width] || 79)
53
- end
54
- end
55
-
56
- # Formats transcript as JSON
57
- class JSONFormatter < Formatter
58
- # Format a single transcript as JSON
59
- #
60
- # @param transcript [FetchedTranscript] The transcript to format
61
- # @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
62
- # @return [String] JSON representation of the transcript
63
- def format_transcript(transcript, **options)
64
- JSON.generate(transcript.to_raw_data, options)
65
- end
66
-
67
- # Format multiple transcripts as JSON array
68
- #
69
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
70
- # @param options [Hash] Options passed to JSON.generate
71
- # @return [String] JSON array representation of the transcripts
72
- def format_transcripts(transcripts, **options)
73
- data = transcripts.map(&:to_raw_data)
74
- JSON.generate(data, options)
75
- end
76
- end
77
-
78
- # Formats transcript as plain text (text only, no timestamps)
79
- class TextFormatter < Formatter
80
- # Format a single transcript as plain text
81
- #
82
- # @param transcript [FetchedTranscript] The transcript to format
83
- # @param options [Hash] Unused options
84
- # @return [String] Plain text with each line separated by newlines
85
- def format_transcript(transcript, **options)
86
- transcript.map(&:text).join("\n")
87
- end
88
-
89
- # Format multiple transcripts as plain text
90
- #
91
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
92
- # @param options [Hash] Unused options
93
- # @return [String] Plain text with transcripts separated by triple newlines
94
- def format_transcripts(transcripts, **options)
95
- transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
96
- end
97
- end
98
-
99
- # Base class for timestamp-based formatters (SRT, WebVTT)
100
- class TextBasedFormatter < TextFormatter
101
- # Format a single transcript with timestamps
102
- #
103
- # @param transcript [FetchedTranscript] The transcript to format
104
- # @param options [Hash] Unused options
105
- # @return [String] Formatted transcript with timestamps
106
- def format_transcript(transcript, **options)
107
- lines = []
108
- snippets = transcript.to_a
109
-
110
- snippets.each_with_index do |snippet, i|
111
- end_time = snippet.start + snippet.duration
112
-
113
- # Use next snippet's start time if it starts before current end time
114
- if i < snippets.length - 1 && snippets[i + 1].start < end_time
115
- end_time = snippets[i + 1].start
116
- end
117
-
118
- time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
119
- lines << format_transcript_helper(i, time_text, snippet)
120
- end
121
-
122
- format_transcript_header(lines)
123
- end
124
-
125
- protected
126
-
127
- # Format a timestamp from components
128
- #
129
- # @param hours [Integer] Hours component
130
- # @param mins [Integer] Minutes component
131
- # @param secs [Integer] Seconds component
132
- # @param ms [Integer] Milliseconds component
133
- # @return [String] Formatted timestamp
134
- def format_timestamp(hours, mins, secs, ms)
135
- raise NotImplementedError, "Subclass must implement #format_timestamp"
136
- end
137
-
138
- # Format the transcript header/wrapper
139
- #
140
- # @param lines [Array<String>] The formatted lines
141
- # @return [String] The complete formatted transcript
142
- def format_transcript_header(lines)
143
- raise NotImplementedError, "Subclass must implement #format_transcript_header"
144
- end
145
-
146
- # Format a single transcript entry
147
- #
148
- # @param index [Integer] The entry index (0-based)
149
- # @param time_text [String] The formatted time range
150
- # @param snippet [TranscriptSnippet] The snippet to format
151
- # @return [String] The formatted entry
152
- def format_transcript_helper(index, time_text, snippet)
153
- raise NotImplementedError, "Subclass must implement #format_transcript_helper"
154
- end
155
-
156
- private
157
-
158
- # Convert seconds to timestamp string
159
- #
160
- # @param time [Float] Time in seconds
161
- # @return [String] Formatted timestamp
162
- def seconds_to_timestamp(time)
163
- time = time.to_f
164
- hours, remainder = time.divmod(3600)
165
- mins, secs_float = remainder.divmod(60)
166
- secs = secs_float.to_i
167
- ms = ((time - time.to_i) * 1000).round
168
-
169
- format_timestamp(hours.to_i, mins.to_i, secs, ms)
170
- end
171
- end
172
-
173
- # Formats transcript as SRT (SubRip) subtitle format
174
- #
175
- # @example SRT format
176
- # 1
177
- # 00:00:00,000 --> 00:00:02,500
178
- # Hello world
179
- #
180
- # 2
181
- # 00:00:02,500 --> 00:00:05,000
182
- # This is a test
183
- #
184
- class SRTFormatter < TextBasedFormatter
185
- protected
186
-
187
- def format_timestamp(hours, mins, secs, ms)
188
- format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
189
- end
190
-
191
- def format_transcript_header(lines)
192
- lines.join("\n\n") + "\n"
193
- end
194
-
195
- def format_transcript_helper(index, time_text, snippet)
196
- "#{index + 1}\n#{time_text}\n#{snippet.text}"
197
- end
198
- end
199
-
200
- # Formats transcript as WebVTT (Web Video Text Tracks) format
201
- #
202
- # @example WebVTT format
203
- # WEBVTT
204
- #
205
- # 00:00:00.000 --> 00:00:02.500
206
- # Hello world
207
- #
208
- # 00:00:02.500 --> 00:00:05.000
209
- # This is a test
210
- #
211
- class WebVTTFormatter < TextBasedFormatter
212
- protected
213
-
214
- def format_timestamp(hours, mins, secs, ms)
215
- format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
216
- end
217
-
218
- def format_transcript_header(lines)
219
- "WEBVTT\n\n" + lines.join("\n\n") + "\n"
220
- end
221
-
222
- def format_transcript_helper(index, time_text, snippet)
223
- "#{time_text}\n#{snippet.text}"
224
- end
225
- end
226
-
227
- # Utility class to load formatters by type name
228
- class FormatterLoader
229
- # Mapping of format names to formatter classes
230
- TYPES = {
231
- "json" => JSONFormatter,
232
- "pretty" => PrettyPrintFormatter,
233
- "text" => TextFormatter,
234
- "webvtt" => WebVTTFormatter,
235
- "srt" => SRTFormatter
236
- }.freeze
237
-
238
- # Error raised when an unknown formatter type is requested
239
- class UnknownFormatterType < StandardError
240
- def initialize(formatter_type)
241
- super(
242
- "The format '#{formatter_type}' is not supported. " \
243
- "Choose one of the following formats: #{TYPES.keys.join(", ")}"
244
- )
245
- end
246
- end
247
-
248
- # Load a formatter by type name
249
- #
250
- # @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
251
- # @return [Formatter] An instance of the requested formatter
252
- # @raise [UnknownFormatterType] If the formatter type is not supported
253
- #
254
- # @example
255
- # loader = FormatterLoader.new
256
- # formatter = loader.load("json")
257
- # output = formatter.format_transcript(transcript)
258
- #
259
- def load(formatter_type = "pretty")
260
- formatter_type = formatter_type.to_s
261
- raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
262
-
263
- TYPES[formatter_type].new
264
- end
265
- end
266
- end
267
- end
268
- end
269
- end
@@ -1,28 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Youtube
4
- module Transcript
5
- module Rb
6
- # YouTube watch URL template
7
- # @example
8
- # format(WATCH_URL, video_id: "abc123")
9
- # # => "https://www.youtube.com/watch?v=abc123"
10
- WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
11
-
12
- # YouTube Innertube API URL template
13
- # @example
14
- # format(INNERTUBE_API_URL, api_key: "key123")
15
- # # => "https://www.youtube.com/youtubei/v1/player?key=key123"
16
- INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
17
-
18
- # Innertube API context for Android client
19
- # Used in POST requests to the Innertube API
20
- INNERTUBE_CONTEXT = {
21
- "client" => {
22
- "clientName" => "ANDROID",
23
- "clientVersion" => "20.10.38"
24
- }
25
- }.freeze
26
- end
27
- end
28
- end
@@ -1,239 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Youtube
4
- module Transcript
5
- module Rb
6
- # Represents a language available for translation
7
- class TranslationLanguage
8
- # @return [String] the language name (e.g., "Spanish")
9
- attr_reader :language
10
-
11
- # @return [String] the language code (e.g., "es")
12
- attr_reader :language_code
13
-
14
- # @param language [String] the language name
15
- # @param language_code [String] the language code
16
- def initialize(language:, language_code:)
17
- @language = language
18
- @language_code = language_code
19
- end
20
- end
21
-
22
- # Represents a single transcript snippet/segment
23
- class TranscriptSnippet
24
- # @return [String] the text content of the snippet
25
- attr_reader :text
26
-
27
- # @return [Float] the start time in seconds
28
- attr_reader :start
29
-
30
- # @return [Float] the duration in seconds
31
- attr_reader :duration
32
-
33
- # @param text [String] the text content
34
- # @param start [Float] the start time in seconds
35
- # @param duration [Float] the duration in seconds
36
- def initialize(text:, start:, duration:)
37
- @text = text
38
- @start = start.to_f
39
- @duration = duration.to_f
40
- end
41
-
42
- # Convert to hash representation
43
- # @return [Hash] hash with text, start, and duration keys
44
- def to_h
45
- {
46
- "text" => @text,
47
- "start" => @start,
48
- "duration" => @duration
49
- }
50
- end
51
- end
52
-
53
- # Represents a fetched transcript containing multiple snippets
54
- # This class is Enumerable, allowing iteration over snippets
55
- class FetchedTranscript
56
- include Enumerable
57
-
58
- # @return [String] the video ID
59
- attr_reader :video_id
60
-
61
- # @return [String] the language name (e.g., "English")
62
- attr_reader :language
63
-
64
- # @return [String] the language code (e.g., "en")
65
- attr_reader :language_code
66
-
67
- # @return [Boolean] whether the transcript was auto-generated
68
- attr_reader :is_generated
69
-
70
- # @return [Array<TranscriptSnippet>] the transcript snippets
71
- attr_reader :snippets
72
-
73
- # @param video_id [String] the YouTube video ID
74
- # @param language [String] the language name
75
- # @param language_code [String] the language code
76
- # @param is_generated [Boolean] whether auto-generated
77
- # @param snippets [Array<TranscriptSnippet>] the snippets (optional)
78
- def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
79
- @video_id = video_id
80
- @language = language
81
- @language_code = language_code
82
- @is_generated = is_generated
83
- @snippets = snippets
84
- end
85
-
86
- # Add a snippet to the transcript
87
- # @param snippet [TranscriptSnippet] the snippet to add
88
- # @return [self]
89
- def add_snippet(snippet)
90
- @snippets << snippet
91
- self
92
- end
93
-
94
- # Iterate over each snippet
95
- # @yield [TranscriptSnippet] each snippet in the transcript
96
- def each(&block)
97
- @snippets.each(&block)
98
- end
99
-
100
- # Get a snippet by index
101
- # @param index [Integer] the index
102
- # @return [TranscriptSnippet] the snippet at the given index
103
- def [](index)
104
- @snippets[index]
105
- end
106
-
107
- # Get the number of snippets
108
- # @return [Integer] the count of snippets
109
- def length
110
- @snippets.length
111
- end
112
- alias size length
113
-
114
- # Convert to raw data (array of hashes)
115
- # @return [Array<Hash>] array of snippet hashes
116
- def to_raw_data
117
- @snippets.map(&:to_h)
118
- end
119
-
120
- # Check if transcript was auto-generated
121
- # @return [Boolean]
122
- def generated?
123
- @is_generated
124
- end
125
- end
126
-
127
- # Represents transcript metadata and provides fetch/translate capabilities
128
- class Transcript
129
- # @return [String] the video ID
130
- attr_reader :video_id
131
-
132
- # @return [String] the language name
133
- attr_reader :language
134
-
135
- # @return [String] the language code
136
- attr_reader :language_code
137
-
138
- # @return [Boolean] whether auto-generated
139
- attr_reader :is_generated
140
-
141
- # @return [Array<TranslationLanguage>] available translation languages
142
- attr_reader :translation_languages
143
-
144
- # @param http_client [Faraday::Connection] the HTTP client
145
- # @param video_id [String] the YouTube video ID
146
- # @param url [String] the transcript URL
147
- # @param language [String] the language name
148
- # @param language_code [String] the language code
149
- # @param is_generated [Boolean] whether auto-generated
150
- # @param translation_languages [Array<TranslationLanguage>] available translations
151
- def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
152
- @http_client = http_client
153
- @video_id = video_id
154
- @url = url
155
- @language = language
156
- @language_code = language_code
157
- @is_generated = is_generated
158
- @translation_languages = translation_languages
159
- @translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
160
- hash[tl.language_code] = tl.language
161
- end
162
- end
163
-
164
- # Fetch the actual transcript data
165
- # @param preserve_formatting [Boolean] whether to preserve HTML formatting
166
- # @return [FetchedTranscript] the fetched transcript
167
- # @raise [PoTokenRequired] if a PO token is required
168
- def fetch(preserve_formatting: false)
169
- raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
170
-
171
- response = @http_client.get(@url)
172
- raise_http_errors(response)
173
-
174
- parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
175
- snippets = parser.parse(response.body)
176
-
177
- FetchedTranscript.new(
178
- video_id: @video_id,
179
- language: @language,
180
- language_code: @language_code,
181
- is_generated: @is_generated,
182
- snippets: snippets
183
- )
184
- end
185
-
186
- # Check if this transcript can be translated
187
- # @return [Boolean]
188
- def translatable?
189
- !@translation_languages.empty?
190
- end
191
- alias is_translatable translatable?
192
-
193
- # Translate this transcript to another language
194
- # @param language_code [String] the target language code
195
- # @return [Transcript] a new Transcript object for the translated version
196
- # @raise [NotTranslatable] if the transcript cannot be translated
197
- # @raise [TranslationLanguageNotAvailable] if the language is not available
198
- def translate(language_code)
199
- raise NotTranslatable, @video_id unless translatable?
200
- raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
201
-
202
- Transcript.new(
203
- http_client: @http_client,
204
- video_id: @video_id,
205
- url: "#{@url}&tlang=#{language_code}",
206
- language: @translation_languages_dict[language_code],
207
- language_code: language_code,
208
- is_generated: true,
209
- translation_languages: []
210
- )
211
- end
212
-
213
- # Check if transcript was auto-generated
214
- # @return [Boolean]
215
- def generated?
216
- @is_generated
217
- end
218
-
219
- # String representation of the transcript
220
- # @return [String]
221
- def to_s
222
- translation_desc = translatable? ? "[TRANSLATABLE]" : ""
223
- "#{@language_code} (\"#{@language}\")#{translation_desc}"
224
- end
225
-
226
- private
227
-
228
- def raise_http_errors(response)
229
- case response.status
230
- when 429
231
- raise IpBlocked, @video_id
232
- when 400..599
233
- raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
234
- end
235
- end
236
- end
237
- end
238
- end
239
- end