youtube-transcript-rb 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,8 +141,7 @@ RSpec.describe YoutubeRb::Transcript::TranscriptList do
141
141
 
142
142
  describe "#each" do
143
143
  it "yields each transcript" do
144
- transcripts = []
145
- list.each { |t| transcripts << t }
144
+ transcripts = list.map { |t| t }
146
145
  expect(transcripts.length).to eq(3)
147
146
  end
148
147
 
@@ -196,22 +195,20 @@ RSpec.describe YoutubeRb::Transcript::TranscriptList do
196
195
  end
197
196
 
198
197
  it "tries language codes in order of priority" do
199
- transcript = list.find_transcript(["ja", "es", "en"])
198
+ transcript = list.find_transcript(%w[ja es en])
200
199
  expect(transcript.language_code).to eq("es")
201
200
  end
202
201
 
203
202
  it "raises NoTranscriptFound when no match" do
204
- expect {
205
- list.find_transcript(["ja", "ko", "zh"])
206
- }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
203
+ expect do
204
+ list.find_transcript(%w[ja ko zh])
205
+ end.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
207
206
  end
208
207
 
209
208
  it "includes requested languages in error" do
210
- begin
211
- list.find_transcript(["ja", "ko"])
212
- rescue YoutubeRb::Transcript::NoTranscriptFound => e
213
- expect(e.requested_language_codes).to eq(["ja", "ko"])
214
- end
209
+ list.find_transcript(%w[ja ko])
210
+ rescue YoutubeRb::Transcript::NoTranscriptFound => e
211
+ expect(e.requested_language_codes).to eq(%w[ja ko])
215
212
  end
216
213
  end
217
214
 
@@ -231,15 +228,15 @@ RSpec.describe YoutubeRb::Transcript::TranscriptList do
231
228
  end
232
229
 
233
230
  it "does not return manually created transcripts" do
234
- expect {
231
+ expect do
235
232
  list.find_generated_transcript(["en"])
236
- }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
233
+ end.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
237
234
  end
238
235
 
239
236
  it "raises NoTranscriptFound when no match" do
240
- expect {
237
+ expect do
241
238
  list.find_generated_transcript(["ja"])
242
- }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
239
+ end.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
243
240
  end
244
241
  end
245
242
 
@@ -259,13 +256,13 @@ RSpec.describe YoutubeRb::Transcript::TranscriptList do
259
256
  end
260
257
 
261
258
  it "does not return generated transcripts" do
262
- expect {
259
+ expect do
263
260
  list.find_manually_created_transcript(["en-auto"])
264
- }.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
261
+ end.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
265
262
  end
266
263
 
267
264
  it "tries language codes in order" do
268
- transcript = list.find_manually_created_transcript(["ja", "es"])
265
+ transcript = list.find_manually_created_transcript(%w[ja es])
269
266
  expect(transcript.language_code).to eq("es")
270
267
  end
271
268
  end
@@ -85,7 +85,7 @@ RSpec.describe YoutubeRb::Transcript do
85
85
  end
86
86
 
87
87
  it "sets is_generated" do
88
- expect(transcript.is_generated).to eq(false)
88
+ expect(transcript.is_generated).to be(false)
89
89
  end
90
90
 
91
91
  it "initializes with empty snippets by default" do
@@ -131,13 +131,12 @@ RSpec.describe YoutubeRb::Transcript do
131
131
 
132
132
  it "iterates over snippets" do
133
133
  texts = transcript.map(&:text)
134
- expect(texts).to eq(["Hello", "World"])
134
+ expect(texts).to eq(%w[Hello World])
135
135
  end
136
136
 
137
137
  describe "#each" do
138
138
  it "yields each snippet" do
139
- yielded = []
140
- transcript.each { |s| yielded << s }
139
+ yielded = transcript.map { |s| s }
141
140
  expect(yielded).to eq([snippet1, snippet2])
142
141
  end
143
142
  end
@@ -261,7 +260,7 @@ RSpec.describe YoutubeRb::Transcript do
261
260
  end
262
261
 
263
262
  it "sets is_generated" do
264
- expect(transcript.is_generated).to eq(false)
263
+ expect(transcript.is_generated).to be(false)
265
264
  end
266
265
 
267
266
  it "sets translation_languages" do
@@ -293,15 +292,15 @@ RSpec.describe YoutubeRb::Transcript do
293
292
 
294
293
  describe "#translate" do
295
294
  it "raises NotTranslatable when not translatable" do
296
- expect {
295
+ expect do
297
296
  transcript_without_translations.translate("es")
298
- }.to raise_error(YoutubeRb::Transcript::NotTranslatable)
297
+ end.to raise_error(YoutubeRb::Transcript::NotTranslatable)
299
298
  end
300
299
 
301
300
  it "raises TranslationLanguageNotAvailable for unavailable language" do
302
- expect {
301
+ expect do
303
302
  transcript.translate("de")
304
- }.to raise_error(YoutubeRb::Transcript::TranslationLanguageNotAvailable)
303
+ end.to raise_error(YoutubeRb::Transcript::TranslationLanguageNotAvailable)
305
304
  end
306
305
 
307
306
  it "returns a new Transcript for available language" do
@@ -362,7 +361,7 @@ RSpec.describe YoutubeRb::Transcript do
362
361
  expect(result.video_id).to eq("test_video")
363
362
  expect(result.language).to eq("English")
364
363
  expect(result.language_code).to eq("en")
365
- expect(result.is_generated).to eq(false)
364
+ expect(result.is_generated).to be(false)
366
365
  end
367
366
 
368
367
  it "raises PoTokenRequired when URL contains &exp=xpe" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube-transcript-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - jeff.dean
@@ -60,6 +60,8 @@ extensions: []
60
60
  extra_rdoc_files: []
61
61
  files:
62
62
  - ".rspec"
63
+ - ".rubocop.yml"
64
+ - ".rubocop_todo.yml"
63
65
  - ".serena/.gitignore"
64
66
  - ".serena/memories/code_style_and_conventions.md"
65
67
  - ".serena/memories/project_overview.md"
@@ -71,16 +73,17 @@ files:
71
73
  - PLAN.md
72
74
  - README.md
73
75
  - Rakefile
76
+ - lib/youtube-transcript-rb.rb
77
+ - lib/youtube_rb/formatters.rb
74
78
  - lib/youtube_rb/transcript.rb
75
79
  - lib/youtube_rb/transcript/api.rb
76
80
  - lib/youtube_rb/transcript/errors.rb
77
- - lib/youtube_rb/transcript/formatters.rb
78
81
  - lib/youtube_rb/transcript/settings.rb
79
82
  - lib/youtube_rb/transcript/transcript.rb
80
83
  - lib/youtube_rb/transcript/transcript_list.rb
81
84
  - lib/youtube_rb/transcript/transcript_list_fetcher.rb
82
85
  - lib/youtube_rb/transcript/transcript_parser.rb
83
- - lib/youtube_rb/transcript/version.rb
86
+ - lib/youtube_rb/version.rb
84
87
  - sig/youtube_rb/transcript.rbs
85
88
  - spec/api_spec.rb
86
89
  - spec/errors_spec.rb
@@ -98,6 +101,7 @@ licenses:
98
101
  metadata:
99
102
  bug_tracker_uri: https://github.com/stadia/youtube-transcript-rb/issues
100
103
  documentation_uri: https://github.com/stadia/youtube-transcript-rb#readme
104
+ rubygems_mfa_required: 'true'
101
105
  rdoc_options: []
102
106
  require_paths:
103
107
  - lib
@@ -112,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
116
  - !ruby/object:Gem::Version
113
117
  version: '0'
114
118
  requirements: []
115
- rubygems_version: 3.6.9
119
+ rubygems_version: 4.0.3
116
120
  specification_version: 4
117
121
  summary: Fetch YouTube video transcripts and subtitles
118
122
  test_files: []
@@ -1,267 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "json"
4
-
5
- module YoutubeRb
6
- module Transcript
7
- # Module containing all transcript formatters
8
- module Formatters
9
- # Base formatter class. All formatters should inherit from this class
10
- # and implement their own format_transcript and format_transcripts methods.
11
- class Formatter
12
- # Format a single transcript
13
- #
14
- # @param transcript [FetchedTranscript] The transcript to format
15
- # @param options [Hash] Additional formatting options
16
- # @return [String] The formatted transcript
17
- def format_transcript(transcript, **options)
18
- raise NotImplementedError, "Subclass must implement #format_transcript"
19
- end
20
-
21
- # Format multiple transcripts
22
- #
23
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
24
- # @param options [Hash] Additional formatting options
25
- # @return [String] The formatted transcripts
26
- def format_transcripts(transcripts, **options)
27
- raise NotImplementedError, "Subclass must implement #format_transcripts"
28
- end
29
- end
30
-
31
- # Formats transcript as pretty-printed Ruby data structures
32
- class PrettyPrintFormatter < Formatter
33
- # Format a single transcript as pretty-printed output
34
- #
35
- # @param transcript [FetchedTranscript] The transcript to format
36
- # @param options [Hash] Options passed to PP.pp
37
- # @return [String] Pretty-printed transcript data
38
- def format_transcript(transcript, **options)
39
- require "pp"
40
- PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
41
- end
42
-
43
- # Format multiple transcripts as pretty-printed output
44
- #
45
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
46
- # @param options [Hash] Options passed to PP.pp
47
- # @return [String] Pretty-printed transcripts data
48
- def format_transcripts(transcripts, **options)
49
- require "pp"
50
- data = transcripts.map(&:to_raw_data)
51
- PP.pp(data, +"", options[:width] || 79)
52
- end
53
- end
54
-
55
- # Formats transcript as JSON
56
- class JSONFormatter < Formatter
57
- # Format a single transcript as JSON
58
- #
59
- # @param transcript [FetchedTranscript] The transcript to format
60
- # @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
61
- # @return [String] JSON representation of the transcript
62
- def format_transcript(transcript, **options)
63
- JSON.generate(transcript.to_raw_data, options)
64
- end
65
-
66
- # Format multiple transcripts as JSON array
67
- #
68
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
69
- # @param options [Hash] Options passed to JSON.generate
70
- # @return [String] JSON array representation of the transcripts
71
- def format_transcripts(transcripts, **options)
72
- data = transcripts.map(&:to_raw_data)
73
- JSON.generate(data, options)
74
- end
75
- end
76
-
77
- # Formats transcript as plain text (text only, no timestamps)
78
- class TextFormatter < Formatter
79
- # Format a single transcript as plain text
80
- #
81
- # @param transcript [FetchedTranscript] The transcript to format
82
- # @param options [Hash] Unused options
83
- # @return [String] Plain text with each line separated by newlines
84
- def format_transcript(transcript, **options)
85
- transcript.map(&:text).join("\n")
86
- end
87
-
88
- # Format multiple transcripts as plain text
89
- #
90
- # @param transcripts [Array<FetchedTranscript>] The transcripts to format
91
- # @param options [Hash] Unused options
92
- # @return [String] Plain text with transcripts separated by triple newlines
93
- def format_transcripts(transcripts, **options)
94
- transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
95
- end
96
- end
97
-
98
- # Base class for timestamp-based formatters (SRT, WebVTT)
99
- class TextBasedFormatter < TextFormatter
100
- # Format a single transcript with timestamps
101
- #
102
- # @param transcript [FetchedTranscript] The transcript to format
103
- # @param options [Hash] Unused options
104
- # @return [String] Formatted transcript with timestamps
105
- def format_transcript(transcript, **options)
106
- lines = []
107
- snippets = transcript.to_a
108
-
109
- snippets.each_with_index do |snippet, i|
110
- end_time = snippet.start + snippet.duration
111
-
112
- # Use next snippet's start time if it starts before current end time
113
- if i < snippets.length - 1 && snippets[i + 1].start < end_time
114
- end_time = snippets[i + 1].start
115
- end
116
-
117
- time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
118
- lines << format_transcript_helper(i, time_text, snippet)
119
- end
120
-
121
- format_transcript_header(lines)
122
- end
123
-
124
- protected
125
-
126
- # Format a timestamp from components
127
- #
128
- # @param hours [Integer] Hours component
129
- # @param mins [Integer] Minutes component
130
- # @param secs [Integer] Seconds component
131
- # @param ms [Integer] Milliseconds component
132
- # @return [String] Formatted timestamp
133
- def format_timestamp(hours, mins, secs, ms)
134
- raise NotImplementedError, "Subclass must implement #format_timestamp"
135
- end
136
-
137
- # Format the transcript header/wrapper
138
- #
139
- # @param lines [Array<String>] The formatted lines
140
- # @return [String] The complete formatted transcript
141
- def format_transcript_header(lines)
142
- raise NotImplementedError, "Subclass must implement #format_transcript_header"
143
- end
144
-
145
- # Format a single transcript entry
146
- #
147
- # @param index [Integer] The entry index (0-based)
148
- # @param time_text [String] The formatted time range
149
- # @param snippet [TranscriptSnippet] The snippet to format
150
- # @return [String] The formatted entry
151
- def format_transcript_helper(index, time_text, snippet)
152
- raise NotImplementedError, "Subclass must implement #format_transcript_helper"
153
- end
154
-
155
- private
156
-
157
- # Convert seconds to timestamp string
158
- #
159
- # @param time [Float] Time in seconds
160
- # @return [String] Formatted timestamp
161
- def seconds_to_timestamp(time)
162
- time = time.to_f
163
- hours, remainder = time.divmod(3600)
164
- mins, secs_float = remainder.divmod(60)
165
- secs = secs_float.to_i
166
- ms = ((time - time.to_i) * 1000).round
167
-
168
- format_timestamp(hours.to_i, mins.to_i, secs, ms)
169
- end
170
- end
171
-
172
- # Formats transcript as SRT (SubRip) subtitle format
173
- #
174
- # @example SRT format
175
- # 1
176
- # 00:00:00,000 --> 00:00:02,500
177
- # Hello world
178
- #
179
- # 2
180
- # 00:00:02,500 --> 00:00:05,000
181
- # This is a test
182
- #
183
- class SRTFormatter < TextBasedFormatter
184
- protected
185
-
186
- def format_timestamp(hours, mins, secs, ms)
187
- format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
188
- end
189
-
190
- def format_transcript_header(lines)
191
- lines.join("\n\n") + "\n"
192
- end
193
-
194
- def format_transcript_helper(index, time_text, snippet)
195
- "#{index + 1}\n#{time_text}\n#{snippet.text}"
196
- end
197
- end
198
-
199
- # Formats transcript as WebVTT (Web Video Text Tracks) format
200
- #
201
- # @example WebVTT format
202
- # WEBVTT
203
- #
204
- # 00:00:00.000 --> 00:00:02.500
205
- # Hello world
206
- #
207
- # 00:00:02.500 --> 00:00:05.000
208
- # This is a test
209
- #
210
- class WebVTTFormatter < TextBasedFormatter
211
- protected
212
-
213
- def format_timestamp(hours, mins, secs, ms)
214
- format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
215
- end
216
-
217
- def format_transcript_header(lines)
218
- "WEBVTT\n\n" + lines.join("\n\n") + "\n"
219
- end
220
-
221
- def format_transcript_helper(index, time_text, snippet)
222
- "#{time_text}\n#{snippet.text}"
223
- end
224
- end
225
-
226
- # Utility class to load formatters by type name
227
- class FormatterLoader
228
- # Mapping of format names to formatter classes
229
- TYPES = {
230
- "json" => JSONFormatter,
231
- "pretty" => PrettyPrintFormatter,
232
- "text" => TextFormatter,
233
- "webvtt" => WebVTTFormatter,
234
- "srt" => SRTFormatter
235
- }.freeze
236
-
237
- # Error raised when an unknown formatter type is requested
238
- class UnknownFormatterType < StandardError
239
- def initialize(formatter_type)
240
- super(
241
- "The format '#{formatter_type}' is not supported. " \
242
- "Choose one of the following formats: #{TYPES.keys.join(", ")}"
243
- )
244
- end
245
- end
246
-
247
- # Load a formatter by type name
248
- #
249
- # @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
250
- # @return [Formatter] An instance of the requested formatter
251
- # @raise [UnknownFormatterType] If the formatter type is not supported
252
- #
253
- # @example
254
- # loader = FormatterLoader.new
255
- # formatter = loader.load("json")
256
- # output = formatter.format_transcript(transcript)
257
- #
258
- def load(formatter_type = "pretty")
259
- formatter_type = formatter_type.to_s
260
- raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
261
-
262
- TYPES[formatter_type].new
263
- end
264
- end
265
- end
266
- end
267
- end