youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "youtube/transcript/rb"
5
+
6
+ RSpec.describe Youtube::Transcript::Rb do
7
+ describe Youtube::Transcript::Rb::TranslationLanguage do
8
+ let(:language) { described_class.new(language: "Spanish", language_code: "es") }
9
+
10
+ describe "#initialize" do
11
+ it "sets the language" do
12
+ expect(language.language).to eq("Spanish")
13
+ end
14
+
15
+ it "sets the language_code" do
16
+ expect(language.language_code).to eq("es")
17
+ end
18
+ end
19
+ end
20
+
21
+ describe Youtube::Transcript::Rb::TranscriptSnippet do
22
+ let(:snippet) { described_class.new(text: "Hello world", start: 1.5, duration: 2.0) }
23
+
24
+ describe "#initialize" do
25
+ it "sets the text" do
26
+ expect(snippet.text).to eq("Hello world")
27
+ end
28
+
29
+ it "sets the start time" do
30
+ expect(snippet.start).to eq(1.5)
31
+ end
32
+
33
+ it "sets the duration" do
34
+ expect(snippet.duration).to eq(2.0)
35
+ end
36
+
37
+ it "converts start to float" do
38
+ snippet = described_class.new(text: "test", start: "1.5", duration: 2)
39
+ expect(snippet.start).to eq(1.5)
40
+ expect(snippet.start).to be_a(Float)
41
+ end
42
+
43
+ it "converts duration to float" do
44
+ snippet = described_class.new(text: "test", start: 1, duration: "2.5")
45
+ expect(snippet.duration).to eq(2.5)
46
+ expect(snippet.duration).to be_a(Float)
47
+ end
48
+ end
49
+
50
+ describe "#to_h" do
51
+ it "returns a hash representation" do
52
+ hash = snippet.to_h
53
+ expect(hash).to be_a(Hash)
54
+ expect(hash["text"]).to eq("Hello world")
55
+ expect(hash["start"]).to eq(1.5)
56
+ expect(hash["duration"]).to eq(2.0)
57
+ end
58
+ end
59
+ end
60
+
61
+ describe Youtube::Transcript::Rb::FetchedTranscript do
62
+ let(:transcript) do
63
+ described_class.new(
64
+ video_id: "test_video",
65
+ language: "English",
66
+ language_code: "en",
67
+ is_generated: false
68
+ )
69
+ end
70
+
71
+ let(:snippet1) { Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Hello", start: 0.0, duration: 1.5) }
72
+ let(:snippet2) { Youtube::Transcript::Rb::TranscriptSnippet.new(text: "World", start: 1.5, duration: 2.0) }
73
+
74
+ describe "#initialize" do
75
+ it "sets the video_id" do
76
+ expect(transcript.video_id).to eq("test_video")
77
+ end
78
+
79
+ it "sets the language" do
80
+ expect(transcript.language).to eq("English")
81
+ end
82
+
83
+ it "sets the language_code" do
84
+ expect(transcript.language_code).to eq("en")
85
+ end
86
+
87
+ it "sets is_generated" do
88
+ expect(transcript.is_generated).to eq(false)
89
+ end
90
+
91
+ it "initializes with empty snippets by default" do
92
+ expect(transcript.snippets).to eq([])
93
+ end
94
+
95
+ it "can initialize with snippets" do
96
+ t = described_class.new(
97
+ video_id: "test",
98
+ language: "English",
99
+ language_code: "en",
100
+ is_generated: false,
101
+ snippets: [snippet1, snippet2]
102
+ )
103
+ expect(t.snippets.length).to eq(2)
104
+ end
105
+ end
106
+
107
+ describe "#add_snippet" do
108
+ it "adds a snippet" do
109
+ transcript.add_snippet(snippet1)
110
+ expect(transcript.snippets.length).to eq(1)
111
+ expect(transcript.snippets.first).to eq(snippet1)
112
+ end
113
+
114
+ it "returns self for chaining" do
115
+ result = transcript.add_snippet(snippet1)
116
+ expect(result).to eq(transcript)
117
+ end
118
+ end
119
+
120
+ describe "Enumerable" do
121
+ before do
122
+ transcript.add_snippet(snippet1)
123
+ transcript.add_snippet(snippet2)
124
+ end
125
+
126
+ it "is enumerable" do
127
+ expect(transcript).to respond_to(:each)
128
+ expect(transcript).to respond_to(:map)
129
+ expect(transcript).to respond_to(:select)
130
+ end
131
+
132
+ it "iterates over snippets" do
133
+ texts = transcript.map(&:text)
134
+ expect(texts).to eq(["Hello", "World"])
135
+ end
136
+
137
+ describe "#each" do
138
+ it "yields each snippet" do
139
+ yielded = []
140
+ transcript.each { |s| yielded << s }
141
+ expect(yielded).to eq([snippet1, snippet2])
142
+ end
143
+ end
144
+ end
145
+
146
+ describe "#[]" do
147
+ before do
148
+ transcript.add_snippet(snippet1)
149
+ transcript.add_snippet(snippet2)
150
+ end
151
+
152
+ it "returns snippet by index" do
153
+ expect(transcript[0]).to eq(snippet1)
154
+ expect(transcript[1]).to eq(snippet2)
155
+ end
156
+
157
+ it "supports negative indices" do
158
+ expect(transcript[-1]).to eq(snippet2)
159
+ end
160
+ end
161
+
162
+ describe "#length" do
163
+ it "returns 0 for empty transcript" do
164
+ expect(transcript.length).to eq(0)
165
+ end
166
+
167
+ it "returns the number of snippets" do
168
+ transcript.add_snippet(snippet1)
169
+ transcript.add_snippet(snippet2)
170
+ expect(transcript.length).to eq(2)
171
+ end
172
+ end
173
+
174
+ describe "#size" do
175
+ it "is an alias for length" do
176
+ transcript.add_snippet(snippet1)
177
+ expect(transcript.size).to eq(transcript.length)
178
+ end
179
+ end
180
+
181
+ describe "#to_raw_data" do
182
+ before do
183
+ transcript.add_snippet(snippet1)
184
+ transcript.add_snippet(snippet2)
185
+ end
186
+
187
+ it "returns an array of hashes" do
188
+ data = transcript.to_raw_data
189
+ expect(data).to be_an(Array)
190
+ expect(data.length).to eq(2)
191
+ end
192
+
193
+ it "contains snippet data as hashes" do
194
+ data = transcript.to_raw_data
195
+ expect(data[0]).to eq({ "text" => "Hello", "start" => 0.0, "duration" => 1.5 })
196
+ expect(data[1]).to eq({ "text" => "World", "start" => 1.5, "duration" => 2.0 })
197
+ end
198
+ end
199
+
200
+ describe "#generated?" do
201
+ it "returns true when is_generated is true" do
202
+ t = described_class.new(
203
+ video_id: "test",
204
+ language: "English",
205
+ language_code: "en",
206
+ is_generated: true
207
+ )
208
+ expect(t.generated?).to be true
209
+ end
210
+
211
+ it "returns false when is_generated is false" do
212
+ expect(transcript.generated?).to be false
213
+ end
214
+ end
215
+ end
216
+
217
+ describe Youtube::Transcript::Rb::Transcript do
218
+ let(:http_client) { double("Faraday::Connection") }
219
+ let(:translation_languages) do
220
+ [
221
+ Youtube::Transcript::Rb::TranslationLanguage.new(language: "Spanish", language_code: "es"),
222
+ Youtube::Transcript::Rb::TranslationLanguage.new(language: "French", language_code: "fr")
223
+ ]
224
+ end
225
+
226
+ let(:transcript) do
227
+ described_class.new(
228
+ http_client: http_client,
229
+ video_id: "test_video",
230
+ url: "https://www.youtube.com/api/timedtext?v=test_video",
231
+ language: "English",
232
+ language_code: "en",
233
+ is_generated: false,
234
+ translation_languages: translation_languages
235
+ )
236
+ end
237
+
238
+ let(:transcript_without_translations) do
239
+ described_class.new(
240
+ http_client: http_client,
241
+ video_id: "test_video",
242
+ url: "https://www.youtube.com/api/timedtext?v=test_video",
243
+ language: "English",
244
+ language_code: "en",
245
+ is_generated: false,
246
+ translation_languages: []
247
+ )
248
+ end
249
+
250
+ describe "#initialize" do
251
+ it "sets the video_id" do
252
+ expect(transcript.video_id).to eq("test_video")
253
+ end
254
+
255
+ it "sets the language" do
256
+ expect(transcript.language).to eq("English")
257
+ end
258
+
259
+ it "sets the language_code" do
260
+ expect(transcript.language_code).to eq("en")
261
+ end
262
+
263
+ it "sets is_generated" do
264
+ expect(transcript.is_generated).to eq(false)
265
+ end
266
+
267
+ it "sets translation_languages" do
268
+ expect(transcript.translation_languages.length).to eq(2)
269
+ end
270
+ end
271
+
272
+ describe "#translatable?" do
273
+ it "returns true when translation_languages is not empty" do
274
+ expect(transcript.translatable?).to be true
275
+ end
276
+
277
+ it "returns false when translation_languages is empty" do
278
+ expect(transcript_without_translations.translatable?).to be false
279
+ end
280
+ end
281
+
282
+ describe "#is_translatable" do
283
+ it "is an alias for translatable?" do
284
+ expect(transcript.is_translatable).to eq(transcript.translatable?)
285
+ end
286
+ end
287
+
288
+ describe "#generated?" do
289
+ it "returns the value of is_generated" do
290
+ expect(transcript.generated?).to be false
291
+ end
292
+ end
293
+
294
+ describe "#translate" do
295
+ it "raises NotTranslatable when not translatable" do
296
+ expect {
297
+ transcript_without_translations.translate("es")
298
+ }.to raise_error(Youtube::Transcript::Rb::NotTranslatable)
299
+ end
300
+
301
+ it "raises TranslationLanguageNotAvailable for unavailable language" do
302
+ expect {
303
+ transcript.translate("de")
304
+ }.to raise_error(Youtube::Transcript::Rb::TranslationLanguageNotAvailable)
305
+ end
306
+
307
+ it "returns a new Transcript for available language" do
308
+ translated = transcript.translate("es")
309
+ expect(translated).to be_a(described_class)
310
+ expect(translated.language_code).to eq("es")
311
+ expect(translated.language).to eq("Spanish")
312
+ end
313
+
314
+ it "appends tlang to URL" do
315
+ translated = transcript.translate("fr")
316
+ # The URL should contain &tlang=fr
317
+ expect(translated.instance_variable_get(:@url)).to include("&tlang=fr")
318
+ end
319
+
320
+ it "marks translated transcript as generated" do
321
+ translated = transcript.translate("es")
322
+ expect(translated.is_generated).to be true
323
+ end
324
+
325
+ it "translated transcript has no translation languages" do
326
+ translated = transcript.translate("es")
327
+ expect(translated.translation_languages).to eq([])
328
+ end
329
+ end
330
+
331
+ describe "#fetch" do
332
+ let(:xml_response) do
333
+ <<~XML
334
+ <?xml version="1.0" encoding="utf-8" ?>
335
+ <transcript>
336
+ <text start="0.0" dur="2.5">Hello world</text>
337
+ <text start="2.5" dur="3.0">This is a test</text>
338
+ </transcript>
339
+ XML
340
+ end
341
+
342
+ let(:response) { double("Response", status: 200, body: xml_response) }
343
+
344
+ before do
345
+ allow(http_client).to receive(:get).and_return(response)
346
+ end
347
+
348
+ it "returns a FetchedTranscript" do
349
+ result = transcript.fetch
350
+ expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
351
+ end
352
+
353
+ it "parses the transcript snippets" do
354
+ result = transcript.fetch
355
+ expect(result.length).to eq(2)
356
+ expect(result[0].text).to eq("Hello world")
357
+ expect(result[1].text).to eq("This is a test")
358
+ end
359
+
360
+ it "sets metadata on FetchedTranscript" do
361
+ result = transcript.fetch
362
+ expect(result.video_id).to eq("test_video")
363
+ expect(result.language).to eq("English")
364
+ expect(result.language_code).to eq("en")
365
+ expect(result.is_generated).to eq(false)
366
+ end
367
+
368
+ it "raises PoTokenRequired when URL contains &exp=xpe" do
369
+ po_transcript = described_class.new(
370
+ http_client: http_client,
371
+ video_id: "test_video",
372
+ url: "https://www.youtube.com/api/timedtext?v=test_video&exp=xpe",
373
+ language: "English",
374
+ language_code: "en",
375
+ is_generated: false,
376
+ translation_languages: []
377
+ )
378
+
379
+ expect { po_transcript.fetch }.to raise_error(Youtube::Transcript::Rb::PoTokenRequired)
380
+ end
381
+
382
+ context "when HTTP error occurs" do
383
+ it "raises IpBlocked for 429 status" do
384
+ allow(http_client).to receive(:get).and_return(double("Response", status: 429, body: ""))
385
+ expect { transcript.fetch }.to raise_error(Youtube::Transcript::Rb::IpBlocked)
386
+ end
387
+
388
+ it "raises YouTubeRequestFailed for 4xx/5xx status" do
389
+ allow(http_client).to receive(:get).and_return(double("Response", status: 500, body: ""))
390
+ expect { transcript.fetch }.to raise_error(Youtube::Transcript::Rb::YouTubeRequestFailed)
391
+ end
392
+ end
393
+
394
+ context "with preserve_formatting option" do
395
+ let(:xml_with_formatting) do
396
+ <<~XML
397
+ <?xml version="1.0" encoding="utf-8" ?>
398
+ <transcript>
399
+ <text start="0.0" dur="2.5">Hello &lt;b&gt;world&lt;/b&gt;</text>
400
+ </transcript>
401
+ XML
402
+ end
403
+
404
+ it "preserves formatting tags when preserve_formatting is true" do
405
+ allow(http_client).to receive(:get).and_return(double("Response", status: 200, body: xml_with_formatting))
406
+ result = transcript.fetch(preserve_formatting: true)
407
+ expect(result[0].text).to include("<b>")
408
+ end
409
+
410
+ it "strips formatting tags when preserve_formatting is false" do
411
+ allow(http_client).to receive(:get).and_return(double("Response", status: 200, body: xml_with_formatting))
412
+ result = transcript.fetch(preserve_formatting: false)
413
+ expect(result[0].text).not_to include("<b>")
414
+ expect(result[0].text).to include("world")
415
+ end
416
+ end
417
+ end
418
+
419
+ describe "#to_s" do
420
+ it "includes language_code and language" do
421
+ str = transcript.to_s
422
+ expect(str).to include("en")
423
+ expect(str).to include("English")
424
+ end
425
+
426
+ it "includes [TRANSLATABLE] when translatable" do
427
+ expect(transcript.to_s).to include("[TRANSLATABLE]")
428
+ end
429
+
430
+ it "does not include [TRANSLATABLE] when not translatable" do
431
+ expect(transcript_without_translations.to_s).not_to include("[TRANSLATABLE]")
432
+ end
433
+ end
434
+ end
435
+ end
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: youtube-transcript-rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - jeff.dean
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: faraday
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '2.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '2.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: faraday-follow_redirects
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '0.3'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '0.3'
40
+ - !ruby/object:Gem::Dependency
41
+ name: nokogiri
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.15'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '1.15'
54
+ description: A Ruby library to retrieve transcripts/subtitles for YouTube videos.
55
+ Port of the Python youtube-transcript-api.
56
+ email:
57
+ - stadia@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".rspec"
63
+ - ".serena/.gitignore"
64
+ - ".serena/memories/code_style_and_conventions.md"
65
+ - ".serena/memories/project_overview.md"
66
+ - ".serena/memories/suggested_commands.md"
67
+ - ".serena/memories/task_completion_checklist.md"
68
+ - ".serena/memories/tech_stack.md"
69
+ - ".serena/project.yml"
70
+ - LICENSE
71
+ - PLAN.md
72
+ - README.md
73
+ - Rakefile
74
+ - lib/youtube/transcript/rb.rb
75
+ - lib/youtube/transcript/rb/api.rb
76
+ - lib/youtube/transcript/rb/errors.rb
77
+ - lib/youtube/transcript/rb/formatters.rb
78
+ - lib/youtube/transcript/rb/settings.rb
79
+ - lib/youtube/transcript/rb/transcript.rb
80
+ - lib/youtube/transcript/rb/transcript_list.rb
81
+ - lib/youtube/transcript/rb/transcript_list_fetcher.rb
82
+ - lib/youtube/transcript/rb/transcript_parser.rb
83
+ - lib/youtube/transcript/rb/version.rb
84
+ - sig/youtube/transcript/rb.rbs
85
+ - spec/api_spec.rb
86
+ - spec/errors_spec.rb
87
+ - spec/formatters_spec.rb
88
+ - spec/integration_spec.rb
89
+ - spec/settings_spec.rb
90
+ - spec/spec_helper.rb
91
+ - spec/transcript_list_fetcher_spec.rb
92
+ - spec/transcript_list_spec.rb
93
+ - spec/transcript_parser_spec.rb
94
+ - spec/transcript_spec.rb
95
+ homepage: https://github.com/stadia/youtube-transcript-rb
96
+ licenses:
97
+ - MIT
98
+ metadata:
99
+ bug_tracker_uri: https://github.com/stadia/youtube-transcript-rb/issues
100
+ documentation_uri: https://github.com/stadia/youtube-transcript-rb#readme
101
+ rdoc_options: []
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: 3.2.0
109
+ required_rubygems_version: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ requirements: []
115
+ rubygems_version: 3.6.9
116
+ specification_version: 4
117
+ summary: Fetch YouTube video transcripts and subtitles
118
+ test_files: []