youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "cgi"
5
+
6
+ module Youtube
7
+ module Transcript
8
+ module Rb
9
+ # Parses XML transcript data from YouTube
10
+ class TranscriptParser
11
+ # HTML formatting tags to preserve when preserve_formatting is enabled
12
+ FORMATTING_TAGS = %w[
13
+ strong
14
+ em
15
+ b
16
+ i
17
+ mark
18
+ small
19
+ del
20
+ ins
21
+ sub
22
+ sup
23
+ ].freeze
24
+
25
+ # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
26
+ def initialize(preserve_formatting: false)
27
+ @preserve_formatting = preserve_formatting
28
+ @html_regex = build_html_regex
29
+ end
30
+
31
+ # Parse XML transcript data into TranscriptSnippet objects
32
+ # @param raw_data [String] the raw XML data from YouTube
33
+ # @return [Array<TranscriptSnippet>] parsed transcript snippets
34
+ def parse(raw_data)
35
+ doc = Nokogiri::XML(raw_data)
36
+ snippets = []
37
+
38
+ doc.xpath("//text").each do |element|
39
+ text_content = element.text
40
+ next if text_content.nil? || text_content.empty?
41
+
42
+ # Unescape HTML entities and remove unwanted HTML tags
43
+ text = process_text(text_content)
44
+
45
+ snippets << TranscriptSnippet.new(
46
+ text: text,
47
+ start: element["start"].to_f,
48
+ duration: (element["dur"] || "0.0").to_f
49
+ )
50
+ end
51
+
52
+ snippets
53
+ end
54
+
55
+ private
56
+
57
+ # Build regex for removing HTML tags
58
+ # @return [Regexp]
59
+ def build_html_regex
60
+ if @preserve_formatting
61
+ # Remove all tags except formatting tags
62
+ formats_pattern = FORMATTING_TAGS.join("|")
63
+ # Match tags that are NOT the formatting tags
64
+ Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
65
+ else
66
+ # Remove all HTML tags
67
+ Regexp.new("<[^>]*>", Regexp::IGNORECASE)
68
+ end
69
+ end
70
+
71
+ # Process text by unescaping HTML entities and removing unwanted tags
72
+ # @param text [String] the raw text
73
+ # @return [String] processed text
74
+ def process_text(text)
75
+ # Unescape HTML entities
76
+ unescaped = CGI.unescapeHTML(text)
77
+ # Remove unwanted HTML tags
78
+ unescaped.gsub(@html_regex, "")
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Youtube
4
+ module Transcript
5
+ module Rb
6
+ VERSION = "0.1.0"
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "rb/version"
4
+ require_relative "rb/settings"
5
+ require_relative "rb/errors"
6
+ require_relative "rb/transcript_parser"
7
+ require_relative "rb/transcript"
8
+ require_relative "rb/transcript_list"
9
+ require_relative "rb/transcript_list_fetcher"
10
+ require_relative "rb/api"
11
+ require_relative "rb/formatters"
12
+
13
+ module Youtube
14
+ module Transcript
15
+ module Rb
16
+ class << self
17
+ # Convenience method to fetch a transcript
18
+ # @param video_id [String] YouTube video ID
19
+ # @param languages [Array<String>] Language codes in order of preference
20
+ # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
21
+ # @return [FetchedTranscript] The fetched transcript
22
+ def fetch(video_id, languages: ["en"], preserve_formatting: false)
23
+ api = YouTubeTranscriptApi.new
24
+ api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
25
+ end
26
+
27
+ # Convenience method to list available transcripts
28
+ # @param video_id [String] YouTube video ID
29
+ # @return [TranscriptList] List of available transcripts
30
+ def list(video_id)
31
+ api = YouTubeTranscriptApi.new
32
+ api.list(video_id)
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,8 @@
1
+ module Youtube
2
+ module Transcript
3
+ module Rb
4
+ VERSION: String
5
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
6
+ end
7
+ end
8
+ end
data/spec/api_spec.rb ADDED
@@ -0,0 +1,397 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "webmock/rspec"
5
+
6
+ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
7
+ let(:api) { described_class.new }
8
+ let(:video_id) { "dQw4w9WgXcQ" }
9
+ let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
10
+
11
+ let(:watch_url) { "https://www.youtube.com/watch?v=#{video_id}" }
12
+ let(:innertube_url) { "https://www.youtube.com/youtubei/v1/player?key=#{api_key}" }
13
+ let(:transcript_url) { "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=en" }
14
+
15
+ let(:sample_html) do
16
+ <<~HTML
17
+ <!DOCTYPE html>
18
+ <html>
19
+ <head><title>Test Video</title></head>
20
+ <body>
21
+ <script>
22
+ var ytcfg = {"INNERTUBE_API_KEY": "#{api_key}"};
23
+ </script>
24
+ </body>
25
+ </html>
26
+ HTML
27
+ end
28
+
29
+ let(:sample_innertube_response) do
30
+ {
31
+ "playabilityStatus" => { "status" => "OK" },
32
+ "captions" => {
33
+ "playerCaptionsTracklistRenderer" => {
34
+ "captionTracks" => [
35
+ {
36
+ "baseUrl" => transcript_url,
37
+ "name" => { "runs" => [{ "text" => "English" }] },
38
+ "languageCode" => "en",
39
+ "isTranslatable" => true
40
+ },
41
+ {
42
+ "baseUrl" => "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=es",
43
+ "name" => { "runs" => [{ "text" => "Spanish" }] },
44
+ "languageCode" => "es",
45
+ "isTranslatable" => false
46
+ }
47
+ ],
48
+ "translationLanguages" => [
49
+ { "languageCode" => "fr", "languageName" => { "runs" => [{ "text" => "French" }] } }
50
+ ]
51
+ }
52
+ }
53
+ }
54
+ end
55
+
56
+ let(:sample_transcript_xml) do
57
+ <<~XML
58
+ <?xml version="1.0" encoding="utf-8"?>
59
+ <transcript>
60
+ <text start="0.0" dur="2.5">Hello world</text>
61
+ <text start="2.5" dur="3.0">This is a test</text>
62
+ <text start="5.5" dur="2.0">Thank you</text>
63
+ </transcript>
64
+ XML
65
+ end
66
+
67
+ describe "#initialize" do
68
+ it "creates a default HTTP client when none provided" do
69
+ api = described_class.new
70
+ expect(api.instance_variable_get(:@http_client)).to be_a(Faraday::Connection)
71
+ end
72
+
73
+ it "accepts a custom HTTP client" do
74
+ custom_client = Faraday.new
75
+ api = described_class.new(http_client: custom_client)
76
+ expect(api.instance_variable_get(:@http_client)).to eq(custom_client)
77
+ end
78
+
79
+ it "accepts a proxy configuration" do
80
+ proxy_config = double("proxy_config")
81
+ api = described_class.new(proxy_config: proxy_config)
82
+ expect(api.instance_variable_get(:@proxy_config)).to eq(proxy_config)
83
+ end
84
+
85
+ it "creates a TranscriptListFetcher" do
86
+ api = described_class.new
87
+ expect(api.instance_variable_get(:@fetcher)).to be_a(Youtube::Transcript::Rb::TranscriptListFetcher)
88
+ end
89
+ end
90
+
91
+ describe "#fetch" do
92
+ before do
93
+ stub_request(:get, watch_url)
94
+ .to_return(status: 200, body: sample_html)
95
+
96
+ stub_request(:post, innertube_url)
97
+ .to_return(status: 200, body: sample_innertube_response.to_json)
98
+
99
+ stub_request(:get, transcript_url)
100
+ .to_return(status: 200, body: sample_transcript_xml)
101
+ end
102
+
103
+ it "returns a FetchedTranscript" do
104
+ result = api.fetch(video_id)
105
+ expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
106
+ end
107
+
108
+ it "fetches the transcript with correct video_id" do
109
+ result = api.fetch(video_id)
110
+ expect(result.video_id).to eq(video_id)
111
+ end
112
+
113
+ it "fetches the transcript with correct language" do
114
+ result = api.fetch(video_id, languages: ["en"])
115
+ expect(result.language_code).to eq("en")
116
+ expect(result.language).to eq("English")
117
+ end
118
+
119
+ it "contains transcript snippets" do
120
+ result = api.fetch(video_id)
121
+ expect(result.length).to eq(3)
122
+ expect(result.first.text).to eq("Hello world")
123
+ end
124
+
125
+ it "respects language preference order" do
126
+ stub_request(:get, "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=es")
127
+ .to_return(status: 200, body: sample_transcript_xml)
128
+
129
+ result = api.fetch(video_id, languages: ["es", "en"])
130
+ expect(result.language_code).to eq("es")
131
+ end
132
+
133
+ it "falls back to next language if first not available" do
134
+ result = api.fetch(video_id, languages: ["ja", "en"])
135
+ expect(result.language_code).to eq("en")
136
+ end
137
+
138
+ it "raises NoTranscriptFound when no language matches" do
139
+ expect {
140
+ api.fetch(video_id, languages: ["ja", "ko", "zh"])
141
+ }.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
142
+ end
143
+
144
+ context "with preserve_formatting option" do
145
+ let(:formatted_transcript_xml) do
146
+ <<~XML
147
+ <?xml version="1.0" encoding="utf-8"?>
148
+ <transcript>
149
+ <text start="0.0" dur="2.5">Hello &lt;b&gt;world&lt;/b&gt;</text>
150
+ <text start="2.5" dur="3.0">This is &lt;i&gt;important&lt;/i&gt;</text>
151
+ </transcript>
152
+ XML
153
+ end
154
+
155
+ before do
156
+ stub_request(:get, transcript_url)
157
+ .to_return(status: 200, body: formatted_transcript_xml)
158
+ end
159
+
160
+ it "preserves formatting when requested" do
161
+ result = api.fetch(video_id, preserve_formatting: true)
162
+ expect(result.first.text).to include("<b>")
163
+ expect(result.first.text).to eq("Hello <b>world</b>")
164
+ end
165
+
166
+ it "removes formatting by default" do
167
+ result = api.fetch(video_id, preserve_formatting: false)
168
+ expect(result.first.text).not_to include("<b>")
169
+ expect(result.first.text).to eq("Hello world")
170
+ end
171
+ end
172
+ end
173
+
174
+ describe "#list" do
175
+ before do
176
+ stub_request(:get, watch_url)
177
+ .to_return(status: 200, body: sample_html)
178
+
179
+ stub_request(:post, innertube_url)
180
+ .to_return(status: 200, body: sample_innertube_response.to_json)
181
+ end
182
+
183
+ it "returns a TranscriptList" do
184
+ result = api.list(video_id)
185
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
186
+ end
187
+
188
+ it "returns a list with the correct video_id" do
189
+ result = api.list(video_id)
190
+ expect(result.video_id).to eq(video_id)
191
+ end
192
+
193
+ it "includes all available transcripts" do
194
+ result = api.list(video_id)
195
+ expect(result.count).to eq(2)
196
+ end
197
+
198
+ it "allows finding specific transcripts" do
199
+ result = api.list(video_id)
200
+ transcript = result.find_transcript(["en"])
201
+ expect(transcript.language_code).to eq("en")
202
+ end
203
+
204
+ context "when video is unavailable" do
205
+ before do
206
+ stub_request(:post, innertube_url)
207
+ .to_return(status: 200, body: {
208
+ "playabilityStatus" => {
209
+ "status" => "ERROR",
210
+ "reason" => "This video is unavailable"
211
+ }
212
+ }.to_json)
213
+ end
214
+
215
+ it "raises VideoUnavailable error" do
216
+ expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
217
+ end
218
+ end
219
+
220
+ context "when transcripts are disabled" do
221
+ before do
222
+ stub_request(:post, innertube_url)
223
+ .to_return(status: 200, body: {
224
+ "playabilityStatus" => { "status" => "OK" },
225
+ "captions" => {}
226
+ }.to_json)
227
+ end
228
+
229
+ it "raises TranscriptsDisabled error" do
230
+ expect { api.list(video_id) }.to raise_error(Youtube::Transcript::Rb::TranscriptsDisabled)
231
+ end
232
+ end
233
+ end
234
+
235
+ describe "#fetch_all" do
236
+ let(:video_ids) { ["video1", "video2", "video3"] }
237
+
238
+ before do
239
+ video_ids.each do |vid|
240
+ stub_request(:get, "https://www.youtube.com/watch?v=#{vid}")
241
+ .to_return(status: 200, body: sample_html)
242
+
243
+ stub_request(:post, innertube_url)
244
+ .to_return(status: 200, body: {
245
+ "playabilityStatus" => { "status" => "OK" },
246
+ "captions" => {
247
+ "playerCaptionsTracklistRenderer" => {
248
+ "captionTracks" => [
249
+ {
250
+ "baseUrl" => "https://www.youtube.com/api/timedtext?v=#{vid}&lang=en",
251
+ "name" => { "runs" => [{ "text" => "English" }] },
252
+ "languageCode" => "en",
253
+ "isTranslatable" => false
254
+ }
255
+ ],
256
+ "translationLanguages" => []
257
+ }
258
+ }
259
+ }.to_json)
260
+
261
+ stub_request(:get, "https://www.youtube.com/api/timedtext?v=#{vid}&lang=en")
262
+ .to_return(status: 200, body: sample_transcript_xml)
263
+ end
264
+ end
265
+
266
+ it "returns a hash of transcripts" do
267
+ results = api.fetch_all(video_ids)
268
+ expect(results).to be_a(Hash)
269
+ expect(results.keys).to contain_exactly(*video_ids)
270
+ end
271
+
272
+ it "fetches all video transcripts" do
273
+ results = api.fetch_all(video_ids)
274
+ results.each do |vid, transcript|
275
+ expect(transcript).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
276
+ expect(transcript.video_id).to eq(vid)
277
+ end
278
+ end
279
+
280
+ it "respects language preference" do
281
+ results = api.fetch_all(video_ids, languages: ["en"])
282
+ results.each do |_, transcript|
283
+ expect(transcript.language_code).to eq("en")
284
+ end
285
+ end
286
+
287
+ it "yields each result when block given" do
288
+ yielded = []
289
+ api.fetch_all(video_ids) do |video_id, result|
290
+ yielded << [video_id, result.class]
291
+ end
292
+ expect(yielded.length).to eq(3)
293
+ yielded.each do |vid, klass|
294
+ expect(video_ids).to include(vid)
295
+ expect(klass).to eq(Youtube::Transcript::Rb::FetchedTranscript)
296
+ end
297
+ end
298
+
299
+ context "when a video fails" do
300
+ let(:failing_video_ids) { ["fail_video"] }
301
+
302
+ before do
303
+ WebMock.reset!
304
+
305
+ # Setup a failing video
306
+ stub_request(:get, "https://www.youtube.com/watch?v=fail_video")
307
+ .to_return(status: 200, body: sample_html)
308
+
309
+ stub_request(:post, innertube_url)
310
+ .to_return(status: 200, body: {
311
+ "playabilityStatus" => {
312
+ "status" => "ERROR",
313
+ "reason" => "This video is unavailable"
314
+ }
315
+ }.to_json)
316
+ end
317
+
318
+ it "raises error by default" do
319
+ expect { api.fetch_all(failing_video_ids) }.to raise_error(Youtube::Transcript::Rb::VideoUnavailable)
320
+ end
321
+
322
+ it "continues on error when configured" do
323
+ results = api.fetch_all(failing_video_ids, continue_on_error: true)
324
+ # No successful ones
325
+ expect(results).to be_empty
326
+ end
327
+
328
+ it "yields errors when continue_on_error is true" do
329
+ errors = []
330
+ api.fetch_all(failing_video_ids, continue_on_error: true) do |video_id, result|
331
+ errors << [video_id, result] if result.is_a?(StandardError)
332
+ end
333
+ expect(errors.length).to eq(1)
334
+ expect(errors.first[0]).to eq("fail_video")
335
+ expect(errors.first[1]).to be_a(Youtube::Transcript::Rb::VideoUnavailable)
336
+ end
337
+ end
338
+
339
+ context "with empty video list" do
340
+ it "returns empty hash" do
341
+ results = api.fetch_all([])
342
+ expect(results).to eq({})
343
+ end
344
+ end
345
+ end
346
+
347
+ describe "convenience module methods" do
348
+ before do
349
+ stub_request(:get, watch_url)
350
+ .to_return(status: 200, body: sample_html)
351
+
352
+ stub_request(:post, innertube_url)
353
+ .to_return(status: 200, body: sample_innertube_response.to_json)
354
+
355
+ stub_request(:get, transcript_url)
356
+ .to_return(status: 200, body: sample_transcript_xml)
357
+ end
358
+
359
+ describe "Youtube::Transcript::Rb.fetch" do
360
+ it "fetches a transcript" do
361
+ result = Youtube::Transcript::Rb.fetch(video_id)
362
+ expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
363
+ end
364
+
365
+ it "accepts language option" do
366
+ result = Youtube::Transcript::Rb.fetch(video_id, languages: ["en"])
367
+ expect(result.language_code).to eq("en")
368
+ end
369
+
370
+ it "accepts preserve_formatting option" do
371
+ result = Youtube::Transcript::Rb.fetch(video_id, preserve_formatting: false)
372
+ expect(result).to be_a(Youtube::Transcript::Rb::FetchedTranscript)
373
+ end
374
+ end
375
+
376
+ describe "Youtube::Transcript::Rb.list" do
377
+ it "lists available transcripts" do
378
+ result = Youtube::Transcript::Rb.list(video_id)
379
+ expect(result).to be_a(Youtube::Transcript::Rb::TranscriptList)
380
+ end
381
+ end
382
+ end
383
+
384
+ describe "default HTTP client configuration" do
385
+ it "sets timeout" do
386
+ api = described_class.new
387
+ client = api.instance_variable_get(:@http_client)
388
+ expect(client.options.timeout).to eq(30)
389
+ end
390
+
391
+ it "sets open_timeout" do
392
+ api = described_class.new
393
+ client = api.instance_variable_get(:@http_client)
394
+ expect(client.options.open_timeout).to eq(30)
395
+ end
396
+ end
397
+ end