youtube-transcript-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +1 -0
- data/.serena/.gitignore +1 -0
- data/.serena/memories/code_style_and_conventions.md +35 -0
- data/.serena/memories/project_overview.md +40 -0
- data/.serena/memories/suggested_commands.md +50 -0
- data/.serena/memories/task_completion_checklist.md +25 -0
- data/.serena/memories/tech_stack.md +20 -0
- data/.serena/project.yml +84 -0
- data/LICENSE +21 -0
- data/PLAN.md +422 -0
- data/README.md +496 -0
- data/Rakefile +4 -0
- data/lib/youtube/transcript/rb/api.rb +150 -0
- data/lib/youtube/transcript/rb/errors.rb +217 -0
- data/lib/youtube/transcript/rb/formatters.rb +269 -0
- data/lib/youtube/transcript/rb/settings.rb +28 -0
- data/lib/youtube/transcript/rb/transcript.rb +239 -0
- data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
- data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
- data/lib/youtube/transcript/rb/version.rb +9 -0
- data/lib/youtube/transcript/rb.rb +37 -0
- data/sig/youtube/transcript/rb.rbs +8 -0
- data/spec/api_spec.rb +397 -0
- data/spec/errors_spec.rb +240 -0
- data/spec/formatters_spec.rb +436 -0
- data/spec/integration_spec.rb +363 -0
- data/spec/settings_spec.rb +67 -0
- data/spec/spec_helper.rb +109 -0
- data/spec/transcript_list_fetcher_spec.rb +520 -0
- data/spec/transcript_list_spec.rb +380 -0
- data/spec/transcript_parser_spec.rb +355 -0
- data/spec/transcript_spec.rb +435 -0
- metadata +118 -0
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe Youtube::Transcript::Rb::TranscriptList do
|
|
6
|
+
let(:http_client) { instance_double(Faraday::Connection) }
|
|
7
|
+
let(:video_id) { "test_video_123" }
|
|
8
|
+
|
|
9
|
+
# Sample captions JSON similar to what YouTube returns
|
|
10
|
+
let(:sample_captions_json) do
|
|
11
|
+
{
|
|
12
|
+
"captionTracks" => [
|
|
13
|
+
{
|
|
14
|
+
"baseUrl" => "https://www.youtube.com/api/timedtext?v=test&lang=en&fmt=srv3",
|
|
15
|
+
"name" => { "runs" => [{ "text" => "English" }] },
|
|
16
|
+
"languageCode" => "en",
|
|
17
|
+
"kind" => "",
|
|
18
|
+
"isTranslatable" => true
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"baseUrl" => "https://www.youtube.com/api/timedtext?v=test&lang=es&fmt=srv3",
|
|
22
|
+
"name" => { "runs" => [{ "text" => "Spanish" }] },
|
|
23
|
+
"languageCode" => "es",
|
|
24
|
+
"kind" => "",
|
|
25
|
+
"isTranslatable" => false
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"baseUrl" => "https://www.youtube.com/api/timedtext?v=test&lang=en&fmt=srv3",
|
|
29
|
+
"name" => { "runs" => [{ "text" => "English (auto-generated)" }] },
|
|
30
|
+
"languageCode" => "en-auto",
|
|
31
|
+
"kind" => "asr",
|
|
32
|
+
"isTranslatable" => true
|
|
33
|
+
}
|
|
34
|
+
],
|
|
35
|
+
"translationLanguages" => [
|
|
36
|
+
{ "languageCode" => "fr", "languageName" => { "runs" => [{ "text" => "French" }] } },
|
|
37
|
+
{ "languageCode" => "de", "languageName" => { "runs" => [{ "text" => "German" }] } }
|
|
38
|
+
]
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe ".build" do
|
|
43
|
+
it "creates a TranscriptList from captions JSON" do
|
|
44
|
+
list = described_class.build(
|
|
45
|
+
http_client: http_client,
|
|
46
|
+
video_id: video_id,
|
|
47
|
+
captions_json: sample_captions_json
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
expect(list).to be_a(described_class)
|
|
51
|
+
expect(list.video_id).to eq(video_id)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it "separates manually created and generated transcripts" do
|
|
55
|
+
list = described_class.build(
|
|
56
|
+
http_client: http_client,
|
|
57
|
+
video_id: video_id,
|
|
58
|
+
captions_json: sample_captions_json
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Should have 3 total transcripts (2 manual + 1 generated)
|
|
62
|
+
expect(list.count).to eq(3)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "removes &fmt=srv3 from base URLs" do
|
|
66
|
+
list = described_class.build(
|
|
67
|
+
http_client: http_client,
|
|
68
|
+
video_id: video_id,
|
|
69
|
+
captions_json: sample_captions_json
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
list.each do |transcript|
|
|
73
|
+
# The URL should not contain &fmt=srv3
|
|
74
|
+
expect(transcript.instance_variable_get(:@url)).not_to include("&fmt=srv3")
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it "handles empty captions JSON" do
|
|
79
|
+
empty_json = { "captionTracks" => [], "translationLanguages" => [] }
|
|
80
|
+
list = described_class.build(
|
|
81
|
+
http_client: http_client,
|
|
82
|
+
video_id: video_id,
|
|
83
|
+
captions_json: empty_json
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
expect(list.count).to eq(0)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it "handles missing translationLanguages" do
|
|
90
|
+
json = { "captionTracks" => sample_captions_json["captionTracks"] }
|
|
91
|
+
list = described_class.build(
|
|
92
|
+
http_client: http_client,
|
|
93
|
+
video_id: video_id,
|
|
94
|
+
captions_json: json
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
expect(list.count).to eq(3)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "assigns translation languages only to translatable transcripts" do
|
|
101
|
+
list = described_class.build(
|
|
102
|
+
http_client: http_client,
|
|
103
|
+
video_id: video_id,
|
|
104
|
+
captions_json: sample_captions_json
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
transcripts = list.to_a
|
|
108
|
+
en_transcript = transcripts.find { |t| t.language_code == "en" }
|
|
109
|
+
es_transcript = transcripts.find { |t| t.language_code == "es" }
|
|
110
|
+
|
|
111
|
+
expect(en_transcript.translation_languages).not_to be_empty
|
|
112
|
+
expect(es_transcript.translation_languages).to be_empty
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
describe "#initialize" do
|
|
117
|
+
it "stores the video ID" do
|
|
118
|
+
list = described_class.new(
|
|
119
|
+
video_id: video_id,
|
|
120
|
+
manually_created_transcripts: {},
|
|
121
|
+
generated_transcripts: {},
|
|
122
|
+
translation_languages: []
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
expect(list.video_id).to eq(video_id)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
describe "Enumerable" do
|
|
130
|
+
let(:list) do
|
|
131
|
+
described_class.build(
|
|
132
|
+
http_client: http_client,
|
|
133
|
+
video_id: video_id,
|
|
134
|
+
captions_json: sample_captions_json
|
|
135
|
+
)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "includes Enumerable" do
|
|
139
|
+
expect(described_class).to include(Enumerable)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
describe "#each" do
|
|
143
|
+
it "yields each transcript" do
|
|
144
|
+
transcripts = []
|
|
145
|
+
list.each { |t| transcripts << t }
|
|
146
|
+
expect(transcripts.length).to eq(3)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
it "returns an enumerator when no block given" do
|
|
150
|
+
expect(list.each).to be_a(Enumerator)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it "yields manually created transcripts first" do
|
|
154
|
+
transcripts = list.to_a
|
|
155
|
+
# First two should be manually created (en and es)
|
|
156
|
+
expect(transcripts[0].is_generated).to be false
|
|
157
|
+
expect(transcripts[1].is_generated).to be false
|
|
158
|
+
# Last one should be generated (en-auto)
|
|
159
|
+
expect(transcripts[2].is_generated).to be true
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "supports #map" do
|
|
164
|
+
codes = list.map(&:language_code)
|
|
165
|
+
expect(codes).to contain_exactly("en", "es", "en-auto")
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it "supports #count" do
|
|
169
|
+
expect(list.count).to eq(3)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "supports #select" do
|
|
173
|
+
generated = list.select(&:is_generated)
|
|
174
|
+
expect(generated.length).to eq(1)
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
describe "#find_transcript" do
|
|
179
|
+
let(:list) do
|
|
180
|
+
described_class.build(
|
|
181
|
+
http_client: http_client,
|
|
182
|
+
video_id: video_id,
|
|
183
|
+
captions_json: sample_captions_json
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
it "finds a transcript by language code" do
|
|
188
|
+
transcript = list.find_transcript(["en"])
|
|
189
|
+
expect(transcript.language_code).to eq("en")
|
|
190
|
+
expect(transcript.language).to eq("English")
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
it "prefers manually created over generated transcripts" do
|
|
194
|
+
transcript = list.find_transcript(["en"])
|
|
195
|
+
expect(transcript.is_generated).to be false
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it "tries language codes in order of priority" do
|
|
199
|
+
transcript = list.find_transcript(["ja", "es", "en"])
|
|
200
|
+
expect(transcript.language_code).to eq("es")
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
it "raises NoTranscriptFound when no match" do
|
|
204
|
+
expect {
|
|
205
|
+
list.find_transcript(["ja", "ko", "zh"])
|
|
206
|
+
}.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it "includes requested languages in error" do
|
|
210
|
+
begin
|
|
211
|
+
list.find_transcript(["ja", "ko"])
|
|
212
|
+
rescue Youtube::Transcript::Rb::NoTranscriptFound => e
|
|
213
|
+
expect(e.requested_language_codes).to eq(["ja", "ko"])
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
describe "#find_generated_transcript" do
|
|
219
|
+
let(:list) do
|
|
220
|
+
described_class.build(
|
|
221
|
+
http_client: http_client,
|
|
222
|
+
video_id: video_id,
|
|
223
|
+
captions_json: sample_captions_json
|
|
224
|
+
)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
it "finds only generated transcripts" do
|
|
228
|
+
transcript = list.find_generated_transcript(["en-auto"])
|
|
229
|
+
expect(transcript.language_code).to eq("en-auto")
|
|
230
|
+
expect(transcript.is_generated).to be true
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
it "does not return manually created transcripts" do
|
|
234
|
+
expect {
|
|
235
|
+
list.find_generated_transcript(["en"])
|
|
236
|
+
}.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
it "raises NoTranscriptFound when no match" do
|
|
240
|
+
expect {
|
|
241
|
+
list.find_generated_transcript(["ja"])
|
|
242
|
+
}.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
describe "#find_manually_created_transcript" do
|
|
247
|
+
let(:list) do
|
|
248
|
+
described_class.build(
|
|
249
|
+
http_client: http_client,
|
|
250
|
+
video_id: video_id,
|
|
251
|
+
captions_json: sample_captions_json
|
|
252
|
+
)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
it "finds only manually created transcripts" do
|
|
256
|
+
transcript = list.find_manually_created_transcript(["en"])
|
|
257
|
+
expect(transcript.language_code).to eq("en")
|
|
258
|
+
expect(transcript.is_generated).to be false
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
it "does not return generated transcripts" do
|
|
262
|
+
expect {
|
|
263
|
+
list.find_manually_created_transcript(["en-auto"])
|
|
264
|
+
}.to raise_error(Youtube::Transcript::Rb::NoTranscriptFound)
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
it "tries language codes in order" do
|
|
268
|
+
transcript = list.find_manually_created_transcript(["ja", "es"])
|
|
269
|
+
expect(transcript.language_code).to eq("es")
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
describe "#to_s" do
|
|
274
|
+
let(:list) do
|
|
275
|
+
described_class.build(
|
|
276
|
+
http_client: http_client,
|
|
277
|
+
video_id: video_id,
|
|
278
|
+
captions_json: sample_captions_json
|
|
279
|
+
)
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
it "includes the video ID" do
|
|
283
|
+
expect(list.to_s).to include(video_id)
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
it "includes MANUALLY CREATED section" do
|
|
287
|
+
expect(list.to_s).to include("(MANUALLY CREATED)")
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
it "includes GENERATED section" do
|
|
291
|
+
expect(list.to_s).to include("(GENERATED)")
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
it "includes TRANSLATION LANGUAGES section" do
|
|
295
|
+
expect(list.to_s).to include("(TRANSLATION LANGUAGES)")
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it "lists manually created transcripts" do
|
|
299
|
+
str = list.to_s
|
|
300
|
+
expect(str).to include("en")
|
|
301
|
+
expect(str).to include("English")
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it "lists translation languages" do
|
|
305
|
+
str = list.to_s
|
|
306
|
+
expect(str).to include("fr")
|
|
307
|
+
expect(str).to include("French")
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
context "with empty transcript list" do
|
|
311
|
+
let(:empty_list) do
|
|
312
|
+
described_class.new(
|
|
313
|
+
video_id: video_id,
|
|
314
|
+
manually_created_transcripts: {},
|
|
315
|
+
generated_transcripts: {},
|
|
316
|
+
translation_languages: []
|
|
317
|
+
)
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
it "shows None for empty sections" do
|
|
321
|
+
str = empty_list.to_s
|
|
322
|
+
expect(str).to include("None")
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
describe "edge cases" do
|
|
328
|
+
it "handles missing name in caption tracks" do
|
|
329
|
+
json = {
|
|
330
|
+
"captionTracks" => [
|
|
331
|
+
{
|
|
332
|
+
"baseUrl" => "https://example.com",
|
|
333
|
+
"name" => {},
|
|
334
|
+
"languageCode" => "en"
|
|
335
|
+
}
|
|
336
|
+
]
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
list = described_class.build(
|
|
340
|
+
http_client: http_client,
|
|
341
|
+
video_id: video_id,
|
|
342
|
+
captions_json: json
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
expect(list.count).to eq(1)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
it "handles nil baseUrl" do
|
|
349
|
+
json = {
|
|
350
|
+
"captionTracks" => [
|
|
351
|
+
{
|
|
352
|
+
"baseUrl" => nil,
|
|
353
|
+
"name" => { "runs" => [{ "text" => "English" }] },
|
|
354
|
+
"languageCode" => "en"
|
|
355
|
+
}
|
|
356
|
+
]
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
list = described_class.build(
|
|
360
|
+
http_client: http_client,
|
|
361
|
+
video_id: video_id,
|
|
362
|
+
captions_json: json
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
expect(list.count).to eq(1)
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
it "handles missing captionTracks" do
|
|
369
|
+
json = { "translationLanguages" => [] }
|
|
370
|
+
|
|
371
|
+
list = described_class.build(
|
|
372
|
+
http_client: http_client,
|
|
373
|
+
video_id: video_id,
|
|
374
|
+
captions_json: json
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
expect(list.count).to eq(0)
|
|
378
|
+
end
|
|
379
|
+
end
|
|
380
|
+
end
|