youtube-transcript-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +1 -0
- data/.serena/.gitignore +1 -0
- data/.serena/memories/code_style_and_conventions.md +35 -0
- data/.serena/memories/project_overview.md +40 -0
- data/.serena/memories/suggested_commands.md +50 -0
- data/.serena/memories/task_completion_checklist.md +25 -0
- data/.serena/memories/tech_stack.md +20 -0
- data/.serena/project.yml +84 -0
- data/LICENSE +21 -0
- data/PLAN.md +422 -0
- data/README.md +496 -0
- data/Rakefile +4 -0
- data/lib/youtube/transcript/rb/api.rb +150 -0
- data/lib/youtube/transcript/rb/errors.rb +217 -0
- data/lib/youtube/transcript/rb/formatters.rb +269 -0
- data/lib/youtube/transcript/rb/settings.rb +28 -0
- data/lib/youtube/transcript/rb/transcript.rb +239 -0
- data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
- data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
- data/lib/youtube/transcript/rb/version.rb +9 -0
- data/lib/youtube/transcript/rb.rb +37 -0
- data/sig/youtube/transcript/rb.rbs +8 -0
- data/spec/api_spec.rb +397 -0
- data/spec/errors_spec.rb +240 -0
- data/spec/formatters_spec.rb +436 -0
- data/spec/integration_spec.rb +363 -0
- data/spec/settings_spec.rb +67 -0
- data/spec/spec_helper.rb +109 -0
- data/spec/transcript_list_fetcher_spec.rb +520 -0
- data/spec/transcript_list_spec.rb +380 -0
- data/spec/transcript_parser_spec.rb +355 -0
- data/spec/transcript_spec.rb +435 -0
- metadata +118 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "youtube/transcript/rb"
|
|
5
|
+
|
|
6
|
+
RSpec.describe Youtube::Transcript::Rb::TranscriptParser do
|
|
7
|
+
describe "#initialize" do
|
|
8
|
+
it "creates a parser with preserve_formatting false by default" do
|
|
9
|
+
parser = described_class.new
|
|
10
|
+
expect(parser.instance_variable_get(:@preserve_formatting)).to be false
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it "creates a parser with preserve_formatting true when specified" do
|
|
14
|
+
parser = described_class.new(preserve_formatting: true)
|
|
15
|
+
expect(parser.instance_variable_get(:@preserve_formatting)).to be true
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe "#parse" do
|
|
20
|
+
let(:parser) { described_class.new }
|
|
21
|
+
|
|
22
|
+
context "with basic XML" do
|
|
23
|
+
let(:xml) do
|
|
24
|
+
<<~XML
|
|
25
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
26
|
+
<transcript>
|
|
27
|
+
<text start="0.0" dur="2.5">Hello world</text>
|
|
28
|
+
<text start="2.5" dur="3.0">This is a test</text>
|
|
29
|
+
</transcript>
|
|
30
|
+
XML
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "returns an array of TranscriptSnippet objects" do
|
|
34
|
+
result = parser.parse(xml)
|
|
35
|
+
expect(result).to be_an(Array)
|
|
36
|
+
expect(result.length).to eq(2)
|
|
37
|
+
expect(result.first).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it "parses text content correctly" do
|
|
41
|
+
result = parser.parse(xml)
|
|
42
|
+
expect(result[0].text).to eq("Hello world")
|
|
43
|
+
expect(result[1].text).to eq("This is a test")
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it "parses start time correctly" do
|
|
47
|
+
result = parser.parse(xml)
|
|
48
|
+
expect(result[0].start).to eq(0.0)
|
|
49
|
+
expect(result[1].start).to eq(2.5)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "parses duration correctly" do
|
|
53
|
+
result = parser.parse(xml)
|
|
54
|
+
expect(result[0].duration).to eq(2.5)
|
|
55
|
+
expect(result[1].duration).to eq(3.0)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
context "with missing duration attribute" do
|
|
60
|
+
let(:xml) do
|
|
61
|
+
<<~XML
|
|
62
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
63
|
+
<transcript>
|
|
64
|
+
<text start="0.0">Hello world</text>
|
|
65
|
+
</transcript>
|
|
66
|
+
XML
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it "defaults duration to 0.0" do
|
|
70
|
+
result = parser.parse(xml)
|
|
71
|
+
expect(result[0].duration).to eq(0.0)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
context "with empty text elements" do
|
|
76
|
+
let(:xml) do
|
|
77
|
+
<<~XML
|
|
78
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
79
|
+
<transcript>
|
|
80
|
+
<text start="0.0" dur="2.5">Hello</text>
|
|
81
|
+
<text start="2.5" dur="1.0"></text>
|
|
82
|
+
<text start="3.5" dur="2.0">World</text>
|
|
83
|
+
</transcript>
|
|
84
|
+
XML
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "skips empty text elements" do
|
|
88
|
+
result = parser.parse(xml)
|
|
89
|
+
expect(result.length).to eq(2)
|
|
90
|
+
expect(result[0].text).to eq("Hello")
|
|
91
|
+
expect(result[1].text).to eq("World")
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
context "with HTML entities" do
|
|
96
|
+
let(:xml) do
|
|
97
|
+
<<~XML
|
|
98
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
99
|
+
<transcript>
|
|
100
|
+
<text start="0.0" dur="2.5">Hello & goodbye</text>
|
|
101
|
+
<text start="2.5" dur="3.0">Quote: "hello"</text>
|
|
102
|
+
</transcript>
|
|
103
|
+
XML
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it "unescapes HTML entities" do
|
|
107
|
+
result = parser.parse(xml)
|
|
108
|
+
expect(result[0].text).to eq("Hello & goodbye")
|
|
109
|
+
expect(result[1].text).to eq('Quote: "hello"')
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
context "with escaped HTML that looks like tags" do
|
|
114
|
+
let(:xml) do
|
|
115
|
+
<<~XML
|
|
116
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
117
|
+
<transcript>
|
|
118
|
+
<text start="0.0" dur="2.5">Test <value></text>
|
|
119
|
+
</transcript>
|
|
120
|
+
XML
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it "unescapes and then strips the tag (expected behavior)" do
|
|
124
|
+
# When HTML entities are unescaped, <value> becomes a tag and gets stripped
|
|
125
|
+
result = parser.parse(xml)
|
|
126
|
+
expect(result[0].text).to eq("Test ")
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
context "with HTML tags and preserve_formatting: false" do
|
|
131
|
+
let(:parser) { described_class.new(preserve_formatting: false) }
|
|
132
|
+
let(:xml) do
|
|
133
|
+
<<~XML
|
|
134
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
135
|
+
<transcript>
|
|
136
|
+
<text start="0.0" dur="2.5">Hello <b>world</b></text>
|
|
137
|
+
<text start="2.5" dur="3.0"><i>Italic</i> text</text>
|
|
138
|
+
<text start="5.5" dur="2.0"><span class="highlight">Span</span></text>
|
|
139
|
+
</transcript>
|
|
140
|
+
XML
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "strips all HTML tags" do
|
|
144
|
+
result = parser.parse(xml)
|
|
145
|
+
expect(result[0].text).to eq("Hello world")
|
|
146
|
+
expect(result[1].text).to eq("Italic text")
|
|
147
|
+
expect(result[2].text).to eq("Span")
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
context "with HTML tags and preserve_formatting: true" do
|
|
152
|
+
let(:parser) { described_class.new(preserve_formatting: true) }
|
|
153
|
+
let(:xml) do
|
|
154
|
+
<<~XML
|
|
155
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
156
|
+
<transcript>
|
|
157
|
+
<text start="0.0" dur="2.5">Hello <b>world</b></text>
|
|
158
|
+
<text start="2.5" dur="3.0"><i>Italic</i> text</text>
|
|
159
|
+
<text start="5.5" dur="2.0"><em>Emphasis</em></text>
|
|
160
|
+
<text start="8.5" dur="2.0"><strong>Strong</strong></text>
|
|
161
|
+
</transcript>
|
|
162
|
+
XML
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
it "preserves formatting tags like <b>" do
|
|
166
|
+
result = parser.parse(xml)
|
|
167
|
+
expect(result[0].text).to eq("Hello <b>world</b>")
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it "preserves formatting tags like <i>" do
|
|
171
|
+
result = parser.parse(xml)
|
|
172
|
+
expect(result[1].text).to eq("<i>Italic</i> text")
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "preserves formatting tags like <em>" do
|
|
176
|
+
result = parser.parse(xml)
|
|
177
|
+
expect(result[2].text).to eq("<em>Emphasis</em>")
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "preserves formatting tags like <strong>" do
|
|
181
|
+
result = parser.parse(xml)
|
|
182
|
+
expect(result[3].text).to eq("<strong>Strong</strong>")
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
context "with non-formatting HTML tags and preserve_formatting: true" do
|
|
187
|
+
let(:parser) { described_class.new(preserve_formatting: true) }
|
|
188
|
+
let(:xml) do
|
|
189
|
+
<<~XML
|
|
190
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
191
|
+
<transcript>
|
|
192
|
+
<text start="0.0" dur="2.5"><span class="x">Span</span></text>
|
|
193
|
+
<text start="2.5" dur="3.0"><div>Div</div></text>
|
|
194
|
+
<text start="5.5" dur="2.0"><a href="url">Link</a></text>
|
|
195
|
+
</transcript>
|
|
196
|
+
XML
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it "strips non-formatting tags like <span>" do
|
|
200
|
+
result = parser.parse(xml)
|
|
201
|
+
expect(result[0].text).to eq("Span")
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it "strips non-formatting tags like <div>" do
|
|
205
|
+
result = parser.parse(xml)
|
|
206
|
+
expect(result[1].text).to eq("Div")
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it "strips non-formatting tags like <a>" do
|
|
210
|
+
result = parser.parse(xml)
|
|
211
|
+
expect(result[2].text).to eq("Link")
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
context "with all supported formatting tags" do
|
|
216
|
+
let(:parser) { described_class.new(preserve_formatting: true) }
|
|
217
|
+
|
|
218
|
+
described_class::FORMATTING_TAGS.each do |tag|
|
|
219
|
+
it "preserves <#{tag}> tags" do
|
|
220
|
+
xml = <<~XML
|
|
221
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
222
|
+
<transcript>
|
|
223
|
+
<text start="0.0" dur="2.5"><#{tag}>content</#{tag}></text>
|
|
224
|
+
</transcript>
|
|
225
|
+
XML
|
|
226
|
+
result = parser.parse(xml)
|
|
227
|
+
expect(result[0].text).to eq("<#{tag}>content</#{tag}>")
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
context "with mixed content" do
|
|
233
|
+
let(:xml) do
|
|
234
|
+
<<~XML
|
|
235
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
236
|
+
<transcript>
|
|
237
|
+
<text start="0.0" dur="2.5">Line 1</text>
|
|
238
|
+
<text start="2.5" dur="3.0">Line 2 with & ampersand</text>
|
|
239
|
+
<text start="5.5" dur="2.0">Line 3</text>
|
|
240
|
+
</transcript>
|
|
241
|
+
XML
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
it "parses multiple elements correctly" do
|
|
245
|
+
result = parser.parse(xml)
|
|
246
|
+
expect(result.length).to eq(3)
|
|
247
|
+
expect(result.map(&:text)).to eq(["Line 1", "Line 2 with & ampersand", "Line 3"])
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
context "with integer times" do
|
|
252
|
+
let(:xml) do
|
|
253
|
+
<<~XML
|
|
254
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
255
|
+
<transcript>
|
|
256
|
+
<text start="5" dur="10">Content</text>
|
|
257
|
+
</transcript>
|
|
258
|
+
XML
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
it "converts to float" do
|
|
262
|
+
result = parser.parse(xml)
|
|
263
|
+
expect(result[0].start).to eq(5.0)
|
|
264
|
+
expect(result[0].duration).to eq(10.0)
|
|
265
|
+
expect(result[0].start).to be_a(Float)
|
|
266
|
+
expect(result[0].duration).to be_a(Float)
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
context "with empty transcript" do
|
|
271
|
+
let(:xml) do
|
|
272
|
+
<<~XML
|
|
273
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
274
|
+
<transcript>
|
|
275
|
+
</transcript>
|
|
276
|
+
XML
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
it "returns an empty array" do
|
|
280
|
+
result = parser.parse(xml)
|
|
281
|
+
expect(result).to eq([])
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
context "with whitespace-only text" do
|
|
286
|
+
let(:xml) do
|
|
287
|
+
<<~XML
|
|
288
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
289
|
+
<transcript>
|
|
290
|
+
<text start="0.0" dur="2.5"> </text>
|
|
291
|
+
<text start="2.5" dur="3.0">Valid text</text>
|
|
292
|
+
</transcript>
|
|
293
|
+
XML
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
it "includes whitespace-only text since it's not empty" do
|
|
297
|
+
result = parser.parse(xml)
|
|
298
|
+
# Whitespace is still valid content
|
|
299
|
+
expect(result.length).to eq(2)
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
context "with Unicode content" do
|
|
304
|
+
let(:xml) do
|
|
305
|
+
<<~XML
|
|
306
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
307
|
+
<transcript>
|
|
308
|
+
<text start="0.0" dur="2.5">こんにちは世界</text>
|
|
309
|
+
<text start="2.5" dur="3.0">Привет мир</text>
|
|
310
|
+
<text start="5.5" dur="2.0">🎉 Emoji test 🚀</text>
|
|
311
|
+
</transcript>
|
|
312
|
+
XML
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
it "handles Japanese characters" do
|
|
316
|
+
result = parser.parse(xml)
|
|
317
|
+
expect(result[0].text).to eq("こんにちは世界")
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
it "handles Cyrillic characters" do
|
|
321
|
+
result = parser.parse(xml)
|
|
322
|
+
expect(result[1].text).to eq("Привет мир")
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
it "handles emoji" do
|
|
326
|
+
result = parser.parse(xml)
|
|
327
|
+
expect(result[2].text).to eq("🎉 Emoji test 🚀")
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
context "with newlines in text" do
|
|
332
|
+
let(:xml) do
|
|
333
|
+
<<~XML
|
|
334
|
+
<?xml version="1.0" encoding="utf-8" ?>
|
|
335
|
+
<transcript>
|
|
336
|
+
<text start="0.0" dur="2.5">Line one
|
|
337
|
+
Line two</text>
|
|
338
|
+
</transcript>
|
|
339
|
+
XML
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
it "preserves newlines" do
|
|
343
|
+
result = parser.parse(xml)
|
|
344
|
+
expect(result[0].text).to include("\n")
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
describe "FORMATTING_TAGS" do
|
|
350
|
+
it "includes all expected formatting tags" do
|
|
351
|
+
expected_tags = %w[strong em b i mark small del ins sub sup]
|
|
352
|
+
expect(described_class::FORMATTING_TAGS).to match_array(expected_tags)
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
end
|