youtube-transcript-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,355 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "youtube/transcript/rb"
5
+
6
+ RSpec.describe Youtube::Transcript::Rb::TranscriptParser do
7
+ describe "#initialize" do
8
+ it "creates a parser with preserve_formatting false by default" do
9
+ parser = described_class.new
10
+ expect(parser.instance_variable_get(:@preserve_formatting)).to be false
11
+ end
12
+
13
+ it "creates a parser with preserve_formatting true when specified" do
14
+ parser = described_class.new(preserve_formatting: true)
15
+ expect(parser.instance_variable_get(:@preserve_formatting)).to be true
16
+ end
17
+ end
18
+
19
+ describe "#parse" do
20
+ let(:parser) { described_class.new }
21
+
22
+ context "with basic XML" do
23
+ let(:xml) do
24
+ <<~XML
25
+ <?xml version="1.0" encoding="utf-8" ?>
26
+ <transcript>
27
+ <text start="0.0" dur="2.5">Hello world</text>
28
+ <text start="2.5" dur="3.0">This is a test</text>
29
+ </transcript>
30
+ XML
31
+ end
32
+
33
+ it "returns an array of TranscriptSnippet objects" do
34
+ result = parser.parse(xml)
35
+ expect(result).to be_an(Array)
36
+ expect(result.length).to eq(2)
37
+ expect(result.first).to be_a(Youtube::Transcript::Rb::TranscriptSnippet)
38
+ end
39
+
40
+ it "parses text content correctly" do
41
+ result = parser.parse(xml)
42
+ expect(result[0].text).to eq("Hello world")
43
+ expect(result[1].text).to eq("This is a test")
44
+ end
45
+
46
+ it "parses start time correctly" do
47
+ result = parser.parse(xml)
48
+ expect(result[0].start).to eq(0.0)
49
+ expect(result[1].start).to eq(2.5)
50
+ end
51
+
52
+ it "parses duration correctly" do
53
+ result = parser.parse(xml)
54
+ expect(result[0].duration).to eq(2.5)
55
+ expect(result[1].duration).to eq(3.0)
56
+ end
57
+ end
58
+
59
+ context "with missing duration attribute" do
60
+ let(:xml) do
61
+ <<~XML
62
+ <?xml version="1.0" encoding="utf-8" ?>
63
+ <transcript>
64
+ <text start="0.0">Hello world</text>
65
+ </transcript>
66
+ XML
67
+ end
68
+
69
+ it "defaults duration to 0.0" do
70
+ result = parser.parse(xml)
71
+ expect(result[0].duration).to eq(0.0)
72
+ end
73
+ end
74
+
75
+ context "with empty text elements" do
76
+ let(:xml) do
77
+ <<~XML
78
+ <?xml version="1.0" encoding="utf-8" ?>
79
+ <transcript>
80
+ <text start="0.0" dur="2.5">Hello</text>
81
+ <text start="2.5" dur="1.0"></text>
82
+ <text start="3.5" dur="2.0">World</text>
83
+ </transcript>
84
+ XML
85
+ end
86
+
87
+ it "skips empty text elements" do
88
+ result = parser.parse(xml)
89
+ expect(result.length).to eq(2)
90
+ expect(result[0].text).to eq("Hello")
91
+ expect(result[1].text).to eq("World")
92
+ end
93
+ end
94
+
95
+ context "with HTML entities" do
96
+ let(:xml) do
97
+ <<~XML
98
+ <?xml version="1.0" encoding="utf-8" ?>
99
+ <transcript>
100
+ <text start="0.0" dur="2.5">Hello &amp; goodbye</text>
101
+ <text start="2.5" dur="3.0">Quote: &quot;hello&quot;</text>
102
+ </transcript>
103
+ XML
104
+ end
105
+
106
+ it "unescapes HTML entities" do
107
+ result = parser.parse(xml)
108
+ expect(result[0].text).to eq("Hello & goodbye")
109
+ expect(result[1].text).to eq('Quote: "hello"')
110
+ end
111
+ end
112
+
113
+ context "with escaped HTML that looks like tags" do
114
+ let(:xml) do
115
+ <<~XML
116
+ <?xml version="1.0" encoding="utf-8" ?>
117
+ <transcript>
118
+ <text start="0.0" dur="2.5">Test &lt;value&gt;</text>
119
+ </transcript>
120
+ XML
121
+ end
122
+
123
+ it "unescapes and then strips the tag (expected behavior)" do
124
+ # When HTML entities are unescaped, <value> becomes a tag and gets stripped
125
+ result = parser.parse(xml)
126
+ expect(result[0].text).to eq("Test ")
127
+ end
128
+ end
129
+
130
+ context "with HTML tags and preserve_formatting: false" do
131
+ let(:parser) { described_class.new(preserve_formatting: false) }
132
+ let(:xml) do
133
+ <<~XML
134
+ <?xml version="1.0" encoding="utf-8" ?>
135
+ <transcript>
136
+ <text start="0.0" dur="2.5">Hello &lt;b&gt;world&lt;/b&gt;</text>
137
+ <text start="2.5" dur="3.0">&lt;i&gt;Italic&lt;/i&gt; text</text>
138
+ <text start="5.5" dur="2.0">&lt;span class="highlight"&gt;Span&lt;/span&gt;</text>
139
+ </transcript>
140
+ XML
141
+ end
142
+
143
+ it "strips all HTML tags" do
144
+ result = parser.parse(xml)
145
+ expect(result[0].text).to eq("Hello world")
146
+ expect(result[1].text).to eq("Italic text")
147
+ expect(result[2].text).to eq("Span")
148
+ end
149
+ end
150
+
151
+ context "with HTML tags and preserve_formatting: true" do
152
+ let(:parser) { described_class.new(preserve_formatting: true) }
153
+ let(:xml) do
154
+ <<~XML
155
+ <?xml version="1.0" encoding="utf-8" ?>
156
+ <transcript>
157
+ <text start="0.0" dur="2.5">Hello &lt;b&gt;world&lt;/b&gt;</text>
158
+ <text start="2.5" dur="3.0">&lt;i&gt;Italic&lt;/i&gt; text</text>
159
+ <text start="5.5" dur="2.0">&lt;em&gt;Emphasis&lt;/em&gt;</text>
160
+ <text start="8.5" dur="2.0">&lt;strong&gt;Strong&lt;/strong&gt;</text>
161
+ </transcript>
162
+ XML
163
+ end
164
+
165
+ it "preserves formatting tags like <b>" do
166
+ result = parser.parse(xml)
167
+ expect(result[0].text).to eq("Hello <b>world</b>")
168
+ end
169
+
170
+ it "preserves formatting tags like <i>" do
171
+ result = parser.parse(xml)
172
+ expect(result[1].text).to eq("<i>Italic</i> text")
173
+ end
174
+
175
+ it "preserves formatting tags like <em>" do
176
+ result = parser.parse(xml)
177
+ expect(result[2].text).to eq("<em>Emphasis</em>")
178
+ end
179
+
180
+ it "preserves formatting tags like <strong>" do
181
+ result = parser.parse(xml)
182
+ expect(result[3].text).to eq("<strong>Strong</strong>")
183
+ end
184
+ end
185
+
186
+ context "with non-formatting HTML tags and preserve_formatting: true" do
187
+ let(:parser) { described_class.new(preserve_formatting: true) }
188
+ let(:xml) do
189
+ <<~XML
190
+ <?xml version="1.0" encoding="utf-8" ?>
191
+ <transcript>
192
+ <text start="0.0" dur="2.5">&lt;span class="x"&gt;Span&lt;/span&gt;</text>
193
+ <text start="2.5" dur="3.0">&lt;div&gt;Div&lt;/div&gt;</text>
194
+ <text start="5.5" dur="2.0">&lt;a href="url"&gt;Link&lt;/a&gt;</text>
195
+ </transcript>
196
+ XML
197
+ end
198
+
199
+ it "strips non-formatting tags like <span>" do
200
+ result = parser.parse(xml)
201
+ expect(result[0].text).to eq("Span")
202
+ end
203
+
204
+ it "strips non-formatting tags like <div>" do
205
+ result = parser.parse(xml)
206
+ expect(result[1].text).to eq("Div")
207
+ end
208
+
209
+ it "strips non-formatting tags like <a>" do
210
+ result = parser.parse(xml)
211
+ expect(result[2].text).to eq("Link")
212
+ end
213
+ end
214
+
215
+ context "with all supported formatting tags" do
216
+ let(:parser) { described_class.new(preserve_formatting: true) }
217
+
218
+ described_class::FORMATTING_TAGS.each do |tag|
219
+ it "preserves <#{tag}> tags" do
220
+ xml = <<~XML
221
+ <?xml version="1.0" encoding="utf-8" ?>
222
+ <transcript>
223
+ <text start="0.0" dur="2.5">&lt;#{tag}&gt;content&lt;/#{tag}&gt;</text>
224
+ </transcript>
225
+ XML
226
+ result = parser.parse(xml)
227
+ expect(result[0].text).to eq("<#{tag}>content</#{tag}>")
228
+ end
229
+ end
230
+ end
231
+
232
+ context "with mixed content" do
233
+ let(:xml) do
234
+ <<~XML
235
+ <?xml version="1.0" encoding="utf-8" ?>
236
+ <transcript>
237
+ <text start="0.0" dur="2.5">Line 1</text>
238
+ <text start="2.5" dur="3.0">Line 2 with &amp; ampersand</text>
239
+ <text start="5.5" dur="2.0">Line 3</text>
240
+ </transcript>
241
+ XML
242
+ end
243
+
244
+ it "parses multiple elements correctly" do
245
+ result = parser.parse(xml)
246
+ expect(result.length).to eq(3)
247
+ expect(result.map(&:text)).to eq(["Line 1", "Line 2 with & ampersand", "Line 3"])
248
+ end
249
+ end
250
+
251
+ context "with integer times" do
252
+ let(:xml) do
253
+ <<~XML
254
+ <?xml version="1.0" encoding="utf-8" ?>
255
+ <transcript>
256
+ <text start="5" dur="10">Content</text>
257
+ </transcript>
258
+ XML
259
+ end
260
+
261
+ it "converts to float" do
262
+ result = parser.parse(xml)
263
+ expect(result[0].start).to eq(5.0)
264
+ expect(result[0].duration).to eq(10.0)
265
+ expect(result[0].start).to be_a(Float)
266
+ expect(result[0].duration).to be_a(Float)
267
+ end
268
+ end
269
+
270
+ context "with empty transcript" do
271
+ let(:xml) do
272
+ <<~XML
273
+ <?xml version="1.0" encoding="utf-8" ?>
274
+ <transcript>
275
+ </transcript>
276
+ XML
277
+ end
278
+
279
+ it "returns an empty array" do
280
+ result = parser.parse(xml)
281
+ expect(result).to eq([])
282
+ end
283
+ end
284
+
285
+ context "with whitespace-only text" do
286
+ let(:xml) do
287
+ <<~XML
288
+ <?xml version="1.0" encoding="utf-8" ?>
289
+ <transcript>
290
+ <text start="0.0" dur="2.5"> </text>
291
+ <text start="2.5" dur="3.0">Valid text</text>
292
+ </transcript>
293
+ XML
294
+ end
295
+
296
+ it "includes whitespace-only text since it's not empty" do
297
+ result = parser.parse(xml)
298
+ # Whitespace is still valid content
299
+ expect(result.length).to eq(2)
300
+ end
301
+ end
302
+
303
+ context "with Unicode content" do
304
+ let(:xml) do
305
+ <<~XML
306
+ <?xml version="1.0" encoding="utf-8" ?>
307
+ <transcript>
308
+ <text start="0.0" dur="2.5">こんにちは世界</text>
309
+ <text start="2.5" dur="3.0">Привет мир</text>
310
+ <text start="5.5" dur="2.0">🎉 Emoji test 🚀</text>
311
+ </transcript>
312
+ XML
313
+ end
314
+
315
+ it "handles Japanese characters" do
316
+ result = parser.parse(xml)
317
+ expect(result[0].text).to eq("こんにちは世界")
318
+ end
319
+
320
+ it "handles Cyrillic characters" do
321
+ result = parser.parse(xml)
322
+ expect(result[1].text).to eq("Привет мир")
323
+ end
324
+
325
+ it "handles emoji" do
326
+ result = parser.parse(xml)
327
+ expect(result[2].text).to eq("🎉 Emoji test 🚀")
328
+ end
329
+ end
330
+
331
+ context "with newlines in text" do
332
+ let(:xml) do
333
+ <<~XML
334
+ <?xml version="1.0" encoding="utf-8" ?>
335
+ <transcript>
336
+ <text start="0.0" dur="2.5">Line one
337
+ Line two</text>
338
+ </transcript>
339
+ XML
340
+ end
341
+
342
+ it "preserves newlines" do
343
+ result = parser.parse(xml)
344
+ expect(result[0].text).to include("\n")
345
+ end
346
+ end
347
+ end
348
+
349
+ describe "FORMATTING_TAGS" do
350
+ it "includes all expected formatting tags" do
351
+ expected_tags = %w[strong em b i mark small del ins sub sup]
352
+ expect(described_class::FORMATTING_TAGS).to match_array(expected_tags)
353
+ end
354
+ end
355
+ end