ruby-readability 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +25 -0
- data/.rspec +1 -1
- data/README.md +3 -6
- data/lib/readability.rb +73 -21
- data/ruby-readability.gemspec +1 -4
- data/spec/fixtures/codinghorror.html +189 -0
- data/spec/fixtures/images/Confusion_of_Tongues.png +0 -0
- data/spec/fixtures/images/JohnPinhole.jpg +0 -0
- data/spec/fixtures/nested_images.html +11 -0
- data/spec/readability_spec.rb +315 -100
- data/spec/spec_helper.rb +0 -6
- metadata +28 -35
- data/.travis.yml +0 -6
data/spec/readability_spec.rb
CHANGED
@@ -19,7 +19,7 @@ describe Readability do
|
|
19
19
|
</body>
|
20
20
|
</html>
|
21
21
|
HTML
|
22
|
-
|
22
|
+
|
23
23
|
@simple_html_with_img_no_text = <<-HTML
|
24
24
|
<html>
|
25
25
|
<head>
|
@@ -32,7 +32,7 @@ describe Readability do
|
|
32
32
|
</body>
|
33
33
|
</html>
|
34
34
|
HTML
|
35
|
-
|
35
|
+
|
36
36
|
@simple_html_with_img_in_noscript = <<-HTML
|
37
37
|
<html>
|
38
38
|
<head>
|
@@ -40,8 +40,8 @@ describe Readability do
|
|
40
40
|
</head>
|
41
41
|
<body class='main'>
|
42
42
|
<div class="article-img">
|
43
|
-
<img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
|
44
|
-
height="317" alt="test" class="lazy"
|
43
|
+
<img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
|
44
|
+
height="317" alt="test" class="lazy"
|
45
45
|
data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
|
46
46
|
<noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
|
47
47
|
</div>
|
@@ -54,30 +54,70 @@ describe Readability do
|
|
54
54
|
before do
|
55
55
|
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
56
56
|
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
57
|
-
@
|
57
|
+
@thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
58
|
+
@ch = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
|
59
|
+
@nested = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")
|
58
60
|
|
59
61
|
FakeWeb::Registry.instance.clean_registry
|
62
|
+
|
60
63
|
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
|
61
64
|
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
|
62
|
-
|
65
|
+
|
63
66
|
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
|
64
67
|
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
|
65
|
-
|
66
|
-
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
|
68
|
+
|
69
|
+
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
|
67
70
|
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
|
68
|
-
|
71
|
+
|
72
|
+
# Register images for codinghorror
|
73
|
+
FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
|
74
|
+
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/JohnPinhole.jpg"))
|
75
|
+
FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png',
|
76
|
+
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/Confusion_of_Tongues.png"))
|
69
77
|
end
|
70
78
|
|
71
79
|
it "should show one image, but outside of the best candidate" do
|
72
|
-
@doc = Readability::Document.new(@
|
73
|
-
@doc.images.
|
74
|
-
@doc.best_candidate_has_image.
|
80
|
+
@doc = Readability::Document.new(@thesun)
|
81
|
+
expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"])
|
82
|
+
expect(@doc.best_candidate_has_image).to eq(false)
|
75
83
|
end
|
76
84
|
|
77
85
|
it "should show one image inside of the best candidate" do
|
78
86
|
@doc = Readability::Document.new(@nytimes)
|
79
|
-
@doc.images.
|
80
|
-
@doc.best_candidate_has_image.
|
87
|
+
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
|
88
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
89
|
+
end
|
90
|
+
|
91
|
+
it "should expand relative image url" do
|
92
|
+
url = 'http://blog.codinghorror.com/standard-flavored-markdown/'
|
93
|
+
@doc = Readability::Document.new(@ch, tags: %w[div p img a],
|
94
|
+
attributes: %w[src href],
|
95
|
+
remove_empty_nodes: false)
|
96
|
+
@doc.images_with_fqdn_uris!(url)
|
97
|
+
|
98
|
+
expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg')
|
99
|
+
expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png')
|
100
|
+
|
101
|
+
expect(@doc.images).to match_array([
|
102
|
+
'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
|
103
|
+
'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png'
|
104
|
+
])
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
|
108
|
+
@doc = Readability::Document.new(@nested, attributes: ["src"])
|
109
|
+
expect(@doc.images).to be_empty
|
110
|
+
|
111
|
+
@doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
|
112
|
+
expect(@doc.images).to be_empty
|
113
|
+
|
114
|
+
@doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
|
115
|
+
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
116
|
+
end
|
117
|
+
|
118
|
+
it "should be able to whitelist all attributes" do
|
119
|
+
@doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
|
120
|
+
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
|
81
121
|
end
|
82
122
|
|
83
123
|
it "should not try to download local images" do
|
@@ -93,69 +133,69 @@ describe Readability do
|
|
93
133
|
</body>
|
94
134
|
</html>
|
95
135
|
HTML
|
96
|
-
|
97
|
-
@doc.images.
|
136
|
+
expect(@doc).not_to receive(:get_image_size)
|
137
|
+
expect(@doc.images).to eq([])
|
98
138
|
end
|
99
139
|
|
100
140
|
describe "no images" do
|
101
141
|
it "shouldn't show images" do
|
102
142
|
@doc = Readability::Document.new(@bbc, :min_image_height => 600)
|
103
|
-
@doc.images.
|
104
|
-
@doc.best_candidate_has_image.
|
143
|
+
expect(@doc.images).to eq([])
|
144
|
+
expect(@doc.best_candidate_has_image).to eq(false)
|
105
145
|
end
|
106
146
|
end
|
107
147
|
|
108
148
|
describe "poll of images" do
|
109
149
|
it "should show some images inside of the best candidate" do
|
110
150
|
@doc = Readability::Document.new(@bbc)
|
111
|
-
@doc.images.
|
151
|
+
expect(@doc.images).to match_array(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
|
112
152
|
"http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
|
113
153
|
"http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
|
114
|
-
"http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
115
|
-
@doc.best_candidate_has_image.
|
154
|
+
"http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
|
155
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
116
156
|
end
|
117
157
|
|
118
158
|
it "should show some images inside of the best candidate, include gif format" do
|
119
159
|
@doc = Readability::Document.new(@bbc, :ignore_image_format => [])
|
120
|
-
@doc.images.
|
121
|
-
@doc.best_candidate_has_image.
|
160
|
+
expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
|
161
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
122
162
|
end
|
123
163
|
|
124
164
|
describe "width, height and format" do
|
125
165
|
it "should show some images inside of the best candidate, but with width most equal to 400px" do
|
126
166
|
@doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
|
127
|
-
@doc.images.
|
128
|
-
@doc.best_candidate_has_image.
|
167
|
+
expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"])
|
168
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
129
169
|
end
|
130
170
|
|
131
171
|
it "should show some images inside of the best candidate, but with width most equal to 304px" do
|
132
172
|
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
|
133
|
-
@doc.images.
|
134
|
-
@doc.best_candidate_has_image.
|
173
|
+
expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
|
174
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
135
175
|
end
|
136
176
|
|
137
177
|
it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
|
138
178
|
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
|
139
|
-
@doc.images.
|
140
|
-
@doc.best_candidate_has_image.
|
179
|
+
expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
|
180
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
141
181
|
end
|
142
182
|
|
143
183
|
it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
|
144
184
|
@doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
|
145
|
-
@doc.images.
|
146
|
-
@doc.best_candidate_has_image.
|
185
|
+
expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
|
186
|
+
expect(@doc.best_candidate_has_image).to eq(true)
|
147
187
|
end
|
148
|
-
|
188
|
+
|
149
189
|
it "should not miss an image if it exists by itself in a div without text" do
|
150
190
|
@doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
151
|
-
@doc.images.
|
191
|
+
expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
|
152
192
|
end
|
153
|
-
|
193
|
+
|
154
194
|
it "should not double count an image between script and noscript" do
|
155
195
|
@doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
|
156
|
-
@doc.images.
|
196
|
+
expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
|
157
197
|
end
|
158
|
-
|
198
|
+
|
159
199
|
end
|
160
200
|
end
|
161
201
|
end
|
@@ -167,11 +207,11 @@ describe Readability do
|
|
167
207
|
end
|
168
208
|
|
169
209
|
it "should transform divs containing no block elements into <p>s" do
|
170
|
-
@doc.html.css("#body").first.name.
|
210
|
+
expect(@doc.html.css("#body").first.name).to eq("p")
|
171
211
|
end
|
172
212
|
|
173
213
|
it "should not transform divs that contain block elements" do
|
174
|
-
@doc.html.css("#contains_blockquote").first.name.
|
214
|
+
expect(@doc.html.css("#contains_blockquote").first.name).to eq("div")
|
175
215
|
end
|
176
216
|
end
|
177
217
|
|
@@ -185,9 +225,9 @@ describe Readability do
|
|
185
225
|
<body></body>
|
186
226
|
</html>
|
187
227
|
HTML
|
188
|
-
doc.author.
|
228
|
+
expect(doc.author).to eql("Austin Fonacier")
|
189
229
|
end
|
190
|
-
|
230
|
+
|
191
231
|
it "should pick up readability's recommended author format" do
|
192
232
|
doc = Readability::Document.new(<<-HTML)
|
193
233
|
<html>
|
@@ -200,9 +240,9 @@ describe Readability do
|
|
200
240
|
</body>
|
201
241
|
</html>
|
202
242
|
HTML
|
203
|
-
doc.author.
|
243
|
+
expect(doc.author).to eql("Austin Fonacier")
|
204
244
|
end
|
205
|
-
|
245
|
+
|
206
246
|
it "should pick up vcard fn" do
|
207
247
|
doc = Readability::Document.new(<<-HTML)
|
208
248
|
<html>
|
@@ -216,9 +256,9 @@ describe Readability do
|
|
216
256
|
</body>
|
217
257
|
</html>
|
218
258
|
HTML
|
219
|
-
doc.author.
|
259
|
+
expect(doc.author).to eql("Austin Fonacier")
|
220
260
|
end
|
221
|
-
|
261
|
+
|
222
262
|
it "should pick up <a rel='author'>" do
|
223
263
|
doc = Readability::Document.new(<<-HTML)
|
224
264
|
<html>
|
@@ -228,9 +268,9 @@ describe Readability do
|
|
228
268
|
</body>
|
229
269
|
</html>
|
230
270
|
HTML
|
231
|
-
doc.author.
|
271
|
+
expect(doc.author).to eql("Danny Banks (rel)")
|
232
272
|
end
|
233
|
-
|
273
|
+
|
234
274
|
it "should pick up <div id='author'>" do
|
235
275
|
doc = Readability::Document.new(<<-HTML)
|
236
276
|
<html>
|
@@ -240,7 +280,7 @@ describe Readability do
|
|
240
280
|
</body>
|
241
281
|
</html>
|
242
282
|
HTML
|
243
|
-
doc.author.
|
283
|
+
expect(doc.author).to eql("Austin Fonacier (author)")
|
244
284
|
end
|
245
285
|
end
|
246
286
|
|
@@ -263,15 +303,15 @@ describe Readability do
|
|
263
303
|
end
|
264
304
|
|
265
305
|
it "should like <div>s more than <th>s" do
|
266
|
-
@doc.score_node(@elem1)[:content_score].
|
306
|
+
expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
|
267
307
|
end
|
268
308
|
|
269
309
|
it "should like classes like text more than classes like comment" do
|
270
310
|
@elem2.name = "div"
|
271
|
-
@doc.score_node(@elem1)[:content_score].
|
311
|
+
expect(@doc.score_node(@elem1)[:content_score]).to eq(@doc.score_node(@elem2)[:content_score])
|
272
312
|
@elem1['class'] = "text"
|
273
313
|
@elem2['class'] = "comment"
|
274
|
-
@doc.score_node(@elem1)[:content_score].
|
314
|
+
expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
|
275
315
|
end
|
276
316
|
end
|
277
317
|
|
@@ -282,15 +322,15 @@ describe Readability do
|
|
282
322
|
end
|
283
323
|
|
284
324
|
it "should remove things that have class comment" do
|
285
|
-
@doc.html.inner_html.
|
325
|
+
expect(@doc.html.inner_html).not_to match(/a comment/)
|
286
326
|
end
|
287
327
|
|
288
328
|
it "should not remove body tags" do
|
289
|
-
@doc.html.inner_html.
|
329
|
+
expect(@doc.html.inner_html).to match(/<\/body>/)
|
290
330
|
end
|
291
331
|
|
292
332
|
it "should not remove things with class comment and id body" do
|
293
|
-
@doc.html.inner_html.
|
333
|
+
expect(@doc.html.inner_html).to match(/real content/)
|
294
334
|
end
|
295
335
|
end
|
296
336
|
|
@@ -318,13 +358,13 @@ describe Readability do
|
|
318
358
|
end
|
319
359
|
|
320
360
|
it "should score elements in the document" do
|
321
|
-
@candidates.values.length.
|
361
|
+
expect(@candidates.values.length).to eq(3)
|
322
362
|
end
|
323
363
|
|
324
364
|
it "should prefer the body in this particular example" do
|
325
|
-
@candidates.values.sort { |a, b|
|
365
|
+
expect(@candidates.values.sort { |a, b|
|
326
366
|
b[:content_score] <=> a[:content_score]
|
327
|
-
}.first[:elem][:id].
|
367
|
+
}.first[:elem][:id]).to eq("body")
|
328
368
|
end
|
329
369
|
|
330
370
|
context "when two consequent br tags are used instead of p" do
|
@@ -349,9 +389,162 @@ describe Readability do
|
|
349
389
|
</html>
|
350
390
|
HTML
|
351
391
|
@candidates = @doc.score_paragraphs(0)
|
352
|
-
@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].
|
392
|
+
expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
|
353
393
|
end
|
354
394
|
end
|
395
|
+
|
396
|
+
it "does not include short paragraphs as related siblings in the output" do
|
397
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
|
398
|
+
<html>
|
399
|
+
<head>
|
400
|
+
<title>title!</title>
|
401
|
+
</head>
|
402
|
+
<body>
|
403
|
+
<section>
|
404
|
+
<p>Paragraph 1</p>
|
405
|
+
<p>Paragraph 2</p>
|
406
|
+
</section>
|
407
|
+
<section>
|
408
|
+
<p>Too short</p>
|
409
|
+
</section>
|
410
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
411
|
+
</body>
|
412
|
+
</html>
|
413
|
+
HTML
|
414
|
+
|
415
|
+
expect(@doc.content).to include("Paragraph 1")
|
416
|
+
expect(@doc.content).to include("Paragraph 2")
|
417
|
+
expect(@doc.content).not_to include("Too short")
|
418
|
+
end
|
419
|
+
|
420
|
+
it "includes long paragraphs as related siblings in the output" do
|
421
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
|
422
|
+
<html>
|
423
|
+
<head>
|
424
|
+
<title>title!</title>
|
425
|
+
</head>
|
426
|
+
<body>
|
427
|
+
<section>
|
428
|
+
<p>Paragraph 1</p>
|
429
|
+
<p>Paragraph 2</p>
|
430
|
+
</section>
|
431
|
+
<p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
|
432
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
433
|
+
</body>
|
434
|
+
</html>
|
435
|
+
HTML
|
436
|
+
|
437
|
+
expect(@doc.content).to include("Paragraph 1")
|
438
|
+
expect(@doc.content).to include("Paragraph 2")
|
439
|
+
expect(@doc.content).to include("This paragraph is longer")
|
440
|
+
end
|
441
|
+
|
442
|
+
it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
|
443
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
|
444
|
+
<html>
|
445
|
+
<head>
|
446
|
+
<title>title!</title>
|
447
|
+
</head>
|
448
|
+
<body>
|
449
|
+
<section>
|
450
|
+
<p>Paragraph 1</p>
|
451
|
+
<p>Paragraph 2</p>
|
452
|
+
</section>
|
453
|
+
<section>
|
454
|
+
<p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
|
455
|
+
</section>
|
456
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
457
|
+
</body>
|
458
|
+
</html>
|
459
|
+
HTML
|
460
|
+
|
461
|
+
expect(@doc.content).to include("Paragraph 1")
|
462
|
+
expect(@doc.content).to include("Paragraph 2")
|
463
|
+
expect(@doc.content).not_to include("Although this paragraph")
|
464
|
+
end
|
465
|
+
|
466
|
+
it "does include non-paragraph tags in the output if their content score is high enough" do
|
467
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
|
468
|
+
<html>
|
469
|
+
<head>
|
470
|
+
<title>title!</title>
|
471
|
+
</head>
|
472
|
+
<body>
|
473
|
+
<section>
|
474
|
+
<p>Paragraph 1</p>
|
475
|
+
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
|
476
|
+
</section>
|
477
|
+
<section>
|
478
|
+
<p>This should be included in the output because the content is score is high enough.<p>
|
479
|
+
<p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
|
480
|
+
</section>
|
481
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
482
|
+
</body>
|
483
|
+
</html>
|
484
|
+
HTML
|
485
|
+
|
486
|
+
expect(@doc.content).to include("Paragraph 1")
|
487
|
+
expect(@doc.content).to include("Paragraph 2")
|
488
|
+
expect(@doc.content).to include("This should be included")
|
489
|
+
end
|
490
|
+
|
491
|
+
it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
|
492
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
|
493
|
+
<html>
|
494
|
+
<head>
|
495
|
+
<title>title!</title>
|
496
|
+
</head>
|
497
|
+
<body>
|
498
|
+
<section>
|
499
|
+
<p>Paragraph 1</p>
|
500
|
+
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
|
501
|
+
</section>
|
502
|
+
<section>
|
503
|
+
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
|
504
|
+
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
505
|
+
</section>
|
506
|
+
<section>
|
507
|
+
<p>too short when stripped </p>
|
508
|
+
</section>
|
509
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
510
|
+
</body>
|
511
|
+
</html>
|
512
|
+
HTML
|
513
|
+
|
514
|
+
expect(@doc.content).to include("Paragraph 1")
|
515
|
+
expect(@doc.content).to include("Paragraph 2")
|
516
|
+
expect(@doc.content).to include("should be included")
|
517
|
+
expect(@doc.content).not_to include("too short when stripped")
|
518
|
+
end
|
519
|
+
|
520
|
+
it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
|
521
|
+
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
|
522
|
+
<html>
|
523
|
+
<head>
|
524
|
+
<title>title!</title>
|
525
|
+
</head>
|
526
|
+
<body>
|
527
|
+
<div> <!-- This is the closest node of the best candidate that has siblings. -->
|
528
|
+
<div>
|
529
|
+
<section>
|
530
|
+
<p>Paragraph 1</p>
|
531
|
+
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
|
532
|
+
</section>
|
533
|
+
</div>
|
534
|
+
</div>
|
535
|
+
<section>
|
536
|
+
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
|
537
|
+
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
|
538
|
+
</section>
|
539
|
+
#{'<a href="/">This link lowers the body score.</a>' * 5}
|
540
|
+
</body>
|
541
|
+
</html>
|
542
|
+
HTML
|
543
|
+
|
544
|
+
expect(@doc.content).to include("Paragraph 1")
|
545
|
+
expect(@doc.content).to include("Paragraph 2")
|
546
|
+
expect(@doc.content).to include("should be included")
|
547
|
+
end
|
355
548
|
end
|
356
549
|
|
357
550
|
describe "the cant_read.html fixture" do
|
@@ -359,7 +552,7 @@ describe Readability do
|
|
359
552
|
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
|
360
553
|
allowed_attributes = %w[href]
|
361
554
|
html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
|
362
|
-
Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.
|
555
|
+
expect(Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content).to match(/Can you talk a little about how you developed the looks for the/)
|
363
556
|
end
|
364
557
|
end
|
365
558
|
|
@@ -370,15 +563,15 @@ describe Readability do
|
|
370
563
|
end
|
371
564
|
|
372
565
|
it "should return the main page content" do
|
373
|
-
@doc.content.
|
566
|
+
expect(@doc.content).to match("Some content")
|
374
567
|
end
|
375
568
|
|
376
569
|
it "should return the page title if present" do
|
377
|
-
@doc.title.
|
570
|
+
expect(@doc.title).to match("title!")
|
378
571
|
|
379
572
|
doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
|
380
573
|
:min_text_length => 0, :retry_length => 1)
|
381
|
-
doc.title.
|
574
|
+
expect(doc.title).to be_nil
|
382
575
|
end
|
383
576
|
end
|
384
577
|
|
@@ -389,7 +582,7 @@ describe Readability do
|
|
389
582
|
end
|
390
583
|
|
391
584
|
it "should not return the sidebar" do
|
392
|
-
@doc.content.
|
585
|
+
expect(@doc.content).not_to match("sidebar")
|
393
586
|
end
|
394
587
|
end
|
395
588
|
|
@@ -407,7 +600,7 @@ describe Readability do
|
|
407
600
|
end
|
408
601
|
|
409
602
|
it "should not return the sidebar" do
|
410
|
-
@doc.content.
|
603
|
+
expect(@doc.content).not_to match("a b c d f")
|
411
604
|
end
|
412
605
|
end
|
413
606
|
|
@@ -427,12 +620,12 @@ describe Readability do
|
|
427
620
|
#puts "testing #{sample}..."
|
428
621
|
|
429
622
|
$required_fragments.each do |required_text|
|
430
|
-
doc.
|
623
|
+
expect(doc).to include(required_text)
|
431
624
|
checks += 1
|
432
625
|
end
|
433
626
|
|
434
627
|
$excluded_fragments.each do |text_to_avoid|
|
435
|
-
doc.
|
628
|
+
expect(doc).not_to include(text_to_avoid)
|
436
629
|
checks += 1
|
437
630
|
end
|
438
631
|
end
|
@@ -446,18 +639,18 @@ describe Readability do
|
|
446
639
|
it "should correctly guess and enforce HTML encoding" do
|
447
640
|
doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
|
448
641
|
content = doc.content
|
449
|
-
content.encoding.to_s.
|
450
|
-
content.
|
642
|
+
expect(content.encoding.to_s).to eq("ISO-8859-1")
|
643
|
+
expect(content).to be_valid_encoding
|
451
644
|
end
|
452
645
|
|
453
646
|
it "should allow encoding guessing to be skipped" do
|
454
|
-
|
647
|
+
expect(GuessHtmlEncoding).to_not receive(:encode)
|
455
648
|
doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
|
456
649
|
doc.content
|
457
650
|
end
|
458
651
|
|
459
652
|
it "should allow encoding guessing to be overridden" do
|
460
|
-
|
653
|
+
expect(GuessHtmlEncoding).to_not receive(:encode)
|
461
654
|
doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
|
462
655
|
doc.content
|
463
656
|
end
|
@@ -469,54 +662,54 @@ describe Readability do
|
|
469
662
|
it "should strip the html comments tag" do
|
470
663
|
doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
|
471
664
|
content = doc.content
|
472
|
-
content.
|
473
|
-
content.
|
665
|
+
expect(content).to include("hi!")
|
666
|
+
expect(content).not_to include("bye")
|
474
667
|
end
|
475
668
|
|
476
669
|
it "should not error with empty content" do
|
477
|
-
Readability::Document.new('').content.
|
670
|
+
expect(Readability::Document.new('').content).to eq('<div><div></div></div>')
|
478
671
|
end
|
479
672
|
|
480
673
|
it "should not error with a document with no <body>" do
|
481
|
-
Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.
|
674
|
+
expect(Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content).to eq('<div><div></div></div>')
|
482
675
|
end
|
483
676
|
end
|
484
|
-
|
677
|
+
|
485
678
|
describe "No side-effects" do
|
486
679
|
before do
|
487
680
|
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
488
681
|
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
489
|
-
@
|
682
|
+
@thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
490
683
|
end
|
491
|
-
|
684
|
+
|
492
685
|
it "should not have any side-effects when calling content() and then images()" do
|
493
|
-
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
686
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
494
687
|
:do_not_guess_encoding => true)
|
495
|
-
@doc.images.
|
688
|
+
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
|
496
689
|
@doc.content
|
497
|
-
@doc.images.
|
690
|
+
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
|
498
691
|
end
|
499
|
-
|
692
|
+
|
500
693
|
it "should not have any side-effects when calling content() multiple times" do
|
501
|
-
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
694
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
502
695
|
:do_not_guess_encoding => true)
|
503
|
-
@doc.content.
|
696
|
+
expect(@doc.content).to eq(@doc.content)
|
504
697
|
end
|
505
|
-
|
698
|
+
|
506
699
|
it "should not have any side-effects when calling content and images multiple times" do
|
507
|
-
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
700
|
+
@doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
|
508
701
|
:do_not_guess_encoding => true)
|
509
|
-
@doc.images.
|
510
|
-
@doc.content.
|
511
|
-
@doc.images.
|
702
|
+
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
|
703
|
+
expect(@doc.content).to eq(@doc.content)
|
704
|
+
expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
|
512
705
|
end
|
513
|
-
|
706
|
+
|
514
707
|
end
|
515
|
-
|
708
|
+
|
516
709
|
describe "Code blocks" do
|
517
710
|
before do
|
518
711
|
@code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
|
519
|
-
@content = Readability::Document.new(@code,
|
712
|
+
@content = Readability::Document.new(@code,
|
520
713
|
:tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
|
521
714
|
:attributes => %w[src href],
|
522
715
|
:remove_empty_nodes => false).content
|
@@ -524,29 +717,29 @@ describe Readability do
|
|
524
717
|
end
|
525
718
|
|
526
719
|
it "preserve the code blocks" do
|
527
|
-
@doc.css("code pre").text.
|
720
|
+
expect(@doc.css("code pre").text).to eq("\nroot\n indented\n ")
|
528
721
|
end
|
529
722
|
|
530
723
|
it "preserve backwards code blocks" do
|
531
|
-
@doc.css("pre code").text.
|
724
|
+
expect(@doc.css("pre code").text).to eq("\nsecond\n indented\n ")
|
532
725
|
end
|
533
726
|
end
|
534
727
|
|
535
728
|
describe "remove all tags" do
|
536
729
|
it "should work for an incomplete piece of HTML" do
|
537
730
|
doc = Readability::Document.new('<div>test</div', :tags => [])
|
538
|
-
doc.content.
|
731
|
+
expect(doc.content).to eq('test')
|
539
732
|
end
|
540
733
|
|
541
734
|
it "should work for a HTML document" do
|
542
735
|
doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
|
543
736
|
:tags => [])
|
544
|
-
doc.content.
|
737
|
+
expect(doc.content).to eq('test')
|
545
738
|
end
|
546
739
|
|
547
740
|
it "should work for a plain text" do
|
548
741
|
doc = Readability::Document.new('test', :tags => [])
|
549
|
-
doc.content.
|
742
|
+
expect(doc.content).to eq('test')
|
550
743
|
end
|
551
744
|
end
|
552
745
|
|
@@ -563,8 +756,8 @@ describe Readability do
|
|
563
756
|
doc = Readability::Document.new(boing_boing)
|
564
757
|
|
565
758
|
content = doc.content
|
566
|
-
(content !~ /Bees and Bombs/).
|
567
|
-
content.
|
759
|
+
expect(content !~ /Bees and Bombs/).to eq(true)
|
760
|
+
expect(content).to match(/ADVERTISE/)
|
568
761
|
end
|
569
762
|
|
570
763
|
it "should apply whitelist" do
|
@@ -572,23 +765,45 @@ describe Readability do
|
|
572
765
|
doc = Readability::Document.new(boing_boing,
|
573
766
|
whitelist: ".post-content")
|
574
767
|
content = doc.content
|
575
|
-
content.
|
768
|
+
expect(content).to match(/Bees and Bombs/)
|
576
769
|
end
|
577
770
|
|
578
771
|
it "should apply blacklist" do
|
579
772
|
doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
|
580
773
|
content = doc.content
|
581
|
-
(content !~ /ADVERTISE/).
|
774
|
+
expect(content !~ /ADVERTISE/).to eq(true)
|
582
775
|
|
583
776
|
end
|
584
777
|
end
|
585
778
|
|
586
779
|
describe "clean_conditionally_reason?" do
|
587
|
-
let
|
780
|
+
let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
|
588
781
|
|
589
782
|
it "does not raise error" do
|
590
783
|
@doc = Readability::Document.new(list_fixture)
|
591
784
|
expect { @doc.content }.to_not raise_error
|
592
785
|
end
|
593
786
|
end
|
787
|
+
|
788
|
+
describe "clean_conditionally" do
|
789
|
+
let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
|
790
|
+
|
791
|
+
it "can set a clean_conditionally function to allow overriding the default decision" do
|
792
|
+
clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
|
793
|
+
content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
|
794
|
+
|
795
|
+
expect(content).to include("sidebar")
|
796
|
+
expect(content).not_to include('Some content')
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
describe "debug" do
|
801
|
+
it "can set a debug function, e.g. to send output to Rails logger" do
|
802
|
+
output = []
|
803
|
+
debug_fn = lambda { |str| output << str }
|
804
|
+
|
805
|
+
Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
|
806
|
+
expect(output).not_to be_empty
|
807
|
+
end
|
808
|
+
end
|
594
809
|
end
|