ruby-readability 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@ describe Readability do
19
19
  </body>
20
20
  </html>
21
21
  HTML
22
-
22
+
23
23
  @simple_html_with_img_no_text = <<-HTML
24
24
  <html>
25
25
  <head>
@@ -32,7 +32,7 @@ describe Readability do
32
32
  </body>
33
33
  </html>
34
34
  HTML
35
-
35
+
36
36
  @simple_html_with_img_in_noscript = <<-HTML
37
37
  <html>
38
38
  <head>
@@ -40,8 +40,8 @@ describe Readability do
40
40
  </head>
41
41
  <body class='main'>
42
42
  <div class="article-img">
43
- <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
- height="317" alt="test" class="lazy"
43
+ <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
+ height="317" alt="test" class="lazy"
45
45
  data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
46
46
  <noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
47
47
  </div>
@@ -54,30 +54,70 @@ describe Readability do
54
54
  before do
55
55
  @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
56
56
  @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
57
- @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
57
+ @thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
58
+ @ch = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
59
+ @nested = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")
58
60
 
59
61
  FakeWeb::Registry.instance.clean_registry
62
+
60
63
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
61
64
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
62
-
65
+
63
66
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
64
67
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
65
-
66
- FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
68
+
69
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
67
70
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
68
-
71
+
72
+ # Register images for codinghorror
73
+ FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
74
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/JohnPinhole.jpg"))
75
+ FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png',
76
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/Confusion_of_Tongues.png"))
69
77
  end
70
78
 
71
79
  it "should show one image, but outside of the best candidate" do
72
- @doc = Readability::Document.new(@thesum)
73
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"]
74
- @doc.best_candidate_has_image.should == false
80
+ @doc = Readability::Document.new(@thesun)
81
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"])
82
+ expect(@doc.best_candidate_has_image).to eq(false)
75
83
  end
76
84
 
77
85
  it "should show one image inside of the best candidate" do
78
86
  @doc = Readability::Document.new(@nytimes)
79
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
80
- @doc.best_candidate_has_image.should == true
87
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
88
+ expect(@doc.best_candidate_has_image).to eq(true)
89
+ end
90
+
91
+ it "should expand relative image url" do
92
+ url = 'http://blog.codinghorror.com/standard-flavored-markdown/'
93
+ @doc = Readability::Document.new(@ch, tags: %w[div p img a],
94
+ attributes: %w[src href],
95
+ remove_empty_nodes: false)
96
+ @doc.images_with_fqdn_uris!(url)
97
+
98
+ expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg')
99
+ expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png')
100
+
101
+ expect(@doc.images).to match_array([
102
+ 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
103
+ 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png'
104
+ ])
105
+ end
106
+
107
+ it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
108
+ @doc = Readability::Document.new(@nested, attributes: ["src"])
109
+ expect(@doc.images).to be_empty
110
+
111
+ @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
112
+ expect(@doc.images).to be_empty
113
+
114
+ @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
115
+ expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
116
+ end
117
+
118
+ it "should be able to whitelist all attributes" do
119
+ @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
120
+ expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
81
121
  end
82
122
 
83
123
  it "should not try to download local images" do
@@ -93,69 +133,69 @@ describe Readability do
93
133
  </body>
94
134
  </html>
95
135
  HTML
96
- do_not_allow(@doc).load_image(anything)
97
- @doc.images.should == []
136
+ expect(@doc).not_to receive(:get_image_size)
137
+ expect(@doc.images).to eq([])
98
138
  end
99
139
 
100
140
  describe "no images" do
101
141
  it "shouldn't show images" do
102
142
  @doc = Readability::Document.new(@bbc, :min_image_height => 600)
103
- @doc.images.should == []
104
- @doc.best_candidate_has_image.should == false
143
+ expect(@doc.images).to eq([])
144
+ expect(@doc.best_candidate_has_image).to eq(false)
105
145
  end
106
146
  end
107
147
 
108
148
  describe "poll of images" do
109
149
  it "should show some images inside of the best candidate" do
110
150
  @doc = Readability::Document.new(@bbc)
111
- @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
151
+ expect(@doc.images).to match_array(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
112
152
  "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
113
153
  "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
114
- "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
115
- @doc.best_candidate_has_image.should == true
154
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
155
+ expect(@doc.best_candidate_has_image).to eq(true)
116
156
  end
117
157
 
118
158
  it "should show some images inside of the best candidate, include gif format" do
119
159
  @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
120
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
121
- @doc.best_candidate_has_image.should == true
160
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
161
+ expect(@doc.best_candidate_has_image).to eq(true)
122
162
  end
123
163
 
124
164
  describe "width, height and format" do
125
165
  it "should show some images inside of the best candidate, but with width most equal to 400px" do
126
166
  @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
127
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
128
- @doc.best_candidate_has_image.should == true
167
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"])
168
+ expect(@doc.best_candidate_has_image).to eq(true)
129
169
  end
130
170
 
131
171
  it "should show some images inside of the best candidate, but with width most equal to 304px" do
132
172
  @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
133
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
134
- @doc.best_candidate_has_image.should == true
173
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
174
+ expect(@doc.best_candidate_has_image).to eq(true)
135
175
  end
136
176
 
137
177
  it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
138
178
  @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
139
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
140
- @doc.best_candidate_has_image.should == true
179
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
180
+ expect(@doc.best_candidate_has_image).to eq(true)
141
181
  end
142
182
 
143
183
  it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
144
184
  @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
145
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
146
- @doc.best_candidate_has_image.should == true
185
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
186
+ expect(@doc.best_candidate_has_image).to eq(true)
147
187
  end
148
-
188
+
149
189
  it "should not miss an image if it exists by itself in a div without text" do
150
190
  @doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
151
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
191
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
152
192
  end
153
-
193
+
154
194
  it "should not double count an image between script and noscript" do
155
195
  @doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
156
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
196
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
157
197
  end
158
-
198
+
159
199
  end
160
200
  end
161
201
  end
@@ -167,11 +207,11 @@ describe Readability do
167
207
  end
168
208
 
169
209
  it "should transform divs containing no block elements into <p>s" do
170
- @doc.html.css("#body").first.name.should == "p"
210
+ expect(@doc.html.css("#body").first.name).to eq("p")
171
211
  end
172
212
 
173
213
  it "should not transform divs that contain block elements" do
174
- @doc.html.css("#contains_blockquote").first.name.should == "div"
214
+ expect(@doc.html.css("#contains_blockquote").first.name).to eq("div")
175
215
  end
176
216
  end
177
217
 
@@ -185,9 +225,9 @@ describe Readability do
185
225
  <body></body>
186
226
  </html>
187
227
  HTML
188
- doc.author.should eql("Austin Fonacier")
228
+ expect(doc.author).to eql("Austin Fonacier")
189
229
  end
190
-
230
+
191
231
  it "should pick up readability's recommended author format" do
192
232
  doc = Readability::Document.new(<<-HTML)
193
233
  <html>
@@ -200,9 +240,9 @@ describe Readability do
200
240
  </body>
201
241
  </html>
202
242
  HTML
203
- doc.author.should eql("Austin Fonacier")
243
+ expect(doc.author).to eql("Austin Fonacier")
204
244
  end
205
-
245
+
206
246
  it "should pick up vcard fn" do
207
247
  doc = Readability::Document.new(<<-HTML)
208
248
  <html>
@@ -216,9 +256,9 @@ describe Readability do
216
256
  </body>
217
257
  </html>
218
258
  HTML
219
- doc.author.should eql("Austin Fonacier")
259
+ expect(doc.author).to eql("Austin Fonacier")
220
260
  end
221
-
261
+
222
262
  it "should pick up <a rel='author'>" do
223
263
  doc = Readability::Document.new(<<-HTML)
224
264
  <html>
@@ -228,9 +268,9 @@ describe Readability do
228
268
  </body>
229
269
  </html>
230
270
  HTML
231
- doc.author.should eql("Danny Banks (rel)")
271
+ expect(doc.author).to eql("Danny Banks (rel)")
232
272
  end
233
-
273
+
234
274
  it "should pick up <div id='author'>" do
235
275
  doc = Readability::Document.new(<<-HTML)
236
276
  <html>
@@ -240,7 +280,7 @@ describe Readability do
240
280
  </body>
241
281
  </html>
242
282
  HTML
243
- doc.author.should eql("Austin Fonacier (author)")
283
+ expect(doc.author).to eql("Austin Fonacier (author)")
244
284
  end
245
285
  end
246
286
 
@@ -263,15 +303,15 @@ describe Readability do
263
303
  end
264
304
 
265
305
  it "should like <div>s more than <th>s" do
266
- @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
306
+ expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
267
307
  end
268
308
 
269
309
  it "should like classes like text more than classes like comment" do
270
310
  @elem2.name = "div"
271
- @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
311
+ expect(@doc.score_node(@elem1)[:content_score]).to eq(@doc.score_node(@elem2)[:content_score])
272
312
  @elem1['class'] = "text"
273
313
  @elem2['class'] = "comment"
274
- @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
314
+ expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
275
315
  end
276
316
  end
277
317
 
@@ -282,15 +322,15 @@ describe Readability do
282
322
  end
283
323
 
284
324
  it "should remove things that have class comment" do
285
- @doc.html.inner_html.should_not =~ /a comment/
325
+ expect(@doc.html.inner_html).not_to match(/a comment/)
286
326
  end
287
327
 
288
328
  it "should not remove body tags" do
289
- @doc.html.inner_html.should =~ /<\/body>/
329
+ expect(@doc.html.inner_html).to match(/<\/body>/)
290
330
  end
291
331
 
292
332
  it "should not remove things with class comment and id body" do
293
- @doc.html.inner_html.should =~ /real content/
333
+ expect(@doc.html.inner_html).to match(/real content/)
294
334
  end
295
335
  end
296
336
 
@@ -318,13 +358,13 @@ describe Readability do
318
358
  end
319
359
 
320
360
  it "should score elements in the document" do
321
- @candidates.values.length.should == 3
361
+ expect(@candidates.values.length).to eq(3)
322
362
  end
323
363
 
324
364
  it "should prefer the body in this particular example" do
325
- @candidates.values.sort { |a, b|
365
+ expect(@candidates.values.sort { |a, b|
326
366
  b[:content_score] <=> a[:content_score]
327
- }.first[:elem][:id].should == "body"
367
+ }.first[:elem][:id]).to eq("body")
328
368
  end
329
369
 
330
370
  context "when two consequent br tags are used instead of p" do
@@ -349,9 +389,162 @@ describe Readability do
349
389
  </html>
350
390
  HTML
351
391
  @candidates = @doc.score_paragraphs(0)
352
- @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
392
+ expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
353
393
  end
354
394
  end
395
+
396
+ it "does not include short paragraphs as related siblings in the output" do
397
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
398
+ <html>
399
+ <head>
400
+ <title>title!</title>
401
+ </head>
402
+ <body>
403
+ <section>
404
+ <p>Paragraph 1</p>
405
+ <p>Paragraph 2</p>
406
+ </section>
407
+ <section>
408
+ <p>Too short</p>
409
+ </section>
410
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
411
+ </body>
412
+ </html>
413
+ HTML
414
+
415
+ expect(@doc.content).to include("Paragraph 1")
416
+ expect(@doc.content).to include("Paragraph 2")
417
+ expect(@doc.content).not_to include("Too short")
418
+ end
419
+
420
+ it "includes long paragraphs as related siblings in the output" do
421
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
422
+ <html>
423
+ <head>
424
+ <title>title!</title>
425
+ </head>
426
+ <body>
427
+ <section>
428
+ <p>Paragraph 1</p>
429
+ <p>Paragraph 2</p>
430
+ </section>
431
+ <p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
432
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
433
+ </body>
434
+ </html>
435
+ HTML
436
+
437
+ expect(@doc.content).to include("Paragraph 1")
438
+ expect(@doc.content).to include("Paragraph 2")
439
+ expect(@doc.content).to include("This paragraph is longer")
440
+ end
441
+
442
+ it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
443
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
444
+ <html>
445
+ <head>
446
+ <title>title!</title>
447
+ </head>
448
+ <body>
449
+ <section>
450
+ <p>Paragraph 1</p>
451
+ <p>Paragraph 2</p>
452
+ </section>
453
+ <section>
454
+ <p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
455
+ </section>
456
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
457
+ </body>
458
+ </html>
459
+ HTML
460
+
461
+ expect(@doc.content).to include("Paragraph 1")
462
+ expect(@doc.content).to include("Paragraph 2")
463
+ expect(@doc.content).not_to include("Although this paragraph")
464
+ end
465
+
466
+ it "does include non-paragraph tags in the output if their content score is high enough" do
467
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
468
+ <html>
469
+ <head>
470
+ <title>title!</title>
471
+ </head>
472
+ <body>
473
+ <section>
474
+ <p>Paragraph 1</p>
475
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
476
+ </section>
477
+ <section>
478
+ <p>This should be included in the output because the content is score is high enough.<p>
479
+ <p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
480
+ </section>
481
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
482
+ </body>
483
+ </html>
484
+ HTML
485
+
486
+ expect(@doc.content).to include("Paragraph 1")
487
+ expect(@doc.content).to include("Paragraph 2")
488
+ expect(@doc.content).to include("This should be included")
489
+ end
490
+
491
+ it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
492
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
493
+ <html>
494
+ <head>
495
+ <title>title!</title>
496
+ </head>
497
+ <body>
498
+ <section>
499
+ <p>Paragraph 1</p>
500
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
501
+ </section>
502
+ <section>
503
+ <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
504
+ <p>The likely_siblings now include the section tag so it should be included in the output.</p>
505
+ </section>
506
+ <section>
507
+ <p>too short when stripped </p>
508
+ </section>
509
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
510
+ </body>
511
+ </html>
512
+ HTML
513
+
514
+ expect(@doc.content).to include("Paragraph 1")
515
+ expect(@doc.content).to include("Paragraph 2")
516
+ expect(@doc.content).to include("should be included")
517
+ expect(@doc.content).not_to include("too short when stripped")
518
+ end
519
+
520
+ it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
521
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
522
+ <html>
523
+ <head>
524
+ <title>title!</title>
525
+ </head>
526
+ <body>
527
+ <div> <!-- This is the closest node of the best candidate that has siblings. -->
528
+ <div>
529
+ <section>
530
+ <p>Paragraph 1</p>
531
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
532
+ </section>
533
+ </div>
534
+ </div>
535
+ <section>
536
+ <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
537
+ <p>The likely_siblings now include the section tag so it should be included in the output.</p>
538
+ </section>
539
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
540
+ </body>
541
+ </html>
542
+ HTML
543
+
544
+ expect(@doc.content).to include("Paragraph 1")
545
+ expect(@doc.content).to include("Paragraph 2")
546
+ expect(@doc.content).to include("should be included")
547
+ end
355
548
  end
356
549
 
357
550
  describe "the cant_read.html fixture" do
@@ -359,7 +552,7 @@ describe Readability do
359
552
  allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
360
553
  allowed_attributes = %w[href]
361
554
  html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
362
- Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
555
+ expect(Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content).to match(/Can you talk a little about how you developed the looks for the/)
363
556
  end
364
557
  end
365
558
 
@@ -370,15 +563,15 @@ describe Readability do
370
563
  end
371
564
 
372
565
  it "should return the main page content" do
373
- @doc.content.should match("Some content")
566
+ expect(@doc.content).to match("Some content")
374
567
  end
375
568
 
376
569
  it "should return the page title if present" do
377
- @doc.title.should match("title!")
570
+ expect(@doc.title).to match("title!")
378
571
 
379
572
  doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
380
573
  :min_text_length => 0, :retry_length => 1)
381
- doc.title.should be_nil
574
+ expect(doc.title).to be_nil
382
575
  end
383
576
  end
384
577
 
@@ -389,7 +582,7 @@ describe Readability do
389
582
  end
390
583
 
391
584
  it "should not return the sidebar" do
392
- @doc.content.should_not match("sidebar")
585
+ expect(@doc.content).not_to match("sidebar")
393
586
  end
394
587
  end
395
588
 
@@ -407,7 +600,7 @@ describe Readability do
407
600
  end
408
601
 
409
602
  it "should not return the sidebar" do
410
- @doc.content.should_not match("a b c d f")
603
+ expect(@doc.content).not_to match("a b c d f")
411
604
  end
412
605
  end
413
606
 
@@ -427,12 +620,12 @@ describe Readability do
427
620
  #puts "testing #{sample}..."
428
621
 
429
622
  $required_fragments.each do |required_text|
430
- doc.should include(required_text)
623
+ expect(doc).to include(required_text)
431
624
  checks += 1
432
625
  end
433
626
 
434
627
  $excluded_fragments.each do |text_to_avoid|
435
- doc.should_not include(text_to_avoid)
628
+ expect(doc).not_to include(text_to_avoid)
436
629
  checks += 1
437
630
  end
438
631
  end
@@ -446,18 +639,18 @@ describe Readability do
446
639
  it "should correctly guess and enforce HTML encoding" do
447
640
  doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
448
641
  content = doc.content
449
- content.encoding.to_s.should == "ISO-8859-1"
450
- content.should be_valid_encoding
642
+ expect(content.encoding.to_s).to eq("ISO-8859-1")
643
+ expect(content).to be_valid_encoding
451
644
  end
452
645
 
453
646
  it "should allow encoding guessing to be skipped" do
454
- do_not_allow(GuessHtmlEncoding).encode
647
+ expect(GuessHtmlEncoding).to_not receive(:encode)
455
648
  doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
456
649
  doc.content
457
650
  end
458
651
 
459
652
  it "should allow encoding guessing to be overridden" do
460
- do_not_allow(GuessHtmlEncoding).encode
653
+ expect(GuessHtmlEncoding).to_not receive(:encode)
461
654
  doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
462
655
  doc.content
463
656
  end
@@ -469,54 +662,54 @@ describe Readability do
469
662
  it "should strip the html comments tag" do
470
663
  doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
471
664
  content = doc.content
472
- content.should include("hi!")
473
- content.should_not include("bye")
665
+ expect(content).to include("hi!")
666
+ expect(content).not_to include("bye")
474
667
  end
475
668
 
476
669
  it "should not error with empty content" do
477
- Readability::Document.new('').content.should == '<div><div></div></div>'
670
+ expect(Readability::Document.new('').content).to eq('<div><div></div></div>')
478
671
  end
479
672
 
480
673
  it "should not error with a document with no <body>" do
481
- Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
674
+ expect(Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content).to eq('<div><div></div></div>')
482
675
  end
483
676
  end
484
-
677
+
485
678
  describe "No side-effects" do
486
679
  before do
487
680
  @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
488
681
  @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
489
- @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
682
+ @thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
490
683
  end
491
-
684
+
492
685
  it "should not have any side-effects when calling content() and then images()" do
493
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
686
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
494
687
  :do_not_guess_encoding => true)
495
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
688
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
496
689
  @doc.content
497
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
690
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
498
691
  end
499
-
692
+
500
693
  it "should not have any side-effects when calling content() multiple times" do
501
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
694
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
502
695
  :do_not_guess_encoding => true)
503
- @doc.content.should == @doc.content
696
+ expect(@doc.content).to eq(@doc.content)
504
697
  end
505
-
698
+
506
699
  it "should not have any side-effects when calling content and images multiple times" do
507
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
700
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
508
701
  :do_not_guess_encoding => true)
509
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
510
- @doc.content.should == @doc.content
511
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
702
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
703
+ expect(@doc.content).to eq(@doc.content)
704
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
512
705
  end
513
-
706
+
514
707
  end
515
-
708
+
516
709
  describe "Code blocks" do
517
710
  before do
518
711
  @code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
519
- @content = Readability::Document.new(@code,
712
+ @content = Readability::Document.new(@code,
520
713
  :tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
521
714
  :attributes => %w[src href],
522
715
  :remove_empty_nodes => false).content
@@ -524,29 +717,29 @@ describe Readability do
524
717
  end
525
718
 
526
719
  it "preserve the code blocks" do
527
- @doc.css("code pre").text.should == "\nroot\n indented\n "
720
+ expect(@doc.css("code pre").text).to eq("\nroot\n indented\n ")
528
721
  end
529
722
 
530
723
  it "preserve backwards code blocks" do
531
- @doc.css("pre code").text.should == "\nsecond\n indented\n "
724
+ expect(@doc.css("pre code").text).to eq("\nsecond\n indented\n ")
532
725
  end
533
726
  end
534
727
 
535
728
  describe "remove all tags" do
536
729
  it "should work for an incomplete piece of HTML" do
537
730
  doc = Readability::Document.new('<div>test</div', :tags => [])
538
- doc.content.should == 'test'
731
+ expect(doc.content).to eq('test')
539
732
  end
540
733
 
541
734
  it "should work for a HTML document" do
542
735
  doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
543
736
  :tags => [])
544
- doc.content.should == 'test'
737
+ expect(doc.content).to eq('test')
545
738
  end
546
739
 
547
740
  it "should work for a plain text" do
548
741
  doc = Readability::Document.new('test', :tags => [])
549
- doc.content.should == 'test'
742
+ expect(doc.content).to eq('test')
550
743
  end
551
744
  end
552
745
 
@@ -563,8 +756,8 @@ describe Readability do
563
756
  doc = Readability::Document.new(boing_boing)
564
757
 
565
758
  content = doc.content
566
- (content !~ /Bees and Bombs/).should == true
567
- content.should =~ /ADVERTISE/
759
+ expect(content !~ /Bees and Bombs/).to eq(true)
760
+ expect(content).to match(/ADVERTISE/)
568
761
  end
569
762
 
570
763
  it "should apply whitelist" do
@@ -572,23 +765,45 @@ describe Readability do
572
765
  doc = Readability::Document.new(boing_boing,
573
766
  whitelist: ".post-content")
574
767
  content = doc.content
575
- content.should =~ /Bees and Bombs/
768
+ expect(content).to match(/Bees and Bombs/)
576
769
  end
577
770
 
578
771
  it "should apply blacklist" do
579
772
  doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
580
773
  content = doc.content
581
- (content !~ /ADVERTISE/).should == true
774
+ expect(content !~ /ADVERTISE/).to eq(true)
582
775
 
583
776
  end
584
777
  end
585
778
 
586
779
  describe "clean_conditionally_reason?" do
587
- let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
780
+ let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
588
781
 
589
782
  it "does not raise error" do
590
783
  @doc = Readability::Document.new(list_fixture)
591
784
  expect { @doc.content }.to_not raise_error
592
785
  end
593
786
  end
787
+
788
+ describe "clean_conditionally" do
789
+ let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
790
+
791
+ it "can set a clean_conditionally function to allow overriding the default decision" do
792
+ clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
793
+ content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
794
+
795
+ expect(content).to include("sidebar")
796
+ expect(content).not_to include('Some content')
797
+ end
798
+ end
799
+
800
+ describe "debug" do
801
+ it "can set a debug function, e.g. to send output to Rails logger" do
802
+ output = []
803
+ debug_fn = lambda { |str| output << str }
804
+
805
+ Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
806
+ expect(output).not_to be_empty
807
+ end
808
+ end
594
809
  end