ruby-readability-discourse 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,544 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+ require 'readability'
5
+
6
+ describe Readability do
7
+ before do
8
+ @simple_html_fixture = <<-HTML
9
+ <html>
10
+ <head>
11
+ <title>title!</title>
12
+ </head>
13
+ <body class='comment'>
14
+ <div>
15
+ <p class='comment'>a comment</p>
16
+ <div class='comment' id='body'>real content</div>
17
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
18
+ </div>
19
+ </body>
20
+ </html>
21
+ HTML
22
+ end
23
+
24
+ describe "images" do
25
+ before do
26
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
27
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
28
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
29
+
30
+ FakeWeb::Registry.instance.clean_registry
31
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
32
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
33
+
34
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
35
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
36
+
37
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
38
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
39
+ end
40
+
41
+ it "should show one image, but outside of the best candidate" do
42
+ @doc = Readability::Document.new(@thesum)
43
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"]
44
+ @doc.best_candidate_has_image.should == false
45
+ end
46
+
47
+ it "should show one image inside of the best candidate" do
48
+ @doc = Readability::Document.new(@nytimes)
49
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
50
+ @doc.best_candidate_has_image.should == true
51
+ end
52
+
53
+ it "should not try to download local images" do
54
+ @doc = Readability::Document.new(<<-HTML)
55
+ <html>
56
+ <head>
57
+ <title>title!</title>
58
+ </head>
59
+ <body class='comment'>
60
+ <div>
61
+ <img src="/something/local.gif" />
62
+ </div>
63
+ </body>
64
+ </html>
65
+ HTML
66
+ do_not_allow(@doc).load_image(anything)
67
+ @doc.images.should == []
68
+ end
69
+
70
+ describe "no images" do
71
+ it "shouldn't show images" do
72
+ @doc = Readability::Document.new(@bbc, :min_image_height => 600)
73
+ @doc.images.should == []
74
+ @doc.best_candidate_has_image.should == false
75
+ end
76
+ end
77
+
78
+ describe "poll of images" do
79
+ it "should show some images inside of the best candidate" do
80
+ @doc = Readability::Document.new(@bbc)
81
+ @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
82
+ "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
83
+ "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
84
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
85
+ @doc.best_candidate_has_image.should == true
86
+ end
87
+
88
+ it "should show some images inside of the best candidate, include gif format" do
89
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
90
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
91
+ @doc.best_candidate_has_image.should == true
92
+ end
93
+
94
+ describe "width, height and format" do
95
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
96
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
97
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
98
+ @doc.best_candidate_has_image.should == true
99
+ end
100
+
101
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
102
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
103
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
104
+ @doc.best_candidate_has_image.should == true
105
+ end
106
+
107
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
108
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
109
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
110
+ @doc.best_candidate_has_image.should == true
111
+ end
112
+
113
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
114
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
115
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
116
+ @doc.best_candidate_has_image.should == true
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ describe "transformMisusedDivsIntoParagraphs" do
123
+ before do
124
+ @doc = Readability::Document.new(@simple_html_fixture)
125
+ @doc.transform_misused_divs_into_paragraphs!
126
+ end
127
+
128
+ it "should transform divs containing no block elements into <p>s" do
129
+ @doc.html.css("#body").first.name.should == "p"
130
+ end
131
+
132
+ it "should not transform divs that contain block elements" do
133
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
134
+ end
135
+ end
136
+
137
+ describe "author" do
138
+ it "should pick up <meta name='dc.creator'></meta> as an author" do
139
+ doc = Readability::Document.new(<<-HTML)
140
+ <html>
141
+ <head>
142
+ <meta name='dc.creator' content='Austin Fonacier' />
143
+ </head>
144
+ <body></body>
145
+ </html>
146
+ HTML
147
+ doc.author.should eql("Austin Fonacier")
148
+ end
149
+
150
+ it "should pick up readability's recommended author format" do
151
+ doc = Readability::Document.new(<<-HTML)
152
+ <html>
153
+ <head>
154
+ </head>
155
+ <body>
156
+ <p class="byline author vcard">
157
+ By <cite class="fn">Austin Fonacier</span>
158
+ </p>
159
+ </body>
160
+ </html>
161
+ HTML
162
+ doc.author.should eql("Austin Fonacier")
163
+ end
164
+
165
+ it "should pick up vcard fn" do
166
+ doc = Readability::Document.new(<<-HTML)
167
+ <html>
168
+ <head>
169
+ </head>
170
+ <body>
171
+ <div class="author">By</div>
172
+ <div class="author vcard">
173
+ <a class="url fn" href="http://austinlivesinyotests.com/">Austin Fonacier</a>
174
+ </div>
175
+ </body>
176
+ </html>
177
+ HTML
178
+ doc.author.should eql("Austin Fonacier")
179
+ end
180
+
181
+ it "should pick up <a rel='author'>" do
182
+ doc = Readability::Document.new(<<-HTML)
183
+ <html>
184
+ <head></head>
185
+ <body>
186
+ <a rel="author" href="http://google.com">Danny Banks (rel)</a>
187
+ </body>
188
+ </html>
189
+ HTML
190
+ doc.author.should eql("Danny Banks (rel)")
191
+ end
192
+
193
+ it "should pick up <div id='author'>" do
194
+ doc = Readability::Document.new(<<-HTML)
195
+ <html>
196
+ <head></head>
197
+ <body>
198
+ <div id="author">Austin Fonacier (author)</div>
199
+ </body>
200
+ </html>
201
+ HTML
202
+ doc.author.should eql("Austin Fonacier (author)")
203
+ end
204
+ end
205
+
206
+ describe "score_node" do
207
+ before do
208
+ @doc = Readability::Document.new(<<-HTML)
209
+ <html>
210
+ <body>
211
+ <div id='elem1'>
212
+ <p>some content</p>
213
+ </div>
214
+ <th id='elem2'>
215
+ <p>some other content</p>
216
+ </th>
217
+ </body>
218
+ </html>
219
+ HTML
220
+ @elem1 = @doc.html.css("#elem1").first
221
+ @elem2 = @doc.html.css("#elem2").first
222
+ end
223
+
224
+ it "should like <div>s more than <th>s" do
225
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
226
+ end
227
+
228
+ it "should like classes like text more than classes like comment" do
229
+ @elem2.name = "div"
230
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
231
+ @elem1['class'] = "text"
232
+ @elem2['class'] = "comment"
233
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
234
+ end
235
+ end
236
+
237
+ describe "remove_unlikely_candidates!" do
238
+ before do
239
+ @doc = Readability::Document.new(@simple_html_fixture)
240
+ @doc.remove_unlikely_candidates!
241
+ end
242
+
243
+ it "should remove things that have class comment" do
244
+ @doc.html.inner_html.should_not =~ /a comment/
245
+ end
246
+
247
+ it "should not remove body tags" do
248
+ @doc.html.inner_html.should =~ /<\/body>/
249
+ end
250
+
251
+ it "should not remove things with class comment and id body" do
252
+ @doc.html.inner_html.should =~ /real content/
253
+ end
254
+ end
255
+
256
+ describe "score_paragraphs" do
257
+ before(:each) do
258
+ @doc = Readability::Document.new(<<-HTML)
259
+ <html>
260
+ <head>
261
+ <title>title!</title>
262
+ </head>
263
+ <body id="body">
264
+ <div id="div1">
265
+ <div id="div2>
266
+ <p id="some_comment">a comment</p>
267
+ </div>
268
+ <p id="some_text">some text</p>
269
+ </div>
270
+ <div id="div3">
271
+ <p id="some_text2">some more text</p>
272
+ </div>
273
+ </body>
274
+ </html><!-- " -->
275
+ HTML
276
+ @candidates = @doc.score_paragraphs(0)
277
+ end
278
+
279
+ it "should score elements in the document" do
280
+ @candidates.values.length.should == 3
281
+ end
282
+
283
+ it "should prefer the body in this particular example" do
284
+ @candidates.values.sort { |a, b|
285
+ b[:content_score] <=> a[:content_score]
286
+ }.first[:elem][:id].should == "body"
287
+ end
288
+
289
+ context "when two consequent br tags are used instead of p" do
290
+ it "should assign the higher score to the first paragraph in this particular example" do
291
+ @doc = Readability::Document.new(<<-HTML)
292
+ <html>
293
+ <head>
294
+ <title>title!</title>
295
+ </head>
296
+ <body id="body">
297
+ <div id="post1">
298
+ This is the main content!<br/><br/>
299
+ Zebra found killed butcher with the chainsaw.<br/><br/>
300
+ If only I could think of an example, oh, wait.
301
+ </div>
302
+ <div id="post2">
303
+ This is not the content and although it's longer if you meaure it in characters,
304
+ it's supposed to have lower score than the previous paragraph. And it's only because
305
+ of the previous paragraph is not one paragraph, it's three subparagraphs
306
+ </div>
307
+ </body>
308
+ </html>
309
+ HTML
310
+ @candidates = @doc.score_paragraphs(0)
311
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
312
+ end
313
+ end
314
+ end
315
+
316
+ describe "the cant_read.html fixture" do
317
+ it "should work on the cant_read.html fixture with some allowed tags" do
318
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
319
+ allowed_attributes = %w[href]
320
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
321
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
322
+ end
323
+ end
324
+
325
+ describe "general functionality" do
326
+ before do
327
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
328
+ :min_text_length => 0, :retry_length => 1)
329
+ end
330
+
331
+ it "should return the main page content" do
332
+ @doc.content.should match("Some content")
333
+ end
334
+
335
+ it "should return the page title if present" do
336
+ @doc.title.should match("title!")
337
+
338
+ doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
339
+ :min_text_length => 0, :retry_length => 1)
340
+ doc.title.should be_nil
341
+ end
342
+ end
343
+
344
+ describe "ignoring sidebars" do
345
+ before do
346
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
347
+ :min_text_length => 0, :retry_length => 1)
348
+ end
349
+
350
+ it "should not return the sidebar" do
351
+ @doc.content.should_not match("sidebar")
352
+ end
353
+ end
354
+
355
+ describe "inserting space for block elements" do
356
+ before do
357
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
358
+ <html><head><title>title!</title></head>
359
+ <body>
360
+ <div>
361
+ <p>a<br>b<hr>c<address>d</address>f/p>
362
+ </div>
363
+ </body>
364
+ </html>
365
+ HTML
366
+ end
367
+
368
+ it "should not return the sidebar" do
369
+ @doc.content.should_not match("a b c d f")
370
+ end
371
+ end
372
+
373
+ describe "outputs good stuff for known documents" do
374
+ before do
375
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
376
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
377
+ end
378
+
379
+ it "should output expected fragments of text" do
380
+ checks = 0
381
+ @samples.each do |sample|
382
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
383
+ doc = Readability::Document.new(html).content
384
+
385
+ load "fixtures/samples/#{sample}-fragments.rb"
386
+ #puts "testing #{sample}..."
387
+
388
+ $required_fragments.each do |required_text|
389
+ doc.should include(required_text)
390
+ checks += 1
391
+ end
392
+
393
+ $excluded_fragments.each do |text_to_avoid|
394
+ doc.should_not include(text_to_avoid)
395
+ checks += 1
396
+ end
397
+ end
398
+ #puts "Performed #{checks} checks."
399
+ end
400
+ end
401
+
402
+ describe "encoding guessing" do
403
+ if RUBY_VERSION =~ /^1\.9\./
404
+ context "with ruby 1.9.2" do
405
+ it "should correctly guess and enforce HTML encoding" do
406
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
407
+ content = doc.content
408
+ content.encoding.to_s.should == "ISO-8859-1"
409
+ content.should be_valid_encoding
410
+ end
411
+
412
+ it "should allow encoding guessing to be skipped" do
413
+ do_not_allow(GuessHtmlEncoding).encode
414
+ doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
415
+ doc.content
416
+ end
417
+
418
+ it "should allow encoding guessing to be overridden" do
419
+ do_not_allow(GuessHtmlEncoding).encode
420
+ doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
421
+ doc.content
422
+ end
423
+ end
424
+ end
425
+ end
426
+
427
+ describe "#make_html" do
428
+ it "should strip the html comments tag" do
429
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
430
+ content = doc.content
431
+ content.should include("hi!")
432
+ content.should_not include("bye")
433
+ end
434
+
435
+ it "should not error with empty content" do
436
+ Readability::Document.new('').content.should == '<div><div></div></div>'
437
+ end
438
+
439
+ it "should not error with a document with no <body>" do
440
+ Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
441
+ end
442
+ end
443
+
444
+ describe "No side-effects" do
445
+ before do
446
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
447
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
448
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
449
+ end
450
+
451
+ it "should not have any side-effects when calling content() and then images()" do
452
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
453
+ :do_not_guess_encoding => true)
454
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
455
+ @doc.content
456
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
457
+ end
458
+
459
+ it "should not have any side-effects when calling content() multiple times" do
460
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
461
+ :do_not_guess_encoding => true)
462
+ @doc.content.should == @doc.content
463
+ end
464
+
465
+ it "should not have any side-effects when calling content and images multiple times" do
466
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
467
+ :do_not_guess_encoding => true)
468
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
469
+ @doc.content.should == @doc.content
470
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
471
+ end
472
+
473
+ end
474
+
475
+ describe "Code blocks" do
476
+ before do
477
+ @code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
478
+ @content = Readability::Document.new(@code,
479
+ :tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
480
+ :attributes => %w[src href],
481
+ :remove_empty_nodes => false).content
482
+ @doc = Nokogiri::HTML(@content)
483
+ end
484
+
485
+ it "preserve the code blocks" do
486
+ @doc.css("code pre").text.should == "\nroot\n indented\n "
487
+ end
488
+
489
+ it "preserve backwards code blocks" do
490
+ @doc.css("pre code").text.should == "\nsecond\n indented\n "
491
+ end
492
+ end
493
+
494
+ describe "remove all tags" do
495
+ it "should work for an incomplete piece of HTML" do
496
+ doc = Readability::Document.new('<div>test</div', :tags => [])
497
+ doc.content.should == 'test'
498
+ end
499
+
500
+ it "should work for a HTML document" do
501
+ doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
502
+ :tags => [])
503
+ doc.content.should == 'test'
504
+ end
505
+
506
+ it "should work for a plain text" do
507
+ doc = Readability::Document.new('test', :tags => [])
508
+ doc.content.should == 'test'
509
+ end
510
+ end
511
+
512
+ describe "boing boing" do
513
+ let(:boing_boing) {
514
+ File.read(File.dirname(__FILE__) + "/fixtures/boing_boing.html")
515
+ }
516
+
517
+ it "contains incorrect data by default" do
518
+ # NOTE: in an ideal world this spec starts failing
519
+ # and readability correctly detects content for the
520
+ # boing boing sample.
521
+
522
+ doc = Readability::Document.new(boing_boing)
523
+
524
+ content = doc.content
525
+ (content !~ /Bees and Bombs/).should == true
526
+ content.should =~ /ADVERTISE/
527
+ end
528
+
529
+ it "should apply whitelist" do
530
+
531
+ doc = Readability::Document.new(boing_boing,
532
+ whitelist: ".post-content")
533
+ content = doc.content
534
+ content.should =~ /Bees and Bombs/
535
+ end
536
+
537
+ it "should apply blacklist" do
538
+ doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
539
+ content = doc.content
540
+ (content !~ /ADVERTISE/).should == true
541
+
542
+ end
543
+ end
544
+ end