ruby-readability-discourse 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,544 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+ require 'readability'
5
+
6
+ describe Readability do
7
+ before do
8
+ @simple_html_fixture = <<-HTML
9
+ <html>
10
+ <head>
11
+ <title>title!</title>
12
+ </head>
13
+ <body class='comment'>
14
+ <div>
15
+ <p class='comment'>a comment</p>
16
+ <div class='comment' id='body'>real content</div>
17
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
18
+ </div>
19
+ </body>
20
+ </html>
21
+ HTML
22
+ end
23
+
24
+ describe "images" do
25
+ before do
26
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
27
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
28
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
29
+
30
+ FakeWeb::Registry.instance.clean_registry
31
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
32
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
33
+
34
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
35
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
36
+
37
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
38
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
39
+ end
40
+
41
+ it "should show one image, but outside of the best candidate" do
42
+ @doc = Readability::Document.new(@thesum)
43
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"]
44
+ @doc.best_candidate_has_image.should == false
45
+ end
46
+
47
+ it "should show one image inside of the best candidate" do
48
+ @doc = Readability::Document.new(@nytimes)
49
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
50
+ @doc.best_candidate_has_image.should == true
51
+ end
52
+
53
+ it "should not try to download local images" do
54
+ @doc = Readability::Document.new(<<-HTML)
55
+ <html>
56
+ <head>
57
+ <title>title!</title>
58
+ </head>
59
+ <body class='comment'>
60
+ <div>
61
+ <img src="/something/local.gif" />
62
+ </div>
63
+ </body>
64
+ </html>
65
+ HTML
66
+ do_not_allow(@doc).load_image(anything)
67
+ @doc.images.should == []
68
+ end
69
+
70
+ describe "no images" do
71
+ it "shouldn't show images" do
72
+ @doc = Readability::Document.new(@bbc, :min_image_height => 600)
73
+ @doc.images.should == []
74
+ @doc.best_candidate_has_image.should == false
75
+ end
76
+ end
77
+
78
+ describe "poll of images" do
79
+ it "should show some images inside of the best candidate" do
80
+ @doc = Readability::Document.new(@bbc)
81
+ @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
82
+ "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
83
+ "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
84
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
85
+ @doc.best_candidate_has_image.should == true
86
+ end
87
+
88
+ it "should show some images inside of the best candidate, include gif format" do
89
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
90
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
91
+ @doc.best_candidate_has_image.should == true
92
+ end
93
+
94
+ describe "width, height and format" do
95
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
96
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
97
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
98
+ @doc.best_candidate_has_image.should == true
99
+ end
100
+
101
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
102
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
103
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
104
+ @doc.best_candidate_has_image.should == true
105
+ end
106
+
107
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
108
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
109
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
110
+ @doc.best_candidate_has_image.should == true
111
+ end
112
+
113
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
114
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
115
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
116
+ @doc.best_candidate_has_image.should == true
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ describe "transformMisusedDivsIntoParagraphs" do
123
+ before do
124
+ @doc = Readability::Document.new(@simple_html_fixture)
125
+ @doc.transform_misused_divs_into_paragraphs!
126
+ end
127
+
128
+ it "should transform divs containing no block elements into <p>s" do
129
+ @doc.html.css("#body").first.name.should == "p"
130
+ end
131
+
132
+ it "should not transform divs that contain block elements" do
133
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
134
+ end
135
+ end
136
+
137
+ describe "author" do
138
+ it "should pick up <meta name='dc.creator'></meta> as an author" do
139
+ doc = Readability::Document.new(<<-HTML)
140
+ <html>
141
+ <head>
142
+ <meta name='dc.creator' content='Austin Fonacier' />
143
+ </head>
144
+ <body></body>
145
+ </html>
146
+ HTML
147
+ doc.author.should eql("Austin Fonacier")
148
+ end
149
+
150
+ it "should pick up readability's recommended author format" do
151
+ doc = Readability::Document.new(<<-HTML)
152
+ <html>
153
+ <head>
154
+ </head>
155
+ <body>
156
+ <p class="byline author vcard">
157
+ By <cite class="fn">Austin Fonacier</span>
158
+ </p>
159
+ </body>
160
+ </html>
161
+ HTML
162
+ doc.author.should eql("Austin Fonacier")
163
+ end
164
+
165
+ it "should pick up vcard fn" do
166
+ doc = Readability::Document.new(<<-HTML)
167
+ <html>
168
+ <head>
169
+ </head>
170
+ <body>
171
+ <div class="author">By</div>
172
+ <div class="author vcard">
173
+ <a class="url fn" href="http://austinlivesinyotests.com/">Austin Fonacier</a>
174
+ </div>
175
+ </body>
176
+ </html>
177
+ HTML
178
+ doc.author.should eql("Austin Fonacier")
179
+ end
180
+
181
+ it "should pick up <a rel='author'>" do
182
+ doc = Readability::Document.new(<<-HTML)
183
+ <html>
184
+ <head></head>
185
+ <body>
186
+ <a rel="author" href="http://google.com">Danny Banks (rel)</a>
187
+ </body>
188
+ </html>
189
+ HTML
190
+ doc.author.should eql("Danny Banks (rel)")
191
+ end
192
+
193
+ it "should pick up <div id='author'>" do
194
+ doc = Readability::Document.new(<<-HTML)
195
+ <html>
196
+ <head></head>
197
+ <body>
198
+ <div id="author">Austin Fonacier (author)</div>
199
+ </body>
200
+ </html>
201
+ HTML
202
+ doc.author.should eql("Austin Fonacier (author)")
203
+ end
204
+ end
205
+
206
+ describe "score_node" do
207
+ before do
208
+ @doc = Readability::Document.new(<<-HTML)
209
+ <html>
210
+ <body>
211
+ <div id='elem1'>
212
+ <p>some content</p>
213
+ </div>
214
+ <th id='elem2'>
215
+ <p>some other content</p>
216
+ </th>
217
+ </body>
218
+ </html>
219
+ HTML
220
+ @elem1 = @doc.html.css("#elem1").first
221
+ @elem2 = @doc.html.css("#elem2").first
222
+ end
223
+
224
+ it "should like <div>s more than <th>s" do
225
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
226
+ end
227
+
228
+ it "should like classes like text more than classes like comment" do
229
+ @elem2.name = "div"
230
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
231
+ @elem1['class'] = "text"
232
+ @elem2['class'] = "comment"
233
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
234
+ end
235
+ end
236
+
237
+ describe "remove_unlikely_candidates!" do
238
+ before do
239
+ @doc = Readability::Document.new(@simple_html_fixture)
240
+ @doc.remove_unlikely_candidates!
241
+ end
242
+
243
+ it "should remove things that have class comment" do
244
+ @doc.html.inner_html.should_not =~ /a comment/
245
+ end
246
+
247
+ it "should not remove body tags" do
248
+ @doc.html.inner_html.should =~ /<\/body>/
249
+ end
250
+
251
+ it "should not remove things with class comment and id body" do
252
+ @doc.html.inner_html.should =~ /real content/
253
+ end
254
+ end
255
+
256
+ describe "score_paragraphs" do
257
+ before(:each) do
258
+ @doc = Readability::Document.new(<<-HTML)
259
+ <html>
260
+ <head>
261
+ <title>title!</title>
262
+ </head>
263
+ <body id="body">
264
+ <div id="div1">
265
+ <div id="div2>
266
+ <p id="some_comment">a comment</p>
267
+ </div>
268
+ <p id="some_text">some text</p>
269
+ </div>
270
+ <div id="div3">
271
+ <p id="some_text2">some more text</p>
272
+ </div>
273
+ </body>
274
+ </html><!-- " -->
275
+ HTML
276
+ @candidates = @doc.score_paragraphs(0)
277
+ end
278
+
279
+ it "should score elements in the document" do
280
+ @candidates.values.length.should == 3
281
+ end
282
+
283
+ it "should prefer the body in this particular example" do
284
+ @candidates.values.sort { |a, b|
285
+ b[:content_score] <=> a[:content_score]
286
+ }.first[:elem][:id].should == "body"
287
+ end
288
+
289
+ context "when two consequent br tags are used instead of p" do
290
+ it "should assign the higher score to the first paragraph in this particular example" do
291
+ @doc = Readability::Document.new(<<-HTML)
292
+ <html>
293
+ <head>
294
+ <title>title!</title>
295
+ </head>
296
+ <body id="body">
297
+ <div id="post1">
298
+ This is the main content!<br/><br/>
299
+ Zebra found killed butcher with the chainsaw.<br/><br/>
300
+ If only I could think of an example, oh, wait.
301
+ </div>
302
+ <div id="post2">
303
+ This is not the content and although it's longer if you meaure it in characters,
304
+ it's supposed to have lower score than the previous paragraph. And it's only because
305
+ of the previous paragraph is not one paragraph, it's three subparagraphs
306
+ </div>
307
+ </body>
308
+ </html>
309
+ HTML
310
+ @candidates = @doc.score_paragraphs(0)
311
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
312
+ end
313
+ end
314
+ end
315
+
316
+ describe "the cant_read.html fixture" do
317
+ it "should work on the cant_read.html fixture with some allowed tags" do
318
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
319
+ allowed_attributes = %w[href]
320
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
321
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
322
+ end
323
+ end
324
+
325
+ describe "general functionality" do
326
+ before do
327
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
328
+ :min_text_length => 0, :retry_length => 1)
329
+ end
330
+
331
+ it "should return the main page content" do
332
+ @doc.content.should match("Some content")
333
+ end
334
+
335
+ it "should return the page title if present" do
336
+ @doc.title.should match("title!")
337
+
338
+ doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
339
+ :min_text_length => 0, :retry_length => 1)
340
+ doc.title.should be_nil
341
+ end
342
+ end
343
+
344
+ describe "ignoring sidebars" do
345
+ before do
346
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
347
+ :min_text_length => 0, :retry_length => 1)
348
+ end
349
+
350
+ it "should not return the sidebar" do
351
+ @doc.content.should_not match("sidebar")
352
+ end
353
+ end
354
+
355
+ describe "inserting space for block elements" do
356
+ before do
357
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
358
+ <html><head><title>title!</title></head>
359
+ <body>
360
+ <div>
361
+ <p>a<br>b<hr>c<address>d</address>f/p>
362
+ </div>
363
+ </body>
364
+ </html>
365
+ HTML
366
+ end
367
+
368
+ it "should not return the sidebar" do
369
+ @doc.content.should_not match("a b c d f")
370
+ end
371
+ end
372
+
373
+ describe "outputs good stuff for known documents" do
374
+ before do
375
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
376
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
377
+ end
378
+
379
+ it "should output expected fragments of text" do
380
+ checks = 0
381
+ @samples.each do |sample|
382
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
383
+ doc = Readability::Document.new(html).content
384
+
385
+ load "fixtures/samples/#{sample}-fragments.rb"
386
+ #puts "testing #{sample}..."
387
+
388
+ $required_fragments.each do |required_text|
389
+ doc.should include(required_text)
390
+ checks += 1
391
+ end
392
+
393
+ $excluded_fragments.each do |text_to_avoid|
394
+ doc.should_not include(text_to_avoid)
395
+ checks += 1
396
+ end
397
+ end
398
+ #puts "Performed #{checks} checks."
399
+ end
400
+ end
401
+
402
+ describe "encoding guessing" do
403
+ if RUBY_VERSION =~ /^1\.9\./
404
+ context "with ruby 1.9.2" do
405
+ it "should correctly guess and enforce HTML encoding" do
406
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
407
+ content = doc.content
408
+ content.encoding.to_s.should == "ISO-8859-1"
409
+ content.should be_valid_encoding
410
+ end
411
+
412
+ it "should allow encoding guessing to be skipped" do
413
+ do_not_allow(GuessHtmlEncoding).encode
414
+ doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
415
+ doc.content
416
+ end
417
+
418
+ it "should allow encoding guessing to be overridden" do
419
+ do_not_allow(GuessHtmlEncoding).encode
420
+ doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
421
+ doc.content
422
+ end
423
+ end
424
+ end
425
+ end
426
+
427
+ describe "#make_html" do
428
+ it "should strip the html comments tag" do
429
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
430
+ content = doc.content
431
+ content.should include("hi!")
432
+ content.should_not include("bye")
433
+ end
434
+
435
+ it "should not error with empty content" do
436
+ Readability::Document.new('').content.should == '<div><div></div></div>'
437
+ end
438
+
439
+ it "should not error with a document with no <body>" do
440
+ Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
441
+ end
442
+ end
443
+
444
+ describe "No side-effects" do
445
+ before do
446
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
447
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
448
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
449
+ end
450
+
451
+ it "should not have any side-effects when calling content() and then images()" do
452
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
453
+ :do_not_guess_encoding => true)
454
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
455
+ @doc.content
456
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
457
+ end
458
+
459
+ it "should not have any side-effects when calling content() multiple times" do
460
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
461
+ :do_not_guess_encoding => true)
462
+ @doc.content.should == @doc.content
463
+ end
464
+
465
+ it "should not have any side-effects when calling content and images multiple times" do
466
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
467
+ :do_not_guess_encoding => true)
468
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
469
+ @doc.content.should == @doc.content
470
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
471
+ end
472
+
473
+ end
474
+
475
+ describe "Code blocks" do
476
+ before do
477
+ @code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
478
+ @content = Readability::Document.new(@code,
479
+ :tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
480
+ :attributes => %w[src href],
481
+ :remove_empty_nodes => false).content
482
+ @doc = Nokogiri::HTML(@content)
483
+ end
484
+
485
+ it "preserve the code blocks" do
486
+ @doc.css("code pre").text.should == "\nroot\n indented\n "
487
+ end
488
+
489
+ it "preserve backwards code blocks" do
490
+ @doc.css("pre code").text.should == "\nsecond\n indented\n "
491
+ end
492
+ end
493
+
494
+ describe "remove all tags" do
495
+ it "should work for an incomplete piece of HTML" do
496
+ doc = Readability::Document.new('<div>test</div', :tags => [])
497
+ doc.content.should == 'test'
498
+ end
499
+
500
+ it "should work for a HTML document" do
501
+ doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
502
+ :tags => [])
503
+ doc.content.should == 'test'
504
+ end
505
+
506
+ it "should work for a plain text" do
507
+ doc = Readability::Document.new('test', :tags => [])
508
+ doc.content.should == 'test'
509
+ end
510
+ end
511
+
512
+ describe "boing boing" do
513
+ let(:boing_boing) {
514
+ File.read(File.dirname(__FILE__) + "/fixtures/boing_boing.html")
515
+ }
516
+
517
+ it "contains incorrect data by default" do
518
+ # NOTE: in an ideal world this spec starts failing
519
+ # and readability correctly detects content for the
520
+ # boing boing sample.
521
+
522
+ doc = Readability::Document.new(boing_boing)
523
+
524
+ content = doc.content
525
+ (content !~ /Bees and Bombs/).should == true
526
+ content.should =~ /ADVERTISE/
527
+ end
528
+
529
+ it "should apply whitelist" do
530
+
531
+ doc = Readability::Document.new(boing_boing,
532
+ whitelist: ".post-content")
533
+ content = doc.content
534
+ content.should =~ /Bees and Bombs/
535
+ end
536
+
537
+ it "should apply blacklist" do
538
+ doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
539
+ content = doc.content
540
+ (content !~ /ADVERTISE/).should == true
541
+
542
+ end
543
+ end
544
+ end