marcosinger-ruby-readability 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,330 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Readability do
6
+ before do
7
+ @simple_html_fixture = <<-HTML
8
+ <html>
9
+ <head>
10
+ <title>title!</title>
11
+ </head>
12
+ <body class='comment'>
13
+ <div>
14
+ <p class='comment'>a comment</p>
15
+ <div class='comment' id='body'>real content</div>
16
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
17
+ </div>
18
+ </body>
19
+ </html>
20
+ HTML
21
+ end
22
+
23
+ describe "images" do
24
+ before do
25
+ # bbc => http://www.bbc.co.uk/news/magazine-15959067
26
+ # nytimes => http://opinionator.blogs.nytimes.com/2011/12/01/health-care-for-a-changing-work-force/
27
+ # thesum => http://www.thesun.co.uk/sol/homepage/sport/football/3973265/Manchester-United-news-Dimitar-Berbatov-and-Carling-Cup-flops-warned.html
28
+
29
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
30
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
31
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
32
+ end
33
+
34
+ it "should show one image, but outside of the best candidate" do
35
+ @doc = Readability::Document.new(@thesum)
36
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
37
+ @doc.best_candidate_has_image.should == false
38
+ end
39
+
40
+ it "should show one image inside of the best candidate" do
41
+ @doc = Readability::Document.new(@nytimes)
42
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
43
+ @doc.best_candidate_has_image.should == true
44
+ end
45
+
46
+ describe "no images" do
47
+ it "shouldn't show images" do
48
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400)
49
+ @doc.images.should == []
50
+ @doc.best_candidate_has_image.should == false
51
+ end
52
+ end
53
+
54
+ describe "poll of images" do
55
+ it "should show some images inside of the best candidate" do
56
+ @doc = Readability::Document.new(@bbc)
57
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
58
+ @doc.best_candidate_has_image.should == true
59
+ end
60
+
61
+ it "should show some images inside of the best candidate, include gif format" do
62
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
63
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
64
+ @doc.best_candidate_has_image.should == true
65
+ end
66
+
67
+ describe "width, height and format" do
68
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
69
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
70
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
71
+ @doc.best_candidate_has_image.should == true
72
+ end
73
+
74
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
75
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
76
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
77
+ @doc.best_candidate_has_image.should == true
78
+ end
79
+
80
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
81
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
82
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
83
+ @doc.best_candidate_has_image.should == true
84
+ end
85
+
86
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
87
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
88
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
89
+ @doc.best_candidate_has_image.should == true
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ describe "transformMisusedDivsIntoParagraphs" do
96
+ before do
97
+ @doc = Readability::Document.new(@simple_html_fixture)
98
+ @doc.transform_misused_divs_into_paragraphs!
99
+ end
100
+
101
+ it "should transform divs containing no block elements into <p>s" do
102
+ @doc.html.css("#body").first.name.should == "p"
103
+ end
104
+
105
+ it "should not transform divs that contain block elements" do
106
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
107
+ end
108
+ end
109
+
110
+ describe "score_node" do
111
+ before do
112
+ @doc = Readability::Document.new(<<-HTML)
113
+ <html>
114
+ <body>
115
+ <div id='elem1'>
116
+ <p>some content</p>
117
+ </div>
118
+ <th id='elem2'>
119
+ <p>some other content</p>
120
+ </th>
121
+ </body>
122
+ </html>
123
+ HTML
124
+ @elem1 = @doc.html.css("#elem1").first
125
+ @elem2 = @doc.html.css("#elem2").first
126
+ end
127
+
128
+ it "should like <div>s more than <th>s" do
129
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
130
+ end
131
+
132
+ it "should like classes like text more than classes like comment" do
133
+ @elem2.name = "div"
134
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
135
+ @elem1['class'] = "text"
136
+ @elem2['class'] = "comment"
137
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
138
+ end
139
+ end
140
+
141
+ describe "remove_unlikely_candidates!" do
142
+ before do
143
+ @doc = Readability::Document.new(@simple_html_fixture)
144
+ @doc.remove_unlikely_candidates!
145
+ end
146
+
147
+ it "should remove things that have class comment" do
148
+ @doc.html.inner_html.should_not =~ /a comment/
149
+ end
150
+
151
+ it "should not remove body tags" do
152
+ @doc.html.inner_html.should =~ /<\/body>/
153
+ end
154
+
155
+ it "should not remove things with class comment and id body" do
156
+ @doc.html.inner_html.should =~ /real content/
157
+ end
158
+ end
159
+
160
+ describe "score_paragraphs" do
161
+ before(:each) do
162
+ @doc = Readability::Document.new(<<-HTML)
163
+ <html>
164
+ <head>
165
+ <title>title!</title>
166
+ </head>
167
+ <body id="body">
168
+ <div id="div1">
169
+ <div id="div2>
170
+ <p id="some_comment">a comment</p>
171
+ </div>
172
+ <p id="some_text">some text</p>
173
+ </div>
174
+ <div id="div3">
175
+ <p id="some_text2">some more text</p>
176
+ </div>
177
+ </body>
178
+ </html><!-- " -->
179
+ HTML
180
+ @candidates = @doc.score_paragraphs(0)
181
+ end
182
+
183
+ it "should score elements in the document" do
184
+ @candidates.values.length.should == 3
185
+ end
186
+
187
+ it "should prefer the body in this particular example" do
188
+ @candidates.values.sort { |a, b|
189
+ b[:content_score] <=> a[:content_score]
190
+ }.first[:elem][:id].should == "body"
191
+ end
192
+
193
+ context "when two consequent br tags are used instead of p" do
194
+ it "should assign the higher score to the first paragraph in this particular example" do
195
+ @doc = Readability::Document.new(<<-HTML)
196
+ <html>
197
+ <head>
198
+ <title>title!</title>
199
+ </head>
200
+ <body id="body">
201
+ <div id="post1">
202
+ This is the main content!<br/><br/>
203
+ Zebra found killed butcher with the chainsaw.<br/><br/>
204
+ If only I could think of an example, oh, wait.
205
+ </div>
206
+ <div id="post2">
207
+ This is not the content and although it's longer if you meaure it in characters,
208
+ it's supposed to have lower score than the previous paragraph. And it's only because
209
+ of the previous paragraph is not one paragraph, it's three subparagraphs
210
+ </div>
211
+ </body>
212
+ </html>
213
+ HTML
214
+ @candidates = @doc.score_paragraphs(0)
215
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
216
+ end
217
+ end
218
+ end
219
+
220
+ describe "the cant_read.html fixture" do
221
+ it "should work on the cant_read.html fixture with some allowed tags" do
222
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
223
+ allowed_attributes = %w[href]
224
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
225
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
226
+ end
227
+ end
228
+
229
+ describe "general functionality" do
230
+ before do
231
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
232
+ :min_text_length => 0, :retry_length => 1)
233
+ end
234
+
235
+ it "should return the main page content" do
236
+ @doc.content.should match("Some content")
237
+ end
238
+
239
+ it "should return the page title if present" do
240
+ @doc.title.should match("title!")
241
+
242
+ doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
243
+ :min_text_length => 0, :retry_length => 1)
244
+ doc.title.should be_nil
245
+ end
246
+ end
247
+
248
+ describe "ignoring sidebars" do
249
+ before do
250
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
251
+ :min_text_length => 0, :retry_length => 1)
252
+ end
253
+
254
+ it "should not return the sidebar" do
255
+ @doc.content.should_not match("sidebar")
256
+ end
257
+ end
258
+
259
+ describe "inserting space for block elements" do
260
+ before do
261
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
262
+ <html><head><title>title!</title></head>
263
+ <body>
264
+ <div>
265
+ <p>a<br>b<hr>c<address>d</address>f/p>
266
+ </div>
267
+ </body>
268
+ </html>
269
+ HTML
270
+ end
271
+
272
+ it "should not return the sidebar" do
273
+ @doc.content.should_not match("a b c d f")
274
+ end
275
+ end
276
+
277
+ describe "outputs good stuff for known documents" do
278
+ before do
279
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
280
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
281
+ end
282
+
283
+ it "should output expected fragments of text" do
284
+ checks = 0
285
+ @samples.each do |sample|
286
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
287
+ doc = Readability::Document.new(html).content
288
+
289
+ load "fixtures/samples/#{sample}-fragments.rb"
290
+ #puts "testing #{sample}..."
291
+
292
+ $required_fragments.each do |required_text|
293
+ doc.should include(required_text)
294
+ checks += 1
295
+ end
296
+
297
+ $excluded_fragments.each do |text_to_avoid|
298
+ doc.should_not include(text_to_avoid)
299
+ checks += 1
300
+ end
301
+ end
302
+ #puts "Performed #{checks} checks."
303
+ end
304
+ end
305
+
306
+ describe "encoding guessing" do
307
+ if RUBY_VERSION =~ /^1\.9\./
308
+ context "with ruby 1.9.2" do
309
+ it "should correctly guess and enforce HTML encoding" do
310
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
311
+ content = doc.content
312
+ content.encoding.to_s.should == "ISO-8859-1"
313
+ content.should be_valid_encoding
314
+ end
315
+
316
+ it "should allow encoding guessing to be skipped" do
317
+ do_not_allow(GuessHtmlEncoding).encode
318
+ doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
319
+ doc.content
320
+ end
321
+
322
+ it "should allow encoding guessing to be overridden" do
323
+ do_not_allow(GuessHtmlEncoding).encode
324
+ doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
325
+ doc.content
326
+ end
327
+ end
328
+ end
329
+ end
330
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'readability'
3
+ require 'rr'
4
+ require 'fakeweb'
5
+
6
+ RSpec.configure do |config|
7
+ config.mock_with :rr
8
+ end
9
+
10
+ FakeWeb.allow_net_connect = false
11
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
metadata ADDED
@@ -0,0 +1,176 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: marcosinger-ruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ hash: 7
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
11
+ platform: ruby
12
+ authors:
13
+ - Andrew Cantino
14
+ - starrhorne
15
+ - libc
16
+ - Kyle Maxwell
17
+ - Marco Singer
18
+ autorequire:
19
+ bindir: bin
20
+ cert_chain: []
21
+
22
+ date: 2011-12-19 00:00:00 Z
23
+ dependencies:
24
+ - !ruby/object:Gem::Dependency
25
+ name: rspec
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ hash: 15
33
+ segments:
34
+ - 2
35
+ - 6
36
+ version: "2.6"
37
+ type: :development
38
+ version_requirements: *id001
39
+ - !ruby/object:Gem::Dependency
40
+ name: rr
41
+ prerelease: false
42
+ requirement: &id002 !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ hash: 15
48
+ segments:
49
+ - 1
50
+ - 0
51
+ version: "1.0"
52
+ type: :development
53
+ version_requirements: *id002
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ prerelease: false
57
+ requirement: &id003 !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 1
65
+ - 4
66
+ - 2
67
+ version: 1.4.2
68
+ type: :runtime
69
+ version_requirements: *id003
70
+ - !ruby/object:Gem::Dependency
71
+ name: guess_html_encoding
72
+ prerelease: false
73
+ requirement: &id004 !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ hash: 27
79
+ segments:
80
+ - 0
81
+ - 0
82
+ - 2
83
+ version: 0.0.2
84
+ type: :runtime
85
+ version_requirements: *id004
86
+ description: Port of arc90's readability project to ruby
87
+ email:
88
+ - andrew@iterationlabs.com
89
+ - markaum@gmail.com
90
+ executables:
91
+ - readability
92
+ extensions: []
93
+
94
+ extra_rdoc_files: []
95
+
96
+ files:
97
+ - .document
98
+ - .gitignore
99
+ - .rspec
100
+ - Gemfile
101
+ - README
102
+ - Rakefile
103
+ - bin/readability
104
+ - lib/readability.rb
105
+ - lib/ruby-readability.rb
106
+ - ruby-readability.gemspec
107
+ - spec/fixtures/bbc.html
108
+ - spec/fixtures/cant_read.html
109
+ - spec/fixtures/images/dim_1416768a.jpg
110
+ - spec/fixtures/nytimes.html
111
+ - spec/fixtures/sample.html
112
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
113
+ - spec/fixtures/samples/blogpost_with_links.html
114
+ - spec/fixtures/samples/channel4-1-fragments.rb
115
+ - spec/fixtures/samples/channel4-1.html
116
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
117
+ - spec/fixtures/samples/foxnews-india1.html
118
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
119
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
120
+ - spec/fixtures/should_not_truncate.txt
121
+ - spec/fixtures/thesun.html
122
+ - spec/readability_spec.rb
123
+ - spec/spec.opts
124
+ - spec/spec_helper.rb
125
+ homepage: http://github.com/iterationlabs/ruby-readability
126
+ licenses: []
127
+
128
+ post_install_message:
129
+ rdoc_options: []
130
+
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ none: false
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ hash: 3
139
+ segments:
140
+ - 0
141
+ version: "0"
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ none: false
144
+ requirements:
145
+ - - ">="
146
+ - !ruby/object:Gem::Version
147
+ hash: 3
148
+ segments:
149
+ - 0
150
+ version: "0"
151
+ requirements: []
152
+
153
+ rubyforge_project: ruby-readability
154
+ rubygems_version: 1.8.10
155
+ signing_key:
156
+ specification_version: 3
157
+ summary: Port of arc90's readability project to ruby
158
+ test_files:
159
+ - spec/fixtures/bbc.html
160
+ - spec/fixtures/cant_read.html
161
+ - spec/fixtures/images/dim_1416768a.jpg
162
+ - spec/fixtures/nytimes.html
163
+ - spec/fixtures/sample.html
164
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
165
+ - spec/fixtures/samples/blogpost_with_links.html
166
+ - spec/fixtures/samples/channel4-1-fragments.rb
167
+ - spec/fixtures/samples/channel4-1.html
168
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
169
+ - spec/fixtures/samples/foxnews-india1.html
170
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
171
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
172
+ - spec/fixtures/should_not_truncate.txt
173
+ - spec/fixtures/thesun.html
174
+ - spec/readability_spec.rb
175
+ - spec/spec.opts
176
+ - spec/spec_helper.rb