marcosinger-ruby-readability 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,330 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Readability do
6
+ before do
7
+ @simple_html_fixture = <<-HTML
8
+ <html>
9
+ <head>
10
+ <title>title!</title>
11
+ </head>
12
+ <body class='comment'>
13
+ <div>
14
+ <p class='comment'>a comment</p>
15
+ <div class='comment' id='body'>real content</div>
16
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
17
+ </div>
18
+ </body>
19
+ </html>
20
+ HTML
21
+ end
22
+
23
+ describe "images" do
24
+ before do
25
+ # bbc => http://www.bbc.co.uk/news/magazine-15959067
26
+ # nytimes => http://opinionator.blogs.nytimes.com/2011/12/01/health-care-for-a-changing-work-force/
27
+ # thesum => http://www.thesun.co.uk/sol/homepage/sport/football/3973265/Manchester-United-news-Dimitar-Berbatov-and-Carling-Cup-flops-warned.html
28
+
29
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
30
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
31
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
32
+ end
33
+
34
+ it "should show one image, but outside of the best candidate" do
35
+ @doc = Readability::Document.new(@thesum)
36
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
37
+ @doc.best_candidate_has_image.should == false
38
+ end
39
+
40
+ it "should show one image inside of the best candidate" do
41
+ @doc = Readability::Document.new(@nytimes)
42
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
43
+ @doc.best_candidate_has_image.should == true
44
+ end
45
+
46
+ describe "no images" do
47
+ it "shouldn't show images" do
48
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400)
49
+ @doc.images.should == []
50
+ @doc.best_candidate_has_image.should == false
51
+ end
52
+ end
53
+
54
+ describe "poll of images" do
55
+ it "should show some images inside of the best candidate" do
56
+ @doc = Readability::Document.new(@bbc)
57
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
58
+ @doc.best_candidate_has_image.should == true
59
+ end
60
+
61
+ it "should show some images inside of the best candidate, include gif format" do
62
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
63
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
64
+ @doc.best_candidate_has_image.should == true
65
+ end
66
+
67
+ describe "width, height and format" do
68
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
69
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
70
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
71
+ @doc.best_candidate_has_image.should == true
72
+ end
73
+
74
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
75
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
76
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
77
+ @doc.best_candidate_has_image.should == true
78
+ end
79
+
80
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
81
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
82
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
83
+ @doc.best_candidate_has_image.should == true
84
+ end
85
+
86
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
87
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
88
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
89
+ @doc.best_candidate_has_image.should == true
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ describe "transformMisusedDivsIntoParagraphs" do
96
+ before do
97
+ @doc = Readability::Document.new(@simple_html_fixture)
98
+ @doc.transform_misused_divs_into_paragraphs!
99
+ end
100
+
101
+ it "should transform divs containing no block elements into <p>s" do
102
+ @doc.html.css("#body").first.name.should == "p"
103
+ end
104
+
105
+ it "should not transform divs that contain block elements" do
106
+ @doc.html.css("#contains_blockquote").first.name.should == "div"
107
+ end
108
+ end
109
+
110
+ describe "score_node" do
111
+ before do
112
+ @doc = Readability::Document.new(<<-HTML)
113
+ <html>
114
+ <body>
115
+ <div id='elem1'>
116
+ <p>some content</p>
117
+ </div>
118
+ <th id='elem2'>
119
+ <p>some other content</p>
120
+ </th>
121
+ </body>
122
+ </html>
123
+ HTML
124
+ @elem1 = @doc.html.css("#elem1").first
125
+ @elem2 = @doc.html.css("#elem2").first
126
+ end
127
+
128
+ it "should like <div>s more than <th>s" do
129
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
130
+ end
131
+
132
+ it "should like classes like text more than classes like comment" do
133
+ @elem2.name = "div"
134
+ @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
135
+ @elem1['class'] = "text"
136
+ @elem2['class'] = "comment"
137
+ @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
138
+ end
139
+ end
140
+
141
+ describe "remove_unlikely_candidates!" do
142
+ before do
143
+ @doc = Readability::Document.new(@simple_html_fixture)
144
+ @doc.remove_unlikely_candidates!
145
+ end
146
+
147
+ it "should remove things that have class comment" do
148
+ @doc.html.inner_html.should_not =~ /a comment/
149
+ end
150
+
151
+ it "should not remove body tags" do
152
+ @doc.html.inner_html.should =~ /<\/body>/
153
+ end
154
+
155
+ it "should not remove things with class comment and id body" do
156
+ @doc.html.inner_html.should =~ /real content/
157
+ end
158
+ end
159
+
160
+ describe "score_paragraphs" do
161
+ before(:each) do
162
+ @doc = Readability::Document.new(<<-HTML)
163
+ <html>
164
+ <head>
165
+ <title>title!</title>
166
+ </head>
167
+ <body id="body">
168
+ <div id="div1">
169
+ <div id="div2>
170
+ <p id="some_comment">a comment</p>
171
+ </div>
172
+ <p id="some_text">some text</p>
173
+ </div>
174
+ <div id="div3">
175
+ <p id="some_text2">some more text</p>
176
+ </div>
177
+ </body>
178
+ </html><!-- " -->
179
+ HTML
180
+ @candidates = @doc.score_paragraphs(0)
181
+ end
182
+
183
+ it "should score elements in the document" do
184
+ @candidates.values.length.should == 3
185
+ end
186
+
187
+ it "should prefer the body in this particular example" do
188
+ @candidates.values.sort { |a, b|
189
+ b[:content_score] <=> a[:content_score]
190
+ }.first[:elem][:id].should == "body"
191
+ end
192
+
193
+ context "when two consequent br tags are used instead of p" do
194
+ it "should assign the higher score to the first paragraph in this particular example" do
195
+ @doc = Readability::Document.new(<<-HTML)
196
+ <html>
197
+ <head>
198
+ <title>title!</title>
199
+ </head>
200
+ <body id="body">
201
+ <div id="post1">
202
+ This is the main content!<br/><br/>
203
+ Zebra found killed butcher with the chainsaw.<br/><br/>
204
+ If only I could think of an example, oh, wait.
205
+ </div>
206
+ <div id="post2">
207
+ This is not the content and although it's longer if you meaure it in characters,
208
+ it's supposed to have lower score than the previous paragraph. And it's only because
209
+ of the previous paragraph is not one paragraph, it's three subparagraphs
210
+ </div>
211
+ </body>
212
+ </html>
213
+ HTML
214
+ @candidates = @doc.score_paragraphs(0)
215
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
216
+ end
217
+ end
218
+ end
219
+
220
+ describe "the cant_read.html fixture" do
221
+ it "should work on the cant_read.html fixture with some allowed tags" do
222
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
223
+ allowed_attributes = %w[href]
224
+ html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
225
+ Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
226
+ end
227
+ end
228
+
229
+ describe "general functionality" do
230
+ before do
231
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
232
+ :min_text_length => 0, :retry_length => 1)
233
+ end
234
+
235
+ it "should return the main page content" do
236
+ @doc.content.should match("Some content")
237
+ end
238
+
239
+ it "should return the page title if present" do
240
+ @doc.title.should match("title!")
241
+
242
+ doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
243
+ :min_text_length => 0, :retry_length => 1)
244
+ doc.title.should be_nil
245
+ end
246
+ end
247
+
248
+ describe "ignoring sidebars" do
249
+ before do
250
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
251
+ :min_text_length => 0, :retry_length => 1)
252
+ end
253
+
254
+ it "should not return the sidebar" do
255
+ @doc.content.should_not match("sidebar")
256
+ end
257
+ end
258
+
259
+ describe "inserting space for block elements" do
260
+ before do
261
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
262
+ <html><head><title>title!</title></head>
263
+ <body>
264
+ <div>
265
+ <p>a<br>b<hr>c<address>d</address>f/p>
266
+ </div>
267
+ </body>
268
+ </html>
269
+ HTML
270
+ end
271
+
272
+ it "should not return the sidebar" do
273
+ @doc.content.should_not match("a b c d f")
274
+ end
275
+ end
276
+
277
+ describe "outputs good stuff for known documents" do
278
+ before do
279
+ @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
280
+ @samples = @html_files.map {|filename| File.basename(filename, '.html') }
281
+ end
282
+
283
+ it "should output expected fragments of text" do
284
+ checks = 0
285
+ @samples.each do |sample|
286
+ html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
287
+ doc = Readability::Document.new(html).content
288
+
289
+ load "fixtures/samples/#{sample}-fragments.rb"
290
+ #puts "testing #{sample}..."
291
+
292
+ $required_fragments.each do |required_text|
293
+ doc.should include(required_text)
294
+ checks += 1
295
+ end
296
+
297
+ $excluded_fragments.each do |text_to_avoid|
298
+ doc.should_not include(text_to_avoid)
299
+ checks += 1
300
+ end
301
+ end
302
+ #puts "Performed #{checks} checks."
303
+ end
304
+ end
305
+
306
+ describe "encoding guessing" do
307
+ if RUBY_VERSION =~ /^1\.9\./
308
+ context "with ruby 1.9.2" do
309
+ it "should correctly guess and enforce HTML encoding" do
310
+ doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
311
+ content = doc.content
312
+ content.encoding.to_s.should == "ISO-8859-1"
313
+ content.should be_valid_encoding
314
+ end
315
+
316
+ it "should allow encoding guessing to be skipped" do
317
+ do_not_allow(GuessHtmlEncoding).encode
318
+ doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
319
+ doc.content
320
+ end
321
+
322
+ it "should allow encoding guessing to be overridden" do
323
+ do_not_allow(GuessHtmlEncoding).encode
324
+ doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
325
+ doc.content
326
+ end
327
+ end
328
+ end
329
+ end
330
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'readability'
3
+ require 'rr'
4
+ require 'fakeweb'
5
+
6
+ RSpec.configure do |config|
7
+ config.mock_with :rr
8
+ end
9
+
10
+ FakeWeb.allow_net_connect = false
11
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
metadata ADDED
@@ -0,0 +1,176 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: marcosinger-ruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ hash: 7
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 6
9
+ - 0
10
+ version: 0.6.0
11
+ platform: ruby
12
+ authors:
13
+ - Andrew Cantino
14
+ - starrhorne
15
+ - libc
16
+ - Kyle Maxwell
17
+ - Marco Singer
18
+ autorequire:
19
+ bindir: bin
20
+ cert_chain: []
21
+
22
+ date: 2011-12-19 00:00:00 Z
23
+ dependencies:
24
+ - !ruby/object:Gem::Dependency
25
+ name: rspec
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ hash: 15
33
+ segments:
34
+ - 2
35
+ - 6
36
+ version: "2.6"
37
+ type: :development
38
+ version_requirements: *id001
39
+ - !ruby/object:Gem::Dependency
40
+ name: rr
41
+ prerelease: false
42
+ requirement: &id002 !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ hash: 15
48
+ segments:
49
+ - 1
50
+ - 0
51
+ version: "1.0"
52
+ type: :development
53
+ version_requirements: *id002
54
+ - !ruby/object:Gem::Dependency
55
+ name: nokogiri
56
+ prerelease: false
57
+ requirement: &id003 !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 1
65
+ - 4
66
+ - 2
67
+ version: 1.4.2
68
+ type: :runtime
69
+ version_requirements: *id003
70
+ - !ruby/object:Gem::Dependency
71
+ name: guess_html_encoding
72
+ prerelease: false
73
+ requirement: &id004 !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ hash: 27
79
+ segments:
80
+ - 0
81
+ - 0
82
+ - 2
83
+ version: 0.0.2
84
+ type: :runtime
85
+ version_requirements: *id004
86
+ description: Port of arc90's readability project to ruby
87
+ email:
88
+ - andrew@iterationlabs.com
89
+ - markaum@gmail.com
90
+ executables:
91
+ - readability
92
+ extensions: []
93
+
94
+ extra_rdoc_files: []
95
+
96
+ files:
97
+ - .document
98
+ - .gitignore
99
+ - .rspec
100
+ - Gemfile
101
+ - README
102
+ - Rakefile
103
+ - bin/readability
104
+ - lib/readability.rb
105
+ - lib/ruby-readability.rb
106
+ - ruby-readability.gemspec
107
+ - spec/fixtures/bbc.html
108
+ - spec/fixtures/cant_read.html
109
+ - spec/fixtures/images/dim_1416768a.jpg
110
+ - spec/fixtures/nytimes.html
111
+ - spec/fixtures/sample.html
112
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
113
+ - spec/fixtures/samples/blogpost_with_links.html
114
+ - spec/fixtures/samples/channel4-1-fragments.rb
115
+ - spec/fixtures/samples/channel4-1.html
116
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
117
+ - spec/fixtures/samples/foxnews-india1.html
118
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
119
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
120
+ - spec/fixtures/should_not_truncate.txt
121
+ - spec/fixtures/thesun.html
122
+ - spec/readability_spec.rb
123
+ - spec/spec.opts
124
+ - spec/spec_helper.rb
125
+ homepage: http://github.com/iterationlabs/ruby-readability
126
+ licenses: []
127
+
128
+ post_install_message:
129
+ rdoc_options: []
130
+
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ none: false
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ hash: 3
139
+ segments:
140
+ - 0
141
+ version: "0"
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ none: false
144
+ requirements:
145
+ - - ">="
146
+ - !ruby/object:Gem::Version
147
+ hash: 3
148
+ segments:
149
+ - 0
150
+ version: "0"
151
+ requirements: []
152
+
153
+ rubyforge_project: ruby-readability
154
+ rubygems_version: 1.8.10
155
+ signing_key:
156
+ specification_version: 3
157
+ summary: Port of arc90's readability project to ruby
158
+ test_files:
159
+ - spec/fixtures/bbc.html
160
+ - spec/fixtures/cant_read.html
161
+ - spec/fixtures/images/dim_1416768a.jpg
162
+ - spec/fixtures/nytimes.html
163
+ - spec/fixtures/sample.html
164
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
165
+ - spec/fixtures/samples/blogpost_with_links.html
166
+ - spec/fixtures/samples/channel4-1-fragments.rb
167
+ - spec/fixtures/samples/channel4-1.html
168
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
169
+ - spec/fixtures/samples/foxnews-india1.html
170
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
171
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
172
+ - spec/fixtures/should_not_truncate.txt
173
+ - spec/fixtures/thesun.html
174
+ - spec/readability_spec.rb
175
+ - spec/spec.opts
176
+ - spec/spec_helper.rb