ruby-readability 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/README.markdown +73 -0
- data/bin/readability +0 -1
- data/lib/readability.rb +79 -14
- data/ruby-readability.gemspec +3 -2
- data/spec/fixtures/bbc.html +2069 -0
- data/spec/fixtures/images/dim_1416768a.jpg +0 -0
- data/spec/fixtures/nytimes.html +58 -0
- data/spec/fixtures/thesun.html +1122 -0
- data/spec/readability_spec.rb +95 -3
- data/spec/spec_helper.rb +4 -0
- metadata +30 -28
- data/README +0 -54
data/spec/readability_spec.rb
CHANGED
@@ -20,6 +20,98 @@ describe Readability do
|
|
20
20
|
HTML
|
21
21
|
end
|
22
22
|
|
23
|
+
describe "images" do
|
24
|
+
before do
|
25
|
+
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
26
|
+
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
27
|
+
@thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
28
|
+
|
29
|
+
FakeWeb::Registry.instance.clean_registry
|
30
|
+
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
|
31
|
+
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should show one image, but outside of the best candidate" do
|
35
|
+
@doc = Readability::Document.new(@thesum)
|
36
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
37
|
+
@doc.best_candidate_has_image.should == false
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should show one image inside of the best candidate" do
|
41
|
+
@doc = Readability::Document.new(@nytimes)
|
42
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
43
|
+
@doc.best_candidate_has_image.should == true
|
44
|
+
end
|
45
|
+
|
46
|
+
it "should not try to download local images" do
|
47
|
+
@doc = Readability::Document.new(<<-HTML)
|
48
|
+
<html>
|
49
|
+
<head>
|
50
|
+
<title>title!</title>
|
51
|
+
</head>
|
52
|
+
<body class='comment'>
|
53
|
+
<div>
|
54
|
+
<img src="/something/local.gif" />
|
55
|
+
</div>
|
56
|
+
</body>
|
57
|
+
</html>
|
58
|
+
HTML
|
59
|
+
do_not_allow(@doc).load_image(anything)
|
60
|
+
@doc.images.should == []
|
61
|
+
end
|
62
|
+
|
63
|
+
describe "no images" do
|
64
|
+
it "shouldn't show images" do
|
65
|
+
@doc = Readability::Document.new(@bbc, :min_image_height => 600)
|
66
|
+
@doc.images.should == []
|
67
|
+
@doc.best_candidate_has_image.should == false
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe "poll of images" do
|
72
|
+
it "should show some images inside of the best candidate" do
|
73
|
+
@doc = Readability::Document.new(@bbc)
|
74
|
+
@doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
|
75
|
+
"http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
|
76
|
+
"http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
|
77
|
+
"http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
78
|
+
@doc.best_candidate_has_image.should == true
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should show some images inside of the best candidate, include gif format" do
|
82
|
+
@doc = Readability::Document.new(@bbc, :ignore_image_format => [])
|
83
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
84
|
+
@doc.best_candidate_has_image.should == true
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "width, height and format" do
|
88
|
+
it "should show some images inside of the best candidate, but with width most equal to 400px" do
|
89
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
|
90
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
|
91
|
+
@doc.best_candidate_has_image.should == true
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should show some images inside of the best candidate, but with width most equal to 304px" do
|
95
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
|
96
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
97
|
+
@doc.best_candidate_has_image.should == true
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
|
101
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
|
102
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
103
|
+
@doc.best_candidate_has_image.should == true
|
104
|
+
end
|
105
|
+
|
106
|
+
it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
|
107
|
+
@doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
|
108
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
109
|
+
@doc.best_candidate_has_image.should == true
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
23
115
|
describe "transformMisusedDivsIntoParagraphs" do
|
24
116
|
before do
|
25
117
|
@doc = Readability::Document.new(@simple_html_fixture)
|
@@ -207,7 +299,7 @@ describe Readability do
|
|
207
299
|
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
208
300
|
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
|
209
301
|
end
|
210
|
-
|
302
|
+
|
211
303
|
it "should output expected fragments of text" do
|
212
304
|
checks = 0
|
213
305
|
@samples.each do |sample|
|
@@ -216,12 +308,12 @@ describe Readability do
|
|
216
308
|
|
217
309
|
load "fixtures/samples/#{sample}-fragments.rb"
|
218
310
|
#puts "testing #{sample}..."
|
219
|
-
|
311
|
+
|
220
312
|
$required_fragments.each do |required_text|
|
221
313
|
doc.should include(required_text)
|
222
314
|
checks += 1
|
223
315
|
end
|
224
|
-
|
316
|
+
|
225
317
|
$excluded_fragments.each do |text_to_avoid|
|
226
318
|
doc.should_not include(text_to_avoid)
|
227
319
|
checks += 1
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,22 +12,33 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2012-
|
15
|
+
date: 2012-03-14 00:00:00.000000000Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rspec
|
19
|
-
requirement: &
|
19
|
+
requirement: &86479890 !ruby/object:Gem::Requirement
|
20
20
|
none: false
|
21
21
|
requirements:
|
22
22
|
- - ! '>='
|
23
23
|
- !ruby/object:Gem::Version
|
24
|
-
version: '2.
|
24
|
+
version: '2.8'
|
25
25
|
type: :development
|
26
26
|
prerelease: false
|
27
|
-
version_requirements: *
|
27
|
+
version_requirements: *86479890
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rspec-expectations
|
30
|
+
requirement: &86479650 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '2.8'
|
36
|
+
type: :development
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: *86479650
|
28
39
|
- !ruby/object:Gem::Dependency
|
29
40
|
name: rr
|
30
|
-
requirement: &
|
41
|
+
requirement: &86479420 !ruby/object:Gem::Requirement
|
31
42
|
none: false
|
32
43
|
requirements:
|
33
44
|
- - ! '>='
|
@@ -35,10 +46,10 @@ dependencies:
|
|
35
46
|
version: '1.0'
|
36
47
|
type: :development
|
37
48
|
prerelease: false
|
38
|
-
version_requirements: *
|
49
|
+
version_requirements: *86479420
|
39
50
|
- !ruby/object:Gem::Dependency
|
40
51
|
name: nokogiri
|
41
|
-
requirement: &
|
52
|
+
requirement: &86479190 !ruby/object:Gem::Requirement
|
42
53
|
none: false
|
43
54
|
requirements:
|
44
55
|
- - ! '>='
|
@@ -46,10 +57,10 @@ dependencies:
|
|
46
57
|
version: 1.4.2
|
47
58
|
type: :runtime
|
48
59
|
prerelease: false
|
49
|
-
version_requirements: *
|
60
|
+
version_requirements: *86479190
|
50
61
|
- !ruby/object:Gem::Dependency
|
51
62
|
name: guess_html_encoding
|
52
|
-
requirement: &
|
63
|
+
requirement: &86478960 !ruby/object:Gem::Requirement
|
53
64
|
none: false
|
54
65
|
requirements:
|
55
66
|
- - ! '>='
|
@@ -57,7 +68,7 @@ dependencies:
|
|
57
68
|
version: 0.0.2
|
58
69
|
type: :runtime
|
59
70
|
prerelease: false
|
60
|
-
version_requirements: *
|
71
|
+
version_requirements: *86478960
|
61
72
|
description: Port of arc90's readability project to ruby
|
62
73
|
email:
|
63
74
|
- andrew@iterationlabs.com
|
@@ -68,14 +79,18 @@ extra_rdoc_files: []
|
|
68
79
|
files:
|
69
80
|
- .document
|
70
81
|
- .gitignore
|
82
|
+
- .rspec
|
71
83
|
- Gemfile
|
72
|
-
- README
|
84
|
+
- README.markdown
|
73
85
|
- Rakefile
|
74
86
|
- bin/readability
|
75
87
|
- lib/readability.rb
|
76
88
|
- lib/ruby-readability.rb
|
77
89
|
- ruby-readability.gemspec
|
90
|
+
- spec/fixtures/bbc.html
|
78
91
|
- spec/fixtures/cant_read.html
|
92
|
+
- spec/fixtures/images/dim_1416768a.jpg
|
93
|
+
- spec/fixtures/nytimes.html
|
79
94
|
- spec/fixtures/sample.html
|
80
95
|
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
81
96
|
- spec/fixtures/samples/blogpost_with_links.html
|
@@ -86,6 +101,7 @@ files:
|
|
86
101
|
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
87
102
|
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
88
103
|
- spec/fixtures/should_not_truncate.txt
|
104
|
+
- spec/fixtures/thesun.html
|
89
105
|
- spec/readability_spec.rb
|
90
106
|
- spec/spec.opts
|
91
107
|
- spec/spec_helper.rb
|
@@ -109,22 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
125
|
version: '0'
|
110
126
|
requirements: []
|
111
127
|
rubyforge_project: ruby-readability
|
112
|
-
rubygems_version: 1.8.
|
128
|
+
rubygems_version: 1.8.16
|
113
129
|
signing_key:
|
114
130
|
specification_version: 3
|
115
131
|
summary: Port of arc90's readability project to ruby
|
116
|
-
test_files:
|
117
|
-
- spec/fixtures/cant_read.html
|
118
|
-
- spec/fixtures/sample.html
|
119
|
-
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
120
|
-
- spec/fixtures/samples/blogpost_with_links.html
|
121
|
-
- spec/fixtures/samples/channel4-1-fragments.rb
|
122
|
-
- spec/fixtures/samples/channel4-1.html
|
123
|
-
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
124
|
-
- spec/fixtures/samples/foxnews-india1.html
|
125
|
-
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
126
|
-
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
127
|
-
- spec/fixtures/should_not_truncate.txt
|
128
|
-
- spec/readability_spec.rb
|
129
|
-
- spec/spec.opts
|
130
|
-
- spec/spec_helper.rb
|
132
|
+
test_files: []
|
data/README
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
Ruby Readability
|
2
|
-
|
3
|
-
Command line:
|
4
|
-
(sudo) gem install ruby-readability
|
5
|
-
|
6
|
-
Bundler:
|
7
|
-
gem "ruby-readability", :require => 'readability'
|
8
|
-
|
9
|
-
Example:
|
10
|
-
|
11
|
-
require 'rubygems'
|
12
|
-
require 'readability'
|
13
|
-
require 'open-uri'
|
14
|
-
|
15
|
-
source = open('http://lab.arc90.com/experiments/readability/').read
|
16
|
-
puts Readability::Document.new(source).content
|
17
|
-
|
18
|
-
Options:
|
19
|
-
|
20
|
-
You may provide additions options to Readability::Document.new, including:
|
21
|
-
|
22
|
-
:tags - the base whitelist of tags to sanitize, defaults to %w[div p]
|
23
|
-
:remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
|
24
|
-
:attributes - whitelist of allowed attributes
|
25
|
-
:debug - provide debugging output, defaults false
|
26
|
-
:encoding - if this page is of a known encoding, you can specify it; if left
|
27
|
-
unspecified, the encoding will be guessed (only in Ruby 1.9.x)
|
28
|
-
:html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
|
29
|
-
to aid with guessing the HTML encoding
|
30
|
-
|
31
|
-
Readability comes with a command-line tool for experimentation in bin/readability.
|
32
|
-
|
33
|
-
Usage: readability [options] URL
|
34
|
-
-d, --debug Show debug output
|
35
|
-
-i, --images Keep images and links
|
36
|
-
-h, --help Show this message
|
37
|
-
|
38
|
-
Potential issues:
|
39
|
-
|
40
|
-
* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
|
41
|
-
Version 2.7.8 of libxml2 with the following worked for me:
|
42
|
-
gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
43
|
-
|
44
|
-
===
|
45
|
-
|
46
|
-
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
47
|
-
|
48
|
-
This is a ruby port of arc90's readability project
|
49
|
-
|
50
|
-
http://lab.arc90.com/experiments/readability/
|
51
|
-
|
52
|
-
Given a html document, it pulls out the main body text and cleans it up.
|
53
|
-
|
54
|
-
Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.
|