ruby-readability 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/README.markdown +73 -0
- data/bin/readability +0 -1
- data/lib/readability.rb +79 -14
- data/ruby-readability.gemspec +3 -2
- data/spec/fixtures/bbc.html +2069 -0
- data/spec/fixtures/images/dim_1416768a.jpg +0 -0
- data/spec/fixtures/nytimes.html +58 -0
- data/spec/fixtures/thesun.html +1122 -0
- data/spec/readability_spec.rb +95 -3
- data/spec/spec_helper.rb +4 -0
- metadata +30 -28
- data/README +0 -54
data/spec/readability_spec.rb
CHANGED
|
@@ -20,6 +20,98 @@ describe Readability do
|
|
|
20
20
|
HTML
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
+
describe "images" do
|
|
24
|
+
before do
|
|
25
|
+
@bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
|
|
26
|
+
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
|
|
27
|
+
@thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
|
|
28
|
+
|
|
29
|
+
FakeWeb::Registry.instance.clean_registry
|
|
30
|
+
FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
|
|
31
|
+
:body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "should show one image, but outside of the best candidate" do
|
|
35
|
+
@doc = Readability::Document.new(@thesum)
|
|
36
|
+
@doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
|
|
37
|
+
@doc.best_candidate_has_image.should == false
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it "should show one image inside of the best candidate" do
|
|
41
|
+
@doc = Readability::Document.new(@nytimes)
|
|
42
|
+
@doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
|
|
43
|
+
@doc.best_candidate_has_image.should == true
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it "should not try to download local images" do
|
|
47
|
+
@doc = Readability::Document.new(<<-HTML)
|
|
48
|
+
<html>
|
|
49
|
+
<head>
|
|
50
|
+
<title>title!</title>
|
|
51
|
+
</head>
|
|
52
|
+
<body class='comment'>
|
|
53
|
+
<div>
|
|
54
|
+
<img src="/something/local.gif" />
|
|
55
|
+
</div>
|
|
56
|
+
</body>
|
|
57
|
+
</html>
|
|
58
|
+
HTML
|
|
59
|
+
do_not_allow(@doc).load_image(anything)
|
|
60
|
+
@doc.images.should == []
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe "no images" do
|
|
64
|
+
it "shouldn't show images" do
|
|
65
|
+
@doc = Readability::Document.new(@bbc, :min_image_height => 600)
|
|
66
|
+
@doc.images.should == []
|
|
67
|
+
@doc.best_candidate_has_image.should == false
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe "poll of images" do
|
|
72
|
+
it "should show some images inside of the best candidate" do
|
|
73
|
+
@doc = Readability::Document.new(@bbc)
|
|
74
|
+
@doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
|
|
75
|
+
"http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
|
|
76
|
+
"http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
|
|
77
|
+
"http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
|
78
|
+
@doc.best_candidate_has_image.should == true
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it "should show some images inside of the best candidate, include gif format" do
|
|
82
|
+
@doc = Readability::Document.new(@bbc, :ignore_image_format => [])
|
|
83
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
|
84
|
+
@doc.best_candidate_has_image.should == true
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
describe "width, height and format" do
|
|
88
|
+
it "should show some images inside of the best candidate, but with width most equal to 400px" do
|
|
89
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
|
|
90
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
|
|
91
|
+
@doc.best_candidate_has_image.should == true
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it "should show some images inside of the best candidate, but with width most equal to 304px" do
|
|
95
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
|
|
96
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
|
|
97
|
+
@doc.best_candidate_has_image.should == true
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
|
|
101
|
+
@doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
|
|
102
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
|
103
|
+
@doc.best_candidate_has_image.should == true
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
|
|
107
|
+
@doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
|
|
108
|
+
@doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
|
|
109
|
+
@doc.best_candidate_has_image.should == true
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
23
115
|
describe "transformMisusedDivsIntoParagraphs" do
|
|
24
116
|
before do
|
|
25
117
|
@doc = Readability::Document.new(@simple_html_fixture)
|
|
@@ -207,7 +299,7 @@ describe Readability do
|
|
|
207
299
|
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
|
208
300
|
@samples = @html_files.map {|filename| File.basename(filename, '.html') }
|
|
209
301
|
end
|
|
210
|
-
|
|
302
|
+
|
|
211
303
|
it "should output expected fragments of text" do
|
|
212
304
|
checks = 0
|
|
213
305
|
@samples.each do |sample|
|
|
@@ -216,12 +308,12 @@ describe Readability do
|
|
|
216
308
|
|
|
217
309
|
load "fixtures/samples/#{sample}-fragments.rb"
|
|
218
310
|
#puts "testing #{sample}..."
|
|
219
|
-
|
|
311
|
+
|
|
220
312
|
$required_fragments.each do |required_text|
|
|
221
313
|
doc.should include(required_text)
|
|
222
314
|
checks += 1
|
|
223
315
|
end
|
|
224
|
-
|
|
316
|
+
|
|
225
317
|
$excluded_fragments.each do |text_to_avoid|
|
|
226
318
|
doc.should_not include(text_to_avoid)
|
|
227
319
|
checks += 1
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-readability
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.1
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -12,22 +12,33 @@ authors:
|
|
|
12
12
|
autorequire:
|
|
13
13
|
bindir: bin
|
|
14
14
|
cert_chain: []
|
|
15
|
-
date: 2012-
|
|
15
|
+
date: 2012-03-14 00:00:00.000000000Z
|
|
16
16
|
dependencies:
|
|
17
17
|
- !ruby/object:Gem::Dependency
|
|
18
18
|
name: rspec
|
|
19
|
-
requirement: &
|
|
19
|
+
requirement: &86479890 !ruby/object:Gem::Requirement
|
|
20
20
|
none: false
|
|
21
21
|
requirements:
|
|
22
22
|
- - ! '>='
|
|
23
23
|
- !ruby/object:Gem::Version
|
|
24
|
-
version: '2.
|
|
24
|
+
version: '2.8'
|
|
25
25
|
type: :development
|
|
26
26
|
prerelease: false
|
|
27
|
-
version_requirements: *
|
|
27
|
+
version_requirements: *86479890
|
|
28
|
+
- !ruby/object:Gem::Dependency
|
|
29
|
+
name: rspec-expectations
|
|
30
|
+
requirement: &86479650 !ruby/object:Gem::Requirement
|
|
31
|
+
none: false
|
|
32
|
+
requirements:
|
|
33
|
+
- - ! '>='
|
|
34
|
+
- !ruby/object:Gem::Version
|
|
35
|
+
version: '2.8'
|
|
36
|
+
type: :development
|
|
37
|
+
prerelease: false
|
|
38
|
+
version_requirements: *86479650
|
|
28
39
|
- !ruby/object:Gem::Dependency
|
|
29
40
|
name: rr
|
|
30
|
-
requirement: &
|
|
41
|
+
requirement: &86479420 !ruby/object:Gem::Requirement
|
|
31
42
|
none: false
|
|
32
43
|
requirements:
|
|
33
44
|
- - ! '>='
|
|
@@ -35,10 +46,10 @@ dependencies:
|
|
|
35
46
|
version: '1.0'
|
|
36
47
|
type: :development
|
|
37
48
|
prerelease: false
|
|
38
|
-
version_requirements: *
|
|
49
|
+
version_requirements: *86479420
|
|
39
50
|
- !ruby/object:Gem::Dependency
|
|
40
51
|
name: nokogiri
|
|
41
|
-
requirement: &
|
|
52
|
+
requirement: &86479190 !ruby/object:Gem::Requirement
|
|
42
53
|
none: false
|
|
43
54
|
requirements:
|
|
44
55
|
- - ! '>='
|
|
@@ -46,10 +57,10 @@ dependencies:
|
|
|
46
57
|
version: 1.4.2
|
|
47
58
|
type: :runtime
|
|
48
59
|
prerelease: false
|
|
49
|
-
version_requirements: *
|
|
60
|
+
version_requirements: *86479190
|
|
50
61
|
- !ruby/object:Gem::Dependency
|
|
51
62
|
name: guess_html_encoding
|
|
52
|
-
requirement: &
|
|
63
|
+
requirement: &86478960 !ruby/object:Gem::Requirement
|
|
53
64
|
none: false
|
|
54
65
|
requirements:
|
|
55
66
|
- - ! '>='
|
|
@@ -57,7 +68,7 @@ dependencies:
|
|
|
57
68
|
version: 0.0.2
|
|
58
69
|
type: :runtime
|
|
59
70
|
prerelease: false
|
|
60
|
-
version_requirements: *
|
|
71
|
+
version_requirements: *86478960
|
|
61
72
|
description: Port of arc90's readability project to ruby
|
|
62
73
|
email:
|
|
63
74
|
- andrew@iterationlabs.com
|
|
@@ -68,14 +79,18 @@ extra_rdoc_files: []
|
|
|
68
79
|
files:
|
|
69
80
|
- .document
|
|
70
81
|
- .gitignore
|
|
82
|
+
- .rspec
|
|
71
83
|
- Gemfile
|
|
72
|
-
- README
|
|
84
|
+
- README.markdown
|
|
73
85
|
- Rakefile
|
|
74
86
|
- bin/readability
|
|
75
87
|
- lib/readability.rb
|
|
76
88
|
- lib/ruby-readability.rb
|
|
77
89
|
- ruby-readability.gemspec
|
|
90
|
+
- spec/fixtures/bbc.html
|
|
78
91
|
- spec/fixtures/cant_read.html
|
|
92
|
+
- spec/fixtures/images/dim_1416768a.jpg
|
|
93
|
+
- spec/fixtures/nytimes.html
|
|
79
94
|
- spec/fixtures/sample.html
|
|
80
95
|
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
|
81
96
|
- spec/fixtures/samples/blogpost_with_links.html
|
|
@@ -86,6 +101,7 @@ files:
|
|
|
86
101
|
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
|
87
102
|
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
|
88
103
|
- spec/fixtures/should_not_truncate.txt
|
|
104
|
+
- spec/fixtures/thesun.html
|
|
89
105
|
- spec/readability_spec.rb
|
|
90
106
|
- spec/spec.opts
|
|
91
107
|
- spec/spec_helper.rb
|
|
@@ -109,22 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
109
125
|
version: '0'
|
|
110
126
|
requirements: []
|
|
111
127
|
rubyforge_project: ruby-readability
|
|
112
|
-
rubygems_version: 1.8.
|
|
128
|
+
rubygems_version: 1.8.16
|
|
113
129
|
signing_key:
|
|
114
130
|
specification_version: 3
|
|
115
131
|
summary: Port of arc90's readability project to ruby
|
|
116
|
-
test_files:
|
|
117
|
-
- spec/fixtures/cant_read.html
|
|
118
|
-
- spec/fixtures/sample.html
|
|
119
|
-
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
|
120
|
-
- spec/fixtures/samples/blogpost_with_links.html
|
|
121
|
-
- spec/fixtures/samples/channel4-1-fragments.rb
|
|
122
|
-
- spec/fixtures/samples/channel4-1.html
|
|
123
|
-
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
|
124
|
-
- spec/fixtures/samples/foxnews-india1.html
|
|
125
|
-
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
|
126
|
-
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
|
127
|
-
- spec/fixtures/should_not_truncate.txt
|
|
128
|
-
- spec/readability_spec.rb
|
|
129
|
-
- spec/spec.opts
|
|
130
|
-
- spec/spec_helper.rb
|
|
132
|
+
test_files: []
|
data/README
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
Ruby Readability
|
|
2
|
-
|
|
3
|
-
Command line:
|
|
4
|
-
(sudo) gem install ruby-readability
|
|
5
|
-
|
|
6
|
-
Bundler:
|
|
7
|
-
gem "ruby-readability", :require => 'readability'
|
|
8
|
-
|
|
9
|
-
Example:
|
|
10
|
-
|
|
11
|
-
require 'rubygems'
|
|
12
|
-
require 'readability'
|
|
13
|
-
require 'open-uri'
|
|
14
|
-
|
|
15
|
-
source = open('http://lab.arc90.com/experiments/readability/').read
|
|
16
|
-
puts Readability::Document.new(source).content
|
|
17
|
-
|
|
18
|
-
Options:
|
|
19
|
-
|
|
20
|
-
You may provide additions options to Readability::Document.new, including:
|
|
21
|
-
|
|
22
|
-
:tags - the base whitelist of tags to sanitize, defaults to %w[div p]
|
|
23
|
-
:remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
|
|
24
|
-
:attributes - whitelist of allowed attributes
|
|
25
|
-
:debug - provide debugging output, defaults false
|
|
26
|
-
:encoding - if this page is of a known encoding, you can specify it; if left
|
|
27
|
-
unspecified, the encoding will be guessed (only in Ruby 1.9.x)
|
|
28
|
-
:html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
|
|
29
|
-
to aid with guessing the HTML encoding
|
|
30
|
-
|
|
31
|
-
Readability comes with a command-line tool for experimentation in bin/readability.
|
|
32
|
-
|
|
33
|
-
Usage: readability [options] URL
|
|
34
|
-
-d, --debug Show debug output
|
|
35
|
-
-i, --images Keep images and links
|
|
36
|
-
-h, --help Show this message
|
|
37
|
-
|
|
38
|
-
Potential issues:
|
|
39
|
-
|
|
40
|
-
* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
|
|
41
|
-
Version 2.7.8 of libxml2 with the following worked for me:
|
|
42
|
-
gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
|
|
43
|
-
|
|
44
|
-
===
|
|
45
|
-
|
|
46
|
-
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
|
47
|
-
|
|
48
|
-
This is a ruby port of arc90's readability project
|
|
49
|
-
|
|
50
|
-
http://lab.arc90.com/experiments/readability/
|
|
51
|
-
|
|
52
|
-
Given a html document, it pulls out the main body text and cleans it up.
|
|
53
|
-
|
|
54
|
-
Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.
|