ruby-readability 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,98 @@ describe Readability do
20
20
  HTML
21
21
  end
22
22
 
23
+ describe "images" do
24
+ before do
25
+ @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
26
+ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
27
+ @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
28
+
29
+ FakeWeb::Registry.instance.clean_registry
30
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
31
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
32
+ end
33
+
34
+ it "should show one image, but outside of the best candidate" do
35
+ @doc = Readability::Document.new(@thesum)
36
+ @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
37
+ @doc.best_candidate_has_image.should == false
38
+ end
39
+
40
+ it "should show one image inside of the best candidate" do
41
+ @doc = Readability::Document.new(@nytimes)
42
+ @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
43
+ @doc.best_candidate_has_image.should == true
44
+ end
45
+
46
+ it "should not try to download local images" do
47
+ @doc = Readability::Document.new(<<-HTML)
48
+ <html>
49
+ <head>
50
+ <title>title!</title>
51
+ </head>
52
+ <body class='comment'>
53
+ <div>
54
+ <img src="/something/local.gif" />
55
+ </div>
56
+ </body>
57
+ </html>
58
+ HTML
59
+ do_not_allow(@doc).load_image(anything)
60
+ @doc.images.should == []
61
+ end
62
+
63
+ describe "no images" do
64
+ it "shouldn't show images" do
65
+ @doc = Readability::Document.new(@bbc, :min_image_height => 600)
66
+ @doc.images.should == []
67
+ @doc.best_candidate_has_image.should == false
68
+ end
69
+ end
70
+
71
+ describe "poll of images" do
72
+ it "should show some images inside of the best candidate" do
73
+ @doc = Readability::Document.new(@bbc)
74
+ @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
75
+ "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
76
+ "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
77
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
78
+ @doc.best_candidate_has_image.should == true
79
+ end
80
+
81
+ it "should show some images inside of the best candidate, include gif format" do
82
+ @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
83
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
84
+ @doc.best_candidate_has_image.should == true
85
+ end
86
+
87
+ describe "width, height and format" do
88
+ it "should show some images inside of the best candidate, but with width most equal to 400px" do
89
+ @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
90
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
91
+ @doc.best_candidate_has_image.should == true
92
+ end
93
+
94
+ it "should show some images inside of the best candidate, but with width most equal to 304px" do
95
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
96
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
97
+ @doc.best_candidate_has_image.should == true
98
+ end
99
+
100
+ it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
101
+ @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
102
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
103
+ @doc.best_candidate_has_image.should == true
104
+ end
105
+
106
+ it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
107
+ @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
108
+ @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
109
+ @doc.best_candidate_has_image.should == true
110
+ end
111
+ end
112
+ end
113
+ end
114
+
23
115
  describe "transformMisusedDivsIntoParagraphs" do
24
116
  before do
25
117
  @doc = Readability::Document.new(@simple_html_fixture)
@@ -207,7 +299,7 @@ describe Readability do
207
299
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
208
300
  @samples = @html_files.map {|filename| File.basename(filename, '.html') }
209
301
  end
210
-
302
+
211
303
  it "should output expected fragments of text" do
212
304
  checks = 0
213
305
  @samples.each do |sample|
@@ -216,12 +308,12 @@ describe Readability do
216
308
 
217
309
  load "fixtures/samples/#{sample}-fragments.rb"
218
310
  #puts "testing #{sample}..."
219
-
311
+
220
312
  $required_fragments.each do |required_text|
221
313
  doc.should include(required_text)
222
314
  checks += 1
223
315
  end
224
-
316
+
225
317
  $excluded_fragments.each do |text_to_avoid|
226
318
  doc.should_not include(text_to_avoid)
227
319
  checks += 1
@@ -1,7 +1,11 @@
1
1
  require 'rubygems'
2
2
  require 'readability'
3
3
  require 'rr'
4
+ require 'fakeweb'
5
+
6
+ FakeWeb.allow_net_connect = false
4
7
 
5
8
  RSpec.configure do |config|
6
9
  config.mock_with :rr
7
10
  end
11
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,22 +12,33 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2012-01-24 00:00:00.000000000Z
15
+ date: 2012-03-14 00:00:00.000000000Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rspec
19
- requirement: &70232951278200 !ruby/object:Gem::Requirement
19
+ requirement: &86479890 !ruby/object:Gem::Requirement
20
20
  none: false
21
21
  requirements:
22
22
  - - ! '>='
23
23
  - !ruby/object:Gem::Version
24
- version: '2.6'
24
+ version: '2.8'
25
25
  type: :development
26
26
  prerelease: false
27
- version_requirements: *70232951278200
27
+ version_requirements: *86479890
28
+ - !ruby/object:Gem::Dependency
29
+ name: rspec-expectations
30
+ requirement: &86479650 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '2.8'
36
+ type: :development
37
+ prerelease: false
38
+ version_requirements: *86479650
28
39
  - !ruby/object:Gem::Dependency
29
40
  name: rr
30
- requirement: &70232951277680 !ruby/object:Gem::Requirement
41
+ requirement: &86479420 !ruby/object:Gem::Requirement
31
42
  none: false
32
43
  requirements:
33
44
  - - ! '>='
@@ -35,10 +46,10 @@ dependencies:
35
46
  version: '1.0'
36
47
  type: :development
37
48
  prerelease: false
38
- version_requirements: *70232951277680
49
+ version_requirements: *86479420
39
50
  - !ruby/object:Gem::Dependency
40
51
  name: nokogiri
41
- requirement: &70232951277200 !ruby/object:Gem::Requirement
52
+ requirement: &86479190 !ruby/object:Gem::Requirement
42
53
  none: false
43
54
  requirements:
44
55
  - - ! '>='
@@ -46,10 +57,10 @@ dependencies:
46
57
  version: 1.4.2
47
58
  type: :runtime
48
59
  prerelease: false
49
- version_requirements: *70232951277200
60
+ version_requirements: *86479190
50
61
  - !ruby/object:Gem::Dependency
51
62
  name: guess_html_encoding
52
- requirement: &70232951276720 !ruby/object:Gem::Requirement
63
+ requirement: &86478960 !ruby/object:Gem::Requirement
53
64
  none: false
54
65
  requirements:
55
66
  - - ! '>='
@@ -57,7 +68,7 @@ dependencies:
57
68
  version: 0.0.2
58
69
  type: :runtime
59
70
  prerelease: false
60
- version_requirements: *70232951276720
71
+ version_requirements: *86478960
61
72
  description: Port of arc90's readability project to ruby
62
73
  email:
63
74
  - andrew@iterationlabs.com
@@ -68,14 +79,18 @@ extra_rdoc_files: []
68
79
  files:
69
80
  - .document
70
81
  - .gitignore
82
+ - .rspec
71
83
  - Gemfile
72
- - README
84
+ - README.markdown
73
85
  - Rakefile
74
86
  - bin/readability
75
87
  - lib/readability.rb
76
88
  - lib/ruby-readability.rb
77
89
  - ruby-readability.gemspec
90
+ - spec/fixtures/bbc.html
78
91
  - spec/fixtures/cant_read.html
92
+ - spec/fixtures/images/dim_1416768a.jpg
93
+ - spec/fixtures/nytimes.html
79
94
  - spec/fixtures/sample.html
80
95
  - spec/fixtures/samples/blogpost_with_links-fragments.rb
81
96
  - spec/fixtures/samples/blogpost_with_links.html
@@ -86,6 +101,7 @@ files:
86
101
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
87
102
  - spec/fixtures/samples/globemail-ottawa-cuts.html
88
103
  - spec/fixtures/should_not_truncate.txt
104
+ - spec/fixtures/thesun.html
89
105
  - spec/readability_spec.rb
90
106
  - spec/spec.opts
91
107
  - spec/spec_helper.rb
@@ -109,22 +125,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
125
  version: '0'
110
126
  requirements: []
111
127
  rubyforge_project: ruby-readability
112
- rubygems_version: 1.8.10
128
+ rubygems_version: 1.8.16
113
129
  signing_key:
114
130
  specification_version: 3
115
131
  summary: Port of arc90's readability project to ruby
116
- test_files:
117
- - spec/fixtures/cant_read.html
118
- - spec/fixtures/sample.html
119
- - spec/fixtures/samples/blogpost_with_links-fragments.rb
120
- - spec/fixtures/samples/blogpost_with_links.html
121
- - spec/fixtures/samples/channel4-1-fragments.rb
122
- - spec/fixtures/samples/channel4-1.html
123
- - spec/fixtures/samples/foxnews-india1-fragments.rb
124
- - spec/fixtures/samples/foxnews-india1.html
125
- - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
126
- - spec/fixtures/samples/globemail-ottawa-cuts.html
127
- - spec/fixtures/should_not_truncate.txt
128
- - spec/readability_spec.rb
129
- - spec/spec.opts
130
- - spec/spec_helper.rb
132
+ test_files: []
data/README DELETED
@@ -1,54 +0,0 @@
1
- Ruby Readability
2
-
3
- Command line:
4
- (sudo) gem install ruby-readability
5
-
6
- Bundler:
7
- gem "ruby-readability", :require => 'readability'
8
-
9
- Example:
10
-
11
- require 'rubygems'
12
- require 'readability'
13
- require 'open-uri'
14
-
15
- source = open('http://lab.arc90.com/experiments/readability/').read
16
- puts Readability::Document.new(source).content
17
-
18
- Options:
19
-
20
- You may provide additions options to Readability::Document.new, including:
21
-
22
- :tags - the base whitelist of tags to sanitize, defaults to %w[div p]
23
- :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
24
- :attributes - whitelist of allowed attributes
25
- :debug - provide debugging output, defaults false
26
- :encoding - if this page is of a known encoding, you can specify it; if left
27
- unspecified, the encoding will be guessed (only in Ruby 1.9.x)
28
- :html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
29
- to aid with guessing the HTML encoding
30
-
31
- Readability comes with a command-line tool for experimentation in bin/readability.
32
-
33
- Usage: readability [options] URL
34
- -d, --debug Show debug output
35
- -i, --images Keep images and links
36
- -h, --help Show this message
37
-
38
- Potential issues:
39
-
40
- * If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
41
- Version 2.7.8 of libxml2 with the following worked for me:
42
- gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
43
-
44
- ===
45
-
46
- This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
47
-
48
- This is a ruby port of arc90's readability project
49
-
50
- http://lab.arc90.com/experiments/readability/
51
-
52
- Given a html document, it pulls out the main body text and cleans it up.
53
-
54
- Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.